예제 #1
0
파일: Att.py 프로젝트: KuoHaoZeng/VH
def test(model_path='models/model-900', video_feat_path=video_feat_path):
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    test_data = val_data
    ixtoword = pd.Series(np.load('./data'+str(gpu_id)+'/ixtoword.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    drop_out_rate = 0,
            bias_init_vector=None)

    video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator()
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))

    with tf.device("/cpu:0"):
	    saver = tf.train.Saver()
	    saver.restore(sess, model_path)
    
    for ind, row in enumerate(lstm3_variables_tf):
        if ind % 4 == 0:
                assign_op = row.assign(tf.mul(row,1-0.5))
                sess.run(assign_op)
    
    [pred_sent, gt_sent] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, caption_tf)
    #np.savez('Att_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent)
    scorer = COCOScorer()
    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))
    return total_score
예제 #2
0
파일: HS.py 프로젝트: KuoHaoZeng/VH
def test(model_path='models/model-900', video_feat_path=video_feat_path):
    meta_data, train_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_test)
    #test_data = train_data
    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    drop_out_rate = 0,
            bias_init_vector=None)

    video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf, lstmRNN_variables_tf, lstm3_variables_tf = model.build_generator()
    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    saver.restore(sess, model_path)
    for ind, row in enumerate(lstmRNN_variables_tf):
	if ind % 4 == 0:
		assign_op = row.assign(tf.mul(row,1-0.5))
		sess.run(assign_op)
    for ind, row in enumerate(lstm3_variables_tf):
	if ind % 4 == 0:
		assign_op = row.assign(tf.mul(row,1-0.5))
		sess.run(assign_op)

    [mp, pred_sent, gt_sent, HLness] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf)
    np.savez('HS_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent,mp=mp,HLness=HLness)
    total_score = np.mean(mp)
    print model_path.split('/')[1]+' mAP: ' + str(total_score)
    scorer = COCOScorer()
    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))
    return total_score
예제 #3
0
def test(model_path='models/model-900', video_feat_path=video_feat_path):
    meta_data, train_data, val_data, test_data = get_video_data_jukin(
        video_data_path_train, video_data_path_val, video_data_path_test)
    #    test_data = val_data   # to evaluate on testing data or validation data
    ixtoword = pd.Series(np.load(home_folder + 'data0/ixtoword.npy').tolist())

    model = Video_Caption_Generator(dim_image=dim_image,
                                    n_words=len(ixtoword),
                                    dim_hidden=dim_hidden,
                                    batch_size=batch_size,
                                    n_lstm_steps=n_frame_step,
                                    drop_out_rate=0,
                                    bias_init_vector=None)

    video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator(
    )
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True))

    with tf.device("/cpu:0"):
        saver = tf.train.Saver()
        saver.restore(sess, model_path)

    for ind, row in enumerate(lstm3_variables_tf):
        if ind % 4 == 0:
            assign_op = row.assign(tf.multiply(row, 1 - 0.5))
            sess.run(assign_op)

    [pred_sent, gt_sent, id_list, gt_dict,
     pred_dict] = testing_all(sess, test_data, ixtoword, video_tf,
                              video_mask_tf, caption_tf)
    #np.savez('Att_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent)
    scorer = COCOScorer()
    total_score = scorer.score(gt_dict, pred_dict, id_list)
    return total_score
예제 #4
0
def eval(model, crit, loader, vocab, opt):
    model.eval()
    scorer = COCOScorer()
    ip_json = open(opt['input_json'])
    gt_dataframe = json_normalize(json.load(ip_json)['sentences'])
    ip_json.close()
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in tqdm(loader):
        # forward the model to get loss
        video_ids = data['video_ids']
        audio_fc2 = data['audio_fc2'].cuda()
        video_feat = data['video_feat'].cuda()

        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(audio_fc2,
                                         video_feat,
                                         mode='inference',
                                         opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    return valid_score
예제 #5
0
def eval(model, crit, dataset, vocab, opt, model_path):
    model.eval()
    loader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True)
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in loader:
        # forward the model to get loss
        image_feats = data['image_feats'].cuda()
        audio_mfcc = data['audio_mfcc'].cuda()
        video_ids = data['video_ids']
        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])

    validation_file_name = opt['model_directory'].split('/')[-1]+'_val_score.txt'
    with open(os.path.join(opt["results_path"], validation_file_name), 'a') as scores_table:
        scores_table.write(model_path.split('/')[-1]+': '+json.dumps(results[0]) + "\n")
예제 #6
0
def score_with_cocoeval(samples_valid, samples_test, engine):
    scorer = COCOScorer()
    if samples_valid:
        gts_valid = OrderedDict()
        for ID in engine.val_data_ids:
            vidID, capID = ID.split('|')
            words = engine.get_cap_tokens(vidID, int(capID), mode='val')
            caption = ' '.join(words)
            if gts_valid.has_key(vidID):
                gts_valid[vidID].append({'image_id': vidID, 'caption': caption, 'cap_id': capID})
            else:
                gts_valid[vidID] = [{'image_id': vidID, 'caption': caption, 'cap_id': capID}]
        valid_score = scorer.score(gts_valid, samples_valid, gts_valid.keys())
    else:
        valid_score = None
    if samples_test:
        gts_test = OrderedDict()
        for ID in engine.test_data_ids:
            vidID, capID = ID.split('|')
            words = engine.get_cap_tokens(vidID, int(capID), mode='test')
            caption = ' '.join(words)
            if gts_test.has_key(vidID):
                gts_test[vidID].append({'image_id': vidID, 'caption': caption, 'cap_id': capID})
            else:
                gts_test[vidID] = [{'image_id': vidID, 'caption': caption, 'cap_id': capID}]
        test_score = scorer.score(gts_test, samples_test, gts_test.keys())
    else:
        test_score = None
    return valid_score, test_score
예제 #7
0
def score_with_cocoeval(samples_test, engine, ids):
    scorer = COCOScorer()
    gts_test = OrderedDict()
    for vidID in ids:
        gts_test[vidID] = engine.CAP[vidID]
    test_score = scorer.score(gts_test, samples_test, ids)
    return test_score
예제 #8
0
def eval(model, crit, loader, vocab, opt):
    model.eval()
    '''
    if opt['beam']:
        bs = 1
    else:
        bs = opt['batch_size']
    loader = DataLoader(dataset, batch_size=bs, shuffle=True)
    '''
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in tqdm(loader):
        # forward the model to get loss
        video_ids = data['video_ids']
        audio_conv4 = data['audio_conv4'].cuda()
        audio_fc2 = data['audio_fc2'].cuda()
        sem_feats = data['sem_feats'].cuda()

        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(audio_conv4,
                                         audio_fc2,
                                         sem_feats,
                                         mode='inference',
                                         opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])
    '''
    with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table:
        scores_table.write(json.dumps(results[0]) + "\n")
    with open(os.path.join(opt["results_path"],
                           'vanilla' + ".json"), 'w') as prediction_results:
        json.dump({"predictions": samples, "scores": valid_score},
                  prediction_results)
    '''
    return valid_score
def score_with_cocoeval(samples_valid, samples_test, engine):
    scorer = COCOScorer()
    if samples_valid:
        gts_valid = OrderedDict()
        for vidID in engine.valid_ids:
            gts_valid[vidID] = engine.CAP[vidID]
        valid_score = scorer.score(gts_valid, samples_valid, engine.valid_ids)
    else:
        valid_score = None
    if samples_test:
        gts_test = OrderedDict()
        for vidID in engine.test_ids:
            gts_test[vidID] = engine.CAP[vidID]
        test_score = scorer.score(gts_test, samples_test, engine.test_ids)
    else:
        test_score = None
    return valid_score, test_score
예제 #10
0
def score_with_cocoeval(samples_valid, samples_test, engine):
    scorer = COCOScorer()
    if samples_valid:
        gts_valid = OrderedDict()
        for vidID in engine.valid_ids:
            gts_valid[vidID] = engine.CAP[vidID]
        valid_score = scorer.score(gts_valid, samples_valid, engine.valid_ids)
    else:
        valid_score = None
    if samples_test:
        gts_test = OrderedDict()
        for vidID in engine.test_ids:
            gts_test[vidID] = engine.CAP[vidID]
        test_score = scorer.score(gts_test, samples_test, engine.test_ids)
    else:
        test_score = None
    return valid_score, test_score
예제 #11
0
def evaluate(opt, net, eval_range, prediction_txt_path, reference):
    eval_loader = get_eval_loader(eval_range, opt.feature_h5_path,
                                  opt.region_feature_h5_path,
                                  opt.test_batch_size)

    result = {}
    for i, (frames, regions, spatials,
            video_ids) in tqdm(enumerate(eval_loader)):
        frames = frames.to(DEVICE)
        regions = regions.to(DEVICE)
        spatials = spatials.to(DEVICE)

        outputs, _ = net(frames, regions, spatials, None)
        for (tokens, vid) in zip(outputs, video_ids):
            if opt.use_multi_gpu:
                s = net.module.decoder.decode_tokens(tokens.data)
            else:
                s = net.decoder.decode_tokens(tokens.data)
            result[vid] = s

    with open(prediction_txt_path, 'w') as f:
        for vid, s in result.items():
            f.write('%d\t%s\n' % (vid, s))

    prediction_json = convert_prediction(prediction_txt_path)

    # compute scores
    scorer = COCOScorer()
    with suppress_stdout_stderr():
        scores, sub_category_score = scorer.score(reference, prediction_json,
                                                  prediction_json.keys())
    for metric, score in scores.items():
        print('%s: %.6f' % (metric, score * 100))

    if sub_category_score is not None:
        print('Sub Category Score in Spice:')
        for category, score in sub_category_score.items():
            print('%s: %.6f' % (category, score * 100))
    return scores
예제 #12
0
def score_with_cocoeval(samples_valid, samples_test, valid, test):
    scorer = COCOScorer()

    if samples_valid:

        gts_valid, hypo_valid, valid_ids = make_template(samples_valid, valid)
        print 'compute validation set score:'
        valid_score = scorer.score(gts_valid, hypo_valid, valid_ids)

    else:
        valid_score = None

    if samples_test:

        gts_test, hypo_test, test_ids = make_template(samples_test, test)
        print 'compute test set score:'
        test_score = scorer.score(gts_test, hypo_test, test_ids)

    else:
        test_score = None

    return valid_score, test_score
예제 #13
0
def test(model_path='models/model-900', video_feat_path=video_feat_path):
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    test_data = val_data
    ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
	    dim_tracker=dim_tracker,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    tracker_cnt=tracker_cnt,
            drop_out_rate = 0,
            bias_init_vector=None)

    video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator()
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))

    saver = tf.train.Saver()
    saver.restore(sess, model_path)
    for ind, row in enumerate(lstm1_variables_tf):
        if ind % 4 == 0:
                assign_op = row.assign(tf.mul(row,1-0.5))
                sess.run(assign_op)
    for ind, row in enumerate(lstm2_variables_tf):
        if ind % 4 == 0:
                assign_op = row.assign(tf.mul(row,1-0.5))
                sess.run(assign_op)

#    [pred_sent, gt_sent] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)
#    scorer = COCOScorer()
#    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))
    [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, test_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)
    np.savez('result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent,fname=fnamelist)
    scorer = COCOScorer()
    total_score = scorer.score(gt_dict, pred_dict, id_list)
    return total_score
예제 #14
0
def get_demo_sentence(sess, n_steps, ixtoword, caption_tf, name_tf,
                      result_file):
    [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
     fname_list] = testing_all(sess, n_steps, ixtoword, caption_tf, name_tf)
    scorer = COCOScorer()
    scores = scorer.score(gt_dict, pred_dict, id_list, return_img_score=True)
    bleus = []
    for i, idx in enumerate(id_list):
        bleus.append((scores[idx]['Bleu_4'], fname_list[i], idx))
    sorted_bleus = sorted(bleus, key=lambda x: x[0], reverse=True)
    video_names = []
    with open(result_file, 'w') as result:
        for i in xrange(40):
            fname = sorted_bleus[i][1]
            video_names.append(fname)
            idx = sorted_bleus[i][2]
            result.write(fname + '\n')
            for ele in gt_dict[idx]:
                result.write('GT: ' + ele['caption'] + '\n')
            result.write('PD: ' + pred_dict[idx][0]['caption'] + '\n\n\n')
        print 'result saved to', result_file
    with open(result_file + '.videos', "wb") as fp:
        pickle.dump(video_names, fp)
        print 'video names saved to', result_file + '.videos'
예제 #15
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

#    np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword)
#    np.save('./data'+str(gpu_id)+'/wordtoix', wordtoix)
#    sys.exit()
    ixtoword=pd.Series(np.load('./data_all/ixtoword.npy').tolist())
    wordtoix=pd.Series(np.load('./data_all/wordtoix.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
	    dim_tracker=dim_tracker,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    tracker_cnt=tracker_cnt,
            drop_out_rate = 0.5,
            bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_tracker, tf_tracker_mask, tf_caption, tf_caption_mask= model.build_model()
    #loss_summary = tf.scalar_summary("Loss",tf_loss)
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
    #merged = tf.merge_all_summaries()
    #writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def)

    with tf.device("/cpu:0"):
    	saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()
    saver.restore(sess, 'models/model-0')

    tStart_total = time.time()
    nr_prefetch = int(3)
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]
        
        tStart_epoch = time.time()
        loss_epoch = np.zeros(len(train_data))
        ## init queue
        data_queue = mp.Queue(nr_prefetch)
#        tracker_queue = mp.Queue(nr_prefetch)
        title_queue = mp.Queue(nr_prefetch)
        t1 = Thread(target=load_data_into_queue, args=(train_data, data_queue, 'data'))
#        t2 = Thread(target=load_data_into_queue, args=(train_data, tracker_queue, 'tracker'))
        t3 = Thread(target=load_data_into_queue, args=(train_data, title_queue, 'title'))
        t1.start()
#        t2.start()
        t3.start()
        for current_batch_file_idx in range(len(train_data)):
            tStart = time.time()
            current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
            current_video_len = np.zeros(batch_size)
            
            if 'tracker' in current_batch.keys():
                current_tracker = np.array(current_batch['tracker'])
            else:
                current_tracker = np.zeros((batch_size, tracker_cnt, dim_tracker))
            
            if 'tracker_mask' in current_batch.keys():
                current_tracker_mask = np.array(current_batch['tracker_mask'])
            else:
                current_tracker_mask = np.zeros((batch_size, tracker_cnt))

#            current_tracker = tracker_queue.get()
            current_batch_data = data_queue.get()
            current_batch_title = title_queue.get()
            for ind in xrange(batch_size):
                current_feats[ind,:,:] = current_batch_data[:,ind,:]
                idx = np.where(current_batch['label'][:,ind] != -1)[0]
                if len(idx) == 0:
                        continue
                current_video_masks[ind,idx[-1]] = 1

            current_captions = current_batch_title
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=35-1)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            current_batch.close()


            _, loss_val= sess.run(
                [train_op, tf_loss],
                feed_dict={
                tf_video: current_feats,
                tf_video_mask : current_video_masks,
                tf_tracker : current_tracker,
                tf_tracker_mask : current_tracker_mask,
                tf_caption: current_caption_matrix,
                tf_caption_mask: current_caption_masks
                })
            #writer.add_summary(summary_str, epoch)
            loss_epoch[current_batch_file_idx] = loss_val
            tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
            #print "Time Cost:", round(tStop - tStart,2), "s"

        t1.join()
#       t2.join()
        t3.join()
        print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
        tStop_epoch = time.time()
        print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s"
	sys.stdout.flush()

        if np.mod(epoch, 2) == 0:
            print "Epoch ", epoch, " is done. Saving the model ..."
    	    with tf.device('/cpu:0'):
            	saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
        if np.mod(epoch, 10) == 0:
            current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))])
            video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator()
            ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist())
#            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, train_data[-2:], ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)
#            for key in pred_dict.keys():
#                for ele in gt_dict[key]:
#                    print "GT:  " + ele['caption']
#                print "PD:  " + pred_dict[key][0]['caption']
#                print '-------'

            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, val_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)

            scorer = COCOScorer()
            total_score = scorer.score(gt_dict, pred_dict, id_list)

    print "Finally, saving the model ..."
    with tf.device('/cpu:0'):
	    saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
예제 #16
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(
        video_data_path_train, video_data_path_val, video_data_path_test
    )
    captions = meta_data["Description"].values
    captions = map(lambda x: x.replace(".", ""), captions)
    captions = map(lambda x: x.replace(",", ""), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

    np.save("./data" + str(gpu_id) + "/ixtoword", ixtoword)

    model = Video_Caption_Generator(
        dim_image=dim_image,
        n_words=len(wordtoix),
        dim_hidden=dim_hidden,
        batch_size=batch_size,
        n_lstm_steps=n_frame_step,
        drop_out_rate=0.5,
        bias_init_vector=None,
    )

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model()
    loss_summary = tf.scalar_summary("Loss", tf_loss)
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
    merged = tf.merge_all_summaries()
    writer = tf.train.SummaryWriter("/tmp/tf_log", sess.graph_def)

    saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()
    saver.restore(sess, "models_SS_youtube_notest_dummy/model-20")

    tStart_total = time.time()
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]

        tStart_epoch = time.time()
        loss_epoch = np.zeros(len(train_data))
        for current_batch_file_idx in xrange(len(train_data)):

            tStart = time.time()
            current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
            current_video_len = np.zeros(batch_size)
            for ind in xrange(batch_size):
                current_feats[ind, :, :] = current_batch["data"][:, ind, :]
                idx = np.where(current_batch["label"][:, ind] != -1)[0]
                if len(idx) == 0:
                    continue
                current_video_masks[ind, idx[-1]] = 1

            current_captions = current_batch["title"]
            current_caption_ind = map(
                lambda cap: [wordtoix[word] for word in cap.lower().split(" ") if word in wordtoix], current_captions
            )

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding="post", maxlen=16 - 1)
            current_caption_matrix = np.hstack(
                [current_caption_matrix, np.zeros([len(current_caption_matrix), 1])]
            ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array(map(lambda x: (x != 0).sum() + 1, current_caption_matrix))

            for ind, row in enumerate(current_caption_masks):
                row[: nonzeros[ind]] = 1

            _, loss_val, summary_str = sess.run(
                [train_op, tf_loss, merged],
                feed_dict={
                    tf_video: current_feats,
                    tf_video_mask: current_video_masks,
                    tf_caption: current_caption_matrix,
                    tf_caption_mask: current_caption_masks,
                },
            )
            writer.add_summary(summary_str, epoch)
            loss_epoch[current_batch_file_idx] = loss_val
            tStop = time.time()
            # print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
            # print "Time Cost:", round(tStop - tStart,2), "s"

        print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
        tStop_epoch = time.time()
        print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch, 2), "s"
        sys.stdout.flush()

        if np.mod(epoch, 10) == 0:
            print "Epoch ", epoch, " is done. Saving the model ..."
            saver.save(sess, os.path.join(model_path, "model"), global_step=epoch)

            current_batch = h5py.File(val_data[np.random.randint(0, len(val_data))])
            video_tf, video_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator()
            ixtoword = pd.Series(np.load("./data" + str(gpu_id) + "/ixtoword.npy").tolist())
            [pred_sent, gt_sent] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf)
            for idx in range(len(pred_sent)):
                print "GT:  " + gt_sent[idx][0]["caption"]
                print "PD:  " + pred_sent[idx][0]["caption"]
                print "-------"
            [pred_sent, gt_sent] = testing_all(sess, val_data, ixtoword, video_tf, video_mask_tf, caption_tf)
            scorer = COCOScorer()
            total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))

    print "Finally, saving the model ..."
    saver.save(sess, os.path.join(model_path, "model"), global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
예제 #17
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

#    np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword)
#    np.save('./data'+str(gpu_id)+'/wordtoix', wordtoix)
#    sys.exit()
    ixtoword=pd.Series(np.load('./data_all/ixtoword.npy').tolist())
    wordtoix=pd.Series(np.load('./data_all/wordtoix.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
	    dim_tracker=dim_tracker,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    tracker_cnt=tracker_cnt,
            drop_out_rate = 0.5,
            bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_tracker, tf_tracker_mask, tf_caption, tf_caption_mask= model.build_model()
    #loss_summary = tf.scalar_summary("Loss",tf_loss)
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
    #merged = tf.merge_all_summaries()
    #writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def)

    with tf.device("/cpu:0"):
    	saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()
    saver.restore(sess, 'models/model-0')

    tStart_total = time.time()
    nr_prefetch = int(3)
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]
        
        tStart_epoch = time.time()
        loss_epoch = np.zeros(len(train_data))
        ## init queue
        data_queue = mp.Queue(nr_prefetch)
#        tracker_queue = mp.Queue(nr_prefetch)
        title_queue = mp.Queue(nr_prefetch)
        t1 = Thread(target=load_data_into_queue, args=(train_data, data_queue, 'data'))
#        t2 = Thread(target=load_data_into_queue, args=(train_data, tracker_queue, 'tracker'))
        t3 = Thread(target=load_data_into_queue, args=(train_data, title_queue, 'title'))
        t1.start()
#        t2.start()
        t3.start()
        for current_batch_file_idx in range(len(train_data)):
            tStart = time.time()
            current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
            current_video_len = np.zeros(batch_size)
            
            if 'tracker' in current_batch.keys():
                current_tracker = np.array(current_batch['tracker'])
            else:
                current_tracker = np.zeros((batch_size, tracker_cnt, dim_tracker))
            
            if 'tracker_mask' in current_batch.keys():
                current_tracker_mask = np.array(current_batch['tracker_mask'])
            else:
                current_tracker_mask = np.zeros((batch_size, tracker_cnt))

#            current_tracker = tracker_queue.get()
            current_batch_data = data_queue.get()
            current_batch_title = title_queue.get()
            for ind in range(batch_size):
                current_feats[ind,:,:] = current_batch_data[:,ind,:]
                idx = np.where(current_batch['label'][:,ind] != -1)[0]
                if len(idx) == 0:
                        continue
                current_video_masks[ind,idx[-1]] = 1

            current_captions = current_batch_title
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=35-1)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            current_batch.close()


            _, loss_val= sess.run(
                [train_op, tf_loss],
                feed_dict={
                tf_video: current_feats,
                tf_video_mask : current_video_masks,
                tf_tracker : current_tracker,
                tf_tracker_mask : current_tracker_mask,
                tf_caption: current_caption_matrix,
                tf_caption_mask: current_caption_masks
                })
            #writer.add_summary(summary_str, epoch)
            loss_epoch[current_batch_file_idx] = loss_val
            tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
            #print "Time Cost:", round(tStop - tStart,2), "s"

        t1.join()
#       t2.join()
        t3.join()
        print ("Epoch:", epoch, " done. Loss:", np.mean(loss_epoch))
        tStop_epoch = time.time()
        print ("Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s")
	sys.stdout.flush()

        if np.mod(epoch, 2) == 0:
            print ("Epoch ", epoch, " is done. Saving the model ...")
    	    with tf.device('/cpu:0'):
            	saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
        if np.mod(epoch, 10) == 0:
            current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))])
            video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator()
            ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist())
#            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, train_data[-2:], ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)
#            for key in pred_dict.keys():
#                for ele in gt_dict[key]:
#                    print "GT:  " + ele['caption']
#                print "PD:  " + pred_dict[key][0]['caption']
#                print '-------'

            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, val_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)

            scorer = COCOScorer()
            total_score = scorer.score(gt_dict, pred_dict, id_list)

    print ("Finally, saving the model ...")
    with tf.device('/cpu:0'):
	    saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs)
    tStop_total = time.time()
    print ("Total Time Cost:", round(tStop_total - tStart_total,2), "s")
예제 #18
0
파일: Att.py 프로젝트: KuoHaoZeng/VH
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

    np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword)

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    drop_out_rate = 0.5,
            bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask= model.build_model()
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))

    with tf.device("/cpu:0"):
    	saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()
    saver.restore(sess, 'models_Att_update_new/model-30')

    tStart_total = time.time()
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]

	tStart_epoch = time.time()
	loss_epoch = np.zeros(len(train_data))
        for current_batch_file_idx in xrange(len(train_data)):

	    tStart = time.time()
	    current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
	    current_video_len = np.zeros(batch_size)
	    for ind in xrange(batch_size):
		current_feats[ind,:,:] = current_batch['data'][:,ind,:]
		idx = np.where(current_batch['label'][:,ind] != -1)[0]
		if len(idx) == 0:
			continue
		current_video_masks[ind,:idx[-1]+1] = 1

            current_captions = current_batch['title']
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=16-1)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            _, loss_val = sess.run(
                    [train_op, tf_loss],
                    feed_dict={
                        tf_video: current_feats,
                        tf_video_mask : current_video_masks,
                        tf_caption: current_caption_matrix,
                        tf_caption_mask: current_caption_masks
                        })
	    loss_epoch[current_batch_file_idx] = loss_val
	    tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
	    #print "Time Cost:", round(tStop - tStart,2), "s"

	print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
	tStop_epoch = time.time()
	print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s"

        if np.mod(epoch, 10) == 0 or epoch == n_epochs - 1:
            print "Epoch ", epoch, " is done. Saving the model ..."
    	    with tf.device("/cpu:0"):
            	saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)

	    current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))])
    	    video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator()
    	    ixtoword = pd.Series(np.load('./data'+str(gpu_id)+'/ixtoword.npy').tolist())
	    [pred_sent, gt_sent] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf)
	    for idx in range(len(pred_sent)):
		print "GT:  " + gt_sent[idx][0]['caption']
		print "PD:  " + pred_sent[idx][0]['caption']
		print '-------'
    	    [pred_sent, gt_sent] = testing_all(sess, val_data, ixtoword,video_tf, video_mask_tf, caption_tf)
	    scorer = COCOScorer()
	    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))
	sys.stdout.flush()

    print "Finally, saving the model ..."
    with tf.device("/cpu:0"):
    	saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def test(saved_model=''):
    scorer = COCOScorer()
    ixtoword = pd.Series(np.load(cfg.vocab_path + 'ixtoword.npy').tolist())
    combine_features = load_flickr30k_features if cfg.id == "Flickr30k" else load_msr_vtt_features

    model = s2vt(dim_image=cfg.dim_image,
                 n_words=len(ixtoword),
                 dim_hidden=cfg.dim_hidden,
                 batch_size=cfg.batch_size,
                 n_frame_steps=cfg.n_frame_step,
                 n_lstm_steps=cfg.n_lstm_step,
                 dim_word_emb=cfg.dim_word_emb,
                 cell_clip=cfg.cell_clip,
                 forget_bias=cfg.forget_bias,
                 input_keep_prob=cfg.input_keep_prob,
                 output_keep_prob=cfg.output_keep_prob,
                 bias_init_vector=None)

    _, video_tf, caption_tf, _, _ = model.build_model("inference")
    session = tf.InteractiveSession(config=tf.ConfigProto(
        gpu_options=gpu_options))
    saver = tf.train.Saver()
    saver.restore(session, saved_model)

    if cfg.id == "Flickr30k":
        _, _, test_data = get_flickr30k_data(cfg)
    elif cfg.id == "MSR-VTT":
        _, _, test_data = get_msr_vtt_data(cfg)

    splits = []

    splits.append((test_data['video_path'].unique(), test_data))
    results = []
    for split, gt_dataframe in splits:
        gts = convert_data_to_coco_scorer_format(gt_dataframe)
        samples = {}
        for start, end in zip(
                range(0, len(split), cfg.batch_size),
                range(cfg.batch_size,
                      len(split) + cfg.batch_size, cfg.batch_size)):

            current_batch = split[start:end]
            current_feats = np.zeros(
                (cfg.batch_size, cfg.n_frame_step, cfg.dim_image))
            current_feats_vals = [
                combine_features(vid) for vid in current_batch
            ]

            for ind, feat in enumerate(current_feats_vals):
                current_feats[ind][:len(current_feats_vals[ind])] = feat

            generated_word_index = session.run(
                caption_tf, feed_dict={video_tf: current_feats})
            generated_word_index = np.asarray(generated_word_index).transpose()
            periods = np.argmax(generated_word_index == 0, axis=1) + 1
            periods[
                periods ==
                0] = cfg.n_lstm_step  #take the whole sequence if a period was not produced
            for i in range(len(current_batch)):
                generated_sentence = ' '.join(
                    ixtoword[generated_word_index[i, :periods[i] - 1]])
                video_id = current_batch[i].split("/")[-1].split("_")[
                    0]  #+ ".jpg"
                samples[video_id] = [{
                    u'image_id': video_id,
                    u'caption': generated_sentence
                }]

        with suppress_stdout_stderr():
            valid_score = scorer.score(gts, samples, samples.keys())
        results.append(valid_score)
        print valid_score

    print len(samples)
    if not os.path.exists(cfg.results_path):
        os.makedirs(cfg.results_path)

    with open(cfg.results_path + "scores.txt", 'a') as scores_table:
        scores_table.write(json.dumps(results[0]) + "\n")
    with open(cfg.results_path + saved_model.split("/")[-1] + ".json",
              'w') as prediction_results:
        json.dump({
            "predictions": samples,
            "scores": valid_score
        }, prediction_results)
예제 #20
0
def train():
    assert os.path.isdir(home_folder)
    assert os.path.isfile(video_data_path_train)
    assert os.path.isfile(video_data_path_val)
    assert os.path.isdir(model_path)
    print 'load meta data...'
    wordtoix = np.load(home_folder + 'data0/wordtoix.npy').tolist()
    print 'build model and session...'
    # place shared parameters on the GPU
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=None)
    tStart_total = time.time()
    n_epoch_steps = int(n_train_samples / batch_size)
    n_steps = n_epochs * n_epoch_steps
    # preprocessing on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_val)
        # random batches
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_encode_data, val_video_label, val_fname, val_caption_id, val_caption_id_1 = \
            tf.train.batch([val_data, val_encode_data, val_video_label, val_fname, val_caption_id, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2*batch_size)
    # operation on the GPU
    with tf.device("/gpu:0"):
        tf_loss, tf_loss_caption, tf_loss_latent, tf_loss_video, tf_output_semantic = model.build_model(
            train_data, train_video_label, train_caption_id,
            train_caption_id_1, train_caption_label)
        val_caption_tf, val_lstm3_variables_tf = model.build_sent_generator(
            val_data, val_video_label)
        val_video_tf, val_lstm4_variables_tf = model.build_video_generator(
            val_caption_id_1)

    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False))
    # check for model file
    with tf.device("/cpu:0"):
        saver = tf.train.Saver(max_to_keep=100)
    ckpt = tf.train.get_checkpoint_state(model_path)
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
        print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True)
    else:
        print("Created model with fresh parameters.")
        sess.run(tf.global_variables_initializer())
    temp = set(tf.global_variables())
    # train on the GPU
    with tf.device("/gpu:0"):
        #        train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gvs = optimizer.compute_gradients(tf_loss)
        # when variable is not related to the loss, grad returned as None
        clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var)
                    for grad, var in gvs if grad is not None]
        train_op = optimizer.apply_gradients(gvs)

    ## initialize variables added for optimizer
    sess.run(tf.variables_initializer(set(tf.global_variables()) - temp))
    # initialize epoch variable in queue reader
    sess.run(tf.local_variables_initializer())
    loss_epoch = 0
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    # write graph architecture to file
    summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph)
    for step in xrange(1, n_steps + 1):
        tStart = time.time()
        _, loss_val, loss_cap, loss_lat, loss_vid, sem = sess.run([
            train_op, tf_loss, tf_loss_caption, tf_loss_latent, tf_loss_video,
            tf_output_semantic
        ])
        tStop = time.time()
        print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap, "loss_lat:", loss_lat, "loss_vid:", loss_vid
        print "Time Cost:", round(tStop - tStart, 2), "s"
        loss_epoch += loss_val

        if step % n_epoch_steps == 0:
            epoch = step / n_epoch_steps
            loss_epoch /= n_epoch_steps
            with tf.device("/cpu:0"):
                saver.save(sess,
                           os.path.join(model_path, 'model'),
                           global_step=epoch)
            print 'epoch:', epoch, 'loss:', loss_epoch, 'loss_cap:', loss_cap, 'loss_lat:', loss_lat, 'loss_vid:', loss_vid
            print 'sem:', sem[0, :10]
            loss_epoch = 0
            ######### test sentence generation ##########
            ixtoword = pd.Series(
                np.load(home_folder + 'data0/ixtoword.npy').tolist())
            n_val_steps = int(n_val_samples / batch_size)
            [pred_sent, gt_sent, id_list, gt_dict,
             pred_dict] = testing_all(sess, 1, ixtoword, val_caption_tf,
                                      val_fname)
            for key in pred_dict.keys():
                for ele in gt_dict[key]:
                    print "GT:  " + ele['caption']
                print "PD:  " + pred_dict[key][0]['caption']
                print '-------'
            [pred_sent, gt_sent, id_list, gt_dict,
             pred_dict] = testing_all(sess, n_val_steps, ixtoword,
                                      val_caption_tf, val_fname)
            scorer = COCOScorer()
            total_score = scorer.score(gt_dict, pred_dict, id_list)
            ######### test video generation #############
            mse = test_all_videos(sess, n_val_steps, val_data, val_video_tf)
            sys.stdout.flush()

        sys.stdout.flush()

    coord.request_stop()
    coord.join(threads)
    print "Finally, saving the model ..."
    with tf.device("/cpu:0"):
        saver.save(sess,
                   os.path.join(model_path, 'model'),
                   global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
    sess.close()
예제 #21
0
def train():
    assert os.path.isfile(video_data_path_train)
    assert os.path.isfile(video_data_path_val)
    assert os.path.isdir(model_path)
    assert os.path.isfile(wordtoix_file)
    assert os.path.isfile(ixtoword_file)
    assert os.path.isfile(bias_init_vector_file)
    assert drop_strategy in ['block_video', 'block_sent', 'random', 'keep']
    wordtoix = np.load(wordtoix_file).tolist()
    ixtoword = pd.Series(np.load(ixtoword_file).tolist())
    bias_init_vector = np.load(bias_init_vector_file)
    print 'build model and session...'
    # shared parameters on the GPU
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=bias_init_vector)
    tStart_total = time.time()
    n_epoch_steps = int(n_train_samples / batch_size)
    n_steps = n_epochs * n_epoch_steps
    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_val)
        # random batches
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \
            = model.build_model(train_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label)
        val_v2s_tf, _ = model.build_v2s_generator(val_data)
        val_s2s_tf, _ = model.build_s2s_generator(val_caption_id_1)
        val_s2v_tf, _ = model.build_s2v_generator(val_caption_id_1)
        val_v2v_tf, _ = model.build_v2v_generator(val_data)

    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False))
    # check for model file
    with tf.device(cpu_device):
        saver = tf.train.Saver(max_to_keep=100)
    ckpt = tf.train.get_checkpoint_state(model_path)
    global_step = 0
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
        #        print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True)
        global_step = get_model_step(ckpt.model_checkpoint_path)
        print 'global_step:', global_step
    else:
        print("Created model with fresh parameters.")
        sess.run(tf.global_variables_initializer())
    temp = set(tf.global_variables())
    # train on the GPU
    with tf.device("/gpu:0"):
        ## 1. weight decay
        for var in tf.trainable_variables():
            decay_loss = tf.multiply(tf.nn.l2_loss(var),
                                     0.0004,
                                     name='weight_loss')
            tf.add_to_collection('losses', decay_loss)
        tf.add_to_collection('losses', tf_loss)
        tf_total_loss = tf.add_n(tf.get_collection('losses'),
                                 name='total_loss')
        ## 2. gradient clip
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gvs = optimizer.compute_gradients(tf_total_loss)
        # when variable is not related to the loss, grad returned as None
        clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var)
                    for grad, var in gvs if grad is not None]
        for grad, var in gvs:
            if grad is not None:
                tf.summary.histogram(var.name + '/grad', grad)
                tf.summary.histogram(var.name + '/data', var)
        train_op = optimizer.apply_gradients(clip_gvs)

    ## initialize variables added for optimizer
    sess.run(tf.variables_initializer(set(tf.global_variables()) - temp))
    # initialize epoch variable in queue reader
    sess.run(tf.local_variables_initializer())
    loss_epoch = 0
    loss_epoch_cap = 0
    loss_epoch_vid = 0
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    ##### add summaries ######
    tf.summary.histogram('video_h', tf_v_h)
    tf.summary.histogram('sent_h', tf_s_h)
    tf.summary.scalar('loss_vid', tf_loss_vid)
    tf.summary.scalar('loss_lat', tf_loss_lat)
    tf.summary.scalar('loss_caption', tf_loss_cap)
    #    for var in tf.trainable_variables():
    #        summaries.append(tf.histogram_summary(var.op.name, var))
    summary_op = tf.summary.merge_all()
    # write graph architecture to file
    summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph)
    epoch = global_step
    video_label = sess.run(train_video_label)
    for step in xrange(1, n_steps + 1):
        tStart = time.time()
        if drop_strategy == 'keep':
            drop_type = 0
        elif drop_strategy == 'block_sentence':
            drop_type = 1
        elif drop_strategy == 'block_video':
            drop_type = 2
        else:
            drop_type = random.randint(0, 2)

        _, loss_val, loss_cap, loss_lat, loss_vid = sess.run(
            [train_op, tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid],
            feed_dict={tf_drop_type: drop_type})
        tStop = time.time()
        print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap * caption_weight, "loss_latent:", loss_lat * latent_weight, "loss_vid:", loss_vid * video_weight
        print "Time Cost:", round(tStop - tStart, 2), "s"
        loss_epoch += loss_val
        loss_epoch_cap += loss_cap
        loss_epoch_vid += loss_vid

        if step % n_epoch_steps == 0:
            #        if step % 3 == 0:
            epoch += 1
            loss_epoch /= n_epoch_steps
            loss_epoch_cap /= n_epoch_steps
            loss_epoch_vid /= n_epoch_steps
            with tf.device(cpu_device):
                saver.save(sess,
                           os.path.join(model_path, 'model'),
                           global_step=epoch)


#            print 'z:', z[0, :10]
            print 'epoch:', epoch, 'loss:', loss_epoch, "loss_cap:", loss_epoch_cap, "loss_lat:", loss_lat, "loss_vid:", loss_epoch_vid
            loss_epoch = 0
            loss_epoch_cap = 0
            loss_epoch_vid = 0
            ######### test sentence generation ##########
            n_val_steps = int(n_val_samples / batch_size)
            #            n_val_steps = 3
            ### TODO: sometimes COCO test show exceptions in the beginning of training ####
            if test_v2s:
                try:
                    [pred_sent, gt_sent, id_list, gt_dict,
                     pred_dict] = testing_all(sess, 1, ixtoword, val_v2s_tf,
                                              val_fname)
                    for key in pred_dict.keys():
                        for ele in gt_dict[key]:
                            print "GT:  " + ele['caption']
                        print "PD:  " + pred_dict[key][0]['caption']
                        print '-------'
                    print '############## video to sentence result #################'
                    [pred_sent, gt_sent, id_list, gt_dict,
                     pred_dict] = testing_all(sess, n_val_steps, ixtoword,
                                              val_v2s_tf, val_fname)
                    scorer = COCOScorer()
                    total_score = scorer.score(gt_dict, pred_dict, id_list)
                    print '############## video to sentence result #################'
                except Exception, e:
                    print 'epoch:', epoch, 'v2s Bleu test exception'

            if test_s2s:
                try:
                    [pred_sent, gt_sent, id_list, gt_dict,
                     pred_dict] = testing_all(sess, 1, ixtoword, val_s2s_tf,
                                              val_fname)
                    for key in pred_dict.keys():
                        for ele in gt_dict[key]:
                            print "GT:  " + ele['caption']
                        print "PD:  " + pred_dict[key][0]['caption']
                        print '-------'
                    print '############## sentence to sentence result #################'
                    [pred_sent, gt_sent, id_list, gt_dict,
                     pred_dict] = testing_all(sess, n_val_steps, ixtoword,
                                              val_s2s_tf, val_fname)
                    scorer = COCOScorer()
                    total_score = scorer.score(gt_dict, pred_dict, id_list)
                    print '############## sentence to sentence result #################'
                except Exception, e:
                    print 'epoch', epoch, 's2s Bleu test exception'

            ######### test video generation #############
            if test_v2v:
                mse_v2v = test_all_videos(sess, n_val_steps, val_data,
                                          val_v2v_tf, val_video_label, None)
                print 'epoch', epoch, 'video2video mse:', mse_v2v
            if test_s2v:
                mse_s2v = test_all_videos(sess, n_val_steps, val_data,
                                          val_s2v_tf, val_video_label, None)
                print 'epoch', epoch, 'caption2video mse:', mse_s2v
            sys.stdout.flush()

            ###### summary ######
            if epoch % 2 == 0:
                summary = sess.run(summary_op)
                summary_writer.add_summary(summary, epoch)
예제 #22
0
파일: HS.py 프로젝트: KuoHaoZeng/VH
def train():
    meta_data, train_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

    np.save('./data/ixtoword', ixtoword)

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    drop_out_rate = 0.5,
            bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_video_len, tf_caption, tf_caption_mask, tf_HLness, tf_HLness_mask, tf_HLness_att_mask= model.build_model()
    loss_summary = tf.scalar_summary("Loss",tf_loss)
    sess = tf.InteractiveSession()
    merged = tf.merge_all_summaries()
    writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def)

    saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()

    tStart_total = time.time()
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]

	tStart_epoch = time.time()
	loss_epoch = np.zeros(len(train_data))
        for current_batch_file_idx in xrange(len(train_data)):

	    tStart = time.time()
	    current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
	    current_HLness = np.zeros((batch_size, n_frame_step))
	    current_HLness_masks = np.zeros((batch_size, n_frame_step))
	    current_HLness_att_masks = np.zeros((batch_size, n_frame_step))
            current_video_masks = np.zeros((batch_size, n_frame_step))
	    current_video_len = np.zeros(batch_size)
	    for ind in xrange(batch_size):
		current_feats[ind,:,:] = current_batch['data'][:,ind,:]
		idx = np.where(current_batch['label'][:,ind] != -1)[0]
		if len(idx) == 0:
			continue
		idy = np.where(current_batch['label'][:,ind] == 1)[0]
		if len(idy) == 0:
			continue
		current_HLness[ind,idx] = current_batch['label'][idx,ind]
		current_HLness_masks[ind,idx] = 1
		current_video_masks[ind,idy[-1]] = 1
		current_video_len[ind] = idx[-1] + 1
		current_HLness_att_masks[ind,idy] = 1
		if(idy[0] > 4):
			current_HLness_att_masks[ind,idy[0]-5:idy[0]] = 1
		else:
			current_HLness_att_masks[ind,0:idy[0]] = 1

            current_captions = current_batch['title']
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=15-1)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            _, loss_val, summary_str= sess.run(
                    [train_op, tf_loss, merged],
                    feed_dict={
                        tf_video: current_feats,
                        tf_video_mask : current_video_masks,
                        tf_caption: current_caption_matrix,
                        tf_caption_mask: current_caption_masks,
			tf_HLness: current_HLness,
			tf_HLness_mask: current_HLness_masks,
			tf_HLness_att_mask: current_HLness_att_masks
                        })
	    writer.add_summary(summary_str, epoch)
	    loss_epoch[current_batch_file_idx] = loss_val
	    tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
	    #print "Time Cost:", round(tStop - tStart,2), "s"

	print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
	tStop_epoch = time.time()
	print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s"

        if np.mod(epoch, 20) == 0:
            print "Epoch ", epoch, " is done. Saving the model ..."
            saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)

	    current_batch = h5py.File(test_data[np.random.randint(0,len(test_data))])
    	    video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf, lstmRNN_variables_tf, lstm3_variables_tf = model.build_generator()
    	    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())
	    #[mp, pred_sent, gt_sent, HLness] = testing_one(sess, current_batch, ixtoword,video_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf)
    	    [mp, pred_sent, gt_sent, HLness] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf)
	    #for xxx in xrange(current_batch['label'].shape[1]):
	    #	print gt_sent[xxx]
	    #	print pred_sent[xxx]
	    total_score = np.mean(mp)
	    print total_score
	    scorer = COCOScorer()
	    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))

    print "Finally, saving the model ..."
    saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
예제 #23
0
def test(model_path=None,
         video_data_path_test=video_data_path_val,
         n_test_samples=n_val_samples):
    #    test_data = val_data   # to evaluate on testing data or validation data
    wordtoix = np.load(wordtoix_file).tolist()
    ixtoword = pd.Series(np.load(ixtoword_file).tolist())
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=None)

    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _, train_frame_data = read_and_decode_with_frame(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _, val_frame_data = read_and_decode_with_frame(video_data_path_test)
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \
            = model.build_model(train_data, train_frame_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label)
        val_v2s_tf, v2s_lstm3_vars_tf = model.build_v2s_generator(val_data)
        val_s2s_tf, s2s_lstm2_vars_tf, s2s_lstm3_vars_tf = model.build_s2s_generator(
            val_caption_id_1)
        val_s2v_tf, s2v_lstm2_vars_tf, s2v_lstm4_vars_tf = model.build_s2v_generator(
            val_caption_id_1, val_frame_data)
        val_v2v_tf, v2v_lstm4_vars_tf = model.build_v2v_generator(
            val_data, val_frame_data)
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True))

    with tf.device(cpu_device):
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
        print 'load parameters from:', model_path


#    print 'halve the dropout weights..'
#    for ind, row in enumerate(v2s_lstm3_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)
#    for ind, row in enumerate(s2s_lstm2_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)
#    for ind, row in enumerate(s2v_lstm4_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    ######### test sentence generation ##########
    print 'testing...'
    n_test_steps = int(n_test_samples / batch_size)
    print 'n_test_steps:', n_test_steps
    tstart = time.time()
    ### TODO: sometimes COCO test show exceptions in the beginning of training ####
    if test_v2s:
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname)
        for i, key in enumerate(pred_dict.keys()):
            print 'video:', flist[i]
            for ele in gt_dict[key]:
                print "GT:  " + ele['caption']
            print "PD:  " + pred_dict[key][0]['caption']
            print '-------'
        print '############## video to sentence result #################'
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         _] = testing_all(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname)
        scorer = COCOScorer()
        total_score_1 = scorer.score(gt_dict, pred_dict, id_list)
        print '############## video to sentence result #################'

    if test_s2s:
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname)
        for i, key in enumerate(pred_dict.keys()):
            print 'video:', flist[i]
            for ele in gt_dict[key]:
                print "GT:  " + ele['caption']
            print "PD:  " + pred_dict[key][0]['caption']
            print '-------'
        print '############## sentence to sentence result #################'
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         _] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname)
        scorer = COCOScorer()
        total_score_2 = scorer.score(gt_dict, pred_dict, id_list)
        print '############## sentence to sentence result #################'

    ######### test video generation #############
    if test_v2v:
        mse_v2v = test_all_videos(sess, n_test_steps, val_frame_data,
                                  val_v2v_tf, val_video_label,
                                  pixel_scale_factor)
        print 'video2video mse:', mse_v2v
    if test_s2v:
        mse_s2v = test_all_videos(sess, n_test_steps, val_frame_data,
                                  val_s2v_tf, val_video_label,
                                  pixel_scale_factor)
        print 'caption2video mse:', mse_s2v
    if save_demo_sent_v2s:
        get_demo_sentence(sess,
                          n_test_steps,
                          ixtoword,
                          val_v2s_tf,
                          val_fname,
                          result_file=home_folder + 'demo_v2s.txt')
    if save_demo_sent_s2s:
        get_demo_sentence(sess,
                          n_test_steps,
                          ixtoword,
                          val_s2s_tf,
                          val_fname,
                          result_file=home_folder + 'demo_s2s.txt')
    if save_demo_video_v2v:
        get_demo_video(sess, n_test_steps, val_frame_data, val_v2v_tf,
                       val_video_label, val_fname, home_folder + 'demo_v2v/',
                       pixel_scale_factor)
    if save_demo_video_s2v:
        get_demo_video(sess, n_test_steps, val_frame_data, val_s2v_tf,
                       val_video_label, val_fname, home_folder + 'demo_s2v/',
                       pixel_scale_factor)

    sys.stdout.flush()
    coord.request_stop()
    coord.join(threads)
    tstop = time.time()
    print "Total Time Cost:", round(tstop - tstart, 2), "s"
    sess.close()
예제 #24
0
    def sampling( self, condition_net, image_encoder, image_generator, 
                gen_in_layer, gen_out_layer, start_code, 
                n_iters, lr, lr_end, threshold, 
                layer, conditions, #units=None, xy=0, 
                epsilon1=1, epsilon2=1, epsilon3=1e-10,
                inpainting=None, # in-painting args
                output_dir=None, reset_every=0, save_every=1, n_gram='Bleu_1'):

        # Get the input and output sizes
        image_shape = condition_net.blobs['data'].data.shape
        generator_output_shape = image_generator.blobs[gen_out_layer].data.shape
        encoder_input_shape = image_encoder.blobs['data'].data.shape

        # Calculate the difference between the input image of the condition net 
        # and the output image from the generator
        image_size = util.get_image_size(image_shape)
        generator_output_size = util.get_image_size(generator_output_shape)
        encoder_input_size = util.get_image_size(encoder_input_shape)

        # The top left offset to crop the output image to get a 227x227 image
        topleft = util.compute_topleft(image_size, generator_output_size)
        topleft_DAE = util.compute_topleft(encoder_input_size, generator_output_size)

        src = image_generator.blobs[gen_in_layer]     # the input feature layer of the generator
        
        # Make sure the layer size and initial vector size match
        assert src.data.shape == start_code.shape

        # Variables to store the best sample
        last_xx = np.zeros(image_shape)    # best image
        last_prob = -sys.maxint                 # highest probability 

        h = start_code.copy()

        condition_idx = 0 
        list_samples = []
        i = 0
        scorer = COCOScorer()
        while True:

            step_size = lr + ((lr_end - lr) * i) / n_iters
            condition = conditions[condition_idx]  # Select a class

            # 1. Compute the epsilon1 term ---
            # compute gradient d log(p(h)) / dh per DAE results in Alain & Bengio 2014
            d_prior = self.h_autoencoder_grad(h=h, encoder=image_generator, decoder=image_encoder, gen_out_layer=gen_out_layer, topleft=topleft_DAE, inpainting=inpainting)

            # 2. Compute the epsilon2 term ---
            # Push the code through the generator to get an image x
            image_generator.blobs["feat"].data[:] = h
            generated = image_generator.forward()
            x = generated[gen_out_layer].copy()       # 256x256

            # Crop from 256x256 to 227x227
            cropped_x = x[:,:,topleft[0]:topleft[0]+image_size[0], topleft[1]:topleft[1]+image_size[1]]
            cropped_x_copy = cropped_x.copy()
            
            if inpainting is not None:
                cropped_x = util.apply_mask(img=cropped_x, mask=inpainting['mask'], context=inpainting['image'])

            # Forward pass the image x to the condition net up to an unit k at the given layer
            # Backprop the gradient through the condition net to the image layer to get a gradient image 
            d_condition_x, prob, info = self.forward_backward_from_x_to_condition(net=condition_net, end=layer, image=cropped_x, condition=condition, scorer=scorer, n_gram=n_gram) 

            if inpainting is not None:
                # Mask out the class gradient image
                d_condition_x[:] *= inpainting["mask"]

                # An additional objective for matching the context image
                d_context_x256 = np.zeros_like(x.copy())
                d_context_x256[:,:,topleft[0]:topleft[0]+image_size[0], topleft[1]:topleft[1]+image_size[1]] = (inpainting["image"] - cropped_x_copy) * inpainting["mask_neg"]
                d_context_h = self.backward_from_x_to_h(generator=image_generator, diff=d_context_x256, start=gen_in_layer, end=gen_out_layer)

            # Put the gradient back in the 256x256 format 
            d_condition_x256 = np.zeros_like(x)
            d_condition_x256[:,:,topleft[0]:topleft[0]+image_size[0], topleft[1]:topleft[1]+image_size[1]] = d_condition_x.copy()

            # Backpropagate the above gradient all the way to h (through generator)
            # This gradient 'd_condition' is d log(p(y|h)) / dh (the epsilon2 term in Eq. 11 in the paper)
            d_condition = self.backward_from_x_to_h(generator=image_generator, diff=d_condition_x256, start=gen_in_layer, end=gen_out_layer)

            self.print_progress(i, info, condition, prob, d_condition)

            # 3. Compute the epsilon3 term ---
            noise = np.zeros_like(h)
            if epsilon3 > 0:
                noise = np.random.normal(0, epsilon3, h.shape)  # Gaussian noise

            # Update h according to Eq.11 in the paper 
            d_h = epsilon1 * d_prior + epsilon2 * d_condition + noise

            # Plus the optional epsilon4 for matching the context region when in-painting
            if inpainting is not None:
                d_h += inpainting["epsilon4"] * d_context_h 

            h += step_size/np.abs(d_h).mean() * d_h

            h = np.clip(h, a_min=0, a_max=30)   
            hm=h
            # Reset the code every N iters (for diversity when running a long sampling chain)
            if reset_every > 0 and i % reset_every == 0 and i > 0: 
                h = np.random.normal(0, 1, h.shape)

                # Experimental: For sample diversity, it's a good idea to randomly pick epsilon1 as well
                epsilon1 = np.random.uniform(low=1e-6, high=1e-2)

            # Save every sample
            last_xx = cropped_x.copy()
            last_prob = prob

            # Filter samples based on threshold or every N iterations
            if save_every > 0 and i % save_every == 0 and prob > threshold:
                name = "%s/samples/%05d.jpg" % (output_dir, i)

                label = self.get_label(condition)
                list_samples.append( (last_xx.copy(), name, label) ) 

            # Stop if grad is 0
            if norm(d_h) == 0:
                print " d_h is 0"
                break

            # Randomly sample a class every N iterations
            if i > 0 and i % n_iters == 0:
				condition_idx += 1
				if condition_idx == len(conditions):
					break

            i += 1  # Next iter

        # returning the last sample
        print "-------------------------"
        print "Last sample: prob [%s] " % last_prob

        return last_xx, list_samples
예제 #25
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(
        video_data_path_train, video_data_path_val, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(
        captions, word_count_threshold=1)

    np.save('./data0/ixtoword', ixtoword)

    model = Video_Caption_Generator(dim_image=dim_image,
                                    n_words=len(wordtoix),
                                    dim_hidden=dim_hidden,
                                    batch_size=batch_size,
                                    n_lstm_steps=n_frame_step,
                                    drop_out_rate=0.5,
                                    bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model(
    )
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True))

    with tf.device("/cpu:0"):
        saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()

    tStart_total = time.time()
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]

        tStart_epoch = time.time()
        loss_epoch = np.zeros(len(train_data))
        for current_batch_file_idx in xrange(len(train_data)):

            tStart = time.time()
            current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
            current_video_len = np.zeros(batch_size)
            for ind in xrange(batch_size):
                current_feats[ind, :, :] = current_batch['data'][:n_frame_step,
                                                                 ind, :]
                idx = np.where(current_batch['label'][:, ind] != -1)[0]
                if len(idx) == 0:
                    continue
                current_video_masks[ind, :idx[-1] + 1] = 1

            current_captions = current_batch['title']
            current_caption_ind = map(
                lambda cap: [
                    wordtoix[word] for word in cap.lower().split(' ')
                    if word in wordtoix
                ], current_captions)

            current_caption_matrix = sequence.pad_sequences(
                current_caption_ind, padding='post', maxlen=n_caption_step - 1)
            current_caption_matrix = np.hstack([
                current_caption_matrix,
                np.zeros([len(current_caption_matrix), 1])
            ]).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0],
                                              current_caption_matrix.shape[1]))
            nonzeros = np.array(
                map(lambda x: (x != 0).sum() + 1, current_caption_matrix))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            _, loss_val = sess.run(
                [train_op, tf_loss],
                feed_dict={
                    tf_video: current_feats,
                    tf_video_mask: current_video_masks,
                    tf_caption: current_caption_matrix,
                    tf_caption_mask: current_caption_masks
                })
            loss_epoch[current_batch_file_idx] = loss_val
            tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
        #print "Time Cost:", round(tStop - tStart,2), "s"

        print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
        tStop_epoch = time.time()
        print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch, 2), "s"

        if np.mod(epoch, 10) == 0 or epoch == n_epochs - 1:
            print "Epoch ", epoch, " is done. Saving the model ..."
            with tf.device("/cpu:0"):
                saver.save(sess,
                           os.path.join(model_path, 'model'),
                           global_step=epoch)

            current_batch = h5py.File(val_data[np.random.randint(
                0, len(val_data))])
            video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator(
            )
            ixtoword = pd.Series(np.load('./data0/ixtoword.npy').tolist())
            [pred_sent, gt_sent, id_list, gt_dict,
             pred_dict] = testing_all(sess, train_data[-2:], ixtoword,
                                      video_tf, video_mask_tf, caption_tf)
            for key in pred_dict.keys():
                for ele in gt_dict[key]:
                    print "GT:  " + ele['caption']
                print "PD:  " + pred_dict[key][0]['caption']
                print '-------'
            [pred_sent, gt_sent, id_list, gt_dict,
             pred_dict] = testing_all(sess, val_data, ixtoword, video_tf,
                                      video_mask_tf, caption_tf)
            scorer = COCOScorer()
            total_score = scorer.score(gt_dict, pred_dict, id_list)
        sys.stdout.flush()

    print "Finally, saving the model ..."
    with tf.device("/cpu:0"):
        saver.save(sess,
                   os.path.join(model_path, 'model'),
                   global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
예제 #26
0
def test(model_path=None,
         video_data_path_test=video_data_path_val,
         n_test_samples=n_val_samples,
         video_name=None):
    #    test_data = val_data   # to evaluate on testing data or validation data
    wordtoix = np.load(wordtoix_file).tolist()
    ixtoword = pd.Series(np.load(ixtoword_file).tolist())
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=None)

    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_test)
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss = model.build_model(train_caption_id, train_caption_id_1,
                                    train_caption_label)
        val_s2s_tf, s2s_lstm3_vars_tf = model.build_s2s_generator(
            val_caption_id_1)
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True))

    with tf.device(cpu_device):
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
        print 'load parameters from:', model_path

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    ######### test sentence generation ##########
    print 'testing...'
    n_test_steps = int(n_test_samples / batch_size)
    print 'n_test_steps:', n_test_steps
    tstart = time.time()
    ### TODO: sometimes COCO test show exceptions in the beginning of training ####
    if test_s2s:
        #        [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname)
        #        for i, key in enumerate(pred_dict.keys()):
        #            print 'video:', flist[i]
        #            for ele in gt_dict[key]:
        #                print "GT:  " + ele['caption']
        #            print "PD:  " + pred_dict[key][0]['caption']
        #            print '-------'
        print '############## sentence to sentence result #################'
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         flist] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf,
                              val_fname)
        if os.path.isfile('demo_s2s.txt.videos'):
            video_name = pickle.load(open('demo_s2s.txt.videos', "rb"))
        if video_name:
            for i, key in enumerate(pred_dict.keys()):
                if flist[i] in video_name:
                    print flist[i]
                    for ele in gt_dict[key]:
                        print "GT:  " + ele['caption']
                    print "PD:  " + pred_dict[key][0]['caption']
                    print '-----------'
        scorer = COCOScorer()
        total_score_2 = scorer.score(gt_dict, pred_dict, id_list)
        print '############## sentence to sentence result #################'

    if save_demo_sent_s2s:
        get_demo_sentence(sess,
                          n_test_steps,
                          ixtoword,
                          val_s2s_tf,
                          val_fname,
                          result_file='demo_s2s.txt')

    sys.stdout.flush()
    coord.request_stop()
    coord.join(threads)
    tstop = time.time()
    print "Total Time Cost:", round(tstop - tstart, 2), "s"
    sess.close()
            print '############## video to sentence result #################'
        except Exception, e:
            print 'v2s bleu test exception'

    if test_s2s:
        try:
            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname)
            for i, key in enumerate(pred_dict.keys()):
                print 'video:', flist[i]
                for ele in gt_dict[key]:
                    print "GT:  " + ele['caption']
                print "PD:  " + pred_dict[key][0]['caption']
                print '-------'
            print '############## sentence to sentence result #################'
            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname)
            scorer = COCOScorer()
            total_score_2 = scorer.score(gt_dict, pred_dict, id_list)
            print '############## sentence to sentence result #################'
        except Exception, e:
            print 'v2s bleu test exception'

    ######### test video generation #############
    if test_v2v:
        mse_v2v = test_all_videos(sess, n_test_steps, val_data, val_v2v_tf, val_video_label, pixel_scale_factor)
        print 'video2video mse:', mse_v2v
    if test_s2v:
        mse_s2v = test_all_videos(sess, n_test_steps, val_data, val_s2v_tf, val_video_label, pixel_scale_factor)
        print 'caption2video mse:', mse_s2v
    if save_demo_sent_v2s:
        get_demo_sentence(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname, result_file='demo_v2s.txt')
    if save_demo_sent_s2s: