def test(model_path='models/model-900', video_feat_path=video_feat_path): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) test_data = val_data ixtoword = pd.Series(np.load('./data'+str(gpu_id)+'/ixtoword.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate = 0, bias_init_vector=None) video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator() sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) with tf.device("/cpu:0"): saver = tf.train.Saver() saver.restore(sess, model_path) for ind, row in enumerate(lstm3_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.mul(row,1-0.5)) sess.run(assign_op) [pred_sent, gt_sent] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, caption_tf) #np.savez('Att_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent) scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) return total_score
def test(model_path='models/model-900', video_feat_path=video_feat_path): meta_data, train_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_test) #test_data = train_data ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate = 0, bias_init_vector=None) video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf, lstmRNN_variables_tf, lstm3_variables_tf = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, model_path) for ind, row in enumerate(lstmRNN_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.mul(row,1-0.5)) sess.run(assign_op) for ind, row in enumerate(lstm3_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.mul(row,1-0.5)) sess.run(assign_op) [mp, pred_sent, gt_sent, HLness] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf) np.savez('HS_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent,mp=mp,HLness=HLness) total_score = np.mean(mp) print model_path.split('/')[1]+' mAP: ' + str(total_score) scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) return total_score
def test(model_path='models/model-900', video_feat_path=video_feat_path): meta_data, train_data, val_data, test_data = get_video_data_jukin( video_data_path_train, video_data_path_val, video_data_path_test) # test_data = val_data # to evaluate on testing data or validation data ixtoword = pd.Series(np.load(home_folder + 'data0/ixtoword.npy').tolist()) model = Video_Caption_Generator(dim_image=dim_image, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate=0, bias_init_vector=None) video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator( ) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True)) with tf.device("/cpu:0"): saver = tf.train.Saver() saver.restore(sess, model_path) for ind, row in enumerate(lstm3_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.multiply(row, 1 - 0.5)) sess.run(assign_op) [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, test_data, ixtoword, video_tf, video_mask_tf, caption_tf) #np.savez('Att_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) return total_score
def eval(model, crit, loader, vocab, opt): model.eval() scorer = COCOScorer() ip_json = open(opt['input_json']) gt_dataframe = json_normalize(json.load(ip_json)['sentences']) ip_json.close() gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in tqdm(loader): # forward the model to get loss video_ids = data['video_ids'] audio_fc2 = data['audio_fc2'].cuda() video_feat = data['video_feat'].cuda() # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(audio_fc2, video_feat, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) return valid_score
def eval(model, crit, dataset, vocab, opt, model_path): model.eval() loader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True) scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in loader: # forward the model to get loss image_feats = data['image_feats'].cuda() audio_mfcc = data['audio_mfcc'].cuda() video_ids = data['video_ids'] # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) validation_file_name = opt['model_directory'].split('/')[-1]+'_val_score.txt' with open(os.path.join(opt["results_path"], validation_file_name), 'a') as scores_table: scores_table.write(model_path.split('/')[-1]+': '+json.dumps(results[0]) + "\n")
def score_with_cocoeval(samples_valid, samples_test, engine): scorer = COCOScorer() if samples_valid: gts_valid = OrderedDict() for ID in engine.val_data_ids: vidID, capID = ID.split('|') words = engine.get_cap_tokens(vidID, int(capID), mode='val') caption = ' '.join(words) if gts_valid.has_key(vidID): gts_valid[vidID].append({'image_id': vidID, 'caption': caption, 'cap_id': capID}) else: gts_valid[vidID] = [{'image_id': vidID, 'caption': caption, 'cap_id': capID}] valid_score = scorer.score(gts_valid, samples_valid, gts_valid.keys()) else: valid_score = None if samples_test: gts_test = OrderedDict() for ID in engine.test_data_ids: vidID, capID = ID.split('|') words = engine.get_cap_tokens(vidID, int(capID), mode='test') caption = ' '.join(words) if gts_test.has_key(vidID): gts_test[vidID].append({'image_id': vidID, 'caption': caption, 'cap_id': capID}) else: gts_test[vidID] = [{'image_id': vidID, 'caption': caption, 'cap_id': capID}] test_score = scorer.score(gts_test, samples_test, gts_test.keys()) else: test_score = None return valid_score, test_score
def score_with_cocoeval(samples_test, engine, ids): scorer = COCOScorer() gts_test = OrderedDict() for vidID in ids: gts_test[vidID] = engine.CAP[vidID] test_score = scorer.score(gts_test, samples_test, ids) return test_score
def eval(model, crit, loader, vocab, opt): model.eval() ''' if opt['beam']: bs = 1 else: bs = opt['batch_size'] loader = DataLoader(dataset, batch_size=bs, shuffle=True) ''' scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in tqdm(loader): # forward the model to get loss video_ids = data['video_ids'] audio_conv4 = data['audio_conv4'].cuda() audio_fc2 = data['audio_fc2'].cuda() sem_feats = data['sem_feats'].cuda() # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(audio_conv4, audio_fc2, sem_feats, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) ''' with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table: scores_table.write(json.dumps(results[0]) + "\n") with open(os.path.join(opt["results_path"], 'vanilla' + ".json"), 'w') as prediction_results: json.dump({"predictions": samples, "scores": valid_score}, prediction_results) ''' return valid_score
def score_with_cocoeval(samples_valid, samples_test, engine): scorer = COCOScorer() if samples_valid: gts_valid = OrderedDict() for vidID in engine.valid_ids: gts_valid[vidID] = engine.CAP[vidID] valid_score = scorer.score(gts_valid, samples_valid, engine.valid_ids) else: valid_score = None if samples_test: gts_test = OrderedDict() for vidID in engine.test_ids: gts_test[vidID] = engine.CAP[vidID] test_score = scorer.score(gts_test, samples_test, engine.test_ids) else: test_score = None return valid_score, test_score
def evaluate(opt, net, eval_range, prediction_txt_path, reference): eval_loader = get_eval_loader(eval_range, opt.feature_h5_path, opt.region_feature_h5_path, opt.test_batch_size) result = {} for i, (frames, regions, spatials, video_ids) in tqdm(enumerate(eval_loader)): frames = frames.to(DEVICE) regions = regions.to(DEVICE) spatials = spatials.to(DEVICE) outputs, _ = net(frames, regions, spatials, None) for (tokens, vid) in zip(outputs, video_ids): if opt.use_multi_gpu: s = net.module.decoder.decode_tokens(tokens.data) else: s = net.decoder.decode_tokens(tokens.data) result[vid] = s with open(prediction_txt_path, 'w') as f: for vid, s in result.items(): f.write('%d\t%s\n' % (vid, s)) prediction_json = convert_prediction(prediction_txt_path) # compute scores scorer = COCOScorer() with suppress_stdout_stderr(): scores, sub_category_score = scorer.score(reference, prediction_json, prediction_json.keys()) for metric, score in scores.items(): print('%s: %.6f' % (metric, score * 100)) if sub_category_score is not None: print('Sub Category Score in Spice:') for category, score in sub_category_score.items(): print('%s: %.6f' % (category, score * 100)) return scores
def score_with_cocoeval(samples_valid, samples_test, valid, test): scorer = COCOScorer() if samples_valid: gts_valid, hypo_valid, valid_ids = make_template(samples_valid, valid) print 'compute validation set score:' valid_score = scorer.score(gts_valid, hypo_valid, valid_ids) else: valid_score = None if samples_test: gts_test, hypo_test, test_ids = make_template(samples_test, test) print 'compute test set score:' test_score = scorer.score(gts_test, hypo_test, test_ids) else: test_score = None return valid_score, test_score
def test(model_path='models/model-900', video_feat_path=video_feat_path): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) test_data = val_data ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, dim_tracker=dim_tracker, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, tracker_cnt=tracker_cnt, drop_out_rate = 0, bias_init_vector=None) video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator() sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) saver = tf.train.Saver() saver.restore(sess, model_path) for ind, row in enumerate(lstm1_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.mul(row,1-0.5)) sess.run(assign_op) for ind, row in enumerate(lstm2_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.mul(row,1-0.5)) sess.run(assign_op) # [pred_sent, gt_sent] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) # scorer = COCOScorer() # total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, test_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) np.savez('result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent,fname=fnamelist) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) return total_score
def get_demo_sentence(sess, n_steps, ixtoword, caption_tf, name_tf, result_file): [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fname_list] = testing_all(sess, n_steps, ixtoword, caption_tf, name_tf) scorer = COCOScorer() scores = scorer.score(gt_dict, pred_dict, id_list, return_img_score=True) bleus = [] for i, idx in enumerate(id_list): bleus.append((scores[idx]['Bleu_4'], fname_list[i], idx)) sorted_bleus = sorted(bleus, key=lambda x: x[0], reverse=True) video_names = [] with open(result_file, 'w') as result: for i in xrange(40): fname = sorted_bleus[i][1] video_names.append(fname) idx = sorted_bleus[i][2] result.write(fname + '\n') for ele in gt_dict[idx]: result.write('GT: ' + ele['caption'] + '\n') result.write('PD: ' + pred_dict[idx][0]['caption'] + '\n\n\n') print 'result saved to', result_file with open(result_file + '.videos', "wb") as fp: pickle.dump(video_names, fp) print 'video names saved to', result_file + '.videos'
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) # np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword) # np.save('./data'+str(gpu_id)+'/wordtoix', wordtoix) # sys.exit() ixtoword=pd.Series(np.load('./data_all/ixtoword.npy').tolist()) wordtoix=pd.Series(np.load('./data_all/wordtoix.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, dim_tracker=dim_tracker, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, tracker_cnt=tracker_cnt, drop_out_rate = 0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_tracker, tf_tracker_mask, tf_caption, tf_caption_mask= model.build_model() #loss_summary = tf.scalar_summary("Loss",tf_loss) sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) #merged = tf.merge_all_summaries() #writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def) with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() saver.restore(sess, 'models/model-0') tStart_total = time.time() nr_prefetch = int(3) for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) ## init queue data_queue = mp.Queue(nr_prefetch) # tracker_queue = mp.Queue(nr_prefetch) title_queue = mp.Queue(nr_prefetch) t1 = Thread(target=load_data_into_queue, args=(train_data, data_queue, 'data')) # t2 = Thread(target=load_data_into_queue, args=(train_data, tracker_queue, 'tracker')) t3 = Thread(target=load_data_into_queue, args=(train_data, title_queue, 'title')) t1.start() # t2.start() t3.start() for current_batch_file_idx in range(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) if 'tracker' in current_batch.keys(): current_tracker = np.array(current_batch['tracker']) else: current_tracker = np.zeros((batch_size, tracker_cnt, dim_tracker)) if 'tracker_mask' in current_batch.keys(): current_tracker_mask = np.array(current_batch['tracker_mask']) else: current_tracker_mask = np.zeros((batch_size, tracker_cnt)) # current_tracker = tracker_queue.get() current_batch_data = data_queue.get() current_batch_title = title_queue.get() for ind in xrange(batch_size): current_feats[ind,:,:] = current_batch_data[:,ind,:] idx = np.where(current_batch['label'][:,ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind,idx[-1]] = 1 current_captions = current_batch_title current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=35-1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix )) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 current_batch.close() _, loss_val= sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask : current_video_masks, tf_tracker : current_tracker, tf_tracker_mask : current_tracker_mask, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) #writer.add_summary(summary_str, epoch) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" t1.join() # t2.join() t3.join() print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s" sys.stdout.flush() if np.mod(epoch, 2) == 0: print "Epoch ", epoch, " is done. Saving the model ..." with tf.device('/cpu:0'): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) if np.mod(epoch, 10) == 0: current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))]) video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator() ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist()) # [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, train_data[-2:], ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) # for key in pred_dict.keys(): # for ele in gt_dict[key]: # print "GT: " + ele['caption'] # print "PD: " + pred_dict[key][0]['caption'] # print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, val_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print "Finally, saving the model ..." with tf.device('/cpu:0'): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin( video_data_path_train, video_data_path_val, video_data_path_test ) captions = meta_data["Description"].values captions = map(lambda x: x.replace(".", ""), captions) captions = map(lambda x: x.replace(",", ""), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) np.save("./data" + str(gpu_id) + "/ixtoword", ixtoword) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate=0.5, bias_init_vector=None, ) tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model() loss_summary = tf.scalar_summary("Loss", tf_loss) sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter("/tmp/tf_log", sess.graph_def) saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() saver.restore(sess, "models_SS_youtube_notest_dummy/model-20") tStart_total = time.time() for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) for current_batch_file_idx in xrange(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) for ind in xrange(batch_size): current_feats[ind, :, :] = current_batch["data"][:, ind, :] idx = np.where(current_batch["label"][:, ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind, idx[-1]] = 1 current_captions = current_batch["title"] current_caption_ind = map( lambda cap: [wordtoix[word] for word in cap.lower().split(" ") if word in wordtoix], current_captions ) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding="post", maxlen=16 - 1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros([len(current_caption_matrix), 1])] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array(map(lambda x: (x != 0).sum() + 1, current_caption_matrix)) for ind, row in enumerate(current_caption_masks): row[: nonzeros[ind]] = 1 _, loss_val, summary_str = sess.run( [train_op, tf_loss, merged], feed_dict={ tf_video: current_feats, tf_video_mask: current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks, }, ) writer.add_summary(summary_str, epoch) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() # print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val # print "Time Cost:", round(tStop - tStart,2), "s" print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch, 2), "s" sys.stdout.flush() if np.mod(epoch, 10) == 0: print "Epoch ", epoch, " is done. Saving the model ..." saver.save(sess, os.path.join(model_path, "model"), global_step=epoch) current_batch = h5py.File(val_data[np.random.randint(0, len(val_data))]) video_tf, video_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator() ixtoword = pd.Series(np.load("./data" + str(gpu_id) + "/ixtoword.npy").tolist()) [pred_sent, gt_sent] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf) for idx in range(len(pred_sent)): print "GT: " + gt_sent[idx][0]["caption"] print "PD: " + pred_sent[idx][0]["caption"] print "-------" [pred_sent, gt_sent] = testing_all(sess, val_data, ixtoword, video_tf, video_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) print "Finally, saving the model ..." saver.save(sess, os.path.join(model_path, "model"), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) # np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword) # np.save('./data'+str(gpu_id)+'/wordtoix', wordtoix) # sys.exit() ixtoword=pd.Series(np.load('./data_all/ixtoword.npy').tolist()) wordtoix=pd.Series(np.load('./data_all/wordtoix.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, dim_tracker=dim_tracker, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, tracker_cnt=tracker_cnt, drop_out_rate = 0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_tracker, tf_tracker_mask, tf_caption, tf_caption_mask= model.build_model() #loss_summary = tf.scalar_summary("Loss",tf_loss) sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) #merged = tf.merge_all_summaries() #writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def) with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() saver.restore(sess, 'models/model-0') tStart_total = time.time() nr_prefetch = int(3) for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) ## init queue data_queue = mp.Queue(nr_prefetch) # tracker_queue = mp.Queue(nr_prefetch) title_queue = mp.Queue(nr_prefetch) t1 = Thread(target=load_data_into_queue, args=(train_data, data_queue, 'data')) # t2 = Thread(target=load_data_into_queue, args=(train_data, tracker_queue, 'tracker')) t3 = Thread(target=load_data_into_queue, args=(train_data, title_queue, 'title')) t1.start() # t2.start() t3.start() for current_batch_file_idx in range(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) if 'tracker' in current_batch.keys(): current_tracker = np.array(current_batch['tracker']) else: current_tracker = np.zeros((batch_size, tracker_cnt, dim_tracker)) if 'tracker_mask' in current_batch.keys(): current_tracker_mask = np.array(current_batch['tracker_mask']) else: current_tracker_mask = np.zeros((batch_size, tracker_cnt)) # current_tracker = tracker_queue.get() current_batch_data = data_queue.get() current_batch_title = title_queue.get() for ind in range(batch_size): current_feats[ind,:,:] = current_batch_data[:,ind,:] idx = np.where(current_batch['label'][:,ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind,idx[-1]] = 1 current_captions = current_batch_title current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=35-1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix )) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 current_batch.close() _, loss_val= sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask : current_video_masks, tf_tracker : current_tracker, tf_tracker_mask : current_tracker_mask, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) #writer.add_summary(summary_str, epoch) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" t1.join() # t2.join() t3.join() print ("Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)) tStop_epoch = time.time() print ("Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s") sys.stdout.flush() if np.mod(epoch, 2) == 0: print ("Epoch ", epoch, " is done. Saving the model ...") with tf.device('/cpu:0'): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) if np.mod(epoch, 10) == 0: current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))]) video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator() ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist()) # [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, train_data[-2:], ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) # for key in pred_dict.keys(): # for ele in gt_dict[key]: # print "GT: " + ele['caption'] # print "PD: " + pred_dict[key][0]['caption'] # print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, val_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print ("Finally, saving the model ...") with tf.device('/cpu:0'): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print ("Total Time Cost:", round(tStop_total - tStart_total,2), "s")
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate = 0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask= model.build_model() sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() saver.restore(sess, 'models_Att_update_new/model-30') tStart_total = time.time() for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) for current_batch_file_idx in xrange(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) for ind in xrange(batch_size): current_feats[ind,:,:] = current_batch['data'][:,ind,:] idx = np.where(current_batch['label'][:,ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind,:idx[-1]+1] = 1 current_captions = current_batch['title'] current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=16-1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix )) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 _, loss_val = sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask : current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s" if np.mod(epoch, 10) == 0 or epoch == n_epochs - 1: print "Epoch ", epoch, " is done. Saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))]) video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator() ixtoword = pd.Series(np.load('./data'+str(gpu_id)+'/ixtoword.npy').tolist()) [pred_sent, gt_sent] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf) for idx in range(len(pred_sent)): print "GT: " + gt_sent[idx][0]['caption'] print "PD: " + pred_sent[idx][0]['caption'] print '-------' [pred_sent, gt_sent] = testing_all(sess, val_data, ixtoword,video_tf, video_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) sys.stdout.flush() print "Finally, saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def test(saved_model=''): scorer = COCOScorer() ixtoword = pd.Series(np.load(cfg.vocab_path + 'ixtoword.npy').tolist()) combine_features = load_flickr30k_features if cfg.id == "Flickr30k" else load_msr_vtt_features model = s2vt(dim_image=cfg.dim_image, n_words=len(ixtoword), dim_hidden=cfg.dim_hidden, batch_size=cfg.batch_size, n_frame_steps=cfg.n_frame_step, n_lstm_steps=cfg.n_lstm_step, dim_word_emb=cfg.dim_word_emb, cell_clip=cfg.cell_clip, forget_bias=cfg.forget_bias, input_keep_prob=cfg.input_keep_prob, output_keep_prob=cfg.output_keep_prob, bias_init_vector=None) _, video_tf, caption_tf, _, _ = model.build_model("inference") session = tf.InteractiveSession(config=tf.ConfigProto( gpu_options=gpu_options)) saver = tf.train.Saver() saver.restore(session, saved_model) if cfg.id == "Flickr30k": _, _, test_data = get_flickr30k_data(cfg) elif cfg.id == "MSR-VTT": _, _, test_data = get_msr_vtt_data(cfg) splits = [] splits.append((test_data['video_path'].unique(), test_data)) results = [] for split, gt_dataframe in splits: gts = convert_data_to_coco_scorer_format(gt_dataframe) samples = {} for start, end in zip( range(0, len(split), cfg.batch_size), range(cfg.batch_size, len(split) + cfg.batch_size, cfg.batch_size)): current_batch = split[start:end] current_feats = np.zeros( (cfg.batch_size, cfg.n_frame_step, cfg.dim_image)) current_feats_vals = [ combine_features(vid) for vid in current_batch ] for ind, feat in enumerate(current_feats_vals): current_feats[ind][:len(current_feats_vals[ind])] = feat generated_word_index = session.run( caption_tf, feed_dict={video_tf: current_feats}) generated_word_index = np.asarray(generated_word_index).transpose() periods = np.argmax(generated_word_index == 0, axis=1) + 1 periods[ periods == 0] = cfg.n_lstm_step #take the whole sequence if a period was not produced for i in range(len(current_batch)): generated_sentence = ' '.join( ixtoword[generated_word_index[i, :periods[i] - 1]]) video_id = current_batch[i].split("/")[-1].split("_")[ 0] #+ ".jpg" samples[video_id] = [{ u'image_id': video_id, u'caption': generated_sentence }] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print valid_score print len(samples) if not os.path.exists(cfg.results_path): os.makedirs(cfg.results_path) with open(cfg.results_path + "scores.txt", 'a') as scores_table: scores_table.write(json.dumps(results[0]) + "\n") with open(cfg.results_path + saved_model.split("/")[-1] + ".json", 'w') as prediction_results: json.dump({ "predictions": samples, "scores": valid_score }, prediction_results)
def train(): assert os.path.isdir(home_folder) assert os.path.isfile(video_data_path_train) assert os.path.isfile(video_data_path_val) assert os.path.isdir(model_path) print 'load meta data...' wordtoix = np.load(home_folder + 'data0/wordtoix.npy').tolist() print 'build model and session...' # place shared parameters on the GPU with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=None) tStart_total = time.time() n_epoch_steps = int(n_train_samples / batch_size) n_steps = n_epochs * n_epoch_steps # preprocessing on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_val) # random batches train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_encode_data, val_video_label, val_fname, val_caption_id, val_caption_id_1 = \ tf.train.batch([val_data, val_encode_data, val_video_label, val_fname, val_caption_id, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2*batch_size) # operation on the GPU with tf.device("/gpu:0"): tf_loss, tf_loss_caption, tf_loss_latent, tf_loss_video, tf_output_semantic = model.build_model( train_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label) val_caption_tf, val_lstm3_variables_tf = model.build_sent_generator( val_data, val_video_label) val_video_tf, val_lstm4_variables_tf = model.build_video_generator( val_caption_id_1) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) # check for model file with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state(model_path) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True) else: print("Created model with fresh parameters.") sess.run(tf.global_variables_initializer()) temp = set(tf.global_variables()) # train on the GPU with tf.device("/gpu:0"): # train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) optimizer = tf.train.AdamOptimizer(learning_rate) gvs = optimizer.compute_gradients(tf_loss) # when variable is not related to the loss, grad returned as None clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in gvs if grad is not None] train_op = optimizer.apply_gradients(gvs) ## initialize variables added for optimizer sess.run(tf.variables_initializer(set(tf.global_variables()) - temp)) # initialize epoch variable in queue reader sess.run(tf.local_variables_initializer()) loss_epoch = 0 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # write graph architecture to file summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph) for step in xrange(1, n_steps + 1): tStart = time.time() _, loss_val, loss_cap, loss_lat, loss_vid, sem = sess.run([ train_op, tf_loss, tf_loss_caption, tf_loss_latent, tf_loss_video, tf_output_semantic ]) tStop = time.time() print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap, "loss_lat:", loss_lat, "loss_vid:", loss_vid print "Time Cost:", round(tStop - tStart, 2), "s" loss_epoch += loss_val if step % n_epoch_steps == 0: epoch = step / n_epoch_steps loss_epoch /= n_epoch_steps with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) print 'epoch:', epoch, 'loss:', loss_epoch, 'loss_cap:', loss_cap, 'loss_lat:', loss_lat, 'loss_vid:', loss_vid print 'sem:', sem[0, :10] loss_epoch = 0 ######### test sentence generation ########## ixtoword = pd.Series( np.load(home_folder + 'data0/ixtoword.npy').tolist()) n_val_steps = int(n_val_samples / batch_size) [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, 1, ixtoword, val_caption_tf, val_fname) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, n_val_steps, ixtoword, val_caption_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) ######### test video generation ############# mse = test_all_videos(sess, n_val_steps, val_data, val_video_tf) sys.stdout.flush() sys.stdout.flush() coord.request_stop() coord.join(threads) print "Finally, saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s" sess.close()
def train(): assert os.path.isfile(video_data_path_train) assert os.path.isfile(video_data_path_val) assert os.path.isdir(model_path) assert os.path.isfile(wordtoix_file) assert os.path.isfile(ixtoword_file) assert os.path.isfile(bias_init_vector_file) assert drop_strategy in ['block_video', 'block_sent', 'random', 'keep'] wordtoix = np.load(wordtoix_file).tolist() ixtoword = pd.Series(np.load(ixtoword_file).tolist()) bias_init_vector = np.load(bias_init_vector_file) print 'build model and session...' # shared parameters on the GPU with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=bias_init_vector) tStart_total = time.time() n_epoch_steps = int(n_train_samples / batch_size) n_steps = n_epochs * n_epoch_steps # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_val) # random batches train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \ = model.build_model(train_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label) val_v2s_tf, _ = model.build_v2s_generator(val_data) val_s2s_tf, _ = model.build_s2s_generator(val_caption_id_1) val_s2v_tf, _ = model.build_s2v_generator(val_caption_id_1) val_v2v_tf, _ = model.build_v2v_generator(val_data) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) # check for model file with tf.device(cpu_device): saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state(model_path) global_step = 0 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True) global_step = get_model_step(ckpt.model_checkpoint_path) print 'global_step:', global_step else: print("Created model with fresh parameters.") sess.run(tf.global_variables_initializer()) temp = set(tf.global_variables()) # train on the GPU with tf.device("/gpu:0"): ## 1. weight decay for var in tf.trainable_variables(): decay_loss = tf.multiply(tf.nn.l2_loss(var), 0.0004, name='weight_loss') tf.add_to_collection('losses', decay_loss) tf.add_to_collection('losses', tf_loss) tf_total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss') ## 2. gradient clip optimizer = tf.train.AdamOptimizer(learning_rate) gvs = optimizer.compute_gradients(tf_total_loss) # when variable is not related to the loss, grad returned as None clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in gvs if grad is not None] for grad, var in gvs: if grad is not None: tf.summary.histogram(var.name + '/grad', grad) tf.summary.histogram(var.name + '/data', var) train_op = optimizer.apply_gradients(clip_gvs) ## initialize variables added for optimizer sess.run(tf.variables_initializer(set(tf.global_variables()) - temp)) # initialize epoch variable in queue reader sess.run(tf.local_variables_initializer()) loss_epoch = 0 loss_epoch_cap = 0 loss_epoch_vid = 0 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ##### add summaries ###### tf.summary.histogram('video_h', tf_v_h) tf.summary.histogram('sent_h', tf_s_h) tf.summary.scalar('loss_vid', tf_loss_vid) tf.summary.scalar('loss_lat', tf_loss_lat) tf.summary.scalar('loss_caption', tf_loss_cap) # for var in tf.trainable_variables(): # summaries.append(tf.histogram_summary(var.op.name, var)) summary_op = tf.summary.merge_all() # write graph architecture to file summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph) epoch = global_step video_label = sess.run(train_video_label) for step in xrange(1, n_steps + 1): tStart = time.time() if drop_strategy == 'keep': drop_type = 0 elif drop_strategy == 'block_sentence': drop_type = 1 elif drop_strategy == 'block_video': drop_type = 2 else: drop_type = random.randint(0, 2) _, loss_val, loss_cap, loss_lat, loss_vid = sess.run( [train_op, tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid], feed_dict={tf_drop_type: drop_type}) tStop = time.time() print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap * caption_weight, "loss_latent:", loss_lat * latent_weight, "loss_vid:", loss_vid * video_weight print "Time Cost:", round(tStop - tStart, 2), "s" loss_epoch += loss_val loss_epoch_cap += loss_cap loss_epoch_vid += loss_vid if step % n_epoch_steps == 0: # if step % 3 == 0: epoch += 1 loss_epoch /= n_epoch_steps loss_epoch_cap /= n_epoch_steps loss_epoch_vid /= n_epoch_steps with tf.device(cpu_device): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) # print 'z:', z[0, :10] print 'epoch:', epoch, 'loss:', loss_epoch, "loss_cap:", loss_epoch_cap, "loss_lat:", loss_lat, "loss_vid:", loss_epoch_vid loss_epoch = 0 loss_epoch_cap = 0 loss_epoch_vid = 0 ######### test sentence generation ########## n_val_steps = int(n_val_samples / batch_size) # n_val_steps = 3 ### TODO: sometimes COCO test show exceptions in the beginning of training #### if test_v2s: try: [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## video to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, n_val_steps, ixtoword, val_v2s_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print '############## video to sentence result #################' except Exception, e: print 'epoch:', epoch, 'v2s Bleu test exception' if test_s2s: try: [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## sentence to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, n_val_steps, ixtoword, val_s2s_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' except Exception, e: print 'epoch', epoch, 's2s Bleu test exception' ######### test video generation ############# if test_v2v: mse_v2v = test_all_videos(sess, n_val_steps, val_data, val_v2v_tf, val_video_label, None) print 'epoch', epoch, 'video2video mse:', mse_v2v if test_s2v: mse_s2v = test_all_videos(sess, n_val_steps, val_data, val_s2v_tf, val_video_label, None) print 'epoch', epoch, 'caption2video mse:', mse_s2v sys.stdout.flush() ###### summary ###### if epoch % 2 == 0: summary = sess.run(summary_op) summary_writer.add_summary(summary, epoch)
def train(): meta_data, train_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) np.save('./data/ixtoword', ixtoword) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate = 0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_video_len, tf_caption, tf_caption_mask, tf_HLness, tf_HLness_mask, tf_HLness_att_mask= model.build_model() loss_summary = tf.scalar_summary("Loss",tf_loss) sess = tf.InteractiveSession() merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def) saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() tStart_total = time.time() for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) for current_batch_file_idx in xrange(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_HLness = np.zeros((batch_size, n_frame_step)) current_HLness_masks = np.zeros((batch_size, n_frame_step)) current_HLness_att_masks = np.zeros((batch_size, n_frame_step)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) for ind in xrange(batch_size): current_feats[ind,:,:] = current_batch['data'][:,ind,:] idx = np.where(current_batch['label'][:,ind] != -1)[0] if len(idx) == 0: continue idy = np.where(current_batch['label'][:,ind] == 1)[0] if len(idy) == 0: continue current_HLness[ind,idx] = current_batch['label'][idx,ind] current_HLness_masks[ind,idx] = 1 current_video_masks[ind,idy[-1]] = 1 current_video_len[ind] = idx[-1] + 1 current_HLness_att_masks[ind,idy] = 1 if(idy[0] > 4): current_HLness_att_masks[ind,idy[0]-5:idy[0]] = 1 else: current_HLness_att_masks[ind,0:idy[0]] = 1 current_captions = current_batch['title'] current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=15-1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix )) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 _, loss_val, summary_str= sess.run( [train_op, tf_loss, merged], feed_dict={ tf_video: current_feats, tf_video_mask : current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks, tf_HLness: current_HLness, tf_HLness_mask: current_HLness_masks, tf_HLness_att_mask: current_HLness_att_masks }) writer.add_summary(summary_str, epoch) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s" if np.mod(epoch, 20) == 0: print "Epoch ", epoch, " is done. Saving the model ..." saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) current_batch = h5py.File(test_data[np.random.randint(0,len(test_data))]) video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf, lstmRNN_variables_tf, lstm3_variables_tf = model.build_generator() ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) #[mp, pred_sent, gt_sent, HLness] = testing_one(sess, current_batch, ixtoword,video_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf) [mp, pred_sent, gt_sent, HLness] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf) #for xxx in xrange(current_batch['label'].shape[1]): # print gt_sent[xxx] # print pred_sent[xxx] total_score = np.mean(mp) print total_score scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) print "Finally, saving the model ..." saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def test(model_path=None, video_data_path_test=video_data_path_val, n_test_samples=n_val_samples): # test_data = val_data # to evaluate on testing data or validation data wordtoix = np.load(wordtoix_file).tolist() ixtoword = pd.Series(np.load(ixtoword_file).tolist()) with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=None) # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _, train_frame_data = read_and_decode_with_frame(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _, val_frame_data = read_and_decode_with_frame(video_data_path_test) train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \ = model.build_model(train_data, train_frame_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label) val_v2s_tf, v2s_lstm3_vars_tf = model.build_v2s_generator(val_data) val_s2s_tf, s2s_lstm2_vars_tf, s2s_lstm3_vars_tf = model.build_s2s_generator( val_caption_id_1) val_s2v_tf, s2v_lstm2_vars_tf, s2v_lstm4_vars_tf = model.build_s2v_generator( val_caption_id_1, val_frame_data) val_v2v_tf, v2v_lstm4_vars_tf = model.build_v2v_generator( val_data, val_frame_data) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True)) with tf.device(cpu_device): saver = tf.train.Saver() saver.restore(sess, model_path) print 'load parameters from:', model_path # print 'halve the dropout weights..' # for ind, row in enumerate(v2s_lstm3_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(s2s_lstm2_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(s2v_lstm4_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ######### test sentence generation ########## print 'testing...' n_test_steps = int(n_test_samples / batch_size) print 'n_test_steps:', n_test_steps tstart = time.time() ### TODO: sometimes COCO test show exceptions in the beginning of training #### if test_v2s: [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname) for i, key in enumerate(pred_dict.keys()): print 'video:', flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## video to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname) scorer = COCOScorer() total_score_1 = scorer.score(gt_dict, pred_dict, id_list) print '############## video to sentence result #################' if test_s2s: [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) for i, key in enumerate(pred_dict.keys()): print 'video:', flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## sentence to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname) scorer = COCOScorer() total_score_2 = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' ######### test video generation ############# if test_v2v: mse_v2v = test_all_videos(sess, n_test_steps, val_frame_data, val_v2v_tf, val_video_label, pixel_scale_factor) print 'video2video mse:', mse_v2v if test_s2v: mse_s2v = test_all_videos(sess, n_test_steps, val_frame_data, val_s2v_tf, val_video_label, pixel_scale_factor) print 'caption2video mse:', mse_s2v if save_demo_sent_v2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname, result_file=home_folder + 'demo_v2s.txt') if save_demo_sent_s2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname, result_file=home_folder + 'demo_s2s.txt') if save_demo_video_v2v: get_demo_video(sess, n_test_steps, val_frame_data, val_v2v_tf, val_video_label, val_fname, home_folder + 'demo_v2v/', pixel_scale_factor) if save_demo_video_s2v: get_demo_video(sess, n_test_steps, val_frame_data, val_s2v_tf, val_video_label, val_fname, home_folder + 'demo_s2v/', pixel_scale_factor) sys.stdout.flush() coord.request_stop() coord.join(threads) tstop = time.time() print "Total Time Cost:", round(tstop - tstart, 2), "s" sess.close()
def sampling( self, condition_net, image_encoder, image_generator, gen_in_layer, gen_out_layer, start_code, n_iters, lr, lr_end, threshold, layer, conditions, #units=None, xy=0, epsilon1=1, epsilon2=1, epsilon3=1e-10, inpainting=None, # in-painting args output_dir=None, reset_every=0, save_every=1, n_gram='Bleu_1'): # Get the input and output sizes image_shape = condition_net.blobs['data'].data.shape generator_output_shape = image_generator.blobs[gen_out_layer].data.shape encoder_input_shape = image_encoder.blobs['data'].data.shape # Calculate the difference between the input image of the condition net # and the output image from the generator image_size = util.get_image_size(image_shape) generator_output_size = util.get_image_size(generator_output_shape) encoder_input_size = util.get_image_size(encoder_input_shape) # The top left offset to crop the output image to get a 227x227 image topleft = util.compute_topleft(image_size, generator_output_size) topleft_DAE = util.compute_topleft(encoder_input_size, generator_output_size) src = image_generator.blobs[gen_in_layer] # the input feature layer of the generator # Make sure the layer size and initial vector size match assert src.data.shape == start_code.shape # Variables to store the best sample last_xx = np.zeros(image_shape) # best image last_prob = -sys.maxint # highest probability h = start_code.copy() condition_idx = 0 list_samples = [] i = 0 scorer = COCOScorer() while True: step_size = lr + ((lr_end - lr) * i) / n_iters condition = conditions[condition_idx] # Select a class # 1. Compute the epsilon1 term --- # compute gradient d log(p(h)) / dh per DAE results in Alain & Bengio 2014 d_prior = self.h_autoencoder_grad(h=h, encoder=image_generator, decoder=image_encoder, gen_out_layer=gen_out_layer, topleft=topleft_DAE, inpainting=inpainting) # 2. Compute the epsilon2 term --- # Push the code through the generator to get an image x image_generator.blobs["feat"].data[:] = h generated = image_generator.forward() x = generated[gen_out_layer].copy() # 256x256 # Crop from 256x256 to 227x227 cropped_x = x[:,:,topleft[0]:topleft[0]+image_size[0], topleft[1]:topleft[1]+image_size[1]] cropped_x_copy = cropped_x.copy() if inpainting is not None: cropped_x = util.apply_mask(img=cropped_x, mask=inpainting['mask'], context=inpainting['image']) # Forward pass the image x to the condition net up to an unit k at the given layer # Backprop the gradient through the condition net to the image layer to get a gradient image d_condition_x, prob, info = self.forward_backward_from_x_to_condition(net=condition_net, end=layer, image=cropped_x, condition=condition, scorer=scorer, n_gram=n_gram) if inpainting is not None: # Mask out the class gradient image d_condition_x[:] *= inpainting["mask"] # An additional objective for matching the context image d_context_x256 = np.zeros_like(x.copy()) d_context_x256[:,:,topleft[0]:topleft[0]+image_size[0], topleft[1]:topleft[1]+image_size[1]] = (inpainting["image"] - cropped_x_copy) * inpainting["mask_neg"] d_context_h = self.backward_from_x_to_h(generator=image_generator, diff=d_context_x256, start=gen_in_layer, end=gen_out_layer) # Put the gradient back in the 256x256 format d_condition_x256 = np.zeros_like(x) d_condition_x256[:,:,topleft[0]:topleft[0]+image_size[0], topleft[1]:topleft[1]+image_size[1]] = d_condition_x.copy() # Backpropagate the above gradient all the way to h (through generator) # This gradient 'd_condition' is d log(p(y|h)) / dh (the epsilon2 term in Eq. 11 in the paper) d_condition = self.backward_from_x_to_h(generator=image_generator, diff=d_condition_x256, start=gen_in_layer, end=gen_out_layer) self.print_progress(i, info, condition, prob, d_condition) # 3. Compute the epsilon3 term --- noise = np.zeros_like(h) if epsilon3 > 0: noise = np.random.normal(0, epsilon3, h.shape) # Gaussian noise # Update h according to Eq.11 in the paper d_h = epsilon1 * d_prior + epsilon2 * d_condition + noise # Plus the optional epsilon4 for matching the context region when in-painting if inpainting is not None: d_h += inpainting["epsilon4"] * d_context_h h += step_size/np.abs(d_h).mean() * d_h h = np.clip(h, a_min=0, a_max=30) hm=h # Reset the code every N iters (for diversity when running a long sampling chain) if reset_every > 0 and i % reset_every == 0 and i > 0: h = np.random.normal(0, 1, h.shape) # Experimental: For sample diversity, it's a good idea to randomly pick epsilon1 as well epsilon1 = np.random.uniform(low=1e-6, high=1e-2) # Save every sample last_xx = cropped_x.copy() last_prob = prob # Filter samples based on threshold or every N iterations if save_every > 0 and i % save_every == 0 and prob > threshold: name = "%s/samples/%05d.jpg" % (output_dir, i) label = self.get_label(condition) list_samples.append( (last_xx.copy(), name, label) ) # Stop if grad is 0 if norm(d_h) == 0: print " d_h is 0" break # Randomly sample a class every N iterations if i > 0 and i % n_iters == 0: condition_idx += 1 if condition_idx == len(conditions): break i += 1 # Next iter # returning the last sample print "-------------------------" print "Last sample: prob [%s] " % last_prob return last_xx, list_samples
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin( video_data_path_train, video_data_path_val, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab( captions, word_count_threshold=1) np.save('./data0/ixtoword', ixtoword) model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate=0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model( ) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True)) with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() tStart_total = time.time() for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) for current_batch_file_idx in xrange(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) for ind in xrange(batch_size): current_feats[ind, :, :] = current_batch['data'][:n_frame_step, ind, :] idx = np.where(current_batch['label'][:, ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind, :idx[-1] + 1] = 1 current_captions = current_batch['title'] current_caption_ind = map( lambda cap: [ wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix ], current_captions) current_caption_matrix = sequence.pad_sequences( current_caption_ind, padding='post', maxlen=n_caption_step - 1) current_caption_matrix = np.hstack([ current_caption_matrix, np.zeros([len(current_caption_matrix), 1]) ]).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum() + 1, current_caption_matrix)) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 _, loss_val = sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask: current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch, 2), "s" if np.mod(epoch, 10) == 0 or epoch == n_epochs - 1: print "Epoch ", epoch, " is done. Saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) current_batch = h5py.File(val_data[np.random.randint( 0, len(val_data))]) video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator( ) ixtoword = pd.Series(np.load('./data0/ixtoword.npy').tolist()) [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, val_data, ixtoword, video_tf, video_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) sys.stdout.flush() print "Finally, saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
def test(model_path=None, video_data_path_test=video_data_path_val, n_test_samples=n_val_samples, video_name=None): # test_data = val_data # to evaluate on testing data or validation data wordtoix = np.load(wordtoix_file).tolist() ixtoword = pd.Series(np.load(ixtoword_file).tolist()) with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=None) # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_test) train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss = model.build_model(train_caption_id, train_caption_id_1, train_caption_label) val_s2s_tf, s2s_lstm3_vars_tf = model.build_s2s_generator( val_caption_id_1) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True)) with tf.device(cpu_device): saver = tf.train.Saver() saver.restore(sess, model_path) print 'load parameters from:', model_path coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ######### test sentence generation ########## print 'testing...' n_test_steps = int(n_test_samples / batch_size) print 'n_test_steps:', n_test_steps tstart = time.time() ### TODO: sometimes COCO test show exceptions in the beginning of training #### if test_s2s: # [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) # for i, key in enumerate(pred_dict.keys()): # print 'video:', flist[i] # for ele in gt_dict[key]: # print "GT: " + ele['caption'] # print "PD: " + pred_dict[key][0]['caption'] # print '-------' print '############## sentence to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname) if os.path.isfile('demo_s2s.txt.videos'): video_name = pickle.load(open('demo_s2s.txt.videos', "rb")) if video_name: for i, key in enumerate(pred_dict.keys()): if flist[i] in video_name: print flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-----------' scorer = COCOScorer() total_score_2 = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' if save_demo_sent_s2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname, result_file='demo_s2s.txt') sys.stdout.flush() coord.request_stop() coord.join(threads) tstop = time.time() print "Total Time Cost:", round(tstop - tstart, 2), "s" sess.close()
print '############## video to sentence result #################' except Exception, e: print 'v2s bleu test exception' if test_s2s: try: [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) for i, key in enumerate(pred_dict.keys()): print 'video:', flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## sentence to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname) scorer = COCOScorer() total_score_2 = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' except Exception, e: print 'v2s bleu test exception' ######### test video generation ############# if test_v2v: mse_v2v = test_all_videos(sess, n_test_steps, val_data, val_v2v_tf, val_video_label, pixel_scale_factor) print 'video2video mse:', mse_v2v if test_s2v: mse_s2v = test_all_videos(sess, n_test_steps, val_data, val_s2v_tf, val_video_label, pixel_scale_factor) print 'caption2video mse:', mse_s2v if save_demo_sent_v2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname, result_file='demo_v2s.txt') if save_demo_sent_s2s: