def train(): assert os.path.isfile(video_data_path_train) assert os.path.isfile(video_data_path_val) assert os.path.isdir(model_path) assert os.path.isfile(wordtoix_file) assert os.path.isfile(ixtoword_file) assert os.path.isfile(bias_init_vector_file) assert drop_strategy in ['block_video', 'block_sent', 'random', 'keep'] wordtoix = np.load(wordtoix_file).tolist() ixtoword = pd.Series(np.load(ixtoword_file).tolist()) bias_init_vector = np.load(bias_init_vector_file) print 'build model and session...' # shared parameters on the GPU with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=bias_init_vector) tStart_total = time.time() n_epoch_steps = int(n_train_samples / batch_size) n_steps = n_epochs * n_epoch_steps # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_val) # random batches train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \ = model.build_model(train_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label) val_v2s_tf, _ = model.build_v2s_generator(val_data) val_s2s_tf, _ = model.build_s2s_generator(val_caption_id_1) val_s2v_tf, _ = model.build_s2v_generator(val_caption_id_1) val_v2v_tf, _ = model.build_v2v_generator(val_data) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) # check for model file with tf.device(cpu_device): saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state(model_path) global_step = 0 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True) global_step = get_model_step(ckpt.model_checkpoint_path) print 'global_step:', global_step else: print("Created model with fresh parameters.") sess.run(tf.global_variables_initializer()) temp = set(tf.global_variables()) # train on the GPU with tf.device("/gpu:0"): ## 1. weight decay for var in tf.trainable_variables(): decay_loss = tf.multiply(tf.nn.l2_loss(var), 0.0004, name='weight_loss') tf.add_to_collection('losses', decay_loss) tf.add_to_collection('losses', tf_loss) tf_total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss') ## 2. gradient clip optimizer = tf.train.AdamOptimizer(learning_rate) gvs = optimizer.compute_gradients(tf_total_loss) # when variable is not related to the loss, grad returned as None clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in gvs if grad is not None] for grad, var in gvs: if grad is not None: tf.summary.histogram(var.name + '/grad', grad) tf.summary.histogram(var.name + '/data', var) train_op = optimizer.apply_gradients(clip_gvs) ## initialize variables added for optimizer sess.run(tf.variables_initializer(set(tf.global_variables()) - temp)) # initialize epoch variable in queue reader sess.run(tf.local_variables_initializer()) loss_epoch = 0 loss_epoch_cap = 0 loss_epoch_vid = 0 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ##### add summaries ###### tf.summary.histogram('video_h', tf_v_h) tf.summary.histogram('sent_h', tf_s_h) tf.summary.scalar('loss_vid', tf_loss_vid) tf.summary.scalar('loss_lat', tf_loss_lat) tf.summary.scalar('loss_caption', tf_loss_cap) # for var in tf.trainable_variables(): # summaries.append(tf.histogram_summary(var.op.name, var)) summary_op = tf.summary.merge_all() # write graph architecture to file summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph) epoch = global_step video_label = sess.run(train_video_label) for step in xrange(1, n_steps + 1): tStart = time.time() if drop_strategy == 'keep': drop_type = 0 elif drop_strategy == 'block_sentence': drop_type = 1 elif drop_strategy == 'block_video': drop_type = 2 else: drop_type = random.randint(0, 2) _, loss_val, loss_cap, loss_lat, loss_vid = sess.run( [train_op, tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid], feed_dict={tf_drop_type: drop_type}) tStop = time.time() print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap * caption_weight, "loss_latent:", loss_lat * latent_weight, "loss_vid:", loss_vid * video_weight print "Time Cost:", round(tStop - tStart, 2), "s" loss_epoch += loss_val loss_epoch_cap += loss_cap loss_epoch_vid += loss_vid if step % n_epoch_steps == 0: # if step % 3 == 0: epoch += 1 loss_epoch /= n_epoch_steps loss_epoch_cap /= n_epoch_steps loss_epoch_vid /= n_epoch_steps with tf.device(cpu_device): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) # print 'z:', z[0, :10] print 'epoch:', epoch, 'loss:', loss_epoch, "loss_cap:", loss_epoch_cap, "loss_lat:", loss_lat, "loss_vid:", loss_epoch_vid loss_epoch = 0 loss_epoch_cap = 0 loss_epoch_vid = 0 ######### test sentence generation ########## n_val_steps = int(n_val_samples / batch_size) # n_val_steps = 3 ### TODO: sometimes COCO test show exceptions in the beginning of training #### if test_v2s: try: [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## video to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, n_val_steps, ixtoword, val_v2s_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print '############## video to sentence result #################' except Exception, e: print 'epoch:', epoch, 'v2s Bleu test exception' if test_s2s: try: [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## sentence to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, n_val_steps, ixtoword, val_s2s_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' except Exception, e: print 'epoch', epoch, 's2s Bleu test exception' ######### test video generation ############# if test_v2v: mse_v2v = test_all_videos(sess, n_val_steps, val_data, val_v2v_tf, val_video_label, None) print 'epoch', epoch, 'video2video mse:', mse_v2v if test_s2v: mse_s2v = test_all_videos(sess, n_val_steps, val_data, val_s2v_tf, val_video_label, None) print 'epoch', epoch, 'caption2video mse:', mse_s2v sys.stdout.flush() ###### summary ###### if epoch % 2 == 0: summary = sess.run(summary_op) summary_writer.add_summary(summary, epoch)
def train(): assert os.path.isdir(home_folder) assert os.path.isfile(video_data_path_train) assert os.path.isfile(video_data_path_val) assert os.path.isdir(model_path) print 'load meta data...' wordtoix = np.load(home_folder + 'data0/wordtoix.npy').tolist() print 'build model and session...' # place shared parameters on the GPU with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=None) tStart_total = time.time() n_epoch_steps = int(n_train_samples / batch_size) n_steps = n_epochs * n_epoch_steps # preprocessing on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_val) # random batches train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_encode_data, val_video_label, val_fname, val_caption_id, val_caption_id_1 = \ tf.train.batch([val_data, val_encode_data, val_video_label, val_fname, val_caption_id, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2*batch_size) # operation on the GPU with tf.device("/gpu:0"): tf_loss, tf_loss_caption, tf_loss_latent, tf_loss_video, tf_output_semantic = model.build_model( train_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label) val_caption_tf, val_lstm3_variables_tf = model.build_sent_generator( val_data, val_video_label) val_video_tf, val_lstm4_variables_tf = model.build_video_generator( val_caption_id_1) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) # check for model file with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state(model_path) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True) else: print("Created model with fresh parameters.") sess.run(tf.global_variables_initializer()) temp = set(tf.global_variables()) # train on the GPU with tf.device("/gpu:0"): # train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) optimizer = tf.train.AdamOptimizer(learning_rate) gvs = optimizer.compute_gradients(tf_loss) # when variable is not related to the loss, grad returned as None clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in gvs if grad is not None] train_op = optimizer.apply_gradients(gvs) ## initialize variables added for optimizer sess.run(tf.variables_initializer(set(tf.global_variables()) - temp)) # initialize epoch variable in queue reader sess.run(tf.local_variables_initializer()) loss_epoch = 0 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # write graph architecture to file summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph) for step in xrange(1, n_steps + 1): tStart = time.time() _, loss_val, loss_cap, loss_lat, loss_vid, sem = sess.run([ train_op, tf_loss, tf_loss_caption, tf_loss_latent, tf_loss_video, tf_output_semantic ]) tStop = time.time() print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap, "loss_lat:", loss_lat, "loss_vid:", loss_vid print "Time Cost:", round(tStop - tStart, 2), "s" loss_epoch += loss_val if step % n_epoch_steps == 0: epoch = step / n_epoch_steps loss_epoch /= n_epoch_steps with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) print 'epoch:', epoch, 'loss:', loss_epoch, 'loss_cap:', loss_cap, 'loss_lat:', loss_lat, 'loss_vid:', loss_vid print 'sem:', sem[0, :10] loss_epoch = 0 ######### test sentence generation ########## ixtoword = pd.Series( np.load(home_folder + 'data0/ixtoword.npy').tolist()) n_val_steps = int(n_val_samples / batch_size) [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, 1, ixtoword, val_caption_tf, val_fname) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, n_val_steps, ixtoword, val_caption_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) ######### test video generation ############# mse = test_all_videos(sess, n_val_steps, val_data, val_video_tf) sys.stdout.flush() sys.stdout.flush() coord.request_stop() coord.join(threads) print "Finally, saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s" sess.close()
def test(model_path=None, video_data_path_test=video_data_path_val, n_test_samples=n_val_samples, video_name=None): # test_data = val_data # to evaluate on testing data or validation data wordtoix = np.load(wordtoix_file).tolist() ixtoword = pd.Series(np.load(ixtoword_file).tolist()) with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=None) # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_test) train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \ = model.build_model(train_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label) val_v2s_tf, v2s_lstm3_vars_tf = model.build_v2s_generator(val_data) val_s2s_tf, s2s_lstm2_vars_tf, s2s_lstm3_vars_tf = model.build_s2s_generator( val_caption_id_1) val_s2v_tf, s2v_lstm2_vars_tf, s2v_lstm4_vars_tf = model.build_s2v_generator( val_caption_id_1) val_v2v_tf, v2v_lstm4_vars_tf = model.build_v2v_generator(val_data) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True)) with tf.device(cpu_device): saver = tf.train.Saver() saver.restore(sess, model_path) print 'load parameters from:', model_path # print 'halve the dropout weights..' # for ind, row in enumerate(v2s_lstm3_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(s2s_lstm2_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(s2s_lstm3_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(s2v_lstm2_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(s2v_lstm4_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(v2v_lstm4_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ######### test sentence generation ########## print 'testing...' n_test_steps = int(n_test_samples / batch_size) print 'n_test_steps:', n_test_steps tstart = time.time() ### TODO: sometimes COCO test show exceptions in the beginning of training #### if test_v2s: # [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname) # for i, key in enumerate(pred_dict.keys()): # print 'video:', flist[i] # for ele in gt_dict[key]: # print "GT: " + ele['caption'] # print "PD: " + pred_dict[key][0]['caption'] # print '-------' print '############## video to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname) if os.path.isfile('demo_v2s.txt.videos'): video_name = pickle.load(open('demo_v2s.txt.videos', "rb")) if video_name: for i, key in enumerate(pred_dict.keys()): if flist[i] in video_name: print flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-----------' scorer = COCOScorer() total_score_1 = scorer.score(gt_dict, pred_dict, id_list) print '############## video to sentence result #################' if test_s2s: # [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) # for i, key in enumerate(pred_dict.keys()): # print 'video:', flist[i] # for ele in gt_dict[key]: # print "GT: " + ele['caption'] # print "PD: " + pred_dict[key][0]['caption'] # print '-------' print '############## sentence to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname) if os.path.isfile('demo_s2s.txt.videos'): video_name = pickle.load(open('demo_s2s.txt.videos', "rb")) if video_name: for i, key in enumerate(pred_dict.keys()): if flist[i] in video_name: print flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-----------' scorer = COCOScorer() total_score_2 = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' ######### test video generation ############# if test_v2v: mse_v2v = test_all_videos(sess, n_test_steps, val_data, val_v2v_tf, val_video_label, feat_scale_factor) print 'video2video mse:', mse_v2v if test_s2v: mse_s2v = test_all_videos(sess, n_test_steps, val_data, val_s2v_tf, val_video_label, feat_scale_factor) print 'caption2video mse:', mse_s2v if save_demo_sent_v2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname, result_file='demo_v2s.txt') if save_demo_sent_s2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname, result_file='demo_s2s.txt') if save_demo_video_v2v: get_demo_video(sess, n_test_steps, val_frame_data, val_v2v_tf, val_video_label, val_fname, 'demo_v2v/', pixel_scale_factor) if save_demo_video_s2v: get_demo_video(sess, n_test_steps, val_frame_data, val_s2v_tf, val_video_label, val_fname, 'demo_s2v/', pixel_scale_factor) sys.stdout.flush() coord.request_stop() coord.join(threads) tstop = time.time() print "Total Time Cost:", round(tstop - tstart, 2), "s" sess.close()