def reinforce(): if tf: global tf_step for src, tgt in tqdm(training_data, mininterval=1, desc="Reinforce-train", leave=False): rl_optimizer.zero_grad() max_words = model.sample(src) s_words, props = model.sample(src, False) reward = rouge_l(s_words, tgt) baseline = rouge_l(max_words, tgt) advantage = reward - baseline loss = rl_criterion(props, s_words, tgt, advantage) loss.backward() rl_optimizer.step() if tf is not None: add_summary_value("reinforce loss", loss.data[0]) add_summary_value("reinforce advantage", advantage.mean().data) add_summary_value("reinforce baseline", baseline.mean().data) add_summary_value("reinforce reward", reward.mean().data) tf_step += 1 if tf_step % 100 == 0: tf_summary_writer.flush()
def train_actor_critic(): loss_A = loss_C = .0 actor.train() critic.train() global tf_step for imgs, labels in tqdm(training_data, mininterval=1, desc="Actor-Critic Training", leave=False): optim_A.zero_grad() optim_C.zero_grad() enc = actor.encode(imgs) hidden_A = actor.feed_enc(enc) props_A, words_A = actor(hidden_A, labels) fixed_props_A = fix_variable(props_A) hidden_C = critic.feed_enc(enc) props_C, words_C = critic(words_A, hidden_C) scores_A, scores_C = rouge_l(words_A[:, 1:], labels), rouge_l(words_C, labels) fix_mask_rewards_A = mask_score(fixed_props_A, words_A[:, 1:], scores_A) mask_rewards_C = mask_score(props_C, words_C, scores_C) loss_c = critic.td_error(fix_mask_rewards_A, mask_rewards_C, criterion_C) loss_c.backward() loss_C += loss_c.data optim_C.clip_grad_norm() optim_C.step() _, sample_words, sample_props = actor.speak(hidden_A) loss_a, reward = criterion_AC(sample_props, sample_words, scores_C - scores_A) loss_a.backward() loss_A += loss_a.data optim_A.clip_grad_norm() optim_A.step() if tf is not None: add_summary_value("train critic loss", loss_c.data[0]) add_summary_value("train actor loss", loss_a.data[0]) add_summary_value("train actor reward", reward.data[0]) tf_step += 1 if tf_step % 100 == 0: tf_summary_writer.flush() loss_A = loss_A[0] / training_data.sents_size loss_C = loss_C[0] / training_data.sents_size return loss_A, loss_C
def train_actor_critic(): loss_A = loss_C = .0 actor.train() critic.train() for imgs, labels in tqdm(training_data, mininterval=1, desc="Actor-Critic Training", leave=False): optim_A.zero_grad() optim_C.zero_grad() enc = actor.encode(imgs) hidden_A = actor.feed_enc(enc) props_A, words_A = actor(hidden_A, labels) fixed_props_A = fix_variable(props_A) hidden_C = critic.feed_enc(enc) props_C, words_C = critic(words_A, hidden_C) scores_A, scores_C = rouge_l(words_A[:, 1:], labels), rouge_l(words_C, labels) fix_mask_rewards_A = mask_score(fixed_props_A, words_A[:, 1:], scores_A) mask_rewards_C = mask_score(props_C, words_C, scores_C) loss_c = critic.td_error(fix_mask_rewards_A, mask_rewards_C, criterion_C) loss_c.backward() loss_C += loss_c.data optim_C.clip_grad_norm() optim_C.step() _g = mask_score(props_A, words_A[:, 1:], scores_A - scores_C) loss_a = criterion_A(_g, labels.view(-1)) loss_a.backward() loss_A += loss_a.data optim_A.clip_grad_norm() optim_A.step() loss_A = loss_A[0] / training_data.sents_size loss_C = loss_C[0] / training_data.sents_size return loss_A, loss_C
def train_actor_critic(): loss_A = loss_C = .0 actor.train() critic.train() # for imgs, labels in tqdm(training_data, # mininterval=1, desc="Actor-Critic Training", leave=False): for imgs, labels in training_data: optim_A.zero_grad() optim_C.zero_grad() enc = encode(imgs)[0] hidden_A = actor.feed_enc(enc) props_A, words_A = actor(hidden_A) fixed_props_A = Variable(props_A.data.new(*props_A.size()), requires_grad=False) fixed_props_A.data.copy_(props_A.data) hidden_C = critic.feed_enc(enc) props_C, words_C = critic(words_A, hidden_C) scores_A, scores_C = rouge_l(words_A[:, 1:], labels), rouge_l(words_C, labels) loss_c = critic.td_error(scores_A, scores_C, fixed_props_A, props_C, criterion_C) loss_c.backward() optim_C.step() loss_C += loss_c.data base = (scores_A - scores_C).mean() loss_a = criterion_A(props_A.view(-1, props_A.size(2)), labels.view(-1)) * base loss_a.backward() optim_A.step() loss_A += loss_a.data loss_A = loss_A[0] / training_data.sents_size loss_C = loss_C[0] / training_data.sents_size return loss_A, loss_C
def pre_train_critic(): iterations, total_loss = 0, .0 actor.eval() critic.train() global tf_step for imgs, labels in tqdm(training_data, mininterval=1, desc="Pre-train Critic", leave=False): optim_pre_C.zero_grad() enc = actor.encode(imgs) hidden_A = actor.feed_enc(enc) props_A, words_A = actor(hidden_A, labels) fixed_props_A = fix_variable(props_A) hidden_C = critic.feed_enc(enc) props_C, words_C = critic(words_A, hidden_C) scores_A, scores_C = rouge_l(words_A[:, 1:], labels), rouge_l(words_C, labels) mask_rewards_A = mask_score(fixed_props_A, words_A[:, 1:], scores_A) mask_rewards_C = mask_score(props_C, words_C, scores_C) loss = critic.td_error(mask_rewards_A, mask_rewards_C, criterion_C) loss.backward() total_loss += loss.data optim_pre_C.clip_grad_norm() optim_pre_C.step() iterations += 1 if tf is not None: add_summary_value("pre-train critic loss", loss.data[0]) tf_step += 1 if tf_step % 100 == 0: tf_summary_writer.flush() if iterations == args.iterations: break return total_loss[0] / args.iterations
def pre_train_critic(): iterations, total_loss = 0, .0 actor.eval() critic.train() # for imgs, labels in tqdm(training_data, # mininterval=1, desc="Pre-train Critic", leave=False): for imgs, labels in training_data: optim_pre_C.zero_grad() enc = encode(imgs)[0] hidden_A = actor.feed_enc(enc) props_A, words_A = actor(hidden_A) fixed_props_A = Variable(props_A.data.new(*props_A.size()), requires_grad=False) fixed_props_A.data.copy_(props_A.data) hidden_C = critic.feed_enc(enc) props_C, words_C = critic(words_A, hidden_C) scores_A, scores_C = rouge_l(words_A[:, 1:], labels), rouge_l(words_C, labels) loss = critic.td_error(scores_A, scores_C, fixed_props_A, props_C, criterion_C) loss.backward() optim_pre_C.step() total_loss += loss.data iterations += 1 if iterations == args.iterations: break return total_loss[0] / args.iterations
def rouge_score(session): assert nb_batch*conf.batch_size%conf.batch_size==0 pred_sum=[] for m in range(0, nb_batch*conf.batch_size, conf.batch_size): pred = session.run(decoder_prediction, feed_dict={encoder_inputs : test_doc2id[m:m+conf.batch_size], query_inputs : test_query2id[m:m+conf.batch_size], decoder_targets : test_summ2id[m:m+conf.batch_size], encoder_inputs_length : test_doc_len[m:m+conf.batch_size], query_inputs_length : test_que_len[m:m+conf.batch_size], decoder_targets_length: test_sum_len[m:m+conf.batch_size], sum_mask_tf : test_sum_mask[m:m+conf.batch_size], doc_mask_tf : test_doc_mask[m:m+conf.batch_size], que_mask_tf : test_query_mask[m:m+conf.batch_size], #embedding_placeholder : embedding_weights, is_training : False, }) pred_sum.extend(pred.tolist()) assert len(pred_sum)==nb_batch*conf.batch_size rouge1_sum=[] rouge2_sum=[] rougel_sum=[] for i in range(nb_batch*conf.batch_size): pred_temp=[] ref_temp=[] for id_ in pred_sum[i]: if id_==1: break pred_temp.append(str(id_)) for id_ in test_summ2id[i]: if id_==1: break ref_temp.append(str(id_)) if pred_temp==[] or ref_temp==[]: continue rouge1_sum.append(rouge.rouge_n(pred_temp, ref_temp, n=1)[-1]) rouge2_sum.append(rouge.rouge_n(pred_temp, ref_temp, n=2)[-1]) rougel_sum.append(rouge.rouge_l(pred_temp, ref_temp)) #print "rouge_1:,rouge1_sum/float(split)) #print "rouge_2:%f"%(rouge2_sum/float(split)) #print "rouge_l:%f"%(rougel_sum/float(split)) return np.mean(rouge1_sum), np.mean(rouge2_sum), np.mean(rougel_sum), \ np.std(rouge1_sum) , np.std(rouge2_sum), np.std(rougel_sum), pred_sum
def train_actor_critic(): actor.train() critic.train() if tf: global tf_step for imgs, labels in tqdm(training_data, mininterval=1, desc="Actor-Critic Training", leave=False): optim_A.zero_grad() optim_C.zero_grad() enc = actor.encode(imgs) hidden_A = actor.feed_enc(enc) target, words = actor(hidden_A) policy_values = rouge_l(words, labels) hidden_C = critic.feed_enc(enc) estimated_values = critic(words, hidden_C) loss_c = criterion_C(estimated_values, policy_values) loss_c.backward() optim_C.clip_grad_norm() optim_C.step() reward = torch.mean(policy_values - estimated_values) loss_a = criterion_A(target.view(-1, target.size(2)), labels.view(-1)) loss_a.backward() optim_A.clip_grad_norm() optim_A.step(reward) if tf is not None: add_summary_value("train critic loss", loss_c[0]) add_summary_value("train actor loss", loss_a.data[0]) add_summary_value("train actor reward", reward.data) add_summary_value("train critic score", estimated_values.data.mean()) add_summary_value("train actor score", policy_values.data.mean()) tf_step += 1 if tf_step % 100 == 0: tf_summary_writer.flush()
def eval(): actor.eval() eval_score = .0 for imgs, labels in tqdm(validation_data, mininterval=1, desc="Actor-Critic Eval", leave=False): enc = actor.encode(imgs) hidden = actor.feed_enc(enc) words, _ = actor.speak(hidden) scores = rouge_l(words, labels) scores = scores.sum() eval_score += scores.data eval_score = eval_score[0] / validation_data.sents_size return eval_score
def eval(): actor.eval() eval_loss = eval_score = .0 # for imgs, labels in tqdm(validation_data, # mininterval=1, desc="Actor-Critic Training", leave=False): for imgs, labels in validation_data: enc = encode(imgs)[0] hidden = actor.feed_enc(enc) props, words = actor(hidden) loss = criterion_A(props.view(-1, props.size(2)), labels.view(-1)) scores = rouge_l(words[:, 1:], labels) scores = scores.sum() eval_loss += loss.data eval_score += scores eval_loss = eval_loss[0] / validation_data.sents_size eval_score = eval_score[0] / validation_data.sents_size return eval_loss, eval_score
def pre_train_critic(): iterations = 0 actor.eval() critic.train() if tf: global tf_step for imgs, labels in tqdm(training_data, mininterval=1, desc="Pre-train Critic", leave=False): optim_pre_C.zero_grad() enc = actor.encode(imgs) hidden_A = actor.feed_enc(enc) # we pre-train the critic network by feeding it with sampled actions from the fixed pre-trained actor. _, words = actor(hidden_A) policy_values = rouge_l(words, labels) hidden_C = critic.feed_enc(enc) estimated_values = critic(words, hidden_C) loss = criterion_C(estimated_values, policy_values) loss.backward() optim_pre_C.clip_grad_norm() optim_pre_C.step() iterations += 1 if tf is not None: add_summary_value("pre-train critic loss", loss.data[0]) tf_step += 1 if tf_step % 100 == 0: tf_summary_writer.flush() if iterations == args.iterations: break