def cal_bleu_rouge(keys, values, rank=None, num_print=10000): start = rank * num_print if rank is not None else 0 end = start + num_print # values = [val[start:end] for val in values] ref_answers, pred_answers = [], [] for ex_idx in range(len(values[0])): for key_idx, key in enumerate(keys): value = values[key_idx][ex_idx] v = value[0] if isinstance(value, list) else value # print(f'{key}: {repr(v)}') if key == 'greedy': pred_answers.append({'answers': [str(v)]}) elif key == 'answer': ref_answers.append({ 'answers': [str(v)], 'question_id': ex_idx }) # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) # 利用utils包,normalize strings to space joined chars ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) # pred_dict是预测值,ref_dict是真实值 # 利用utils包,calculate bleu and rouge metrics else: bleu_rouge = None return bleu_rouge
def run_eval(pred_answers_list, ref_answers_list): """ Run eval. """ pred_answers = pred_answers_list ref_answers = ref_answers_list # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: assert len(pred_answers) == len(ref_answers), \ "length distinguish: (pred_len={}) != (ref_len={})".format(len(pred_answers), len(ref_answers)) pred_ids, ref_ids = [], [] pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id_pred = pred['question_id'] question_id_ref = ref['question_id'] pred_ids.append(question_id_pred) ref_ids.append(question_id_ref) if len(ref['answers']) > 0: pred_dict[question_id_pred] = normalize(pred['answers']) ref_dict[question_id_ref] = normalize(ref['answers']) assert set(pred_ids) == set( ref_ids), "There have different ids in both files." bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return bleu_rouge
def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False): pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 for b_itx, batch in enumerate(eval_batches): feed_dict = {self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.dropout: 0.0} start_probs, end_probs,loss= self.sess.run([self.start_probs,self.end_probs,self.loss], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) padded_p_len = len(batch['passage_token_ids'][0]) for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs): best_answer = self.find_best_answer(sample, start_prob, end_prob, padded_p_len) if save_full_info: sample['pred_answers'] = [best_answer] pred_answers.append(sample) else: pred_answers.append({'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': []}) if 'answers' in sample: ref_answers.append({'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': []}) if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w',encoding='utf-8') as fout: for pred_answer in pred_answers: fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n') self.logger.info('Saving {} results to {}'.format(result_prefix, result_file)) # this average loss is invalid on test set, since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss,bleu_rouge
def get_score(result_dir): pred_answers = [] with open(result_dir) as fin: for lidx_a, line_a in enumerate(fin): em_answer = json.loads(line_a.strip()) pred_answers.append(em_answer) # 如果有参考答案,计算bleu和rouge分数 pred_dict, ref_dict = {}, {} for answer in pred_answers: question_id = answer['question_id'] if len(answer['ref_answers']) > 0: pred_dict[question_id] = normalize(answer['answers']) ref_dict[question_id] = normalize(answer['ref_answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) return bleu_rouge
def eval(path1, path2): # with open(path2, encoding='utf-8') as f: # data = json.load(f) # id = list(int(i) for i in data.keys()) with open(path1, encoding='utf-8') as f: pred_answers, ref_answers = [], [] for lidx, line in enumerate(f): sample = json.loads(line.strip()) pred_answers.append({ 'question_id': sample['question_id'], 'question': sample['question'], 'question_type': sample['question_type'], 'answers': ["No Answer Present."], 'ref_answers': sample['ref_answers'], 'entity_answers': [[]], 'yesno_answers': [] }) ref_answers.append({ 'question_id': sample['question_id'], 'question': sample['question'], 'question_type': sample['question_type'], 'answers': sample['ref_answers'], 'ref_answers': sample['ref_answers'], 'entity_answers': [[]], 'yesno_answers': [] }) pred_dict, ref_dict = {}, {} F1 = 0 count = 0 for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) F = local_prf(pred['answers'][0].split(), ref['answers'][0].split()) F1 += F count += 1 bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) F1_avg = F1 / count return bleu_rouge, F1_avg
def evaluate(self, infer_file, ret=None, from_file=False): """ Processes and evaluates the inferred result. Args: infer_file: A file name to store or read from the inferred results. ret: The information returned by the inferring operation, which contains the batch-level input and the the batch-level inferring result. from_file: If True, the time consuming inferring process will be skipped, and this method takes the content of infer_file as input for evaluation. If False, this method takes the ret as input for evaluation. """ def _merge_and_normalize(obj_list): ret = {} for obj in obj_list: normalized = {k: normalize(v) for k, v in obj.items()} ret.update(normalized) return ret pred_list = [] ref_list = [] objs = [] if from_file: ref_list, pred_list = self._read_list(infer_file) else: ref_list, pred_list, objs = self._parse_infer_ret(ret) with open(infer_file, 'w') as of: for o in objs: print >> of, json.dumps(o, ensure_ascii=False).encode('utf8') metrics = compute_bleu_rouge( _merge_and_normalize(pred_list), _merge_and_normalize(ref_list)) res_str = '{} {}'.format(infer_file, ' '.join('{}={}'.format(k, v) for k, v in metrics.items())) logger.info(res_str)
def run_eval(pred_answers_list, ref_answers_list): """ Run eval. """ pred_answers = pred_answers_list ref_answers = ref_answers_list # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return bleu_rouge
def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False): """ Evaluates the model performance on eval_batches and results are saved if specified Args: eval_batches: iterable batch data result_dir: directory to save predicted answers, answers will not be saved if None # 文件的前缀 result_prefix: prefix of the file for saving predicted answers, answers will not be saved if None save_full_info: if True, the pred_answers will be added to raw sample and saved """ # 预测答案,参考答案 pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 line_number = 1 # 从第1行开始读 for bitx in range(1, 72): batch, line_number = self.brc_data.load_batch_size_data_set( line_number, '../data/devset/search.dev.json', train=True) feed_dict = { self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.dropout_keep_prob: 1.0 } start_probs, end_probs, loss = self.sess.run( [self.start_probs, self.end_probs, self.loss], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) padded_p_len = len(batch['passage_token_ids'][0]) for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs): best_answer = self.find_best_answer(sample, start_prob, end_prob, padded_p_len) # 如果需要保存预测答案信息,将会把预测答案存入源sample中并保存 if save_full_info: sample['pred_answers'] = [best_answer] pred_answers.append(sample) else: pred_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [] }) if 'answers' in sample: ref_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': [] }) # 保存预测数据 if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write( json.dumps(pred_answer, ensure_ascii=False) + '\n') self.logger.info('Saving {} results to {}'.format( result_prefix, result_file)) # 计算损失 以及bleu and rouge得分 # this average loss is invalid on test set, since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) # 把预测数据 与 参考答案保存 pickle.dump(pred_dict, open('./logs/pred_dict.txt', 'wb')) pickle.dump(ref_dict, open('./logs/ref_dict.txt', 'wb')) result_file = os.path.join('./logs', 'pred_dict' + '.json') with open(result_file, 'w') as fout: fout.write(json.dumps(pred_dict, ensure_ascii=False) + '\n') bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge
def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False): """ Evaluates the model performance on eval_batches and results are saved if specified Args: eval_batches: iterable batch data result_dir: directory to save predicted answers, answers will not be saved if None result_prefix: prefix of the file for saving predicted answers, answers will not be saved if None save_full_info: if True, the pred_answers will be added to raw sample and saved """ pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 for b_itx, batch in enumerate(eval_batches): feed_dict = {self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.dropout_keep_prob: 1.0} start_probs, end_probs, loss = self.sess.run([self.start_probs, self.end_probs, self.loss], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) padded_p_len = len(batch['passage_token_ids'][0]) for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs): best_answer = self.find_best_answer(sample, start_prob, end_prob, padded_p_len) if save_full_info: sample['pred_answers'] = [best_answer] pred_answers.append(sample) else: pred_answers.append({'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': []}) if 'answers' in sample: ref_answers.append({'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': []}) if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write(json.dumps(pred_answer, encoding='utf8', ensure_ascii=False) + '\n') self.logger.info('Saving {} results to {}'.format(result_prefix, result_file)) # this average loss is invalid on test set, since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge
def train_analysis(self, step): pred_answers, ref_answers = [], [] fake_answers = [] ref_answers.append({ 'question_id': self.data['question_id'], 'question_type': self.data['question_type'], 'answers': self.data['ref_answers'] }) listSelectedSet = [] all_set = [] # print '+++++++++++++++++++++++++++++++++++++++++++' # print ('question_id', list2string(self.tfg.vocab.recover_from_ids(self.data['question_token_ids']))) for p_idx, is_selected in enumerate( self.data['passage_is_selected_list'], 0): all_set += self.data['passage_token_ids_list'][p_idx] if is_selected == True: # print 'is True' # print('title ', self.data['passage_title_token_ids_list'][p_idx]) #print('title', list2string(self.tfg.vocab.recover_from_ids(self.data['passage_title_token_ids_list'][p_idx]))) listSelectedSet += self.data['passage_token_ids_list'][p_idx] pred_answer_str = '' str123_list = self.tfg.vocab.recover_from_ids(listSelectedSet) all_set_list = self.tfg.vocab.recover_from_ids(all_set) for s in str123_list: pred_answer_str += s selected_recall_score = 1 selected_f1_score = 0 all_racall_score = 1 all_f1_score = 0 if len(self.data['segmented_answers']) > 0 and len(str123_list) > 0: selected_recall_score = metric_max_over_ground_truths( recall, str123_list, self.data['segmented_answers']) selected_f1_score = metric_max_over_ground_truths( f1_score, str123_list, self.data['segmented_answers']) if len(self.data['segmented_answers']) > 0 and len(str123_list) > 0: all_racall_score = metric_max_over_ground_truths( recall, all_set_list, self.data['segmented_answers']) all_f1_score = metric_max_over_ground_truths( f1_score, all_set_list, self.data['segmented_answers']) # print pred_answer_str # print ('ref_answer',self.data['ref_answers'] ) # print('fake_answers', self.data['fake_answers']) # print('pre_answer', [''.join(pred_answer_str)]) pred_answer = { 'question_id': self.data['question_id'], 'question_type': self.data['question_type'], 'answers': [''.join(pred_answer_str)] } pred_answers.append(pred_answer) fake_answer = { 'question_id': self.data['question_id'], 'question_type': self.data['question_type'], 'answers': self.data['fake_answers'] } fake_answers.append(fake_answer) # pre VS ref if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None value_with_mcts = bleu_rouge # print 'ref VS pre: ' # print value_with_mcts # pre VS fac if len(ref_answers) > 0 and len(fake_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, fake_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None value_with_mcts = bleu_rouge # print 'pre VS fac: ' # print value_with_mcts # pre VS fac if len(ref_answers) > 0 and len(self.data['fake_answers']) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(fake_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None value_with_mcts = bleu_rouge # print 'fac VS ref: ' # print value_with_mcts match_score = [ selected_recall_score, selected_f1_score, all_racall_score, all_f1_score ] #print ('match_score', match_score) return pred_answer, fake_answer, match_score
def evaluate(self, eval_batches, result_dir=None, result_prefix=None, result_name='', save_full_info=False): """ Evaluates the model performance on eval_batches and results are saved if specified Args: eval_batches: iterable batch data result_dir: directory to save predicted answers, answers will not be saved if None result_prefix: prefix of the file for saving predicted answers, answers will not be saved if None save_full_info: if True, the pred_answers will be added to raw sample and saved """ pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 n_batch_loss = 0.0 n_batch = 0 for b_itx, batch in enumerate(eval_batches, 1): feed_dict = { self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.dropout_keep_prob: 1.0 } if self.debug_print: if self.simple_net in [0, 5]: res = self.sess.run([ self.loss, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.start_probs ], feed_dict) names = 'self.loss, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.start_probs'.split( ',') if self.simple_net in [1, 2]: res = self.sess.run([ self.loss, self.p_length, self.q_length, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.match_p_encodes, self.fuse_p_encodes, self.gm1, self.gm2, self.start_probs, self.sim_matrix, self.context2question_attn, self.b, self.question2context_attn ], feed_dict) names = 'self.loss, self.p_length, self.q_length, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.match_p_encodes, self.fuse_p_encodes, \ self.gm1, self.gm2, self.start_probs, self.sim_matrix, self.context2question_attn, self.b, self.question2context_attn'.split( ',') if self.simple_net in [3, 4]: res = self.sess.run([ self.loss, self.start_probs, self.end_probs, self.loss2, self.start_label, self.end_label, self.p_length, self.q_length, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.match_p_encodes, self.fuse_p_encodes, self.sim_matrix, self.context2question_attn, self.b, self.question2context_attn, self.pn_init_state, self.pn_f0, self.pn_f1, self.pn_b0, self.pn_b1 ], feed_dict) names = 'self.loss, self.start_probs, self.end_probs, self.loss2, self.start_label, self.end_label, self.p_length, self.q_length, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.match_p_encodes, self.fuse_p_encodes, \ self.sim_matrix, self.context2question_attn, self.b, self.question2context_attn, self.pn_init_state, self.pn_f0, self.pn_f1, self.pn_b0, self.pn_b1'.split( ',') loss, start_probs, end_probs = res[0:3] for i in range(1, len(res)): p_name = names[i] p_array = res[i] self.var_print('var', p_array, p_name, p_name) else: start_probs, end_probs, loss = self.sess.run( [self.start_probs, self.end_probs, self.loss], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) n_batch_loss = loss * len(batch['raw_data']) n_batch += len(batch['raw_data']) if self.log_interval > 0 and b_itx % self.log_interval == 0: #self.print_num_of_total_parameters(True, True) self.logger.info( 'Average dev loss from batch {} to {} is {}'.format( b_itx - self.log_interval + 1, b_itx, "%.10f" % (n_batch_loss / n_batch))) n_batch_loss = 0.0 n_batch = 0 padded_p_len = len(batch['passage_token_ids'][0]) for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs): best_answer, best_span = self.find_best_answer( sample, start_prob, end_prob, padded_p_len) if save_full_info: sample['pred_answers'] = [best_answer] pred_answers.append(sample) else: pred = { 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [best_span] } pred_answers.append(pred) if self.debug_print: self.logger.info('pred=' + json.dumps(pred, ensure_ascii=False)) if 'answers' in sample: ref = { 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': [best_span] } ref_answers.append(ref) if self.debug_print: self.logger.info('ref=' + json.dumps(ref, ensure_ascii=False)) if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + result_name + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write( json.dumps(pred_answer, ensure_ascii=False) + '\n') self.logger.info('Saving {} results to {}'.format( result_prefix, result_file)) #exit() # this average loss is invalid on test set, since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge
def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False): """ Evaluates the model performance on eval_batches and results are saved if specified Args: eval_batches: iterable batch data result_dir: directory to save predicted answers, answers will not be saved if None result_prefix: prefix of the file for saving predicted answers, answers will not be saved if None save_full_info: if True, the pred_answers will be added to raw sample and saved """ pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 for b_itx, batch in enumerate(eval_batches): feed_dict = { self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.dropout_keep_prob: 1.0 } # evaluate必然是没有dropout的 start_probs, end_probs, loss = self.sess.run( [self.start_probs, self.end_probs, self.loss], feed_dict) # self.logger.debug(start_probs) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) padded_p_len = len(batch['passage_token_ids'][0]) # self.p中最长的那个样本的长度?batch['passage_token_ids']应该是个 # 多维的np.ndarray才对,有可能是当list of list来处理,就是第一行, # 也就是第一个样本 for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs): best_answer = self.find_best_answer(sample, start_prob, end_prob, padded_p_len) # 在做evaluate和test的推测工作时,要这样利用start_prob和end_prob if save_full_info: sample['pred_answers'] = [best_answer] pred_answers.append(sample) else: pred_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'question': sample['question'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [] }) if 'answers' in sample: ref_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': [] }) if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w', encoding='utf8') as fout: for pred_answer in pred_answers: fout.write( json.dumps(pred_answer, ensure_ascii=False) + '\n') self.logger.info('Saving {} results to {}'.format( result_prefix, result_file)) # this average loss is invalid on test set, # since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) # 利用utils包,normalize strings to space joined chars ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) # pred_dict是预测值,ref_dict是真实值 # 利用utils包,calculate bleu and rouge metrics else: bleu_rouge = None return ave_loss, bleu_rouge
def evaluate(self, eval_batches, result_dir=None, result_prefix=None): """ 评估模型在验证集上的表现,如果指定了保存,则将把结果保存 """ pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 count = 0 for b_itx, batch in enumerate(eval_batches): count += 1 if count % 100 == 0: self.logger.info(count) if batch['passage_length'][0] <= 0: continue feed_dict = { self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.dropout_keep_prob: 1.0 } start_probs, end_probs, loss, fuse_value, context2question_attn, question2context_attn, match_value = self.sess.run( [ self.start_probs, self.end_probs, self.loss, self.fuse_value, self.context2question_attn, self.question2context_attn, self.match_value ], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) padded_p_len = len(batch['passage_token_ids'][0]) for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs): best_answer, seg_answers, best_score, q2c_match_score, c2q_match_score = self.find_best_answer( sample, start_prob, end_prob, padded_p_len, context2question_attn, question2context_attn) if 'answers' in sample: ref_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': [] }) pred_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'question_tokens': sample['question'], 'ref_answers': sample['answers'], 'best_score': str(best_score), 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [], 'match_value': str(match_value), 'fuse_value': str(fuse_value), 'q2c_match_score': str(q2c_match_score), 'c2q_match_score': str(c2q_match_score), 'seg_answers': [seg_answers] }) else: pred_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'question_tokens': sample['question'], 'answers': [best_answer], 'best_score': str(best_score), 'match_value': str(match_value), 'entity_answers': [[]], 'yesno_answers': [], 'fuse_value': str(fuse_value), 'q2c_match_score': str(q2c_match_score), 'c2q_match_score': str(c2q_match_score), 'seg_answers': [seg_answers] }) if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write( json.dumps( pred_answer, encoding='utf8', ensure_ascii=False) + '\n') self.logger.info('保存 {} 结果到 {}'.format(result_prefix, result_file)) # 该平均损失对于测试集是无效的,因为测试集没有标注答案 ave_loss = 1.0 * total_loss / total_num # 如果有参考答案,计算bleu和rouge分数 if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge
def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False): """ Evaluates the model performance on eval_batches and results are saved if specified Args: eval_batches: iterable batch data result_dir: directory to save predicted answers, answers will not be saved if None result_prefix: prefix of the file for saving predicted answers, answers will not be saved if None save_full_info: if True, the pred_answers will be added to raw sample and saved """ pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 for b_itx, batch in enumerate(eval_batches): #p_allennlpd = [get_str_num(ids) for ids in batch['passage_token_ids']] #q_allennlpd = [get_str_num(ids) for ids in batch['question_token_ids']] #p_allennlpd = get_allennlp_vec(p_allennlpd) #q_allennlpd = get_allennlp_vec(q_allennlpd) # shape[batch,sequence_len , 3 * 1024 = 3072] # print(p_allennlpd.shape) # [32 * 5 = 160 , 500 , 3 * 1024 = 3072] # print(q_allennlpd.shape) # [32 * 5 , 500 , 3 * 1024 = 3072] feed_dict = { self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], #self.p_allennlp: p_allennlpd, #self.q_allennlp: q_allennlpd, self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.real_pass: batch['real_pass'], self.sequence_label: batch['sequence_label'], self.dropout_keep_prob: 1.0 } start_probs, end_probs, yes_probs, pred_pass_probs, loss = self.sess.run( [ self.start_probs, self.end_probs, self.yes_probs, self.pred_pass_probs, self.loss ], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) padded_p_len = len(batch['passage_token_ids'][0]) for sample, start_prob, end_prob, yes_prob, pred_pass_prob in zip( batch['raw_data'], start_probs, end_probs, yes_probs, pred_pass_probs): best_answer = self.find_best_answer(sample, start_prob, end_prob, yes_prob, pred_pass_prob, padded_p_len) if save_full_info: sample['pred_answers'] = [best_answer] pred_answers.append(sample) else: pred_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [] }) if 'answers' in sample: ref_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': [] }) if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write( json.dumps(pred_answer, ensure_ascii=False) + '\n') self.logger.info('Saving {} results to {}'.format( result_prefix, result_file)) # this average loss is invalid on test set, since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge
def search(self, start_node_id): #print '----tree search' tmp_node = self.tree.get_node(start_node_id) #print tmp_node.data.num has_visit_num = tmp_node.data.num - 1 self.count = has_visit_num if int(self.max_search_time - has_visit_num) > 0: start_node_search_time = int(self.max_search_time - has_visit_num) else: start_node_search_time = 0 for tm in range(start_node_search_time): if tm % 10 == 0: batch_start_time = time.time() #print ('search time',tm) search_list = [start_node_id] tmp_node = self.tree.get_node(start_node_id) #print 'search time :'+ str(time) while not tmp_node.is_leaf(): max_score = float("-inf") max_id = -1 for child_id in tmp_node.fpointer: child_node = self.tree.get_node(child_id) score = self.beta * child_node.data.p * ( (tmp_node.data.num)**0.5 / (1 + child_node.data.num)) #print 'child_node.data.Q: ' #print child_node.data.Q score += child_node.data.Q #print 'score: ' #print score #print '**************' if score > max_score: max_id = child_id max_score = score search_list.append(max_id) tmp_node = self.tree.get_node(max_id) if not tmp_node.data.value == None: v = tmp_node.data.value else: if tmp_node.data.sen[-1] == str(self.l_passages - 1): pred_answer = tmp_node.data.sen # print 'search to end pred_answer: ' # print pred_answer # print 'listSelectedSet' listSelectedSet_sens = [] listSelectedSet = map(eval, pred_answer) # print listSelectedSet for idx in listSelectedSet: listSelectedSet_sens.append(self.p_sen_list[idx]) # print 'pred_answer ' pred_answer_str = '' for sen in listSelectedSet_sens: str123_list = self.carpe_diem.vocab.recover_from_ids( sen, 0) for s in str123_list: pred_answer_str += s # print 'pred_answer_str: ' # print pred_answer_str # print 'ref_answer_str: ' # print list2string(self.ref_answer[0]['answers']) pred_answers = [] pred_answers.append({ 'question_id': [self.q_id], 'question_type': [], 'answers': [''.join(pred_answer_str)], 'entity_answers': [[]], 'yesno_answers': [] }) if len(self.ref_answer) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, self.ref_answer): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize( pred['answers']) ref_dict[question_id] = normalize( ref['answers']) # print '========compare in tree=======' # print pred_dict[question_id] # print '----------------------' # print ref_dict[question_id] bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None # print 'last words ++++++++++++++ ' # print bleu_rouge v = input_v = bleu_rouge['Rouge-L'] * self.m_value['Rouge-L'] \ + bleu_rouge['Bleu-4'] * self.m_value['Bleu-4'] \ + bleu_rouge['Bleu-1'] * self.m_value['Bleu-1'] \ + bleu_rouge['Bleu-3'] * self.m_value['Bleu-3'] \ + bleu_rouge['Bleu-2'] * self.m_value['Bleu-2'] else: v = self.carpe_diem.value_function(tmp_node.data.sen)[0][0] tmp_node.data.value = v # if tmp_node.data.sen[-1] == str(self.l_passages - 1): # pred_answer = tmp_node.data.sen # listSelectedSet_sens = [] # listSelectedSet = map(eval, pred_answer) # # print listSelectedSet # for idx in listSelectedSet: # listSelectedSet_sens.append(self.p_sen_list[idx]) # # print 'pred_answer ' # pred_answer_str = '' # for sen in listSelectedSet_sens: # str123_list = self.carpe_diem.vocab.recover_from_ids(sen, 0) # for s in str123_list: # pred_answer_str += s # # pred_answers = [] # # pred_answers.append({'question_id': [self.q_id], # 'question_type': [], # 'answers': [''.join(pred_answer_str)], # 'entity_answers': [[]], # 'yesno_answers': []}) # if len(self.ref_answer) > 0: # pred_dict, ref_dict = {}, {} # for pred, ref in zip(pred_answers, self.ref_answer): # question_id = ref['question_id'] # if len(ref['answers']) > 0: # pred_dict[question_id] = normalize(pred['answers']) # ref_dict[question_id] = normalize(ref['answers']) # bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) # else: # bleu_rouge = None # v = bleu_rouge['Rouge-L'] * self.m_value['Rouge-L'] \ # + bleu_rouge['Bleu-4'] * self.m_value['Bleu-4'] \ # + bleu_rouge['Bleu-1'] * self.m_value['Bleu-1'] \ # + bleu_rouge['Bleu-3'] * self.m_value['Bleu-3'] \ # + bleu_rouge['Bleu-2'] * self.m_value['Bleu-2'] # else: # v = self.carpe_diem.value_function(tmp_node.data.sen)[0][0] self.update(search_list, v) self.count += 1 if tmp_node.is_leaf() and ( self.tree.depth(tmp_node) < self.max_depth ) and tmp_node.data.sen[-1] != str(self.l_passages - 1): self.expand(tmp_node) # if tm %10 == 0: # print ('==================== search 10 time = %3.2f s ====================' % (time.time() - batch_start_time)) ########### '''
#!/usr/bin/python3
def _analysis(self, step , train_batches, dropout_keep_prob): """ Trains the model for a single epoch. Args: train_batches: iterable batch data for training dropout_keep_prob: float value indicating dropout keep probability """ total_loss = 0 num_loss = 0 total_recall = [0.0,0.0,0.0,0.0] num_recall =0 batch_start_time = 0 batch_start_time = time.time() pred_answers, ref_answers = [], [] fake_answers = [] for fbitx, batch in enumerate(train_batches, 1): step += 1 if fbitx % 1000 == 0: print '------ Batch Question: ' + str(fbitx) trees = [] batch_tree_set = [] batch_size = len(batch['question_ids']) #print ('batch_size)', batch_size) for bitx in range(batch_size): tree = {'question_id': batch['question_ids'][bitx], 'question_token_ids': batch['question_token_ids'][bitx], 'q_length': batch['question_length'][bitx], 'passage_token_ids_list': batch['passage_token_ids_list'][bitx], 'passage_title_token_ids_list': batch['passage_title_token_ids_list'][bitx], 'passage_title_length_list': batch['passage_title_length_list'][bitx], 'passage_sentence_token_ids_list': batch['passage_sentence_token_ids_list'][bitx], 'passage_sen_length': batch['passage_sen_length_list'][bitx], #'p_length': batch['passage_length'][bitx], 'passage_is_selected_list': batch['passage_is_selected_list'][bitx], 'question_type': batch['question_types'][bitx], 'ref_answers': batch['ref_answers'][bitx], 'fake_answers': batch['fake_answers'][bitx], 'segmented_answers': batch['segmented_answers'][bitx] } ref_answers.append({'question_id': tree['question_id'], 'question_type': tree['question_type'], 'answers': tree['ref_answers']}) trees.append(tree) #print batch batch_tree = SearchTree(self.tfg, tree, self.max_a_len, self.search_time, self.beta, self.m_value, dropout_keep_prob) batch_tree_set.append(batch_tree) # for every data in batch do training process for idx, batch_tree in enumerate(batch_tree_set,1): pred_answer, fake_answer, recall = batch_tree.train_analysis(step) pred_answers.append(pred_answer) fake_answers.append(fake_answer) total_recall[0] += recall[0] total_recall[1] += recall[1] total_recall[2] += recall[2] total_recall[3] += recall[3] num_recall += 1 print('ave select recall', total_recall[0] / num_recall) print('ave select f1', total_recall[1] / num_recall) print('ave all recall', total_recall[2] / num_recall) print('ave all f1', total_recall[3] / num_recall) ii = 0 if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): ii += 1 question_id = ref['question_id'] #print('type', question_id) if len(ref['answers']) > 0: ref_dict[question_id] = normalize(ref['answers']) pred_dict[question_id] = normalize(pred['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None value_with_mcts = bleu_rouge print ('pre_scor',value_with_mcts) #return 1.0 * total_loss / num_loss, step return 0, step
def validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args): """ do inference with given inference_program """ parallel_executor = fluid.ParallelExecutor(main_program=inference_program, use_cuda=bool(args.use_gpu), loss_name=avg_cost.name) print_para(inference_program, parallel_executor, logger, args) # Use test set as validation each pass total_loss = 0.0 count = 0 n_batch_cnt = 0 n_batch_loss = 0.0 pred_answers, ref_answers = [], [] val_feed_list = [ inference_program.global_block().var(var_name) for var_name in feed_order ] val_feeder = fluid.DataFeeder(val_feed_list, place) pad_id = vocab.get_id(vocab.pad_token) dev_reader = lambda: brc_data.gen_mini_batches( 'dev', args.batch_size, pad_id, shuffle=False) dev_reader = read_multiple(dev_reader, dev_count) for batch_id, batch_list in enumerate(dev_reader(), 1): feed_data = batch_reader(batch_list, args) val_fetch_outs = parallel_executor.run( feed=list(val_feeder.feed_parallel(feed_data, dev_count)), fetch_list=[avg_cost.name, s_probs.name, e_probs.name, match.name], return_numpy=False) total_loss += np.array(val_fetch_outs[0]).sum() start_probs_m = LodTensor_Array(val_fetch_outs[1]) end_probs_m = LodTensor_Array(val_fetch_outs[2]) match_lod = val_fetch_outs[3].lod() count += len(np.array(val_fetch_outs[0])) n_batch_cnt += len(np.array(val_fetch_outs[0])) n_batch_loss += np.array(val_fetch_outs[0]).sum() log_every_n_batch = args.log_interval if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0: logger.info('Average dev loss from batch {} to {} is {}'.format( batch_id - log_every_n_batch + 1, batch_id, "%.10f" % (n_batch_loss / n_batch_cnt))) n_batch_loss = 0.0 n_batch_cnt = 0 batch_offset = 0 for idx, batch in enumerate(batch_list): #one batch batch_size = len(batch['raw_data']) batch_range = match_lod[0][batch_offset:batch_offset + batch_size + 1] batch_lod = [[batch_range[x], batch_range[x + 1]] for x in range(len(batch_range[:-1]))] start_prob_batch = start_probs_m[batch_offset:batch_offset + batch_size + 1] end_prob_batch = end_probs_m[batch_offset:batch_offset + batch_size + 1] for sample, start_prob_inst, end_prob_inst, inst_range in zip( batch['raw_data'], start_prob_batch, end_prob_batch, batch_lod): #one instance inst_lod = match_lod[1][inst_range[0]:inst_range[1] + 1] best_answer, best_span = find_best_answer_for_inst( sample, start_prob_inst, end_prob_inst, inst_lod) pred = { 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [] } pred_answers.append(pred) if 'answers' in sample: ref = { 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': [] } ref_answers.append(ref) batch_offset = batch_offset + batch_size result_dir = args.result_dir result_prefix = args.result_name if result_dir is not None and result_prefix is not None: if not os.path.exists(args.result_dir): os.makedirs(args.result_dir) result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n') logger.info('Saving {} results to {}'.format(result_prefix, result_file)) ave_loss = 1.0 * total_loss / count # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge
def do_eval(model, batcher, settings, result_dir=None, result_prefix=None, save_full_info=False): """ """ pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 count = 0 while True: # batch = batcher.get_next_batch() if batch is None: break # results = model.run_eval_one_batch(batch) count += 1 print(count) # loss = results["loss_optim"] idx_passage = results["idx_passage"] idx_start = results["idx_start"] idx_end = results["idx_end"] # pred_prob = results["pred_prob"] # batch_size = len(idx_passage) total_loss += loss * batch_size total_num += batch_size # sidx = 0 for sidx in range(batch_size): # sample = batch['data_raw'][sidx] idx_p_curr = idx_passage[sidx] idx_s_curr = idx_start[sidx] idx_e_curr = idx_end[sidx] # prob_curr = pred_prob[sidx] # pred_a = ''.join(sample['passages'][idx_p_curr]['passage_tokens'][idx_s_curr: idx_e_curr + 1]) # if save_full_info: sample['pred_answers'] = [pred_a] pred_answers.append(sample) else: pred_answers.append({'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [ pred_a ], 'entity_answers': [[]], 'yesno_answers': []}) if 'answers' in sample: ref_answers.append({'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': []}) # # saving if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w', encoding="utf-8") as fout: for pred_answer in pred_answers: fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n') # model.logger.info('saving {} results to {}'.format(result_prefix, result_file)) # # # metric # this average loss is invalid on test set, since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) # bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None # print("ave_loss: %g" % ave_loss) print("bleu_rouge:") print(bleu_rouge) # model.logger.info('ave_loss: {}'.format(ave_loss)) model.logger.info('bleu_rouge: {}'.format(bleu_rouge)) # return ave_loss, bleu_rouge
def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False): """ Evaluates the model performance on eval_batches and results are saved if specified Args: eval_batches: iterable batch data result_dir: directory to save predicted answers, answers will not be saved if None result_prefix: prefix of the file for saving predicted answers, answers will not be saved if None save_full_info: if True, the pred_answers will be added to raw sample and saved """ pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 for b_itx, batch in enumerate(eval_batches): passage_len = len(batch['passage_token_ids'][0]) label_batch = len(batch['start_id']) all_passage = len(batch['passage_token_ids']) concat_passage_len = all_passage / label_batch * passage_len feed_dict = { self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.start_label_probs: self._get_label_probs(batch['start_id'], concat_passage_len), self.end_label_probs: self._get_label_probs(batch['end_id'], concat_passage_len), self.dropout_keep_prob: 1.0 } if hasattr(self, 'char_vocab'): char_input = { self.p_char: batch['passage_char_ids'], self.q_char: batch['question_char_ids'], self.p_char_length: batch['passage_char_length'], self.q_char_length: batch['question_char_length'], } feed_dict.update(char_input) start_probs, end_probs, loss = self.sess.run( [self.start_probs, self.end_probs, self.loss], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) padded_p_len = len(batch['passage_token_ids'][0]) for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs): best_answer = self.find_best_answer(sample, start_prob, end_prob, padded_p_len) if save_full_info: sample['pred_answers'] = [best_answer] pred_answers.append(sample) else: pred_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [] }) if 'answers' in sample: ref_answers.append({ 'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': [] }) if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write( json.dumps(pred_answer, ensure_ascii=False) + '\n') self.logger.info('Saving {} results to {}'.format( result_prefix, result_file)) # this average loss is invalid on test set, since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge
def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False): """ Evaluates the model performance on eval_batches and results are saved if specified Args: eval_batches: iterable batch data result_dir: directory to save predicted answers, answers will not be saved if None result_prefix: prefix of the file for saving predicted answers, answers will not be saved if None save_full_info: if True, the pred_answers will be added to raw sample and saved """ pred_answers, ref_answers = [], [] total_loss, total_num = 0, 0 for b_itx, batch in enumerate(eval_batches): feed_dict = {self.p: batch['passage_token_ids'], self.q: batch['question_token_ids'], self.p_length: batch['passage_length'], self.q_length: batch['question_length'], self.start_label: batch['start_id'], self.end_label: batch['end_id'], self.dropout_keep_prob: 1.0, self.em: batch['exact_match']} batch_size = len(batch['start_id']) padded_p_len = len(batch['passage_token_ids'][0]) padded_p_num = len(batch['passage_token_ids']) / batch_size para_ids = [] for start_id in batch['start_id']: para_ids.append(start_id // padded_p_len) feed_dict[self.para_label] = para_ids content_label = np.zeros([batch_size, padded_p_num * padded_p_len], dtype=int) for s_idx, (start_id, end_id) in enumerate(zip(batch['start_id'], batch['end_id'])): content_label[s_idx, start_id: end_id+1] = 1 feed_dict[self.content_label] = content_label start_probs, end_probs, content_scores, verif_scores, loss = self.sess.run([self.start_probs, self.end_probs, self.concat_content_score, self.reshaped_ans_verif_score, self.loss], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) padded_p_len = len(batch['passage_token_ids'][0]) for s_idx, sample in enumerate(batch['raw_data']): start_prob = start_probs[s_idx] end_prob = end_probs[s_idx] content_score = content_scores[s_idx] verif_score = verif_scores[s_idx] best_answer = self.find_best_answer_with_verif(sample, start_prob, end_prob, content_score, verif_score, padded_p_len) if save_full_info: sample['pred_answers'] = [best_answer] pred_answers.append(sample) else: pred_answers.append({'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': []}) if 'answers' in sample: ref_answers.append({'question_id': sample['question_id'], 'question_type': sample['question_type'], 'answers': sample['answers'], 'entity_answers': [[]], 'yesno_answers': []}) if result_dir is not None and result_prefix is not None: result_file = os.path.join(result_dir, result_prefix + '.json') with open(result_file, 'w') as fout: for pred_answer in pred_answers: fout.write(json.dumps(pred_answer, encoding='utf8', ensure_ascii=False) + '\n') self.logger.info('Saving {} results to {}'.format(result_prefix, result_file)) # this average loss is invalid on test set, since we don't have true start_id and end_id ave_loss = 1.0 * total_loss / total_num # compute the bleu and rouge scores if reference answers is provided if len(ref_answers) > 0: pred_dict, ref_dict = {}, {} for pred, ref in zip(pred_answers, ref_answers): question_id = ref['question_id'] if len(ref['answers']) > 0: pred_dict[question_id] = normalize(pred['answers']) ref_dict[question_id] = normalize(ref['answers']) bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict) else: bleu_rouge = None return ave_loss, bleu_rouge