def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=100, print_to_screen=False): """ Sample from the provided (train/dev) set. For each sample, calculate F1 and EM score. Return average F1 and EM score for all samples. Optionally pretty-print examples. Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode. This function uses the pre-processed version of the e.g. dev set for speed, whereas "official_eval" mode uses the original JSON. Therefore: 1. official_eval takes your max F1/EM score w.r.t. the three reference answers, whereas this function compares to just the first answer (which is what's saved in the preprocessed data) 2. Our preprocessed version of the dev set is missing some examples due to tokenization issues (see squad_preprocess.py). "official_eval" includes all examples. Inputs: session: TensorFlow session qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files. dataset: string. Either "train" or "dev". Just for logging purposes. num_samples: int. How many samples to use. If num_samples=0 then do whole dataset. print_to_screen: if True, pretty-prints each example to screen Returns: F1 and EM: Scalars. The average across the sampled examples. """ logging.info( "Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) f1_total = 0. em_total = 0. example_num = 0 tic = time.time() # Note here we select discard_long=False because we want to sample from the entire dataset # That means we're truncating, rather than discarding, examples with too-long context or questions for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False): pred_start_pos, pred_end_pos = self.get_start_end_pos( session, batch, dataset) # Convert the start and end positions to lists length batch_size pred_start_pos = pred_start_pos.tolist() # list length batch_size pred_end_pos = pred_end_pos.tolist() # list length batch_size for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate( zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 # Get the predicted answer # Important: batch.context_tokens contains the original words (no UNKs) # You need to use the original no-UNK version when measuring F1/EM pred_ans_tokens = batch.context_tokens[ex_idx][ pred_ans_start:pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens) # Calc F1/EM f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) f1_total += f1 em_total += em # Optionally pretty-print if print_to_screen: print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num toc = time.time() logging.info( "Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc - tic)) return f1_total, em_total
def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=100, print_to_screen=False): """ Sample from the provided (train/dev) set. For each sample, calculate F1 and EM score. Return average F1 and EM score for all samples. Optionally pretty-print examples. """ logging.info( "Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) f1_total = 0. em_total = 0. example_num = 0 tic = time.time() for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False): pred_start_pos, pred_end_pos = self.get_start_end_pos( session, batch) # Convert the start and end positions to lists length batch_size pred_start_pos = pred_start_pos.tolist() # list length batch_size pred_end_pos = pred_end_pos.tolist() # list length batch_size for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate( zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 # Get the predicted answer pred_ans_tokens = batch.context_tokens[ex_idx][ pred_ans_start:pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens) # Calc F1/EM f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) f1_total += f1 em_total += em # Optionally pretty-print if print_to_screen: print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num toc = time.time() logging.info( "Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc - tic)) return f1_total, em_total
def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=10, print_to_screen=False, write_out=False, file_out=None, shuffle=True): """ Sample from the provided (train/dev) set. For each sample, calculate F1 and EM score. Return average F1 and EM score for all samples. Optionally pretty-print examples. Inputs: session: TensorFlow session qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files. dataset: string. Either "train" or "dev". Just for logging purposes. num_samples: int. How many samples to use. If num_samples=0 then do whole dataset. print_to_screen: if True, pretty-prints each example to screen Returns: F1 and EM: Scalars. The average across the sampled examples. """ logging.info( "Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) f1_total = 0. em_total = 0. ed_total = 0. rough_em_total = 0. example_num = 0 tic = time.time() ans_list = [] graph_route_info = [] # Note here we select discard_long=False because we want to sample from the entire dataset # That means we're truncating, rather than discarding, examples with too-long context or questions for batch in get_batch_generator( self.word2id, self.context2id, self.ans2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, self.graph_vocab_class, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, answer_len=self.FLAGS.answer_len, discard_long=False, use_raw_graph=self.FLAGS.use_raw_graph, shuffle=shuffle, show_start_tokens=self.FLAGS.show_start_tokens, output_goal=True): train_ids, pred_ids, dev_final_states, pred_logits = self.get_prob_dists( session, batch) start_ids = batch.ans_ids[:, 0].reshape(-1) graph_length = np.sum(batch.context_mask, axis=1) if self.FLAGS.pred_method != 'beam': pred_ids, confidence_score, ans_str = verify_route( start_ids, pred_logits, batch.context_tokens, self.ans2id, self.id2ans, self.FLAGS.answer_len) f1_scores, em_scores, ed_scores, gm_scores = [], [], [], [] pred_ids = pred_ids.tolist() # the output of using test network for ex_idx, (pred_ans_list, true_ans_tokens) in enumerate( zip(pred_ids, list(batch.ans_tokens))): example_num += 1 pred_ans_tokens = [] for id in pred_ans_list: if id == PAD_ID: break else: pred_ans_tokens.append(self.id2ans[id]) pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens[:]) # Calculate metrics f1, em, edit_dist, goal_match = compute_all_metrics( pred_ans_tokens, true_ans_tokens) f1_scores.append(f1) em_scores.append(em) ed_scores.append(edit_dist) gm_scores.append(goal_match) f1_total += f1 em_total += em ed_total += edit_dist rough_em_total += goal_match ans_list.append(pred_answer) graph_route_info.append( (str(int(graph_length[ex_idx])), str(len(true_ans_tokens[1:-1])), str(int(em)))) # Optionally pretty-print if print_to_screen: print_example(self.word2id, self.context2id, self.ans2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], true_answer, pred_answer, f1, em, edit_dist, confidence_score[ex_idx]) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num ed_total /= example_num rough_em_total /= example_num toc = time.time() logging.info( "Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc - tic)) if write_out: logging.info("Writing the prediction to {}".format(file_out)) with open(file_out, 'w') as f: for line, extra_info in zip(ans_list, graph_route_info): f.write(line + " " + " ".join(extra_info) + '\n') print("Wrote predictions to %s" % file_out) em_file = "em_" + str(file_out) logging.info("Writing EM scores to {}".format(em_file)) with open(em_file, 'w') as f: for em in em_scores: f.write(str(em) + '\n') print("Wrote EM Scores to %s" % em_file) ed_file = "ed_" + str(file_out) logging.info("Writing ED scores to {}".format(ed_file)) with open(ed_file, 'w') as f: for ed in ed_scores: f.write(str(ed) + '\n') print("Wrote ED Scores to %s" % ed_file) gm_file = "gm_" + str(file_out) logging.info("Writing GM scores to {}".format(gm_file)) with open(gm_file, 'w') as f: for gm in gm_scores: f.write(str(gm) + '\n') print("Wrote GM Scores to %s" % gm_file) return f1_total, em_total, ed_total, rough_em_total
def demo(self, session, context_path, qn_path, ans_path, dataset, num_samples=10, print_to_screen=False, write_out=False, file_out=None, shuffle=True): """ Sample from the provided (train/dev) set. For each sample, calculate F1 and EM score. Return average F1 and EM score for all samples. Optionally pretty-print examples. Inputs: session: TensorFlow session qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files. dataset: string. Either "train" or "dev". Just for logging purposes. num_samples: int. How many samples to use. If num_samples=0 then do whole dataset. print_to_screen: if True, pretty-prints each example to screen Returns: F1 and EM: Scalars. The average across the sampled examples. """ logging.info( "Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) example_num = 0 tic = time.time() ans_list = [] graph_route_info = [] for batch in get_batch_generator( self.word2id, self.context2id, self.ans2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, self.graph_vocab_class, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, answer_len=self.FLAGS.answer_len, discard_long=False, use_raw_graph=self.FLAGS.use_raw_graph, shuffle=shuffle, show_start_tokens=self.FLAGS.show_start_tokens, output_goal=True): train_ids, pred_ids, dev_final_states, pred_logits = self.get_prob_dists( session, batch) start_ids = batch.ans_ids[:, 0].reshape(-1) if self.FLAGS.pred_method != 'beam': pred_ids, confidence_score, ans_str = output_route( start_ids, pred_logits, batch.context_tokens, self.ans2id, self.id2ans, self.FLAGS.answer_len) pred_ids = pred_ids.tolist() # the output of using test network dev_attention_map = create_attention_images_summary( dev_final_states) print "dev_attention_map", dev_attention_map.shape dev_attention_map = dev_attention_map.eval().tolist() # the output of using training network, that the true input is fed as the input of the next RNN, for debug. for ex_idx, (pred_ans_list, true_ans_tokens, attention_map) in enumerate( zip(pred_ids, list(batch.ans_tokens), dev_attention_map)): example_num += 1 pred_ans_tokens = [] for id in pred_ans_list: if id == PAD_ID: break else: pred_ans_tokens.append(self.id2ans[id]) pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens[:]) # Calculate metrics f1, em, edit_dist, rough_em = compute_all_metrics( pred_ans_tokens, true_ans_tokens) ans_list.append(pred_answer) if print_to_screen: print_example(self.word2id, self.context2id, self.ans2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], true_answer, pred_answer, f1, em, edit_dist, confidence_score[ex_idx]) # Draw attention map draw_attention(batch, ex_idx, attention_map, pred_ans_tokens) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break toc = time.time() logging.info( "Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc - tic)) if write_out: logging.info("Writing the prediction to {}".format(file_out)) with open(file_out, 'w') as f: for line, extra_info in zip(ans_list, graph_route_info): f.write(line + " " + " ".join(extra_info) + '\n') print("Wrote predictions to %s" % file_out) return
def get_error_stats(self, session, context_path, qn_path, ans_path, dataset, num_samples=10, print_to_screen=False): """ Sample from the provided (train/dev) set. For each sample, calculate F1 and EM score. Return average F1 and EM score for all samples. Optionally pretty-print examples. Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode. This function uses the pre-processed version of the e.g. dev set for speed, whereas "official_eval" mode uses the original JSON. Therefore: 1. official_eval takes your max F1/EM score w.r.t. the three reference answers, whereas this function compares to just the first answer (which is what's saved in the preprocessed data) 2. Our preprocessed version of the dev set is missing some examples due to tokenization issues (see squad_preprocess.py). "official_eval" includes all examples. Inputs: session: TensorFlow session qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files. dataset: string. Either "train" or "dev". Just for logging purposes. num_samples: int. How many samples to use. If num_samples=0 then do whole dataset. print_to_screen: if True, pretty-prints each example to screen Returns: F1 and EM: Scalars. The average across the sampled examples. """ logging.info( "Calculating Error stats for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) f1_total = 0. em_total = 0. example_num = 0 tic = time.time() # Note here we select discard_long=False because we want to sample from the entire dataset # That means we're truncating, rather than discarding, examples with too-long context or questions first_token_qn_dict_wrong = defaultdict(float) first_token_qn_dict_total = defaultdict(float) first_token_qn_dict_f1 = defaultdict(float) for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False): pred_start_pos, pred_end_pos = self.get_start_end_pos( session, batch) # Convert the start and end positions to lists length batch_size pred_start_pos = pred_start_pos.tolist() # list length batch_size pred_end_pos = pred_end_pos.tolist() # list length batch_size for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate( zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 # Get the predicted answer # Important: batch.context_tokens contains the original words (no UNKs) # You need to use the original no-UNK version when measuring F1/EM pred_ans_tokens = batch.context_tokens[ex_idx][ pred_ans_start:pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens) # Calc F1/EM f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) first_token_qn = batch.qn_tokens[ex_idx][0] first_token_qn_dict_total[first_token_qn] += 1 #print 'example_num: ', example_num #print 'total words seen in first_token_qn_dict: ', sum(first_token_qn_dict_total.itervalues()) if not em: #we have found an error: #get first token of error question: first_token_qn_dict_wrong[first_token_qn] += 1 f1_total += f1 first_token_qn_dict_f1[first_token_qn] += f1 em_total += em # Optionally pretty-print if print_to_screen: print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num print 'total words: ', sum(first_token_qn_dict_total.itervalues()) toc = time.time() logging.info( "Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc - tic)) final_freq_dict = {} for token, count in sorted(first_token_qn_dict_total.iteritems(), key=lambda (k, v): (v, k)): #key is fist token of question, value is how many times that token occurs freq = first_token_qn_dict_wrong[ token] / first_token_qn_dict_total[token] f1 = first_token_qn_dict_f1[token] / first_token_qn_dict_total[ token] print "When first token is: [", token, "] f1:", f1, "We got : ", first_token_qn_dict_wrong[ token], " wrong exact match out of ", first_token_qn_dict_total[ token], " percentage of 1st tokens that are this token: ", first_token_qn_dict_total[ token] / sum(first_token_qn_dict_total.itervalues( )), " precentage of this token WRONG: ", freq print('em_total:', em_total) print('f1_total:', f1_total) return f1_total, em_total
def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=100, print_to_screen=False): """ Sample from the provided (train/dev) set. For each sample, calculate F1 and EM score. Return average F1 and EM score for all samples. Optionally pretty-print examples. Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode. This function uses the pre-processed version of the e.g. dev set for speed, whereas "official_eval" mode uses the original JSON. Therefore: 1. official_eval takes your max F1/EM score w.r.t. the three reference answers, whereas this function compares to just the first answer (which is what's saved in the preprocessed data) 2. Our preprocessed version of the dev set is missing some examples due to tokenization issues (see squad_preprocess.py). "official_eval" includes all examples. Inputs: session: TensorFlow session qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files. dataset: string. Either "train" or "dev". Just for logging purposes. num_samples: int. How many samples to use. If num_samples=0 then do whole dataset. print_to_screen: if True, pretty-prints each example to screen Returns: F1 and EM: Scalars. The average across the sampled examples. """ logging.info("Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) f1_total = 0. em_total = 0. example_num = 0 tic = time.time() # Note here we select discard_long=False because we want to sample from the entire dataset # That means we're truncating, rather than discarding, examples with too-long context or questions for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False): pred_start_pos, pred_end_pos = self.get_start_end_pos(session, batch) # Convert the start and end positions to lists length batch_size pred_start_pos = pred_start_pos.tolist() # list length batch_size pred_end_pos = pred_end_pos.tolist() # list length batch_size for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 # Get the predicted answer # Important: batch.context_tokens contains the original words (no UNKs) # You need to use the original no-UNK version when measuring F1/EM pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens) # Calc F1/EM f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) f1_total += f1 em_total += em # Optionally pretty-print if print_to_screen: print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num toc = time.time() logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic)) return f1_total, em_total