def exact_match_score(prediction, ground_truth, lang="Ja", lower=False): if lang == "Ja": return prediction.split() == tokenize_preprocess_japanese_sent( ground_truth).split() else: return prediction.split() == ground_truth.lower().split( ) or prediction == ground_truth.lower()
def get_japanese_answers_with_attention(self, id2answerindices_dict): ja_ans = {} for k, v in id2answerindices_dict.items(): start, end = v title = self.question_id2title[k]["title"] para_idx = self.question_id2title[k]["para_idx"] attention_weights_list = self.title2attention_dic[title][para_idx] ans = [] ans_idx = [] paragraph = {} paragraph_sent_length = [] for i, sent_idx in enumerate(self.sent_idx_dic[title][para_idx]): paragraph[i] = self.trans_lines[sent_idx] paragraph_sent_length.append( len(self.trans_lines[sent_idx].split())) for i in range(int(start), int(end) + 1): m, n = 0, 0 prev_idx = 0 for j in range(len(paragraph_sent_length)): if prev_idx + (paragraph_sent_length[j] - 1) < i: prev_idx += paragraph_sent_length[j] else: m = j n = i - prev_idx break sentence_idx = self.sent_idx_dic[title][para_idx][m] # TODO: Fix code not to use tokenizer here for multilingual # adatation. if self.lang == "Fr": source_tokens = self.source_lines[sentence_idx].split() elif self.lang == "Ja": source_tokens = tokenize_preprocess_japanese_sent( self.source_lines[sentence_idx]).split(" ") attention_weight_vector = attention_weights_list[m][n] source_idx = np.argmax(attention_weight_vector) if source_idx == len(attention_weight_vector) - 1 and \ n != len(self.trans_lines[m]) - 1: source_idx = np.argsort(attention_weight_vector)[::-1][1] ans_token = source_tokens[source_idx].replace("\n", "") ans_idx.append(source_idx) if len(ans_idx) == 0: ja_ans[k] = "" else: start = min(ans_idx) end = max(ans_idx) ja_ans[k] = " ".join(source_tokens[start:end + 1]) return ja_ans
def f1_score(prediction, ground_truth, lang="Ja", lower=False): prediction_tokens = prediction.split() if lang == 'Ja': ground_truth_tokens = tokenize_preprocess_japanese_sent( ground_truth).split() else: ground_truth_tokens = ground_truth.lower().split() common = Counter(prediction_tokens) & Counter(ground_truth_tokens) if prediction == ground_truth.lower(): return 1.0 num_same = sum(common.values()) if num_same == 0: return 0 precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(ground_truth_tokens) f1 = (2 * precision * recall) / (precision + recall) return f1
def get_japanese_answers_with_attention_use_soft_alignment( self, id2answerindices_dict, alpha=0.3): ja_ans = {} for k, v in id2answerindices_dict.items(): start, end = v title = self.question_id2title[k]["title"] para_idx = self.question_id2title[k]["para_idx"] attention_weights_list = self.title2attention_dic[title][para_idx] ans = [] ans_idx = [] paragraph = {} paragraph_sent_length = [] paragraph_source_tokens = [] non_answer_scores = [] for i, sent_idx in enumerate(self.sent_idx_dic[title][para_idx]): paragraph[i] = self.trans_lines[sent_idx] paragraph_sent_length.append( len(self.trans_lines[sent_idx].split())) if self.lang == "Fr": source_tokens = self.source_lines[sent_idx].split() elif self.lang == "Ja": source_tokens = tokenize_preprocess_japanese_sent( self.source_lines[sent_idx]).split(" ") non_answer_scores.append( [1.0 for source_token in source_tokens]) # [[1.0,1.0, ....]], shape : (num_sent_paragraph x num_source_token) paragraph_source_tokens.append(source_tokens) # [[私は,, ....]], shape : (num_sent_paragraph x num_source_token) for i in range(int(start), int(end) + 1): m, n = 0, 0 prev_idx = 0 for j in range(len(paragraph_sent_length)): if prev_idx + (paragraph_sent_length[j] - 1) < i: prev_idx += paragraph_sent_length[j] else: m = j n = i - prev_idx break for j in range(len(paragraph_source_tokens[m])): if len(attention_weights_list[m][n]) <= j: print(paragraph_source_tokens) continue non_answer_scores[m][j] *= ( 1 - attention_weights_list[m][n][j]) answer_scores = [] for m in range(len(non_answer_scores)): answer_scores.append([ 1.0 - non_answer_score for non_answer_score in non_answer_scores[m] ]) ans_indices = {} ans_tokens = [] for m in range(len(non_answer_scores)): for j in range(len(non_answer_scores[m])): if answer_scores[m][j] > alpha: ans_indices.setdefault(m, []) ans_indices[m].append(j) for sent_index, token_indices in ans_indices.items(): m_start, m_end = min(token_indices), max(token_indices) ans_tokens.extend( paragraph_source_tokens[sent_index][m_start:m_end + 1]) print(ans_tokens) if len(ans_tokens) == 0: for i, sent_idx in enumerate( self.sent_idx_dic[title][para_idx]): paragraph[i] = self.trans_lines[sent_idx] paragraph_sent_length.append( len(self.trans_lines[sent_idx].split())) for i in range(int(start), int(end) + 1): m, n = 0, 0 prev_idx = 0 for j in range(len(paragraph_sent_length)): if prev_idx + (paragraph_sent_length[j] - 1) < i: prev_idx += paragraph_sent_length[j] else: m = j n = i - prev_idx break sentence_idx = self.sent_idx_dic[title][para_idx][m] # TODO: Fix code not to use tokenizer here for multilingual # adatation. if self.lang == "Fr": source_tokens = self.source_lines[sentence_idx].split() elif self.lang == "Ja": source_tokens = tokenize_preprocess_japanese_sent( self.source_lines[sentence_idx]).split(" ") attention_weight_vector = attention_weights_list[m][n] source_idx = np.argmax(attention_weight_vector) if source_idx == len(attention_weight_vector) - 1 and \ n != len(self.trans_lines[m]) - 1: source_idx = np.argsort( attention_weight_vector)[::-1][1] ans_token = source_tokens[source_idx].replace("\n", "") ans_idx.append(source_idx) start = min(ans_idx) end = max(ans_idx) ans_tokens = source_tokens[start:end + 1] ja_ans[k] = " ".join(ans_tokens) return ja_ans
def evaluate_mlqa_bing_translate(model, instances, data_iterator, cuda_device, lang, version=4, embedding_name="bing", back_trans=True): answer_retreival = SQuADMLAnswerRetreivalBing(lang, version, embedding_name) model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) question_idx = [] ground_truth_answers = {} predicted_ans = [] predicted_ans_str = [] ja_anss = {} en_anss = {} id2answer_dict = {} if back_trans == False: for batch in generator_tqdm: result = model(**batch) anss = result['best_span'] ans_strs = result['best_span_str'] num = len(batch["metadata"]) for i in range(num): question_idx.append(batch["metadata"][i]["question_id"]) ground_truth_answers[int(question_idx[-1])] = \ batch["metadata"][i]["answer_texts"] for ans in ans_strs: predicted_ans_str.append(ans) for ans in anss: predicted_ans.append(ans) id2answerindices_dict = \ answer_retreival.get_id2answerindices_dict( predicted_ans, question_idx) id2answer_dict = \ answer_retreival.get_id2answerindices_dict( predicted_ans_str, question_idx) ja_anss.update( answer_retreival.get_japanese_answers_with_attention( id2answerindices_dict, id2answer_dict)) else: for batch in generator_tqdm: result = model(**batch) anss = result['best_span'] ans_strs = result['best_span_str'] num = len(batch["metadata"]) for i in range(num): question_idx.append(batch["metadata"][i]["question_id"]) ground_truth_answers[int( question_idx[-1])] = batch["metadata"][i]["answer_texts"] for ans in ans_strs: predicted_ans_str.append(ans) id2answer_batch = { int(id_): span for id_, span in zip(question_idx, predicted_ans_str) } id2answer_dict.update(id2answer_batch) if lang == "Ja": ja_anss = { k: bing_translate(v, "en", "ja") for k, v in id2answer_dict.items() } ja_anss = { k: tokenize_preprocess_japanese_sent(v) for k, v in ja_anss.items() } print(ja_ans) elif lang == "Fr": ja_anss = { k: bing_translate(v, "en", "fr") for k, v in id2answer_dict.items() } ja_anss = { k: tokenize_preprocess_japanese_sent(v) for k, v in ja_anss.items() } for k, v in ja_anss.items(): print("{0}:<JA>{1}, <EN>{2}".format(k, ja_anss[k], id2answer_dict[k])) save_path = 'japanese_ans_predicted.json' f = open(save_path, "w") json.dump(ja_anss, f) f.close() save_path_answer_in_trans = 'predicted_ans_english.json' f = open(save_path_answer_in_trans, "w") json.dump(id2answer_dict, f) f.close() eval_dict = evaluate(ground_truth_answers, ja_anss, lang) print(eval_dict) return {"F1": eval_dict['f1'], "EM": eval_dict['exact_match']}
def evaluate_mlqa_google_translate(model, instances, data_iterator, cuda_device, lang="Ja", version=3): model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) question_idx = [] ground_truth_answers = {} predicted_ans = [] predicted_ans_str = [] ja_anss = {} en_anss = {} id2answer_dict = {} for batch in generator_tqdm: result = model(**batch) anss = result['best_span'] ans_strs = result['best_span_str'] num = len(batch["metadata"]) for i in range(num): question_idx.append(batch["metadata"][i]["question_id"]) ground_truth_answers[int( question_idx[-1])] = batch["metadata"][i]["answer_texts"] for ans in ans_strs: predicted_ans_str.append(ans) id2answer_batch = { int(id_): span for id_, span in zip(question_idx, predicted_ans_str) } id2answer_dict.update(id2answer_batch) if lang == "Ja": ja_anss = { k: google_translate(v, toJa=True) for k, v in id2answer_dict.items() } ja_anss = { k: tokenize_preprocess_japanese_sent(v) for k, v in ja_anss.items() } elif lang == "Fr": ja_anss = { k: google_translate_to_fr(v, True) for k, v in id2answer_dict.items() } save_path = 'japanese_ans_predicted.json' f = open(save_path, "w") json.dump(ja_anss, f) f.close() eval_dict = evaluate(ground_truth_answers, ja_anss) print(eval_dict) return {"F1": eval_dict['f1'], "EM": eval_dict['exact_match']}
def _read(self, question_file_path): sent_idx_dic = create_sent_idx_dic(self.source_context_file_path) if self.online_trans: # 1. Create tmp japanese question files. ja_q_tmp = open(self.japanese_question_file_path, "w") with open(self.question_file_path, newline='') as f: dataReader = csv.reader(f) header = next(dataReader) for row in dataReader: if self.lang == "Ja": question = tokenize_preprocess_japanese_sent(row[3]) if self.use_question_tag == True: ja_q_tmp.write(question + " <QS>\n") else: ja_q_tmp.write(question + "\n") elif self.lang == "Fr": ja_q_tmp.write(row[3].lower() + "\n") ja_q_tmp.close() if self.lang == "Fr": normalize_tokenized_sent(self.japanese_question_file_path, self.lang) # 2. Create tmp japanese context files. ja_c_tmp = open(self.japanese_context_file_path, "w") with open(self.source_context_file_path, newline='') as f: dataReader = csv.reader(f) header = next(dataReader) for row in dataReader: if self.lang == "Ja": if self.use_bing_translate == True or self.use_bing_translate == True: context_sent = row[3] else: context_sent = \ tokenize_preprocess_japanese_sent(row[3]) ja_c_tmp.write(context_sent + "\n") # For French and German, the tokenization and normalization # would be executed later. elif self.lang == "Fr": ja_c_tmp.write(row[3].lower() + "\n") ja_c_tmp.close() if self.lang == "Fr": normalize_tokenized_sent(self.japanese_context_file_path, self.lang) # 3. Get the translated results and attention scores. if self.use_google_translate: questions_sources = open(self.japanese_question_file_path, "r").readlines() translated_questions = \ [google_translate(sentence, False, self.lang) for sentence in questions_sources] translated_questions = \ [tokenize_preprocess_english_sent( sentence) for sentence in translated_questions] translated_questions = [ q.replace("'", "'").replace("\u200b\u200b", "") for q in translated_questions ] google_trans_path = os.path.join("google_trans", self.lang) if not os.path.exists(google_trans_path): os.makedirs(google_trans_path) self.question_trans_file_path = os.path.join( google_trans_path, "TRANS.question.txt.new") elif self.use_bing_translate: questions_sources = open(self.japanese_question_file_path, "r").readlines() translated_questions = \ [bing_translate(sentence, self.lang, 'en') for sentence in questions_sources] translated_questions = \ [tokenize_preprocess_english_sent( sentence) for sentence in translated_questions] translated_questions = [ q.replace("'", "'").replace("\u200b\u200b", "") for q in translated_questions ] bing_trans_path = os.path.join('trans_result', self.lang.lower(), 'v' + str(self.version), 'bing') if not os.path.exists(bing_trans_path): os.makedirs(bing_trans_path) self.question_trans_file_path = os.path.join( bing_trans_path, "TRANS.question.txt.new") else: if self.beam_search == True: if self.soft == True: translated_questions, attention_scores_questions = \ trans_from_files_beam(self.japanese_question_file_path, self.japanese_question_file_path, self.trans_train_source, self.trans_train_target, self.trans_embedding_model, self.trans_encdec_model, seed, 5, True) trans_dir = os.path.split( self.question_trans_file_path)[0] trans_beam_dir = os.path.join(trans_dir, 'beam') if not os.path.exists(trans_beam_dir): os.makedirs(trans_beam_dir) self.question_trans_file_path = os.path.join( trans_beam_dir, "TRANS.question.txt.new") else: trans_from_files_beam(self.japanese_question_file_path, self.japanese_question_file_path, self.trans_train_source, self.trans_train_target, self.trans_embedding_model, self.trans_encdec_model, seed, 5, False) translated_questions = open("trans.txt", 'r').read().splitlines() # Save translated questions. trans_dir = os.path.split( self.question_trans_file_path)[0] trans_beam_dir = os.path.join(trans_dir, 'beam') if not os.path.exists(trans_beam_dir): os.makedirs(trans_beam_dir) self.question_trans_file_path = os.path.join( trans_beam_dir, "TRANS.question.txt.new") else: translated_questions, attention_scores_questions = \ trans_from_files(self.japanese_question_file_path, self.japanese_question_file_path, self.trans_train_source, self.trans_train_target, self.trans_embedding_model, self.trans_encdec_model, seed, trans_mode=True, save_attention_weights=True, replace_UNK=self.replace_UNK) trans_q_lines = translated_questions question_trans = open(self.question_trans_file_path, "w") for question in trans_q_lines: question_trans.write(question + "\n") question_trans.close() # Context Trans if self.use_google_translate: context_sources = open(self.japanese_context_file_path, "r").readlines() translated_context = \ [google_translate(sentence, False, self.lang) for sentence in context_sources] translated_context = \ [tokenize_preprocess_english_sent( sentence) for sentence in translated_context] translated_context = [ c.replace("'", "'").replace("\u200b\u200b", "") for c in translated_context ] google_trans_path = os.path.join("google_trans", self.lang) if not os.path.exists(google_trans_path): os.makedirs(google_trans_path) self.context_trans_file_path = os.path.join( google_trans_path, "TRANS.txt.new") trans_c_lines = translated_context if self.use_bing_translate: context_sources = open(self.japanese_context_file_path, "r").readlines() bing_translate_result = [ bing_translate(sentence, self.lang, 'en', True) for sentence in context_sources ] translated_context = [ result[0] for result in bing_translate_result ] alignment_info = [ result[1] for result in bing_translate_result ] translated_context = [ sentence.lower().replace("\n", "") for sentence in translated_context ] bing_trans_path = os.path.join('trans_result', self.lang.lower(), 'v' + str(self.version), 'bing') if not os.path.exists(bing_trans_path): os.makedirs(bing_trans_path) self.context_trans_file_path = os.path.join( bing_trans_path, "TRANS.txt.new") self.context_attention_file_path = os.path.join( bing_trans_path, "ATTN.txt.new") trans_c_lines = translated_context trans_a_lines = alignment_info else: if self.beam_search == True: if self.soft == True: translated_context, attention_scores_context = \ trans_from_files_beam(self.japanese_context_file_path, self.japanese_context_file_path, self.trans_train_source, self.trans_train_target, self.trans_embedding_model, self.trans_encdec_model, seed, 5, True) trans_c_lines = translated_context trans_a_lines = attention_scores_context # Reset the saved dir name. trans_dir = os.path.split( self.question_trans_file_path)[0] self.context_trans_file_path = os.path.join( trans_dir, "TRANS.txt.new") self.context_attention_file_path = os.path.join( trans_dir, "ATTN.txt.new") else: trans_from_files_beam(self.japanese_context_file_path, self.japanese_context_file_path, self.trans_train_source, self.trans_train_target, self.trans_embedding_model, self.trans_encdec_model, seed, 5, False) translated_context = open("trans.txt", 'r').read().splitlines() attention_scores_context = open( "attn.txt", 'r').read().splitlines() # Reset the saved dir name. trans_dir = os.path.split( self.question_trans_file_path)[0] self.context_trans_file_path = os.path.join( trans_dir, "TRANS.txt.new") self.context_attention_file_path = os.path.join( trans_dir, "ATTN.txt.new") else: translated_context, attention_scores_context = \ trans_from_files(self.japanese_context_file_path, self.japanese_context_file_path, self.trans_train_source, self.trans_train_target, self.trans_embedding_model, self.trans_encdec_model, seed, trans_mode=True, save_attention_weights=True, replace_UNK=self.replace_UNK) trans_c_lines = translated_context trans_a_lines = attention_scores_context if self.use_google_translate == False and self.use_bing_translate == False: trans_c_lines = translated_context trans_a_lines = attention_scores_context context_attention = open(self.context_attention_file_path, "w") context_trans = open(self.context_trans_file_path, "w") if self.beam_search == True and self.soft == False: # save context for trans_context in trans_c_lines: context_trans.write(trans_context + "\n") context_trans.close() # save attention for trans_attention_index in trans_a_lines: context_attention.write(trans_attention_index + "\n") context_trans.close() else: for trans_context, attention_score in zip( trans_c_lines, attention_scores_context): context_trans.write(trans_context + "\n") for i in range(len(attention_score)): attention_weight = [ str(float(weight)) for weight in attention_score[i] ] context_attention.write( " ".join(attention_weight) + "\n") context_attention.write("\n") context_trans.close() context_attention.close() else: context_trans = open(self.context_trans_file_path, "w") for trans_context in trans_c_lines: context_trans.write(trans_context + "\n") if self.use_bing_translate == True: context_attention = open(self.context_attention_file_path, "w") for trans_attention in trans_a_lines: context_attention.write(trans_attention + "\n") context_attention.close() context_trans.close() else: # This is for quick evaluation. # The translated context and questions are sved under the directory `trans_results` # and when the `--online_trans` option is set to False, the system loads the context # and questions which have been transalted beforehand. if self.use_google_translate == True: google_trans_path = os.path.join("google_trans", self.lang) if not os.path.exists(google_trans_path): os.makedirs(google_trans_path) self.context_trans_file_path = os.path.join( google_trans_path, "TRANS.txt.new") self.question_trans_file_path = os.path.join( google_trans_path, "TRANS.question.txt.new") elif self.use_bing_translate == True: bing_trans_path = os.path.join('trans_result', self.lang.lower(), 'v' + str(self.version), 'bing') if not os.path.exists(bing_trans_path): os.makedirs(bing_trans_path) self.context_trans_file_path = os.path.join( bing_trans_path, "TRANS.txt.new") self.question_trans_file_path = os.path.join( bing_trans_path, "TRANS.question.txt.new") if self.beam_search == True: trans_dir = os.path.split(self.question_trans_file_path)[0] trans_dir = os.path.join(trans_dir, "beam") self.question_trans_file_path = os.path.join( trans_dir, "TRANS.question.txt.new") self.context_trans_file_path = os.path.join( trans_dir, "TRANS.txt.new") trans_context_f = open(self.context_trans_file_path) trans_context = trans_context_f.read() trans_context_f.close() trans_c_lines = trans_context.split('\n') trans_question_f = open(self.question_trans_file_path) trans_question = trans_question_f.read() trans_question_f.close() trans_q_lines = trans_question.split('\n') with open(self.question_file_path, newline='') as f: dataReader = csv.reader(f) header = next(dataReader) for row in dataReader: question_id, title, paragraph_id, quastion = int( row[0]), row[1], int(row[2]), row[3] if self.lang == "Ja": answer_texts = [ tokenize_preprocess_japanese_sent(row[4]), tokenize_preprocess_japanese_sent(row[5]), tokenize_preprocess_japanese_sent(row[6]) ] elif self.lang == "Fr" or self.lang == "De": answer_texts = normalize_tokenized_answers( row[4], row[5], row[6], self.lang) sent_indices = sent_idx_dic[title][paragraph_id] paragraph_tokens = [] for sent_idx in sent_indices: paragraph_tokens.extend(trans_c_lines[sent_idx].split()) paragraph = " ".join(paragraph_tokens) tokenized_paragraph = self._tokenizer.tokenize(paragraph) question_text = trans_q_lines[question_id] instance = self.text_to_instance(question_text, paragraph, answer_texts, tokenized_paragraph, question_id) yield instance