def exact_match_score(prediction, ground_truth, lang="Ja", lower=False):
    if lang == "Ja":
        return prediction.split() == tokenize_preprocess_japanese_sent(
            ground_truth).split()
    else:
        return prediction.split() == ground_truth.lower().split(
        ) or prediction == ground_truth.lower()
    def get_japanese_answers_with_attention(self, id2answerindices_dict):
        ja_ans = {}
        for k, v in id2answerindices_dict.items():
            start, end = v
            title = self.question_id2title[k]["title"]
            para_idx = self.question_id2title[k]["para_idx"]
            attention_weights_list = self.title2attention_dic[title][para_idx]

            ans = []
            ans_idx = []

            paragraph = {}
            paragraph_sent_length = []

            for i, sent_idx in enumerate(self.sent_idx_dic[title][para_idx]):
                paragraph[i] = self.trans_lines[sent_idx]
                paragraph_sent_length.append(
                    len(self.trans_lines[sent_idx].split()))
            for i in range(int(start), int(end) + 1):
                m, n = 0, 0
                prev_idx = 0
                for j in range(len(paragraph_sent_length)):
                    if prev_idx + (paragraph_sent_length[j] - 1) < i:
                        prev_idx += paragraph_sent_length[j]
                    else:
                        m = j
                        n = i - prev_idx
                        break
                sentence_idx = self.sent_idx_dic[title][para_idx][m]
                # TODO: Fix code not to use tokenizer here for multilingual
                # adatation.
                if self.lang == "Fr":
                    source_tokens = self.source_lines[sentence_idx].split()
                elif self.lang == "Ja":
                    source_tokens = tokenize_preprocess_japanese_sent(
                        self.source_lines[sentence_idx]).split(" ")
                attention_weight_vector = attention_weights_list[m][n]
                source_idx = np.argmax(attention_weight_vector)
                if source_idx == len(attention_weight_vector) - 1 and \
                        n != len(self.trans_lines[m]) - 1:
                    source_idx = np.argsort(attention_weight_vector)[::-1][1]

                ans_token = source_tokens[source_idx].replace("\n", "")
                ans_idx.append(source_idx)

            if len(ans_idx) == 0:
                ja_ans[k] = ""
            else:
                start = min(ans_idx)
                end = max(ans_idx)
                ja_ans[k] = " ".join(source_tokens[start:end + 1])

        return ja_ans
def f1_score(prediction, ground_truth, lang="Ja", lower=False):
    prediction_tokens = prediction.split()
    if lang == 'Ja':
        ground_truth_tokens = tokenize_preprocess_japanese_sent(
            ground_truth).split()
    else:
        ground_truth_tokens = ground_truth.lower().split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    if prediction == ground_truth.lower():
        return 1.0
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1
    def get_japanese_answers_with_attention_use_soft_alignment(
            self, id2answerindices_dict, alpha=0.3):
        ja_ans = {}

        for k, v in id2answerindices_dict.items():
            start, end = v
            title = self.question_id2title[k]["title"]
            para_idx = self.question_id2title[k]["para_idx"]
            attention_weights_list = self.title2attention_dic[title][para_idx]

            ans = []
            ans_idx = []

            paragraph = {}
            paragraph_sent_length = []
            paragraph_source_tokens = []

            non_answer_scores = []
            for i, sent_idx in enumerate(self.sent_idx_dic[title][para_idx]):
                paragraph[i] = self.trans_lines[sent_idx]
                paragraph_sent_length.append(
                    len(self.trans_lines[sent_idx].split()))

                if self.lang == "Fr":
                    source_tokens = self.source_lines[sent_idx].split()
                elif self.lang == "Ja":
                    source_tokens = tokenize_preprocess_japanese_sent(
                        self.source_lines[sent_idx]).split(" ")

                non_answer_scores.append(
                    [1.0 for source_token in source_tokens])
                # [[1.0,1.0, ....]], shape : (num_sent_paragraph x num_source_token)
                paragraph_source_tokens.append(source_tokens)
                # [[私は,, ....]], shape : (num_sent_paragraph x num_source_token)

            for i in range(int(start), int(end) + 1):
                m, n = 0, 0
                prev_idx = 0
                for j in range(len(paragraph_sent_length)):
                    if prev_idx + (paragraph_sent_length[j] - 1) < i:
                        prev_idx += paragraph_sent_length[j]
                    else:
                        m = j
                        n = i - prev_idx
                        break

                for j in range(len(paragraph_source_tokens[m])):
                    if len(attention_weights_list[m][n]) <= j:
                        print(paragraph_source_tokens)
                        continue
                    non_answer_scores[m][j] *= (
                        1 - attention_weights_list[m][n][j])

            answer_scores = []
            for m in range(len(non_answer_scores)):
                answer_scores.append([
                    1.0 - non_answer_score
                    for non_answer_score in non_answer_scores[m]
                ])

            ans_indices = {}
            ans_tokens = []

            for m in range(len(non_answer_scores)):
                for j in range(len(non_answer_scores[m])):
                    if answer_scores[m][j] > alpha:
                        ans_indices.setdefault(m, [])
                        ans_indices[m].append(j)

            for sent_index, token_indices in ans_indices.items():
                m_start, m_end = min(token_indices), max(token_indices)
                ans_tokens.extend(
                    paragraph_source_tokens[sent_index][m_start:m_end + 1])
            print(ans_tokens)

            if len(ans_tokens) == 0:
                for i, sent_idx in enumerate(
                        self.sent_idx_dic[title][para_idx]):
                    paragraph[i] = self.trans_lines[sent_idx]
                    paragraph_sent_length.append(
                        len(self.trans_lines[sent_idx].split()))
                for i in range(int(start), int(end) + 1):
                    m, n = 0, 0
                    prev_idx = 0
                    for j in range(len(paragraph_sent_length)):
                        if prev_idx + (paragraph_sent_length[j] - 1) < i:
                            prev_idx += paragraph_sent_length[j]
                        else:
                            m = j
                            n = i - prev_idx
                            break
                    sentence_idx = self.sent_idx_dic[title][para_idx][m]
                    # TODO: Fix code not to use tokenizer here for multilingual
                    # adatation.
                    if self.lang == "Fr":
                        source_tokens = self.source_lines[sentence_idx].split()
                    elif self.lang == "Ja":
                        source_tokens = tokenize_preprocess_japanese_sent(
                            self.source_lines[sentence_idx]).split(" ")

                    attention_weight_vector = attention_weights_list[m][n]
                    source_idx = np.argmax(attention_weight_vector)
                    if source_idx == len(attention_weight_vector) - 1 and \
                            n != len(self.trans_lines[m]) - 1:
                        source_idx = np.argsort(
                            attention_weight_vector)[::-1][1]

                    ans_token = source_tokens[source_idx].replace("\n", "")
                    ans_idx.append(source_idx)
                    start = min(ans_idx)
                    end = max(ans_idx)
                    ans_tokens = source_tokens[start:end + 1]

            ja_ans[k] = " ".join(ans_tokens)

        return ja_ans
def evaluate_mlqa_bing_translate(model,
                                 instances,
                                 data_iterator,
                                 cuda_device,
                                 lang,
                                 version=4,
                                 embedding_name="bing",
                                 back_trans=True):
    answer_retreival = SQuADMLAnswerRetreivalBing(lang, version,
                                                  embedding_name)

    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator,
                               total=data_iterator.get_num_batches(instances))

    question_idx = []
    ground_truth_answers = {}
    predicted_ans = []
    predicted_ans_str = []
    ja_anss = {}
    en_anss = {}

    id2answer_dict = {}

    if back_trans == False:
        for batch in generator_tqdm:
            result = model(**batch)
            anss = result['best_span']
            ans_strs = result['best_span_str']
            num = len(batch["metadata"])
            for i in range(num):
                question_idx.append(batch["metadata"][i]["question_id"])
                ground_truth_answers[int(question_idx[-1])] = \
                    batch["metadata"][i]["answer_texts"]
            for ans in ans_strs:
                predicted_ans_str.append(ans)
            for ans in anss:
                predicted_ans.append(ans)

            id2answerindices_dict = \
                answer_retreival.get_id2answerindices_dict(
                    predicted_ans, question_idx)
            id2answer_dict = \
                answer_retreival.get_id2answerindices_dict(
                    predicted_ans_str, question_idx)

            ja_anss.update(
                answer_retreival.get_japanese_answers_with_attention(
                    id2answerindices_dict, id2answer_dict))
    else:
        for batch in generator_tqdm:
            result = model(**batch)
            anss = result['best_span']
            ans_strs = result['best_span_str']
            num = len(batch["metadata"])
            for i in range(num):
                question_idx.append(batch["metadata"][i]["question_id"])
                ground_truth_answers[int(
                    question_idx[-1])] = batch["metadata"][i]["answer_texts"]
            for ans in ans_strs:
                predicted_ans_str.append(ans)

            id2answer_batch = {
                int(id_): span
                for id_, span in zip(question_idx, predicted_ans_str)
            }

            id2answer_dict.update(id2answer_batch)

        if lang == "Ja":
            ja_anss = {
                k: bing_translate(v, "en", "ja")
                for k, v in id2answer_dict.items()
            }
            ja_anss = {
                k: tokenize_preprocess_japanese_sent(v)
                for k, v in ja_anss.items()
            }
            print(ja_ans)
        elif lang == "Fr":
            ja_anss = {
                k: bing_translate(v, "en", "fr")
                for k, v in id2answer_dict.items()
            }

    ja_anss = {
        k: tokenize_preprocess_japanese_sent(v)
        for k, v in ja_anss.items()
    }
    for k, v in ja_anss.items():
        print("{0}:<JA>{1}, <EN>{2}".format(k, ja_anss[k], id2answer_dict[k]))

    save_path = 'japanese_ans_predicted.json'
    f = open(save_path, "w")
    json.dump(ja_anss, f)
    f.close()

    save_path_answer_in_trans = 'predicted_ans_english.json'

    f = open(save_path_answer_in_trans, "w")
    json.dump(id2answer_dict, f)
    f.close()

    eval_dict = evaluate(ground_truth_answers, ja_anss, lang)
    print(eval_dict)

    return {"F1": eval_dict['f1'], "EM": eval_dict['exact_match']}
def evaluate_mlqa_google_translate(model,
                                   instances,
                                   data_iterator,
                                   cuda_device,
                                   lang="Ja",
                                   version=3):
    model.eval()
    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator,
                               total=data_iterator.get_num_batches(instances))

    question_idx = []
    ground_truth_answers = {}
    predicted_ans = []
    predicted_ans_str = []
    ja_anss = {}
    en_anss = {}

    id2answer_dict = {}

    for batch in generator_tqdm:
        result = model(**batch)
        anss = result['best_span']
        ans_strs = result['best_span_str']
        num = len(batch["metadata"])
        for i in range(num):
            question_idx.append(batch["metadata"][i]["question_id"])
            ground_truth_answers[int(
                question_idx[-1])] = batch["metadata"][i]["answer_texts"]
        for ans in ans_strs:
            predicted_ans_str.append(ans)

        id2answer_batch = {
            int(id_): span
            for id_, span in zip(question_idx, predicted_ans_str)
        }

        id2answer_dict.update(id2answer_batch)

    if lang == "Ja":
        ja_anss = {
            k: google_translate(v, toJa=True)
            for k, v in id2answer_dict.items()
        }
        ja_anss = {
            k: tokenize_preprocess_japanese_sent(v)
            for k, v in ja_anss.items()
        }
    elif lang == "Fr":
        ja_anss = {
            k: google_translate_to_fr(v, True)
            for k, v in id2answer_dict.items()
        }

    save_path = 'japanese_ans_predicted.json'
    f = open(save_path, "w")
    json.dump(ja_anss, f)
    f.close()

    eval_dict = evaluate(ground_truth_answers, ja_anss)
    print(eval_dict)

    return {"F1": eval_dict['f1'], "EM": eval_dict['exact_match']}
Ejemplo n.º 7
0
    def _read(self, question_file_path):
        sent_idx_dic = create_sent_idx_dic(self.source_context_file_path)

        if self.online_trans:
            # 1. Create tmp japanese question files.
            ja_q_tmp = open(self.japanese_question_file_path, "w")

            with open(self.question_file_path, newline='') as f:
                dataReader = csv.reader(f)
                header = next(dataReader)
                for row in dataReader:
                    if self.lang == "Ja":
                        question = tokenize_preprocess_japanese_sent(row[3])
                        if self.use_question_tag == True:
                            ja_q_tmp.write(question + " <QS>\n")
                        else:
                            ja_q_tmp.write(question + "\n")
                    elif self.lang == "Fr":
                        ja_q_tmp.write(row[3].lower() + "\n")

            ja_q_tmp.close()
            if self.lang == "Fr":
                normalize_tokenized_sent(self.japanese_question_file_path,
                                         self.lang)

            # 2. Create tmp japanese context files.
            ja_c_tmp = open(self.japanese_context_file_path, "w")
            with open(self.source_context_file_path, newline='') as f:
                dataReader = csv.reader(f)
                header = next(dataReader)
                for row in dataReader:
                    if self.lang == "Ja":
                        if self.use_bing_translate == True or self.use_bing_translate == True:
                            context_sent = row[3]
                        else:
                            context_sent = \
                                tokenize_preprocess_japanese_sent(row[3])
                        ja_c_tmp.write(context_sent + "\n")
                    # For French and German, the tokenization and normalization
                    # would be executed later.
                    elif self.lang == "Fr":
                        ja_c_tmp.write(row[3].lower() + "\n")

            ja_c_tmp.close()
            if self.lang == "Fr":
                normalize_tokenized_sent(self.japanese_context_file_path,
                                         self.lang)

            # 3. Get the translated results and attention scores.
            if self.use_google_translate:
                questions_sources = open(self.japanese_question_file_path,
                                         "r").readlines()
                translated_questions = \
                    [google_translate(sentence, False, self.lang)
                     for sentence in questions_sources]
                translated_questions = \
                    [tokenize_preprocess_english_sent(
                        sentence) for sentence in translated_questions]
                translated_questions = [
                    q.replace("&#39;", "'").replace("\u200b\u200b", "")
                    for q in translated_questions
                ]

                google_trans_path = os.path.join("google_trans", self.lang)
                if not os.path.exists(google_trans_path):
                    os.makedirs(google_trans_path)
                self.question_trans_file_path = os.path.join(
                    google_trans_path, "TRANS.question.txt.new")

            elif self.use_bing_translate:
                questions_sources = open(self.japanese_question_file_path,
                                         "r").readlines()
                translated_questions = \
                    [bing_translate(sentence, self.lang, 'en')
                     for sentence in questions_sources]

                translated_questions = \
                    [tokenize_preprocess_english_sent(
                        sentence) for sentence in translated_questions]
                translated_questions = [
                    q.replace("&#39;", "'").replace("\u200b\u200b", "")
                    for q in translated_questions
                ]

                bing_trans_path = os.path.join('trans_result',
                                               self.lang.lower(),
                                               'v' + str(self.version), 'bing')
                if not os.path.exists(bing_trans_path):
                    os.makedirs(bing_trans_path)
                self.question_trans_file_path = os.path.join(
                    bing_trans_path, "TRANS.question.txt.new")

            else:
                if self.beam_search == True:
                    if self.soft == True:
                        translated_questions, attention_scores_questions = \
                            trans_from_files_beam(self.japanese_question_file_path, self.japanese_question_file_path,
                                                  self.trans_train_source, self.trans_train_target,
                                                  self.trans_embedding_model, self.trans_encdec_model, seed, 5, True)
                        trans_dir = os.path.split(
                            self.question_trans_file_path)[0]
                        trans_beam_dir = os.path.join(trans_dir, 'beam')
                        if not os.path.exists(trans_beam_dir):
                            os.makedirs(trans_beam_dir)
                        self.question_trans_file_path = os.path.join(
                            trans_beam_dir, "TRANS.question.txt.new")

                    else:
                        trans_from_files_beam(self.japanese_question_file_path,
                                              self.japanese_question_file_path,
                                              self.trans_train_source,
                                              self.trans_train_target,
                                              self.trans_embedding_model,
                                              self.trans_encdec_model, seed, 5,
                                              False)

                        translated_questions = open("trans.txt",
                                                    'r').read().splitlines()
                        # Save translated questions.
                        trans_dir = os.path.split(
                            self.question_trans_file_path)[0]
                        trans_beam_dir = os.path.join(trans_dir, 'beam')
                        if not os.path.exists(trans_beam_dir):
                            os.makedirs(trans_beam_dir)
                        self.question_trans_file_path = os.path.join(
                            trans_beam_dir, "TRANS.question.txt.new")

                else:
                    translated_questions, attention_scores_questions = \
                        trans_from_files(self.japanese_question_file_path, self.japanese_question_file_path,
                                         self.trans_train_source, self.trans_train_target,
                                         self.trans_embedding_model, self.trans_encdec_model,
                                         seed, trans_mode=True, save_attention_weights=True,
                                         replace_UNK=self.replace_UNK)

            trans_q_lines = translated_questions

            question_trans = open(self.question_trans_file_path, "w")

            for question in trans_q_lines:
                question_trans.write(question + "\n")
            question_trans.close()

            # Context Trans
            if self.use_google_translate:
                context_sources = open(self.japanese_context_file_path,
                                       "r").readlines()
                translated_context = \
                    [google_translate(sentence, False, self.lang)
                     for sentence in context_sources]
                translated_context = \
                    [tokenize_preprocess_english_sent(
                        sentence) for sentence in translated_context]
                translated_context = [
                    c.replace("&#39;", "'").replace("\u200b\u200b", "")
                    for c in translated_context
                ]

                google_trans_path = os.path.join("google_trans", self.lang)
                if not os.path.exists(google_trans_path):
                    os.makedirs(google_trans_path)
                self.context_trans_file_path = os.path.join(
                    google_trans_path, "TRANS.txt.new")

                trans_c_lines = translated_context

            if self.use_bing_translate:
                context_sources = open(self.japanese_context_file_path,
                                       "r").readlines()

                bing_translate_result = [
                    bing_translate(sentence, self.lang, 'en', True)
                    for sentence in context_sources
                ]
                translated_context = [
                    result[0] for result in bing_translate_result
                ]
                alignment_info = [
                    result[1] for result in bing_translate_result
                ]

                translated_context = [
                    sentence.lower().replace("\n", "")
                    for sentence in translated_context
                ]

                bing_trans_path = os.path.join('trans_result',
                                               self.lang.lower(),
                                               'v' + str(self.version), 'bing')
                if not os.path.exists(bing_trans_path):
                    os.makedirs(bing_trans_path)
                self.context_trans_file_path = os.path.join(
                    bing_trans_path, "TRANS.txt.new")
                self.context_attention_file_path = os.path.join(
                    bing_trans_path, "ATTN.txt.new")

                trans_c_lines = translated_context
                trans_a_lines = alignment_info

            else:
                if self.beam_search == True:
                    if self.soft == True:
                        translated_context, attention_scores_context = \
                            trans_from_files_beam(self.japanese_context_file_path, self.japanese_context_file_path,
                                                  self.trans_train_source, self.trans_train_target,
                                                  self.trans_embedding_model, self.trans_encdec_model, seed, 5, True)

                        trans_c_lines = translated_context
                        trans_a_lines = attention_scores_context

                        # Reset the saved dir name.
                        trans_dir = os.path.split(
                            self.question_trans_file_path)[0]
                        self.context_trans_file_path = os.path.join(
                            trans_dir, "TRANS.txt.new")
                        self.context_attention_file_path = os.path.join(
                            trans_dir, "ATTN.txt.new")
                    else:
                        trans_from_files_beam(self.japanese_context_file_path,
                                              self.japanese_context_file_path,
                                              self.trans_train_source,
                                              self.trans_train_target,
                                              self.trans_embedding_model,
                                              self.trans_encdec_model, seed, 5,
                                              False)

                        translated_context = open("trans.txt",
                                                  'r').read().splitlines()
                        attention_scores_context = open(
                            "attn.txt", 'r').read().splitlines()

                        # Reset the saved dir name.
                        trans_dir = os.path.split(
                            self.question_trans_file_path)[0]
                        self.context_trans_file_path = os.path.join(
                            trans_dir, "TRANS.txt.new")
                        self.context_attention_file_path = os.path.join(
                            trans_dir, "ATTN.txt.new")

                else:
                    translated_context, attention_scores_context = \
                        trans_from_files(self.japanese_context_file_path, self.japanese_context_file_path,
                                         self.trans_train_source, self.trans_train_target,
                                         self.trans_embedding_model, self.trans_encdec_model,
                                         seed, trans_mode=True, save_attention_weights=True,
                                         replace_UNK=self.replace_UNK)

                    trans_c_lines = translated_context
                    trans_a_lines = attention_scores_context

            if self.use_google_translate == False and self.use_bing_translate == False:
                trans_c_lines = translated_context
                trans_a_lines = attention_scores_context
                context_attention = open(self.context_attention_file_path, "w")
                context_trans = open(self.context_trans_file_path, "w")
                if self.beam_search == True and self.soft == False:
                    # save context
                    for trans_context in trans_c_lines:
                        context_trans.write(trans_context + "\n")
                    context_trans.close()

                    # save attention
                    for trans_attention_index in trans_a_lines:
                        context_attention.write(trans_attention_index + "\n")
                    context_trans.close()

                else:
                    for trans_context, attention_score in zip(
                            trans_c_lines, attention_scores_context):
                        context_trans.write(trans_context + "\n")
                        for i in range(len(attention_score)):
                            attention_weight = [
                                str(float(weight))
                                for weight in attention_score[i]
                            ]
                            context_attention.write(
                                " ".join(attention_weight) + "\n")
                        context_attention.write("\n")
                    context_trans.close()
                    context_attention.close()
            else:
                context_trans = open(self.context_trans_file_path, "w")

                for trans_context in trans_c_lines:
                    context_trans.write(trans_context + "\n")

                if self.use_bing_translate == True:
                    context_attention = open(self.context_attention_file_path,
                                             "w")

                    for trans_attention in trans_a_lines:
                        context_attention.write(trans_attention + "\n")

                    context_attention.close()

                context_trans.close()

        else:
            # This is for quick evaluation.
            # The translated context and questions are sved under the directory `trans_results`
            # and when the `--online_trans` option is set to False, the system loads the context
            # and questions which have been transalted beforehand.
            if self.use_google_translate == True:
                google_trans_path = os.path.join("google_trans", self.lang)
                if not os.path.exists(google_trans_path):
                    os.makedirs(google_trans_path)
                self.context_trans_file_path = os.path.join(
                    google_trans_path, "TRANS.txt.new")
                self.question_trans_file_path = os.path.join(
                    google_trans_path, "TRANS.question.txt.new")

            elif self.use_bing_translate == True:
                bing_trans_path = os.path.join('trans_result',
                                               self.lang.lower(),
                                               'v' + str(self.version), 'bing')
                if not os.path.exists(bing_trans_path):
                    os.makedirs(bing_trans_path)
                self.context_trans_file_path = os.path.join(
                    bing_trans_path, "TRANS.txt.new")
                self.question_trans_file_path = os.path.join(
                    bing_trans_path, "TRANS.question.txt.new")

            if self.beam_search == True:
                trans_dir = os.path.split(self.question_trans_file_path)[0]
                trans_dir = os.path.join(trans_dir, "beam")
                self.question_trans_file_path = os.path.join(
                    trans_dir, "TRANS.question.txt.new")
                self.context_trans_file_path = os.path.join(
                    trans_dir, "TRANS.txt.new")

            trans_context_f = open(self.context_trans_file_path)
            trans_context = trans_context_f.read()
            trans_context_f.close()
            trans_c_lines = trans_context.split('\n')

            trans_question_f = open(self.question_trans_file_path)
            trans_question = trans_question_f.read()
            trans_question_f.close()
            trans_q_lines = trans_question.split('\n')

        with open(self.question_file_path, newline='') as f:
            dataReader = csv.reader(f)
            header = next(dataReader)
            for row in dataReader:
                question_id, title, paragraph_id, quastion = int(
                    row[0]), row[1], int(row[2]), row[3]
                if self.lang == "Ja":
                    answer_texts = [
                        tokenize_preprocess_japanese_sent(row[4]),
                        tokenize_preprocess_japanese_sent(row[5]),
                        tokenize_preprocess_japanese_sent(row[6])
                    ]
                elif self.lang == "Fr" or self.lang == "De":
                    answer_texts = normalize_tokenized_answers(
                        row[4], row[5], row[6], self.lang)

                sent_indices = sent_idx_dic[title][paragraph_id]
                paragraph_tokens = []

                for sent_idx in sent_indices:
                    paragraph_tokens.extend(trans_c_lines[sent_idx].split())

                paragraph = " ".join(paragraph_tokens)

                tokenized_paragraph = self._tokenizer.tokenize(paragraph)
                question_text = trans_q_lines[question_id]
                instance = self.text_to_instance(question_text, paragraph,
                                                 answer_texts,
                                                 tokenized_paragraph,
                                                 question_id)
                yield instance