Ejemplo n.º 1
0
 def __init__(self):
     self.tokenizer = SpacyTokenizer()
Ejemplo n.º 2
0
class DuReaderV2Reader(BaseReader):
    def __init__(self):
        self.tokenizer = SpacyTokenizer()

    def read(self, file_path):
        logging.info("Reading file at %s", file_path)
        logging.info("Processing the dataset.")
        instances = self._read(file_path)
        instances = [instance for instance in tqdm(instances)]
        return instances

    def _read(self, file_path):
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        for article in dataset:
            for paragraph in article['paragraphs']:
                context = paragraph["context"]
                context_tokens, context_token_spans = self.tokenizer.word_tokenizer(
                    context)
                for question_answer in paragraph['qas']:
                    question = question_answer["question"].strip()
                    question_tokens, _ = self.tokenizer.word_tokenizer(
                        question)

                    answers, span_starts, span_ends = [], [], []
                    if "answers" in question_answer:
                        answers = [
                            answer['text']
                            for answer in question_answer['answers']
                        ]
                        span_starts = [
                            answer['answer_start']
                            for answer in question_answer['answers']
                        ]
                        span_ends = [
                            start + len(answer)
                            for start, answer in zip(span_starts, answers)
                        ]
                    if 'is_impossible' in question_answer and question_answer[
                            'is_impossible']:
                        span_starts = [0]
                        span_ends = [0]

                    answer_char_spans = zip(
                        span_starts, span_ends
                    ) if len(span_starts) > 0 and len(span_ends) > 0 else None
                    answers = answers if len(answers) > 0 else [""]
                    is_impossible = None if 'is_impossible' not in question_answer else question_answer[
                        'is_impossible'] * 1
                    qid = question_answer['id']
                    yield self._make_instance(context, context_tokens,
                                              context_token_spans, question,
                                              question_tokens,
                                              answer_char_spans, answers,
                                              is_impossible, qid)

    def _make_instance(self,
                       context,
                       context_tokens,
                       context_token_spans,
                       question,
                       question_tokens,
                       answer_char_spans=None,
                       answers=None,
                       is_impossible=None,
                       qid=None):
        answer_token_starts, answer_token_ends = [], []
        if answers is not None:
            for answer_char_start, answer_char_end in answer_char_spans:
                answer_token_span = []
                for idx, span in enumerate(context_token_spans):
                    if not (answer_char_end <= span[0]
                            or answer_char_start >= span[1]):
                        answer_token_span.append(idx)

                if len(answer_token_span) == 0:
                    break  #print(is_impossible) #break #continue
                answer_token_starts.append(answer_token_span[0])
                answer_token_ends.append(answer_token_span[-1])
        abstractive_answer_mask = [0]
        if is_impossible is not None and is_impossible:
            answer_token_starts = []
            answer_token_ends = []
            answer_token_starts.append(0)
            answer_token_ends.append(0)
            abstractive_answer_mask = [1]

        return OrderedDict({
            "context":
            context,
            "context_tokens":
            context_tokens,
            "context_token_spans":
            context_token_spans,
            "is_impossible":
            is_impossible,
            "question":
            question,
            'qid':
            qid,
            "question_tokens":
            question_tokens,
            "answer":
            answers[0] if answers is not None else [],
            "answer_start":
            answer_token_starts[0]
            if len(answer_token_starts) > 0 is not None else None,
            "answer_end":
            answer_token_ends[0]
            if len(answer_token_ends) > 0 is not None else None,
            "abstractive_answer_mask":
            abstractive_answer_mask
        })
Ejemplo n.º 3
0
 def __init__(self, history, question_sep="<q>", answer_sep="<a>"):
     self.tokenizer = SpacyTokenizer()
     self.history = history  # prepend dialog history. -1 represent all previous dialog
     self.question_sep = question_sep
     self.answer_sep = answer_sep
Ejemplo n.º 4
0
class CoQAReader(BaseReader):
    def __init__(self, history, question_sep="<q>", answer_sep="<a>"):
        self.tokenizer = SpacyTokenizer()
        self.history = history  # prepend dialog history. -1 represent all previous dialog
        self.question_sep = question_sep
        self.answer_sep = answer_sep

    def read(self, file_path, data_type):
        if data_type not in ["train", "dev", "test"]:
            raise ValueError()
        logging.info("Reading file at %s", file_path)
        logging.info("Processing the dataset.")
        t0 = time.time()
        instances = self._read(file_path, data_type)
        cost = time.time() - t0
        logging.info("cost=%.3f" % cost)
        return instances

    def _read(self, source, data_type):
        with open(source, 'r', encoding='utf-8') as f:
            source_data = json.load(f)

        storys = []
        questions = []
        answers = []
        indexs = [0]
        for i, dialog in enumerate(source_data["data"]):
            storys.append(dialog["story"])
            for question in dialog["questions"]:
                questions.append(question["input_text"])
            for answer in dialog["answers"]:
                answers.append(answer["input_text"])
            indexs.append(indexs[-1] + len(dialog["questions"]))

        all_story_tokens, all_story_token_spans = self.tokenizer.word_tokenizer_parallel(
            storys)
        all_question_tokens, _ = self.tokenizer.word_tokenizer_parallel(
            questions)
        all_answer_tokens, _ = self.tokenizer.word_tokenizer_parallel(answers)

        i = 0
        instances = []
        for dialog, story_tokens, story_token_spans in tqdm(zip(
                source_data["data"], all_story_tokens, all_story_token_spans),
                                                            ncols=80):
            s = indexs[i]
            e = indexs[i + 1]
            # if data_type == "train":
            self.process_dialog(dialog, story_tokens, story_token_spans,
                                all_answer_tokens[s:e])
            self._make_instances(instances, dialog, story_tokens,
                                 story_token_spans, all_question_tokens[s:e],
                                 all_answer_tokens[s:e], data_type)
            i += 1

        return instances

    def process_dialog(self, dialog, story_tokens, story_token_spans,
                       answer_tokens):
        story = dialog["story"]
        story_id = dialog["id"]
        questions = dialog["questions"]
        answers = dialog["answers"]
        if len(answer_tokens) != len(answers):
            raise ValueError(
                "Answer token turns does match answer number: {} {}\n".format(
                    len(answer_tokens), len(answers)))

        for i, answer in enumerate(answers):
            if i + 1 != answer["turn_id"]:
                raise ValueError(
                    "Question turn id does match answer: {} {}\n".format(
                        story_id, answer["turn_id"]))
            answer_type = self.get_answer_type(answer["input_text"])
            answer["answer_type"] = answer_type
            self.find_extractive_span(story, story_tokens, story_token_spans,
                                      answer, answer_tokens[i], answer_type)

    def get_answer_type(self, answer):
        norm_ans = CoQAEvaluator.normalize_answer(answer)
        answer_type = "extractive"
        if norm_ans in ["unknown", "yes", "no"]:
            answer_type = norm_ans
        return answer_type

    def find_extractive_span(self, story, story_tokens, story_token_spans,
                             answer, answer_tokens, answer_type):

        if answer["span_start"] != -1:
            rationale = answer["span_text"]
            norm_rationale = rationale.strip().strip("'").strip()
            idx = rationale.find(norm_rationale)
            start = answer["span_start"] + idx
            end = start + len(norm_rationale)
            word_ixs = self.get_word_span(story_token_spans, start, end)
            answer["rationale"] = story[story_token_spans[word_ixs[0]][0]:
                                        story_token_spans[word_ixs[-1]][1]]
            answer["rationale_char_start"] = story_token_spans[word_ixs[0]][0]
            answer["rationale_char_end"] = story_token_spans[word_ixs[-1]][1]
            answer["rationale_start"] = word_ixs[0]
            answer["rationale_end"] = word_ixs[-1]

        if answer_type != "extractive":
            return

        best_f1 = 0.0
        ground_truth = []
        for w in answer_tokens:
            w = CoQAEvaluator.normalize_answer(w)
            if w != "":
                ground_truth.append(w)

        if not ground_truth:
            answer[
                "best_f1_answer"] = ""  # input_text is not extractive in context
            answer["answer_type"] = "skip"
            return

        rationale_tokens = [
            CoQAEvaluator.normalize_answer(story_tokens[i]) for i in word_ixs
        ]
        ls = [
            i for i in range(len(rationale_tokens))
            if rationale_tokens[i] in ground_truth
        ]

        for i in range(len(ls)):
            for j in range(i, len(ls)):
                pred = []
                k = ls[i]
                while k <= ls[j]:
                    if rationale_tokens[k] != "":
                        pred.append(rationale_tokens[k])
                    k += 1
                if not pred:
                    continue
                common = Counter(pred) & Counter(ground_truth)
                num_same = sum(common.values())
                if num_same > 0:
                    precision = 1.0 * num_same / len(pred)
                    recall = 1.0 * num_same / len(ground_truth)
                    f1 = (2 * precision * recall) / (precision + recall)
                    if f1 > best_f1:
                        best_f1 = f1
                        best_span = (word_ixs[ls[i]], word_ixs[ls[j]])

        if best_f1 > 0.:
            i, j = best_span
            start = story_token_spans[i][0]
            end = story_token_spans[j][1]
            answer["best_f1_answer_char_start"] = start
            answer["best_f1_answer_char_end"] = end
            answer["best_f1_answer_start"] = i
            answer["best_f1_answer_end"] = j
            answer["best_f1_answer"] = story[start:end]
            answer["best_f1"] = best_f1
        else:
            # search full passage
            tokens_norm = [
                CoQAEvaluator.normalize_answer(w) for w in story_tokens
            ]
            ls = [
                i for i in range(len(tokens_norm))
                if tokens_norm[i] in ground_truth
            ]
            for i in range(len(ls)):
                for j in range(i, len(ls)):
                    pred = []
                    k = ls[i]
                    while k <= ls[j]:
                        if tokens_norm[k] != "":
                            pred.append(tokens_norm[k])
                        k += 1

                    common = Counter(pred) & Counter(ground_truth)
                    num_same = sum(common.values())
                    if num_same > 0:
                        precision = 1.0 * num_same / len(pred)
                        recall = 1.0 * num_same / len(ground_truth)
                        f1 = (2 * precision * recall) / (precision + recall)
                        if f1 > best_f1:
                            best_f1 = f1
                            best_span = (ls[i], ls[j])

            if best_f1 > 0.:
                i, j = best_span
                start = story_token_spans[i][0]
                end = story_token_spans[j][1]
                answer["best_f1_answer_char_start"] = start
                answer["best_f1_answer_char_end"] = end
                answer["best_f1_answer_start"] = i
                answer["best_f1_answer_end"] = j
                answer["best_f1_answer"] = story[start:end]
                answer["best_f1"] = best_f1

                answer["rationale"] = story[start:end]
                answer["rationale_char_start"] = start
                answer["rationale_char_end"] = end
                answer["rationale_start"] = i
                answer["rationale_end"] = j
            else:
                answer[
                    "best_f1_answer"] = ""  # input_text in not extractive in context
                answer["answer_type"] = "skip"
        return

    def get_word_span(self, spans, start, end):
        idxs = []
        for word_ix, (s, e) in enumerate(spans):
            if e > start:
                if s < end:
                    idxs.append(word_ix)
                else:
                    break
        return idxs

    def get_concat_questions(self, questions, question_tokens, answers,
                             answer_tokens, skips):

        question_sep = self.question_sep
        answer_sep = self.answer_sep

        questions_with_history = []
        question_tokens_with_history = []
        i = 0
        while i < len(questions):
            start = 0
            if self.history >= 0:
                start = i - self.history
                if start < 0:
                    start = 0

            concat_ = ""
            concat_tokens = []
            while start < i:
                if not skips[start]:
                    concat_ += question_sep
                    concat_ += questions[start]["input_text"]
                    concat_ += answer_sep
                    concat_ += answers[start]

                    concat_tokens.append(question_sep)
                    concat_tokens += question_tokens[start]
                    concat_tokens.append(answer_sep)
                    concat_tokens += answer_tokens[start]
                start += 1
            concat_ += question_sep
            concat_tokens.append(question_sep)
            concat_ += questions[i]["input_text"]
            concat_tokens += question_tokens[i]

            questions_with_history.append(concat_)
            question_tokens_with_history.append(concat_tokens)
            i += 1

        return questions_with_history, question_tokens_with_history

    def _make_instances(self, instances, dialog, story_tokens,
                        story_token_spans, question_tokens, answer_tokens,
                        data_type):

        if len(dialog["questions"]) != len(question_tokens):
            raise ValueError(
                "Question tokens turns does match question turn number: {} {}\n"
                .format(len(question_tokens), len(dialog["questions"])))
        if len(dialog["answers"]) != len(answer_tokens):
            raise ValueError(
                "Answer tokens turns does match answer turn number: {} {}\n".
                format(len(answer_tokens), len(dialog["answers"])))

        story_id = dialog["id"]
        story = dialog["story"]

        arr = []
        input_answers = []
        skips = [False] * len(dialog["answers"])
        answer_types_id = {
            "unknown": 0,
            "yes": 1,
            "no": 2,
            "extractive": 3,
            "skip": 0
        }
        for idx, answer in enumerate(dialog["answers"]):
            input_answers.append(answer["input_text"])
            answer_type = answer["answer_type"]
            instance = OrderedDict({})
            arr.append(instance)
            instance["answer"] = answer["input_text"]
            instance["answer_start"] = 0
            instance["answer_end"] = 0
            answer_type_one_hot = [0, 0, 0]
            if answer_type != "extractive":
                answer_type_id = answer_types_id[answer_type]
                answer_type_one_hot[answer_type_id] = 1
            instance["abstractive_answer_mask"] = answer_type_one_hot
            instance["answer_type"] = answer_type
            if answer_type == "skip":
                instance["answer_type"] = "unknown"

            instance["rationale"] = None
            instance["rationale_start"] = 0
            instance["rationale_end"] = 0
            if data_type == "train" and answer_type == "skip":
                skips[idx] = True
                # continue
            if answer_type == "unknown":
                instance["rationale"] = "unknown"
                instance["rationale_start"] = 0
                instance["rationale_end"] = 0
            else:
                if "rationale" in answer:
                    instance["rationale"] = answer["rationale"]
                    instance["rationale_start"] = answer["rationale_start"]
                    instance["rationale_end"] = answer["rationale_end"]
                else:
                    raise ValueError(story_id, idx + 1, "no rationale")

            if "best_f1_answer_start" in answer:
                instance["answer"] = answer["best_f1_answer"]
                instance["answer_start"] = answer["best_f1_answer_start"]
                instance["answer_end"] = answer["best_f1_answer_end"]
            elif answer_type == "extractive":
                raise ValueError(story_id, idx + 1, "no ext ans")

        questions = dialog["questions"]
        questions_with_history, question_tokens_with_history = self.get_concat_questions(
            questions, question_tokens, input_answers, answer_tokens, skips)

        idx = 0
        while idx < len(questions_with_history):
            if data_type == "train" and skips[idx]:
                idx += 1
                continue
            question = questions_with_history[idx]
            instance = arr[idx]
            instance["context"] = story
            instance["context_tokens"] = story_tokens
            instance["context_token_spans"] = story_token_spans
            instance["question"] = questions_with_history[idx]
            instance["question_tokens"] = question_tokens_with_history[idx]
            instance["qid"] = story_id + '|' + str(
                dialog["questions"][idx]["turn_id"])
            instance["context_word_len"] = [
                len(word) for word in instance["context_tokens"]
            ]
            instance["question_word_len"] = [
                len(word) for word in instance["question_tokens"]
            ]
            instances.append(instance)
            idx += 1
Ejemplo n.º 5
0
 def __init__(self, fine_grained=False):
     self.tokenizer = SpacyTokenizer(fine_grained)
Ejemplo n.º 6
0
class SquadReader(BaseReader):
    def __init__(self, fine_grained=False):
        self.tokenizer = SpacyTokenizer(fine_grained)

    def read(self, file_path):
        logging.info("Reading file at %s", file_path)
        logging.info("Processing the dataset.")
        instances = self._read(file_path)
        instances = [instance for instance in tqdm(instances)]
        return instances

    def _read(self, file_path, context_limit=-1):
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        for article in dataset:
            for paragraph in article['paragraphs']:
                context = paragraph["context"]
                context_tokens, context_token_spans = self.tokenizer.word_tokenizer(
                    context)
                for question_answer in paragraph['qas']:
                    question = question_answer["question"].strip()
                    question_tokens, _ = self.tokenizer.word_tokenizer(
                        question)

                    answers, span_starts, span_ends = [], [], []
                    if "answers" in question_answer:
                        answers = [
                            answer['text']
                            for answer in question_answer['answers']
                        ]
                        span_starts = [
                            answer['answer_start']
                            for answer in question_answer['answers']
                        ]
                        span_ends = [
                            start + len(answer)
                            for start, answer in zip(span_starts, answers)
                        ]

                    answer_char_spans = zip(
                        span_starts, span_ends
                    ) if len(span_starts) > 0 and len(span_ends) > 0 else None
                    answers = answers if len(answers) > 0 else None
                    qid = question_answer['id']
                    instance = self._make_instance(context, context_tokens,
                                                   context_token_spans,
                                                   question, question_tokens,
                                                   answer_char_spans, answers,
                                                   qid)
                    if len(instance['context_tokens']
                           ) > context_limit and context_limit > 0:
                        if instance['answer_start'] > context_limit or instance[
                                'answer_end'] > context_limit:
                            continue
                        else:
                            instance['context_tokens'] = instance[
                                'context_tokens'][:context_limit]
                    yield instance

    def _make_instance(self,
                       context,
                       context_tokens,
                       context_token_spans,
                       question,
                       question_tokens,
                       answer_char_spans=None,
                       answers=None,
                       qid=None):
        answer_token_starts, answer_token_ends = [], []
        if answers is not None:
            for answer_char_start, answer_char_end in answer_char_spans:
                answer_token_span = []
                for idx, span in enumerate(context_token_spans):
                    if not (answer_char_end <= span[0]
                            or answer_char_start >= span[1]):
                        answer_token_span.append(idx)

                assert len(answer_token_span) > 0
                answer_token_starts.append(answer_token_span[0])
                answer_token_ends.append(answer_token_span[-1])

        return OrderedDict({
            "context":
            context,
            "context_tokens":
            context_tokens,
            "context_token_spans":
            context_token_spans,
            "context_word_len": [len(word) for word in context_tokens],
            "question_word_len": [len(word) for word in question_tokens],
            "question":
            question,
            'qid':
            qid,
            "question_tokens":
            question_tokens,
            "answer":
            answers[0] if answers is not None else None,
            "answer_start":
            answer_token_starts[0] if answers is not None else None,
            "answer_end":
            answer_token_ends[0] if answers is not None else None,
        })