Ejemplo n.º 1
0
class DuReaderV2Reader(BaseReader):
    def __init__(self):
        self.tokenizer = SpacyTokenizer()

    def read(self, file_path):
        logging.info("Reading file at %s", file_path)
        logging.info("Processing the dataset.")
        instances = self._read(file_path)
        instances = [instance for instance in tqdm(instances)]
        return instances

    def _read(self, file_path):
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        for article in dataset:
            for paragraph in article['paragraphs']:
                context = paragraph["context"]
                context_tokens, context_token_spans = self.tokenizer.word_tokenizer(
                    context)
                for question_answer in paragraph['qas']:
                    question = question_answer["question"].strip()
                    question_tokens, _ = self.tokenizer.word_tokenizer(
                        question)

                    answers, span_starts, span_ends = [], [], []
                    if "answers" in question_answer:
                        answers = [
                            answer['text']
                            for answer in question_answer['answers']
                        ]
                        span_starts = [
                            answer['answer_start']
                            for answer in question_answer['answers']
                        ]
                        span_ends = [
                            start + len(answer)
                            for start, answer in zip(span_starts, answers)
                        ]
                    if 'is_impossible' in question_answer and question_answer[
                            'is_impossible']:
                        span_starts = [0]
                        span_ends = [0]

                    answer_char_spans = zip(
                        span_starts, span_ends
                    ) if len(span_starts) > 0 and len(span_ends) > 0 else None
                    answers = answers if len(answers) > 0 else [""]
                    is_impossible = None if 'is_impossible' not in question_answer else question_answer[
                        'is_impossible'] * 1
                    qid = question_answer['id']
                    yield self._make_instance(context, context_tokens,
                                              context_token_spans, question,
                                              question_tokens,
                                              answer_char_spans, answers,
                                              is_impossible, qid)

    def _make_instance(self,
                       context,
                       context_tokens,
                       context_token_spans,
                       question,
                       question_tokens,
                       answer_char_spans=None,
                       answers=None,
                       is_impossible=None,
                       qid=None):
        answer_token_starts, answer_token_ends = [], []
        if answers is not None:
            for answer_char_start, answer_char_end in answer_char_spans:
                answer_token_span = []
                for idx, span in enumerate(context_token_spans):
                    if not (answer_char_end <= span[0]
                            or answer_char_start >= span[1]):
                        answer_token_span.append(idx)

                if len(answer_token_span) == 0:
                    break  #print(is_impossible) #break #continue
                answer_token_starts.append(answer_token_span[0])
                answer_token_ends.append(answer_token_span[-1])
        abstractive_answer_mask = [0]
        if is_impossible is not None and is_impossible:
            answer_token_starts = []
            answer_token_ends = []
            answer_token_starts.append(0)
            answer_token_ends.append(0)
            abstractive_answer_mask = [1]

        return OrderedDict({
            "context":
            context,
            "context_tokens":
            context_tokens,
            "context_token_spans":
            context_token_spans,
            "is_impossible":
            is_impossible,
            "question":
            question,
            'qid':
            qid,
            "question_tokens":
            question_tokens,
            "answer":
            answers[0] if answers is not None else [],
            "answer_start":
            answer_token_starts[0]
            if len(answer_token_starts) > 0 is not None else None,
            "answer_end":
            answer_token_ends[0]
            if len(answer_token_ends) > 0 is not None else None,
            "abstractive_answer_mask":
            abstractive_answer_mask
        })
Ejemplo n.º 2
0
class SquadReader(BaseReader):
    def __init__(self, fine_grained=False):
        self.tokenizer = SpacyTokenizer(fine_grained)

    def read(self, file_path):
        logging.info("Reading file at %s", file_path)
        logging.info("Processing the dataset.")
        instances = self._read(file_path)
        instances = [instance for instance in tqdm(instances)]
        return instances

    def _read(self, file_path, context_limit=-1):
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        for article in dataset:
            for paragraph in article['paragraphs']:
                context = paragraph["context"]
                context_tokens, context_token_spans = self.tokenizer.word_tokenizer(
                    context)
                for question_answer in paragraph['qas']:
                    question = question_answer["question"].strip()
                    question_tokens, _ = self.tokenizer.word_tokenizer(
                        question)

                    answers, span_starts, span_ends = [], [], []
                    if "answers" in question_answer:
                        answers = [
                            answer['text']
                            for answer in question_answer['answers']
                        ]
                        span_starts = [
                            answer['answer_start']
                            for answer in question_answer['answers']
                        ]
                        span_ends = [
                            start + len(answer)
                            for start, answer in zip(span_starts, answers)
                        ]

                    answer_char_spans = zip(
                        span_starts, span_ends
                    ) if len(span_starts) > 0 and len(span_ends) > 0 else None
                    answers = answers if len(answers) > 0 else None
                    qid = question_answer['id']
                    instance = self._make_instance(context, context_tokens,
                                                   context_token_spans,
                                                   question, question_tokens,
                                                   answer_char_spans, answers,
                                                   qid)
                    if len(instance['context_tokens']
                           ) > context_limit and context_limit > 0:
                        if instance['answer_start'] > context_limit or instance[
                                'answer_end'] > context_limit:
                            continue
                        else:
                            instance['context_tokens'] = instance[
                                'context_tokens'][:context_limit]
                    yield instance

    def _make_instance(self,
                       context,
                       context_tokens,
                       context_token_spans,
                       question,
                       question_tokens,
                       answer_char_spans=None,
                       answers=None,
                       qid=None):
        answer_token_starts, answer_token_ends = [], []
        if answers is not None:
            for answer_char_start, answer_char_end in answer_char_spans:
                answer_token_span = []
                for idx, span in enumerate(context_token_spans):
                    if not (answer_char_end <= span[0]
                            or answer_char_start >= span[1]):
                        answer_token_span.append(idx)

                assert len(answer_token_span) > 0
                answer_token_starts.append(answer_token_span[0])
                answer_token_ends.append(answer_token_span[-1])

        return OrderedDict({
            "context":
            context,
            "context_tokens":
            context_tokens,
            "context_token_spans":
            context_token_spans,
            "context_word_len": [len(word) for word in context_tokens],
            "question_word_len": [len(word) for word in question_tokens],
            "question":
            question,
            'qid':
            qid,
            "question_tokens":
            question_tokens,
            "answer":
            answers[0] if answers is not None else None,
            "answer_start":
            answer_token_starts[0] if answers is not None else None,
            "answer_end":
            answer_token_ends[0] if answers is not None else None,
        })