Ejemplo n.º 1
0
    def preprocess_instance(
            self,
            question: QASetting,
            answers: Optional[List[Answer]] = None) -> XQAAnnotation:
        has_answers = answers is not None

        q_tokenized, q_ids, _, q_length, s_tokenized, s_ids, _, s_length, \
        word_in_question, token_offsets, answer_spans = prepare_data(
            question, answers, self.vocab, self.config.get("lowercase", False),
            with_answers=has_answers, max_support_length=self.config.get("max_support_length", None))

        emb_support = np.zeros([s_length, self.emb_matrix.shape[1]])
        emb_question = np.zeros([q_length, self.emb_matrix.shape[1]])

        for k in range(len(s_ids)):
            emb_support[k] = self._get_emb(s_ids[k])
        for k in range(len(q_ids)):
            emb_question[k] = self._get_emb(q_ids[k])

        return XQAAnnotation(
            question_tokens=q_tokenized,
            question_ids=q_ids,
            question_length=q_length,
            question_embeddings=emb_question,
            support_tokens=s_tokenized,
            support_ids=s_ids,
            support_length=s_length,
            support_embeddings=emb_support,
            word_in_question=word_in_question,
            token_offsets=token_offsets,
            answer_spans=answer_spans if has_answers else None,
        )
Ejemplo n.º 2
0
def test_prepare_data():

    result = prepare_data(qa_setting, answers, Vocab(), with_answers=True)

    question_tokens, question_ids, question_lemmas, question_length, \
    support_tokens, support_ids, support_lemmas, support_length, \
    word_in_question, token_offsets, answer_spans = result

    assert question_tokens == ['What', 'is', 'the', 'answer', '?']
    assert question_ids == [1, 2, 3, 4, 5]
    assert question_lemmas is None
    assert question_length == 5

    assert support_tokens == [[
        'It',
        'is',
        'not',
        'A',
        '.',
    ], ['It', 'is', 'B', '.']]
    assert support_ids == [[6, 2, 7, 8, 9], [6, 2, 10, 9]]
    assert support_lemmas == [None, None]
    assert support_length == [5, 4]
    assert word_in_question == [[0.0, 1.0, 0.0, 0.0, 0.0],
                                [0.0, 1.0, 0.0, 0.0]]
    assert token_offsets == [[0, 3, 6, 10, 11], [0, 3, 6, 7]]
    assert answer_spans == [[], [(2, 2)]]
Ejemplo n.º 3
0
    def preprocess_instance(self, question, answers=None):
        has_answers = answers is not None

        q_tokenized, q_ids, q_lemmas, q_length, s_tokenized, s_ids, s_lemmas, s_length, \
        word_in_question, token_offsets, answer_spans = prepare_data(
            question, answers, self.vocab, self.config.get("lowercase", False),
            with_answers=has_answers, max_support_length=self.config.get("max_support_length", None),
            spacy_nlp=True, with_lemmas=True)

        max_num_support = self.config.get(
            "max_num_support", len(question.support))  # take all per default

        # take max supports by TF-IDF (we subsample to max_num_support in create batch)
        # following https://arxiv.org/pdf/1710.10723.pdf
        if len(question.support) > 1:
            scores = sort_by_tfidf(' '.join(q_tokenized),
                                   [' '.join(s) for s in s_tokenized])
            selected_supports = [
                s_idx for s_idx, _ in scores[:max_num_support]
            ]
            s_tokenized = [s_tokenized[s_idx] for s_idx in selected_supports]
            s_lemmas = [s_lemmas[s_idx] for s_idx in selected_supports]
            s_ids = [s_ids[s_idx] for s_idx in selected_supports]
            s_length = [s_length[s_idx] for s_idx in selected_supports]
            word_in_question = [
                word_in_question[s_idx] for s_idx in selected_supports
            ]
            token_offsets = [
                token_offsets[s_idx] for s_idx in selected_supports
            ]
            answer_spans = [answer_spans[s_idx] for s_idx in selected_supports]
        else:
            selected_supports = list(range(len(question.support)))

        return XQAAssertionAnnotation(
            question_tokens=q_tokenized,
            question_lemmas=q_lemmas,
            question_ids=q_ids,
            question_length=q_length,
            support_tokens=s_tokenized,
            support_lemmas=s_lemmas,
            support_ids=s_ids,
            support_length=s_length,
            word_in_question=word_in_question,
            token_offsets=token_offsets,
            answer_spans=answer_spans if has_answers else None,
            selected_supports=selected_supports,
        )
Ejemplo n.º 4
0
    def preprocess_instance(self, question: QASetting,
                            answers: Optional[List[Answer]],
                            is_eval: bool) -> Optional[CBowAnnotation]:
        has_answers = answers is not None

        q_tokenized, q_ids, _, q_length, s_tokenized, s_ids, _, s_length, \
        word_in_question, token_offsets, answer_spans = \
            prepare_data(question, answers, self.vocab, self.config.get("lowercase", False),
                         with_answers=has_answers, wiq_contentword=True, spacy_nlp=False,
                         max_support_length=self.config.get("max_support_length", None))

        not_allowed = all(end - start > _max_span_size
                          for start, end in answer_spans)

        if has_answers and not_allowed:
            return None

        emb_support = np.zeros([s_length, self.emb_matrix.shape[1]])
        emb_question = np.zeros([q_length, self.emb_matrix.shape[1]])

        answertype_span = self.__extract_answertype_span(q_tokenized)

        for k in range(len(s_ids)):
            emb_support[k] = self._get_emb(s_ids[k])
        for k in range(len(q_ids)):
            emb_question[k] = self._get_emb(q_ids[k])

        return CBowAnnotation(
            question_tokens=q_tokenized,
            question_ids=q_ids,
            question_length=q_length,
            question_embeddings=emb_question,
            support_tokens=s_tokenized,
            support_ids=s_ids,
            support_length=s_length,
            support_embeddings=emb_support,
            word_in_question=word_in_question,
            token_offsets=token_offsets,
            answertype_span=answertype_span,
            answer_spans=answer_spans if has_answers else None,
        )