Ejemplo n.º 1
0
    def preprocess_instance(
            self,
            idd: int,
            question: QASetting,
            answers: Optional[List[Answer]] = None) -> MCAnnotation:
        has_answers = answers is not None

        q_tokenized, q_ids, q_length, _, _ = preprocessing.nlp_preprocess(
            question.question,
            self.shared_resources.vocab,
            lowercase=self.shared_resources.config.get('lowercase', True))
        s_tokenized, s_ids, s_length, _, _ = preprocessing.nlp_preprocess(
            question.support[0],
            self.shared_resources.vocab,
            lowercase=self.shared_resources.config.get('lowercase', True))

        return MCAnnotation(question_tokens=q_tokenized,
                            question_ids=q_ids,
                            question_length=q_length,
                            support_tokens=s_tokenized,
                            support_ids=s_ids,
                            support_length=s_length,
                            answer=self.shared_resources.answer_vocab(
                                answers[0].text) if has_answers else 0,
                            id=idd)
Ejemplo n.º 2
0
    def preprocess(self,
                   questions: List[QASetting],
                   answers: Optional[List[List[Answer]]] = None,
                   is_eval: bool = False) -> List[Mapping[str, any]]:
        preprocessed = list()
        for i, qa in enumerate(questions):
            _, token_ids, length, _, _ = preprocessing.nlp_preprocess(
                qa.question,
                self.shared_resources.vocab,
                lowercase=self.shared_resources.config.get('lowercase', True))
            _, s_token_ids, s_length, _, _ = preprocessing.nlp_preprocess(
                qa.support[0],
                self.shared_resources.vocab,
                lowercase=self.shared_resources.config.get('lowercase', True))

            preprocessed.append({
                'supports': s_token_ids,
                'question': token_ids,
                'support_lengths': s_length,
                'question_lengths': length,
                'ids': i,
            })
            if answers is not None:
                preprocessed[-1][
                    "answers"] = self.shared_resources.answer_vocab(
                        answers[i][0].text)

        return preprocessed
Ejemplo n.º 3
0
    def preprocess_instance(
            self,
            idd: int,
            question: QASetting,
            answers: Optional[List[Answer]] = None) -> Optional[MCAnnotation]:
        has_answers = answers is not None

        if self.shared_resources.config.get("use_dep_sa", False):
            anno = MCAnnotation(
                question_tokens=question.q_tokenized,
                question_ids=None,
                question_length=len(question.q_tokenized),
                support_tokens=question.s_tokenized,
                support_ids=None,
                support_length=len(question.s_tokenized),
                answer=self.shared_resources.answer_vocab(answers[0].text)
                if has_answers else 0,
                id=idd,
                question_dep_i=question.q_dep_i,
                question_dep_j=question.q_dep_j,
                question_dep_type=question.q_dep_type,
                support_dep_i=question.s_dep_i,
                support_dep_j=question.s_dep_j,
                support_dep_type=question.s_dep_type,
            )
            return anno
        else:
            q_tokenized, q_ids, q_length, _, _ = preprocessing.nlp_preprocess(
                question.question,
                self.shared_resources.vocab,
                lowercase=self.shared_resources.config.get('lowercase', True))
            s_tokenized, s_ids, s_length, _, _ = preprocessing.nlp_preprocess(
                question.support[0],
                self.shared_resources.vocab,
                lowercase=self.shared_resources.config.get('lowercase', True))

            return MCAnnotation(
                question_tokens=q_tokenized,
                question_ids=q_ids,
                question_length=q_length,
                support_tokens=s_tokenized,
                support_ids=s_ids,
                support_length=s_length,
                answer=self.shared_resources.answer_vocab(answers[0].text)
                if has_answers else 0,
                id=idd,
                question_dep_i=None,
                question_dep_j=None,
                question_dep_type=None,
                support_dep_i=None,
                support_dep_j=None,
                support_dep_type=None,
            )
Ejemplo n.º 4
0
    def preprocess(self, questions: List[QASetting], answers: Optional[List[List[Answer]]] = None,
                   is_eval: bool = False) -> List[Mapping[str, any]]:
        preprocessed = list()
        for i, qa in enumerate(questions):
            tokens, _, length, lemmas, _ = preprocessing.nlp_preprocess(
                qa.question, self.shared_resources.vocab, lowercase=True, with_lemmas=True, use_spacy=True)
            s_tokens, _, s_length, s_lemmas, _ = preprocessing.nlp_preprocess(
                qa.support[0], self.shared_resources.vocab, lowercase=True, with_lemmas=True, use_spacy=True)

            preprocessed.append({
                'support_tokens': s_tokens,
                'support_lemmas': s_lemmas,
                'support_lengths': s_length,
                'question_tokens': tokens,
                'question_lemmas': lemmas,
                'question_lengths': length,
                'ids': i,
            })
            if answers is not None:
                preprocessed[-1]["answers"] = self.shared_resources.answer_vocab(answers[i][0].text)

        return preprocessed
Ejemplo n.º 5
0
def test_vocab():
    train_data = [
        QASetting(question='A person is training his horse for a competition.',
                  support=['A person on a horse jumps over a broken down airplane.'],
                  candidates=['entailment', 'neutral', 'contradiction'])
    ]

    print('build vocab based on train data')
    train_vocab = preprocessing.fill_vocab(train_data)
    train_vocab.freeze()
    pprint(train_vocab._sym2freqs)
    pprint(train_vocab._sym2id)

    MIN_VOCAB_FREQ, MAX_VOCAB_CNT = 2, 10
    train_vocab = train_vocab.prune(MIN_VOCAB_FREQ, MAX_VOCAB_CNT)

    pprint(train_vocab._sym2freqs)
    pprint(train_vocab._sym2id)

    print('encode train data')
    train_data = preprocessing.nlp_preprocess(train_data[0].question, train_vocab)[0]
    print(train_data)
Ejemplo n.º 6
0
def prepare_data(qa_setting: QASetting,
                 answers: Optional[List[Answer]],
                 vocab: Vocab,
                 lowercase: bool = False,
                 with_answers: bool = False,
                 wiq_contentword: bool = False,
                 spacy_nlp: bool = False,
                 max_support_length: int = None,
                 lemmatize=False,
                 with_lemmas=False) \
        -> Tuple[List[str], List[int], Optional[List[int]], int,
                 List[List[str]], List[List[int]], Optional[List[List[int]]], List[int],
                 List[List[float]], List[List[int]], List[List[Tuple[int, int]]]]:
    """Preprocesses a question and (optionally) answers:
    The steps include tokenization, lower-casing, translation to IDs,
    computing the word-in-question feature, computing token offsets,
    truncating supports, and computing answer spans.
    """
    supports = qa_setting.support
    question = qa_setting.question

    question_tokens, question_ids, question_length, question_lemmas, _ = preprocessing.nlp_preprocess(
        question, vocab, lowercase=lowercase, use_spacy=spacy_nlp,
        lemmatize=lemmatize, with_lemmas=with_lemmas, with_tokens_offsets=False)
    question_tokens_set = set(t.lower() for t in question_tokens)

    preprocessed_supports = [
        preprocessing.nlp_preprocess(
            support, vocab, lowercase=lowercase, use_spacy=spacy_nlp,
            lemmatize=lemmatize, with_lemmas=with_lemmas, with_tokens_offsets=True)
        for support in supports]

    all_support_tokens = [s[0] for s in preprocessed_supports]
    all_support_ids = [s[1] for s in preprocessed_supports]
    all_support_length = [s[2] for s in preprocessed_supports]
    all_support_lemmas = [s[3] for s in preprocessed_supports]
    all_token_offsets = [s[4] for s in preprocessed_supports]

    rng = random.Random(12345)

    all_word_in_question = []
    if with_lemmas:
        assert all_support_lemmas is not None
        for support_lemmas in all_support_lemmas:
            all_word_in_question.append([])
            if with_lemmas:
                for lemma in support_lemmas:
                    all_word_in_question[-1].append(float(
                        lemma in question_lemmas and (not wiq_contentword or (lemma.isalnum() and not lemma.is_stop))))
    else:
        for support_tokens in all_support_tokens:
            all_word_in_question.append([])
            for token in support_tokens:
                all_word_in_question[-1].append(
                    float(token.lower() in question_tokens_set and (not wiq_contentword or token.isalnum())))

    all_answer_spans = []
    for doc_idx, support_tokens in enumerate(all_support_tokens):
        min_answer = len(support_tokens)
        max_answer = 0
        token_offsets = all_token_offsets[doc_idx]

        answer_spans = []
        if with_answers:
            assert isinstance(answers, list)
            for a in answers:
                if a.doc_idx != doc_idx:
                    continue

                start = 0
                while start < len(token_offsets) and token_offsets[start] < a.span[0]:
                    start += 1

                if start == len(token_offsets):
                    continue

                end = start
                while end + 1 < len(token_offsets) and token_offsets[end + 1] < a.span[1]:
                    end += 1

                if (start, end) not in answer_spans:
                    answer_spans.append((start, end))
                    min_answer = min(min_answer, start)
                    max_answer = max(max_answer, end)

        # cut support whenever there is a maximum allowed length and recompute answer spans
        support_length = all_support_length[doc_idx]
        if max_support_length is not None and support_length > max_support_length > 0:
            if max_answer < max_support_length:
                # Find new start and end in the flattened support
                new_start = 0
                new_end = max_support_length
            else:
                offset = rng.randint(1, 11)
                new_end = max_answer
                new_start = max(0, min(min_answer, new_end + 2 * offset - max_support_length))
                while new_end - new_start > max_support_length - 2 * offset:
                    answer_spans = [(s, e) for s, e in answer_spans if e < new_end]
                    new_end = max(answer_spans, key=lambda span: span[1])[1]
                    new_start = max(0, min(min_answer, new_end + 2 * offset - max_support_length))
                new_end = min(new_end + offset, support_length)
                new_start = max(new_start - offset, 0)

            # Crop support according to new start and end pointers
            all_support_tokens[doc_idx] = support_tokens[new_start:new_end]
            all_support_ids[doc_idx] = all_support_ids[doc_idx][new_start:new_end]
            if with_lemmas:
                all_support_lemmas[doc_idx] = all_support_lemmas[doc_idx][new_start:new_end]
            answer_spans = [(s - new_start, e - new_start) for s, e in answer_spans]
            all_word_in_question[doc_idx] = all_word_in_question[doc_idx][new_start:new_end]
            all_support_length[doc_idx] = new_end - new_start
            all_token_offsets[doc_idx] = token_offsets[new_start:new_end]
        all_answer_spans.append(answer_spans)

    return question_tokens, question_ids, question_lemmas, question_length, \
           all_support_tokens, all_support_ids, all_support_lemmas, all_support_length, \
           all_word_in_question, all_token_offsets, all_answer_spans
Ejemplo n.º 7
0
def prepare_data(qa_setting: QASetting,
                 answers: Optional[List[Answer]],
                 vocab: Vocab,
                 lowercase: bool = False,
                 with_answers: bool = False,
                 wiq_contentword: bool = False,
                 spacy_nlp: bool = False,
                 max_support_length: int = -1,
                 lemmatize=False,
                 with_lemmas=False) \
        -> Tuple[List[str], List[int], Optional[List[int]], int,
                     List[str], List[int], Optional[List[int]], int,
                     List[float], List[int], List[Tuple[int, int]]]:
    """Preprocesses a question and (optionally) answers:
    The steps include tokenization, lower-casing, translation to IDs,
    computing the word-in-question feature, computing token offsets,
    truncating supports, and computing answer spans.
    """
    support = " ".join(qa_setting.support)
    question = qa_setting.question

    question_tokens, question_ids, question_length, question_lemmas, _ = preprocessing.nlp_preprocess(
        question, vocab, lowercase=lowercase, use_spacy=spacy_nlp,
        lemmatize=lemmatize, with_lemmas=with_lemmas, with_tokens_offsets=False)

    support_tokens, support_ids, support_length, support_lemmas, token_offsets = preprocessing.nlp_preprocess(
        support, vocab, lowercase=lowercase, use_spacy=spacy_nlp,
        lemmatize=lemmatize, with_lemmas=with_lemmas, with_tokens_offsets=True)

    rng = random.Random(12345)

    word_in_question = []

    if with_lemmas:
        assert support_lemmas is not None
        for lemma in support_lemmas:
            word_in_question.append(float(lemma in question_lemmas and
                                          (not wiq_contentword or (lemma.isalnum() and not lemma.is_stop))))
    else:
        for token in support_tokens:
            word_in_question.append(float(token in question_tokens and (not wiq_contentword or token.isalnum())))

    min_answer = len(support_tokens)
    max_answer = 0

    answer_spans = []
    if with_answers:
        assert isinstance(answers, list)
        for a in answers:
            start = 0
            while start < len(token_offsets) and token_offsets[start] < a.span[0]:
                start += 1

            if start == len(token_offsets):
                continue

            end = start
            while end + 1 < len(token_offsets) and token_offsets[end + 1] < a.span[1]:
                end += 1
            if (start, end) not in answer_spans:
                answer_spans.append((start, end))
                min_answer = min(min_answer, start)
                max_answer = max(max_answer, end)

    # cut support whenever there is a maximum allowed length and recompute answer spans
    if max_support_length is not None and len(support_tokens) > max_support_length > 0:
        support_length = max_support_length
        if max_answer < max_support_length:
            support_tokens = support_tokens[:max_support_length]
            support_ids = support_ids[:max_support_length]
            if with_lemmas:
                support_lemmas = support_lemmas[:max_support_length]
            word_in_question = word_in_question[:max_support_length]
        else:
            offset = rng.randint(1, 11)
            new_end = max_answer + offset
            new_start = max(0, min(min_answer - offset, new_end - max_support_length))
            while new_end - new_start > max_support_length:
                answer_spans = [(s, e) for s, e in answer_spans if e < (new_end - offset)]
                new_end = max(answer_spans, key=lambda span: span[1])[1] + offset
                new_start = max(0, min(min_answer - offset, new_end - max_support_length))
            support_tokens = support_tokens[new_start:new_end]
            support_ids = support_ids[new_start:new_end]
            if with_lemmas:
                support_lemmas = support_lemmas[new_start:new_end]
            answer_spans = [(s - new_start, e - new_start) for s, e in answer_spans]
            word_in_question = word_in_question[new_start:new_end]

    return question_tokens, question_ids, question_lemmas, question_length, \
           support_tokens, support_ids, support_lemmas, support_length, \
           word_in_question, token_offsets, answer_spans