Beispiel #1
0
    def _get_answer(self, context, context_ids, answer_start, answer_end):
        encoder = load_encoder(self.config.vocab_path)
        subtokens = [
            encoder._subtoken_id_to_subtoken_string(s) for s in context_ids
        ]

        if not isinstance(subtokens[0], unicode):
            subtokens = [x.decode('utf-8') for x in subtokens]
        if not isinstance(context, unicode):
            context = context.decode('utf-8')
        assert isinstance(context, unicode)
        assert isinstance(subtokens[0], unicode)

        spans = tokenizer_util.match_subtokens_to_string(context, subtokens)

        start = spans[answer_start][0]
        end = spans[answer_end][1]  # + 1
        text = context[start:end]
        return text
Beispiel #2
0
def get_answer_index(context, context_tokens, answer_start, answer):
    assert isinstance(answer, unicode)
    assert isinstance(context, unicode)
    assert isinstance(context_tokens[0], unicode)
    spans = tokenizer_util.match_subtokens_to_string(context, context_tokens)

    answer_end = answer_start + len(answer)
    word_answer_start = None
    word_answer_end = None
    for word_idx, (start, _) in enumerate(spans):
        if (start <= answer_start and
                # Check that we aren't a part of the same token
            (word_answer_start is None or spans[word_answer_start][0] != start
             )):
            word_answer_start = word_idx
        if start < answer_end:
            word_answer_end = word_idx
    assert word_answer_start <= word_answer_end, (context, context_tokens,
                                                  answer_start, answer)
    return word_answer_start, word_answer_end