Beispiel #1
0
    def read_one_example(self, inputs):
        """ inputs keys: question, context """
        context_text = inputs["context"].replace("``",
                                                 '" ').replace("''", '" ')
        tokenized_context = self.word_tokenizer.tokenize(context_text)
        context_spans, char_to_word_offset = self._convert_to_spans(
            context_text, tokenized_context)
        context_tokens = [
            Token(text, span)
            for (text, span) in zip(tokenized_context, context_spans)
        ]

        context_sub_tokens = []
        for token in context_tokens:
            for sub_token in self.sub_level_tokenizer.tokenize(token.text):
                context_sub_tokens.append(Token(sub_token, token.text_span))

        question_text = inputs["question"]
        question_text = " ".join(self.word_tokenizer.tokenize(question_text))
        question_sub_tokens = [
            Token(sub_token)
            for sub_token in self.sub_level_tokenizer.tokenize(question_text)
        ]

        bert_tokens, _ = self._make_features_and_labels(
            context_sub_tokens, question_sub_tokens, -1, -1)

        features = []
        helper = Helper(
            **{
                "bert_token": [],
                "tokenized_context": tokenized_context,
                "token_key":
                "tokenized_context"  # for 1-example inference latency key
            })

        for bert_token in bert_tokens:
            bert_input = [token.text for token in bert_token]

            bert_feature = BertFeature()
            bert_feature.set_input(bert_input)

            features.append(bert_feature.to_dict())
            helper.bert_token.append(bert_token)
        return features, helper.to_dict()
Beispiel #2
0
    def read_one_example(self, inputs):
        """ inputs keys: sequence """
        sequence_text = inputs["sequence"].strip().replace("\n", "")
        sequence_tokens = self.word_tokenizer.tokenize(sequence_text)
        naive_tokens = sequence_text.split()
        is_head_word = utils.get_is_head_of_word(naive_tokens, sequence_tokens)

        sequence_sub_tokens = []
        tagged_sub_token_idxs = []
        curr_sub_token_idx = 1  # skip CLS_TOKEN
        for token_idx, token in enumerate(sequence_tokens):
            for sub_token_pos, sub_token in enumerate(
                    self.subword_tokenizer.tokenize(token, unit="word")):
                sequence_sub_tokens.append(sub_token)
                if is_head_word[token_idx] and sub_token_pos == 0:
                    tagged_sub_token_idxs.append(curr_sub_token_idx)
                curr_sub_token_idx += 1

        if len(sequence_sub_tokens) > self.sequence_max_length:
            sequence_sub_tokens = sequence_sub_tokens[:self.
                                                      sequence_max_length]

        bert_input = [self.cls_token] + sequence_sub_tokens + [self.sep_token]
        assert len(naive_tokens) == len(tagged_sub_token_idxs), \
            f"""Wrong tagged_sub_token_idxs: followings mismatch.
            naive_tokens: {naive_tokens}
            sequence_sub_tokens: {sequence_sub_tokens}
            tagged_sub_token_idxs: {tagged_sub_token_idxs}"""

        bert_feature = BertFeature()
        bert_feature.set_input(bert_input)
        bert_feature.set_feature("tagged_sub_token_idxs",
                                 tagged_sub_token_idxs)
        bert_feature.set_feature("num_tokens", len(naive_tokens))

        features = [bert_feature.to_dict()]
        helper = {}
        return features, helper