def read_one_example(self, inputs): """ inputs keys: question, context """ context_text = inputs["context"].replace("``", '" ').replace("''", '" ') tokenized_context = self.word_tokenizer.tokenize(context_text) context_spans, char_to_word_offset = self._convert_to_spans( context_text, tokenized_context) context_tokens = [ Token(text, span) for (text, span) in zip(tokenized_context, context_spans) ] context_sub_tokens = [] for token in context_tokens: for sub_token in self.sub_level_tokenizer.tokenize(token.text): context_sub_tokens.append(Token(sub_token, token.text_span)) question_text = inputs["question"] question_text = " ".join(self.word_tokenizer.tokenize(question_text)) question_sub_tokens = [ Token(sub_token) for sub_token in self.sub_level_tokenizer.tokenize(question_text) ] bert_tokens, _ = self._make_features_and_labels( context_sub_tokens, question_sub_tokens, -1, -1) features = [] helper = Helper( **{ "bert_token": [], "tokenized_context": tokenized_context, "token_key": "tokenized_context" # for 1-example inference latency key }) for bert_token in bert_tokens: bert_input = [token.text for token in bert_token] bert_feature = BertFeature() bert_feature.set_input(bert_input) features.append(bert_feature.to_dict()) helper.bert_token.append(bert_token) return features, helper.to_dict()
def read_one_example(self, inputs): """ inputs keys: sequence_a and sequence_b """ sequence_a = utils.get_sequence_a(inputs) sequence_b = inputs.get("sequence_b", None) bert_feature = BertFeature() bert_feature.set_input_with_speical_token( sequence_a, sequence_b, self.tokenizer, max_seq_length=self.sequence_max_length, data_type="predict", cls_token=self.cls_token, sep_token=self.sep_token, input_type=self.input_type, ) features = [bert_feature.to_dict()] helper = {} return features, helper
def read_one_example(self, inputs): """ inputs keys: sequence """ sequence_text = inputs["sequence"].strip().replace("\n", "") sequence_tokens = self.word_tokenizer.tokenize(sequence_text) naive_tokens = sequence_text.split() is_head_word = utils.get_is_head_of_word(naive_tokens, sequence_tokens) sequence_sub_tokens = [] tagged_sub_token_idxs = [] curr_sub_token_idx = 1 # skip CLS_TOKEN for token_idx, token in enumerate(sequence_tokens): for sub_token_pos, sub_token in enumerate( self.subword_tokenizer.tokenize(token, unit="word")): sequence_sub_tokens.append(sub_token) if is_head_word[token_idx] and sub_token_pos == 0: tagged_sub_token_idxs.append(curr_sub_token_idx) curr_sub_token_idx += 1 if len(sequence_sub_tokens) > self.sequence_max_length: sequence_sub_tokens = sequence_sub_tokens[:self. sequence_max_length] bert_input = [self.cls_token] + sequence_sub_tokens + [self.sep_token] assert len(naive_tokens) == len(tagged_sub_token_idxs), \ f"""Wrong tagged_sub_token_idxs: followings mismatch. naive_tokens: {naive_tokens} sequence_sub_tokens: {sequence_sub_tokens} tagged_sub_token_idxs: {tagged_sub_token_idxs}""" bert_feature = BertFeature() bert_feature.set_input(bert_input) bert_feature.set_feature("tagged_sub_token_idxs", tagged_sub_token_idxs) bert_feature.set_feature("num_tokens", len(naive_tokens)) features = [bert_feature.to_dict()] helper = {} return features, helper