def _tokenize(self, utterance): """Tokenize the utterance using word-piece tokenization used by BERT. Args: utterance: A string containing the utterance to be tokenized. Returns: bert_tokens: A list of tokens obtained by word-piece tokenization of the utterance. alignments: A dict mapping indices of characters corresponding to start and end positions of words (not subwords) to corresponding indices in bert_tokens list. inverse_alignments: A list of size equal to bert_tokens. Each element is a tuple containing the index of the starting and inclusive ending character of the word corresponding to the subword. This list is used during inference to map word-piece indices to spans in the original utterance. """ utterance = tokenization.convert_to_unicode(utterance) # After _naive_tokenize, spaces and punctuation marks are all retained, i.e. # direct concatenation of all the tokens in the sequence will be the # original string. tokens = _naive_tokenize(utterance) # Filter out empty tokens and obtain aligned character index for each token. alignments = {} char_index = 0 bert_tokens = [] # These lists store inverse alignments to be used during inference. bert_tokens_start_chars = [] bert_tokens_end_chars = [] for token in tokens: if token.strip(): subwords = self._tokenizer.tokenize(token) # Store the alignment for the index of starting character and the # inclusive ending character of the token. alignments[char_index] = len(bert_tokens) bert_tokens_start_chars.extend([char_index] * len(subwords)) bert_tokens.extend(subwords) # The inclusive ending character index corresponding to the word. inclusive_char_end = char_index + len(token) - 1 alignments[inclusive_char_end] = len(bert_tokens) - 1 bert_tokens_end_chars.extend([inclusive_char_end] * len(subwords)) char_index += len(token) inverse_alignments = list( zip(bert_tokens_start_chars, bert_tokens_end_chars)) return bert_tokens, alignments, inverse_alignments
def _create_feature(self, input_line, embedding_tensor_name, service_id, intent_or_slot_id, value_id=-1): """Create a single InputFeatures instance.""" seq_length = self._max_seq_length line = tokenization.convert_to_unicode(input_line) line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) tokens_a = self._tokenizer.tokenize(text_a) tokens_b = None if text_b: tokens_b = self._tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" data_utils.truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it # makes it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense # because the entire model is fine-tuned. tokens = [] input_type_ids = [] tokens.append("[CLS]") input_type_ids.append(0) for token in tokens_a: tokens.append(token) input_type_ids.append(0) tokens.append("[SEP]") input_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) input_type_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(input_type_ids) == seq_length return InputFeatures(input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids, embedding_tensor_name=embedding_tensor_name, service_id=service_id, intent_or_slot_id=intent_or_slot_id, value_id=value_id)