def __init__( self, add_bos_token: bool, add_eos_token: bool, use_eos_token_for_bos: bool, max_seq_len: int, vocab: Vocabulary, tokenizer: Optional[Tokenizer], ): super().__init__() if tokenizer is not None and hasattr(tokenizer, "torchscriptify"): try: self.tokenizer = tokenizer.torchscriptify() except NotImplementedError: # This is fine as long as the exported tokenizer is only used # in pre-tokenized mode self.tokenizer = None else: self.tokenizer = None self.do_nothing_tokenizer = ScriptDoNothingTokenizer() self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(), bos_idx=vocab.get_bos_index() if add_bos_token else -1, eos_idx=vocab.get_eos_index() if add_eos_token else -1, ) self.vocab_lookup_1d = VocabLookup(self.vocab) self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len
def test_xlm_token_tensorizer(self): vocab = self._mock_vocab() xlm = ScriptXLMTensorizer( tokenizer=ScriptDoNothingTokenizer(), token_vocab=vocab, language_vocab=ScriptVocabulary(["ar", "cn", "en"]), max_seq_len=256, default_language="en", ) rand_tokens = [ [str(random.randint(100, 200)) for i in range(20)], [str(random.randint(100, 200)) for i in range(10)], ] tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens)) tokens = tokens.tolist() # eos token self.assertEqual(tokens[0][0], 202) self.assertEqual(tokens[0][-1], 202) # pad token self.assertEqual(tokens[1][12:], [200] * 10) languages = languages.tolist() self.assertEqual(languages[0], [2] * len(tokens[0])) self.assertEqual(languages[1][12:], [0] * 10) tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"])) languages = languages.tolist() self.assertEqual(languages[0][:], [1] * len(tokens[0])) self.assertEqual(languages[1][:12], [2] * 12)
def _mock_xlm_tensorizer(self, max_seq_len=256): return ScriptXLMTensorizer( tokenizer=ScriptDoNothingTokenizer(), token_vocab=self._mock_vocab(), language_vocab=ScriptVocabulary(["ar", "cn", "en"]), max_seq_len=256, default_language="en", )
def torchscriptify(self): return ScriptDoNothingTokenizer()
class TokenTensorizerScriptImpl(TensorizerScriptImpl): def __init__( self, add_bos_token: bool, add_eos_token: bool, use_eos_token_for_bos: bool, max_seq_len: int, vocab: Vocabulary, tokenizer: Optional[Tokenizer], ): super().__init__() if tokenizer is not None and hasattr(tokenizer, "torchscriptify"): try: self.tokenizer = tokenizer.torchscriptify() except NotImplementedError: # This is fine as long as the exported tokenizer is only used # in pre-tokenized mode self.tokenizer = None else: self.tokenizer = None self.do_nothing_tokenizer = ScriptDoNothingTokenizer() self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(), bos_idx=vocab.get_bos_index() if add_bos_token else -1, eos_idx=vocab.get_eos_index() if add_eos_token else -1, ) self.vocab_lookup_1d = VocabLookup(self.vocab) self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.use_eos_token_for_bos = use_eos_token_for_bos self.max_seq_len = max_seq_len def get_texts_by_index(self, texts: Optional[List[List[str]]], index: int) -> Optional[str]: if texts is None or len(texts) == 0: return None # TokenTensorizer only works with a single text per row, stick with that return texts[index][0] def get_tokens_by_index(self, tokens: Optional[List[List[List[str]]]], index: int) -> Optional[List[str]]: if tokens is None or len(tokens) == 0: return None # TokenTensorizer only works with a single text per row, stick with that return tokens[index][0] def _lookup_tokens_1d( self, tokens: List[Tuple[str, int, int]] ) -> Tuple[List[int], List[int], List[int]]: return self.vocab_lookup_1d( tokens, bos_idx=self.vocab.bos_idx if self.add_bos_token else None, eos_idx=self.vocab.eos_idx if self.add_eos_token else None, use_eos_token_for_bos=self.use_eos_token_for_bos, max_seq_len=self.max_seq_len, ) def tokenize( self, row_text: Optional[str], row_pre_tokenized: Optional[List[str]] ) -> List[Tuple[str, int, int]]: tokens: List[Tuple[str, int, int]] = [] if row_text is not None: if self.tokenizer is not None: tokens = self.tokenizer.tokenize(row_text) elif row_pre_tokenized is not None: for token in row_pre_tokenized: tokens.extend(self.do_nothing_tokenizer.tokenize(token)) return tokens def numberize( self, text_tokens: List[Tuple[str, int, int]] ) -> Tuple[List[int], int, List[Tuple[int, int]]]: token_indices: List[int] = [] token_starts: List[int] = [] token_ends: List[int] = [] token_indices, token_starts, token_ends = self._lookup_tokens_1d( text_tokens) token_ranges: List[Tuple[int, int]] = [] for s, e in zip(token_starts, token_ends): token_ranges.append((s, e)) return token_indices, len(token_indices), token_ranges def tensorize( self, tokens_2d: List[List[int]], seq_lens_1d: List[int], positions_2d: List[List[Tuple[int, int]]], ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: token_indices_tensor: torch.Tensor = torch.tensor( pad_2d(tokens_2d, seq_lens=seq_lens_1d, pad_idx=self.vocab.pad_idx), dtype=torch.long, ) token_starts_2d: List[List[int]] = [] token_ends_2d: List[List[int]] = [] for position_list in positions_2d: token_starts_2d.append([x[0] for x in position_list]) token_ends_2d.append([x[1] for x in position_list]) token_positions_tensor = torch.stack( [ torch.tensor( pad_2d(token_starts_2d, seq_lens=seq_lens_1d, pad_idx=-1), dtype=torch.long, ), torch.tensor( pad_2d(token_ends_2d, seq_lens=seq_lens_1d, pad_idx=-1), dtype=torch.long, ), ], dim=2, ) return ( token_indices_tensor, torch.tensor(seq_lens_1d, dtype=torch.long), token_positions_tensor, ) def forward( self, inputs: ScriptBatchInput ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: tokens_2d: List[List[int]] = [] seq_lens_1d: List[int] = [] positions_2d: List[List[Tuple[int, int]]] = [] for idx in range(self.batch_size(inputs)): tokens: List[Tuple[str, int, int]] = self.tokenize( self.get_texts_by_index(inputs.texts, idx), self.get_tokens_by_index(inputs.tokens, idx), ) numberized: Tuple[List[int], int, List[Tuple[int, int]]] = self.numberize(tokens) tokens_2d.append(numberized[0]) seq_lens_1d.append(numberized[1]) positions_2d.append(numberized[2]) return self.tensorize(tokens_2d, seq_lens_1d, positions_2d)
def _mock_roberta_tensorizer(self, max_seq_len=100): return ScriptRoBERTaTensorizerWithIndices( tokenizer=ScriptDoNothingTokenizer(), vocab=self._mock_vocab(), max_seq_len=max_seq_len, )
def test_xlm_tensorizer_seq_padding_size_exceeds_max_seq_len(self): vocab = self._mock_vocab() xlm = ScriptXLMTensorizer( tokenizer=ScriptDoNothingTokenizer(), token_vocab=vocab, language_vocab=ScriptVocabulary(["ar", "cn", "en"]), max_seq_len=20, default_language="en", ) seq_padding_control = [0, 32, 256] xlm.set_padding_control("sequence_length", seq_padding_control) rand_tokens = [ [str(random.randint(100, 200)) for i in range(30)], [str(random.randint(100, 200)) for i in range(20)], [str(random.randint(100, 200)) for i in range(10)], ] tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens), ) token_count = [len(t) + 2 for t in rand_tokens] expected_batch_size = len(rand_tokens) expected_token_size = min( max(max(token_count), seq_padding_control[1]), xlm.max_seq_len) expected_padding_count = [ max(0, expected_token_size - cnt) for cnt in token_count ] token_count = [ expected_token_size - cnt for cnt in expected_padding_count ] # verify tensorized tokens padding tokens = tokens.tolist() self.assertEqual(len(tokens), expected_batch_size) self.assertEqual( max(len(t) for t in tokens), min(len(t) for t in tokens), expected_token_size, ) for i in range(expected_batch_size): self.assertEqual(tokens[i][token_count[i]:], [200] * expected_padding_count[i]) # verify tensorized languages languages = languages.tolist() self.assertEqual(len(languages), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(languages[i][:token_count[i]], [2] * token_count[i]) self.assertEqual(languages[i][token_count[i]:], [0] * expected_padding_count[i]) # verify tensorized postions positions = positions.tolist() self.assertEqual(len(positions), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(positions[i][token_count[i]:], [0] * expected_padding_count[i]) # verify pad_masks pad_masks = pad_masks.tolist() self.assertEqual(len(pad_masks), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(pad_masks[i][:token_count[i]], [1] * token_count[i]) self.assertEqual(pad_masks[i][token_count[i]:], [0] * expected_padding_count[i])