Beispiel #1
0
    def __init__(
        self,
        add_bos_token: bool,
        add_eos_token: bool,
        use_eos_token_for_bos: bool,
        max_seq_len: int,
        vocab: Vocabulary,
        tokenizer: Optional[Tokenizer],
    ):
        super().__init__()

        if tokenizer is not None and hasattr(tokenizer, "torchscriptify"):
            try:
                self.tokenizer = tokenizer.torchscriptify()
            except NotImplementedError:
                # This is fine as long as the exported tokenizer is only used
                # in pre-tokenized mode
                self.tokenizer = None
        else:
            self.tokenizer = None

        self.do_nothing_tokenizer = ScriptDoNothingTokenizer()
        self.vocab = ScriptVocabulary(
            list(vocab),
            pad_idx=vocab.get_pad_index(),
            bos_idx=vocab.get_bos_index() if add_bos_token else -1,
            eos_idx=vocab.get_eos_index() if add_eos_token else -1,
        )
        self.vocab_lookup_1d = VocabLookup(self.vocab)

        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        self.max_seq_len = max_seq_len
Beispiel #2
0
    def test_xlm_token_tensorizer(self):
        vocab = self._mock_vocab()

        xlm = ScriptXLMTensorizer(
            tokenizer=ScriptDoNothingTokenizer(),
            token_vocab=vocab,
            language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
            max_seq_len=256,
            default_language="en",
        )
        rand_tokens = [
            [str(random.randint(100, 200)) for i in range(20)],
            [str(random.randint(100, 200)) for i in range(10)],
        ]

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens))
        tokens = tokens.tolist()
        # eos token
        self.assertEqual(tokens[0][0], 202)
        self.assertEqual(tokens[0][-1], 202)
        # pad token
        self.assertEqual(tokens[1][12:], [200] * 10)

        languages = languages.tolist()
        self.assertEqual(languages[0], [2] * len(tokens[0]))
        self.assertEqual(languages[1][12:], [0] * 10)

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"]))
        languages = languages.tolist()
        self.assertEqual(languages[0][:], [1] * len(tokens[0]))
        self.assertEqual(languages[1][:12], [2] * 12)
 def _mock_xlm_tensorizer(self, max_seq_len=256):
     return ScriptXLMTensorizer(
         tokenizer=ScriptDoNothingTokenizer(),
         token_vocab=self._mock_vocab(),
         language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
         max_seq_len=256,
         default_language="en",
     )
Beispiel #4
0
 def torchscriptify(self):
     return ScriptDoNothingTokenizer()
Beispiel #5
0
class TokenTensorizerScriptImpl(TensorizerScriptImpl):
    def __init__(
        self,
        add_bos_token: bool,
        add_eos_token: bool,
        use_eos_token_for_bos: bool,
        max_seq_len: int,
        vocab: Vocabulary,
        tokenizer: Optional[Tokenizer],
    ):
        super().__init__()

        if tokenizer is not None and hasattr(tokenizer, "torchscriptify"):
            try:
                self.tokenizer = tokenizer.torchscriptify()
            except NotImplementedError:
                # This is fine as long as the exported tokenizer is only used
                # in pre-tokenized mode
                self.tokenizer = None
        else:
            self.tokenizer = None

        self.do_nothing_tokenizer = ScriptDoNothingTokenizer()
        self.vocab = ScriptVocabulary(
            list(vocab),
            pad_idx=vocab.get_pad_index(),
            bos_idx=vocab.get_bos_index() if add_bos_token else -1,
            eos_idx=vocab.get_eos_index() if add_eos_token else -1,
        )
        self.vocab_lookup_1d = VocabLookup(self.vocab)

        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        self.max_seq_len = max_seq_len

    def get_texts_by_index(self, texts: Optional[List[List[str]]],
                           index: int) -> Optional[str]:
        if texts is None or len(texts) == 0:
            return None

        # TokenTensorizer only works with a single text per row, stick with that
        return texts[index][0]

    def get_tokens_by_index(self, tokens: Optional[List[List[List[str]]]],
                            index: int) -> Optional[List[str]]:
        if tokens is None or len(tokens) == 0:
            return None

        # TokenTensorizer only works with a single text per row, stick with that
        return tokens[index][0]

    def _lookup_tokens_1d(
        self, tokens: List[Tuple[str, int, int]]
    ) -> Tuple[List[int], List[int], List[int]]:
        return self.vocab_lookup_1d(
            tokens,
            bos_idx=self.vocab.bos_idx if self.add_bos_token else None,
            eos_idx=self.vocab.eos_idx if self.add_eos_token else None,
            use_eos_token_for_bos=self.use_eos_token_for_bos,
            max_seq_len=self.max_seq_len,
        )

    def tokenize(
            self, row_text: Optional[str],
            row_pre_tokenized: Optional[List[str]]
    ) -> List[Tuple[str, int, int]]:

        tokens: List[Tuple[str, int, int]] = []
        if row_text is not None:
            if self.tokenizer is not None:
                tokens = self.tokenizer.tokenize(row_text)
        elif row_pre_tokenized is not None:
            for token in row_pre_tokenized:
                tokens.extend(self.do_nothing_tokenizer.tokenize(token))

        return tokens

    def numberize(
        self, text_tokens: List[Tuple[str, int, int]]
    ) -> Tuple[List[int], int, List[Tuple[int, int]]]:
        token_indices: List[int] = []
        token_starts: List[int] = []
        token_ends: List[int] = []

        token_indices, token_starts, token_ends = self._lookup_tokens_1d(
            text_tokens)

        token_ranges: List[Tuple[int, int]] = []

        for s, e in zip(token_starts, token_ends):
            token_ranges.append((s, e))

        return token_indices, len(token_indices), token_ranges

    def tensorize(
        self,
        tokens_2d: List[List[int]],
        seq_lens_1d: List[int],
        positions_2d: List[List[Tuple[int, int]]],
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

        token_indices_tensor: torch.Tensor = torch.tensor(
            pad_2d(tokens_2d, seq_lens=seq_lens_1d,
                   pad_idx=self.vocab.pad_idx),
            dtype=torch.long,
        )

        token_starts_2d: List[List[int]] = []
        token_ends_2d: List[List[int]] = []

        for position_list in positions_2d:
            token_starts_2d.append([x[0] for x in position_list])
            token_ends_2d.append([x[1] for x in position_list])

        token_positions_tensor = torch.stack(
            [
                torch.tensor(
                    pad_2d(token_starts_2d, seq_lens=seq_lens_1d, pad_idx=-1),
                    dtype=torch.long,
                ),
                torch.tensor(
                    pad_2d(token_ends_2d, seq_lens=seq_lens_1d, pad_idx=-1),
                    dtype=torch.long,
                ),
            ],
            dim=2,
        )

        return (
            token_indices_tensor,
            torch.tensor(seq_lens_1d, dtype=torch.long),
            token_positions_tensor,
        )

    def forward(
        self, inputs: ScriptBatchInput
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

        tokens_2d: List[List[int]] = []
        seq_lens_1d: List[int] = []
        positions_2d: List[List[Tuple[int, int]]] = []

        for idx in range(self.batch_size(inputs)):
            tokens: List[Tuple[str, int, int]] = self.tokenize(
                self.get_texts_by_index(inputs.texts, idx),
                self.get_tokens_by_index(inputs.tokens, idx),
            )

            numberized: Tuple[List[int], int,
                              List[Tuple[int, int]]] = self.numberize(tokens)
            tokens_2d.append(numberized[0])
            seq_lens_1d.append(numberized[1])
            positions_2d.append(numberized[2])

        return self.tensorize(tokens_2d, seq_lens_1d, positions_2d)
 def _mock_roberta_tensorizer(self, max_seq_len=100):
     return ScriptRoBERTaTensorizerWithIndices(
         tokenizer=ScriptDoNothingTokenizer(),
         vocab=self._mock_vocab(),
         max_seq_len=max_seq_len,
     )
Beispiel #7
0
    def test_xlm_tensorizer_seq_padding_size_exceeds_max_seq_len(self):
        vocab = self._mock_vocab()

        xlm = ScriptXLMTensorizer(
            tokenizer=ScriptDoNothingTokenizer(),
            token_vocab=vocab,
            language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
            max_seq_len=20,
            default_language="en",
        )

        seq_padding_control = [0, 32, 256]
        xlm.set_padding_control("sequence_length", seq_padding_control)

        rand_tokens = [
            [str(random.randint(100, 200)) for i in range(30)],
            [str(random.randint(100, 200)) for i in range(20)],
            [str(random.randint(100, 200)) for i in range(10)],
        ]

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), )

        token_count = [len(t) + 2 for t in rand_tokens]
        expected_batch_size = len(rand_tokens)
        expected_token_size = min(
            max(max(token_count), seq_padding_control[1]), xlm.max_seq_len)
        expected_padding_count = [
            max(0, expected_token_size - cnt) for cnt in token_count
        ]
        token_count = [
            expected_token_size - cnt for cnt in expected_padding_count
        ]

        # verify tensorized tokens padding
        tokens = tokens.tolist()
        self.assertEqual(len(tokens), expected_batch_size)
        self.assertEqual(
            max(len(t) for t in tokens),
            min(len(t) for t in tokens),
            expected_token_size,
        )
        for i in range(expected_batch_size):
            self.assertEqual(tokens[i][token_count[i]:],
                             [200] * expected_padding_count[i])

        # verify tensorized languages
        languages = languages.tolist()
        self.assertEqual(len(languages), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(languages[i][:token_count[i]],
                             [2] * token_count[i])
            self.assertEqual(languages[i][token_count[i]:],
                             [0] * expected_padding_count[i])

        # verify tensorized postions
        positions = positions.tolist()
        self.assertEqual(len(positions), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(positions[i][token_count[i]:],
                             [0] * expected_padding_count[i])

        # verify pad_masks
        pad_masks = pad_masks.tolist()
        self.assertEqual(len(pad_masks), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(pad_masks[i][:token_count[i]],
                             [1] * token_count[i])
            self.assertEqual(pad_masks[i][token_count[i]:],
                             [0] * expected_padding_count[i])