Beispiel #1
0
    def test_xlm_token_tensorizer(self):
        vocab = self._mock_vocab()

        xlm = ScriptXLMTensorizer(
            tokenizer=ScriptDoNothingTokenizer(),
            token_vocab=vocab,
            language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
            max_seq_len=256,
            default_language="en",
        )
        rand_tokens = [
            [str(random.randint(100, 200)) for i in range(20)],
            [str(random.randint(100, 200)) for i in range(10)],
        ]

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens))
        tokens = tokens.tolist()
        # eos token
        self.assertEqual(tokens[0][0], 202)
        self.assertEqual(tokens[0][-1], 202)
        # pad token
        self.assertEqual(tokens[1][12:], [200] * 10)

        languages = languages.tolist()
        self.assertEqual(languages[0], [2] * len(tokens[0]))
        self.assertEqual(languages[1][12:], [0] * 10)

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"]))
        languages = languages.tolist()
        self.assertEqual(languages[0][:], [1] * len(tokens[0]))
        self.assertEqual(languages[1][:12], [2] * 12)
Beispiel #2
0
    def forward(
        self,
        right_texts: Optional[List[str]] = None,
        left_texts: Optional[List[str]] = None,
        right_tokens: Optional[List[List[str]]] = None,
        left_tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
        right_dense_feat: Optional[List[List[float]]] = None,
        left_dense_feat: Optional[List[List[float]]] = None,
    ) -> torch.Tensor:
        if right_dense_feat is None or left_dense_feat is None:
            raise RuntimeError("Expect dense feature.")

        right_inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(right_texts),
            tokens=squeeze_2d(right_tokens),
            languages=squeeze_1d(languages),
        )
        left_inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(left_texts),
            tokens=squeeze_2d(left_tokens),
            languages=squeeze_1d(languages),
        )

        right_dense_feat = self.right_normalizer.normalize(right_dense_feat)
        left_dense_feat = self.left_normalizer.normalize(left_dense_feat)
        right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float)
        left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float)

        sentence_embedding = self._forward(right_inputs, left_inputs,
                                           right_dense_tensor,
                                           left_dense_tensor)
        return sentence_embedding
    def test_xlm_tensorizer_input_sequence_exceeds_max_seq_len(self):
        xlm = self._mock_xlm_tensorizer(max_seq_len=20)
        rand_tokens = self.get_rand_tokens([30, 10])

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), )

        sig_idxs = [len(t) + 2 for t in rand_tokens]
        expected_token_size = min(max(sig_idxs), xlm.max_seq_len)
        expected_token_padding = [
            max(0, expected_token_size - cnt) for cnt in sig_idxs
        ]
        sig_idxs = [
            expected_token_size - cnt for cnt in expected_token_padding
        ]

        padding_key = {
            tokens: 200,
            pad_masks: 0,
            languages: 0,
            positions: 0,
        }

        # verify padding
        for output_tensor, pad_val in padding_key.items():
            self.validate_padding(
                output_tensor,
                pad_val,
                significant_idxs=sig_idxs,
                expected_batch_size=len(rand_tokens),
                expected_token_padding=expected_token_padding,
            )
Beispiel #4
0
 def forward(self,
             tokens: List[List[str]],
             languages: Optional[List[str]] = None):
     input_tensors = self.tensorizer(pre_tokenized=squeeze_2d(tokens),
                                     languages=squeeze_1d(languages))
     logits = self.model(input_tensors)
     return self.output_layer(logits)
Beispiel #5
0
    def forward(
        self,
        right_dense_feat: List[List[float]],
        left_dense_feat: List[List[float]],
        texts: Optional[List[str]] = None,
        # multi_texts is of shape [batch_size, num_columns]
        multi_texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
    ):
        inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(texts, multi_texts),
            tokens=squeeze_2d(tokens),
            languages=squeeze_1d(languages),
        )
        input_tensors = self.tensorizer(inputs)
        right_dense_feat = self.right_normalizer.normalize(right_dense_feat)
        left_dense_feat = self.left_normalizer.normalize(left_dense_feat)

        right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float)
        left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float)
        if self.tensorizer.device != "":
            right_dense_tensor = right_dense_tensor.to(self.tensorizer.device)
            left_dense_tensor = left_dense_tensor.to(self.tensorizer.device)
        logits = self.model(input_tensors, right_dense_tensor,
                            left_dense_tensor)
        return self.output_layer(logits)
    def test_roberta_tensorizer_input_exceeds_max_seq_len(self):
        roberta = self._mock_roberta_tensorizer(max_seq_len=28)

        rand_tokens = self.get_rand_tokens([25, 15, 5, 30])
        expected_batch_size = 4
        expected_token_size = 28

        tokens, pad_mask, start_indices, end_indices, positions = roberta.tensorize(
            tokens=squeeze_2d(rand_tokens))

        sig_idxs = [1 + len(t) + 1 for t in rand_tokens]
        expected_token_padding = [
            expected_token_size - num for num in sig_idxs
        ]

        padding_key = {
            tokens: 200,
            pad_mask: 0,
            start_indices: 0,
            end_indices: 0,
            positions: 0,
        }

        # verify padding
        for output_tensor, pad_val in padding_key.items():
            self.validate_padding(
                output_tensor,
                pad_val,
                significant_idxs=sig_idxs,
                expected_batch_size=expected_batch_size,
                expected_token_padding=expected_token_padding,
            )
Beispiel #7
0
    def forward(
        self,
        texts: Optional[List[str]] = None,
        # multi_texts is of shape [batch_size, num_columns]
        multi_texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
        dense_feat: Optional[List[List[float]]] = None,
    ) -> torch.Tensor:
        if dense_feat is None:
            raise RuntimeError("Expect dense feature.")

        inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(texts, multi_texts),
            tokens=squeeze_2d(tokens),
            languages=squeeze_1d(languages),
        )
        # call model
        dense_feat = self.normalizer.normalize(dense_feat)
        dense_tensor = torch.tensor(dense_feat, dtype=torch.float)

        sentence_embedding = self._forward(inputs, dense_tensor)
        if self.concat_dense:
            return torch.cat([sentence_embedding, dense_tensor], 1)
        else:
            return sentence_embedding
    def test_roberta_tensorizer_default_padding(self):
        roberta = self._mock_roberta_tensorizer()
        rand_tokens = self.get_rand_tokens([20, 5, 15])

        start_placeholder = 1
        end_placeholder = 1
        # num idxs that store significant values for elem in rand_token, i.e. [22, 7, 17]
        sig_idxs = [
            start_placeholder + len(t) + end_placeholder for t in rand_tokens
        ]
        # pad every token to bottleneck value, i.e. [0, 15, 5]
        expected_token_padding = [max(sig_idxs) - num for num in sig_idxs]

        tokens, pad_mask, start_indices, end_indices, positions = roberta.tensorize(
            tokens=squeeze_2d(rand_tokens))

        padding_key = {
            tokens: 200,
            pad_mask: 0,
            start_indices: 0,
            end_indices: 0,
            positions: 0,
        }

        # verify padding
        for output_tensor, pad_val in padding_key.items():
            self.validate_padding(
                output_tensor,
                pad_val,
                significant_idxs=sig_idxs,
                expected_batch_size=len(rand_tokens),
                expected_token_padding=expected_token_padding,
            )
    def test_xlm_tensorizer_batch_padding(self):
        xlm = self._mock_xlm_tensorizer()

        batch_padding_control = [0, 3, 6]
        xlm.set_padding_control("batch_length", batch_padding_control)

        rand_tokens = self.get_rand_tokens([20, 10])

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), )

        sig_idxs = [len(t) + 2 for t in rand_tokens] + [0]
        expected_token_size = max(sig_idxs)
        expected_batch_size = min(
            max(len(rand_tokens), batch_padding_control[1]), xlm.max_seq_len)
        expected_token_padding = [
            expected_token_size - cnt for cnt in sig_idxs
        ]

        padding_key = {
            tokens: 200,
            pad_masks: 0,
            languages: 0,
            positions: 0,
        }

        # verify padding
        for output_tensor, pad_val in padding_key.items():
            self.validate_padding(
                output_tensor,
                pad_val,
                significant_idxs=sig_idxs,
                expected_batch_size=expected_batch_size,
                expected_token_padding=expected_token_padding,
            )
Beispiel #10
0
 def forward(
     self,
     tokens: List[List[str]],
     dense_feat: List[List[float]],
     languages: Optional[List[str]] = None,
 ):
     input_tensors = self.tensorizer(pre_tokenized=squeeze_2d(tokens),
                                     languages=squeeze_1d(languages))
     logits = self.model(input_tensors, torch.tensor(dense_feat).float())
     return self.output_layer(logits)
Beispiel #11
0
 def forward(
     self,
     right_texts: Optional[List[str]] = None,
     left_texts: Optional[List[str]] = None,
     right_tokens: Optional[List[List[str]]] = None,
     left_tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ) -> torch.Tensor:
     right_inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(right_texts),
         tokens=squeeze_2d(right_tokens),
         languages=squeeze_1d(languages),
     )
     left_inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(left_texts),
         tokens=squeeze_2d(left_tokens),
         languages=squeeze_1d(languages),
     )
     return self._forward(right_inputs, left_inputs)
Beispiel #12
0
    def forward(
        self,
        right_dense_feat: List[List[float]],
        left_dense_feat: List[List[float]],
        right_texts: Optional[List[str]] = None,
        left_texts: Optional[List[str]] = None,
        right_tokens: Optional[List[List[str]]] = None,
        left_tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
    ):
        right_inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(right_texts),
            tokens=squeeze_2d(right_tokens),
            languages=squeeze_1d(languages),
        )
        right_input_tensors = self.right_tensorizer(right_inputs)
        left_inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(left_texts),
            tokens=squeeze_2d(left_tokens),
            languages=squeeze_1d(languages),
        )
        left_input_tensors = self.left_tensorizer(left_inputs)

        right_dense_feat = self.right_normalizer.normalize(right_dense_feat)
        left_dense_feat = self.left_normalizer.normalize(left_dense_feat)
        right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float)
        left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float)
        if self.right_tensorizer.device != "":
            right_dense_tensor = right_dense_tensor.to(
                self.right_tensorizer.device)
        if self.left_tensorizer.device != "":
            left_dense_tensor = left_dense_tensor.to(
                self.left_tensorizer.device)

        logits = self.model(
            right_input_tensors,
            left_input_tensors,
            right_dense_tensor,
            left_dense_tensor,
        )
        return self.output_layer(logits)
Beispiel #13
0
 def forward(
     self,
     # first input
     texts1: Optional[List[str]] = None,
     tokens1: Optional[List[List[str]]] = None,
     # second input
     texts2: Optional[List[str]] = None,
     tokens2: Optional[List[List[str]]] = None,
 ):
     inputs1: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts1),
         tokens=squeeze_2d(tokens1),
         languages=None,
     )
     inputs2: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts2),
         tokens=squeeze_2d(tokens2),
         languages=None,
     )
     input_tensors1 = self.tensorizer1(inputs1)
     input_tensors2 = self.tensorizer2(inputs2)
     return self.model(input_tensors1, input_tensors2)
    def test_xlm_token_tensorizer(self):
        xlm = self._mock_xlm_tensorizer()
        rand_tokens = self.get_rand_tokens([20, 10])

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens))
        tokens = tokens.tolist()
        # eos token
        self.assertEqual(tokens[0][0], 202)
        self.assertEqual(tokens[0][-1], 202)
        # pad token
        self.assertEqual(tokens[1][12:], [200] * 10)

        languages = languages.tolist()
        self.assertEqual(languages[0], [2] * len(tokens[0]))
        self.assertEqual(languages[1][12:], [0] * 10)

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"]))
        languages = languages.tolist()
        self.assertEqual(languages[0][:], [1] * len(tokens[0]))
        self.assertEqual(languages[1][:12], [2] * 12)
Beispiel #15
0
 def forward(
     self,
     right_texts: Optional[List[str]] = None,
     left_texts: Optional[List[str]] = None,
     right_tokens: Optional[List[List[str]]] = None,
     left_tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ):
     right_inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(right_texts),
         tokens=squeeze_2d(right_tokens),
         languages=squeeze_1d(languages),
     )
     right_input_tensors = self.right_tensorizer(right_inputs)
     left_inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(left_texts),
         tokens=squeeze_2d(left_tokens),
         languages=squeeze_1d(languages),
     )
     left_input_tensors = self.left_tensorizer(left_inputs)
     logits = self.model(right_input_tensors, left_input_tensors)
     return self.output_layer(logits)
Beispiel #16
0
 def forward(
     self,
     texts: Optional[List[str]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ):
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     input_tensors = self.tensorizer(inputs)
     logits = self.model(input_tensors)
     return self.output_layer(logits)
Beispiel #17
0
    def tensorize_1d(
        self,
        texts: Optional[List[str]] = None,
        tokens: Optional[List[List[str]]] = None,
    ):
        """
        Process raw inputs(single sentence) into model input tensors, it
        supports two input formats:
            1) multiple rows of single sentence
            2) multiple rows of single sentence pre-processed tokens

        This function should handle the logic of calling numberize() and also
        padding the numberized result.
        """
        return self.tensorize(squeeze_1d(texts), squeeze_2d(tokens))
Beispiel #18
0
 def forward(
     self,
     texts: Optional[List[str]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
     dense_feat: Optional[List[List[float]]] = None,
 ) -> torch.Tensor:
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     input_tensors = self.tensorizer(inputs)
     # call model
     return self.model(input_tensors)[self.index]
Beispiel #19
0
 def forward(
     self,
     texts: Optional[List[str]] = None,
     # multi_texts is of shape [batch_size, num_columns]
     multi_texts: Optional[List[List[str]]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
     dense_feat: Optional[List[List[float]]] = None,
 ) -> List[torch.Tensor]:
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(texts, multi_texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     return self._forward(inputs)
    def test_xlm_tensorizer_default_padding(self):
        xlm = self._mock_xlm_tensorizer()
        rand_tokens = self.get_rand_tokens([20, 10])

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens))

        sig_idxs = [len(t) + 2 for t in rand_tokens]
        expected_token_size = max(sig_idxs)
        expected_token_padding = [
            expected_token_size - cnt for cnt in sig_idxs
        ]
        expected_batch_size = len(rand_tokens)

        # verify tensorized tokens padding
        tokens = tokens.tolist()
        self.assertEqual(len(tokens), expected_batch_size)
        self.assertEqual(
            max(len(t) for t in tokens),
            min(len(t) for t in tokens),
            expected_token_size,
        )
        for i in range(expected_batch_size):
            self.assertEqual(tokens[i][sig_idxs[i]:],
                             [200] * expected_token_padding[i])

        # verify tensorized languages
        languages = languages.tolist()
        self.assertEqual(len(languages), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(languages[i][:sig_idxs[i]], [2] * sig_idxs[i])
            self.assertEqual(languages[i][sig_idxs[i]:],
                             [0] * expected_token_padding[i])

        # verify tensorized postions
        positions = positions.tolist()
        self.assertEqual(len(positions), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(positions[i][sig_idxs[i]:],
                             [0] * expected_token_padding[i])

        # verify pad_masks
        pad_masks = pad_masks.tolist()
        self.assertEqual(len(pad_masks), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(pad_masks[i][:sig_idxs[i]], [1] * sig_idxs[i])
            self.assertEqual(pad_masks[i][sig_idxs[i]:],
                             [0] * expected_token_padding[i])
Beispiel #21
0
 def forward(
     self,
     texts: Optional[List[str]] = None,
     # multi_texts is of shape [batch_size, num_columns]
     multi_texts: Optional[List[List[str]]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ):
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(texts, multi_texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     input_tensors = self.tensorizer(inputs)
     logits = self.model(input_tensors)
     return self.output_layer(logits)
Beispiel #22
0
 def forward(
     self,
     dense_feat: List[List[float]],
     texts: Optional[List[str]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ):
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     input_tensors = self.tensorizer(inputs)
     dense_feat = self.normalizer.normalize(dense_feat)
     logits = self.model(input_tensors,
                         torch.tensor(dense_feat, dtype=torch.float))
     return self.output_layer(logits)
    def test_roberta_tensorizer_sequence_batch_padding(self):
        roberta = self._mock_roberta_tensorizer()
        seq_padding_control = [0, 48, 256]
        batch_padding_control = [0, 3, 6]
        roberta.set_padding_control("batch_length", batch_padding_control)
        roberta.set_padding_control("sequence_length", seq_padding_control)

        rand_tokens = self.get_rand_tokens([25, 15, 5, 30])
        expected_batch_size = 6
        expected_token_size = 48

        tokens, pad_mask, start_indices, end_indices, positions = roberta.tensorize(
            tokens=squeeze_2d(rand_tokens))

        sig_idxs = [1 + len(t) + 1 for t in rand_tokens] + [0, 0]
        expected_token_padding = [
            expected_token_size - num for num in sig_idxs
        ] + [
            48,
            48,
        ]

        padding_key = {
            tokens: 200,
            pad_mask: 0,
            start_indices: 0,
            end_indices: 0,
            positions: 0,
        }

        # verify padding
        for output_tensor, pad_val in padding_key.items():
            self.validate_padding(
                output_tensor,
                pad_val,
                significant_idxs=sig_idxs,
                expected_batch_size=expected_batch_size,
                expected_token_padding=expected_token_padding,
            )
Beispiel #24
0
    def forward(
        self,
        texts: Optional[List[str]] = None,
        tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
        dense_feat: Optional[List[List[float]]] = None,
    ) -> torch.Tensor:
        if dense_feat is None:
            raise RuntimeError("Expect dense feature.")

        inputs: ScriptBatchInput = ScriptBatchInput(
            texts=squeeze_1d(texts),
            tokens=squeeze_2d(tokens),
            languages=squeeze_1d(languages),
        )
        input_tensors = self.tensorizer(inputs)
        # call model
        dense_feat = self.normalizer.normalize(dense_feat)
        dense_tensor = torch.tensor(dense_feat, dtype=torch.float)

        encoder_embedding = self.model(input_tensors, dense_tensor)[self.index]
        return torch.cat([encoder_embedding, dense_tensor], 1)
    def test_roberta_tensorizer_seq_padding_size_exceeds_max_seq_len(self):
        roberta = self._mock_roberta_tensorizer(max_seq_len=20)
        seq_padding_control = [0, 32, 256]
        roberta.set_padding_control("sequence_length", seq_padding_control)

        rand_tokens = self.get_rand_tokens([30, 20, 10])

        tokens, pad_mask, start_indices, end_indices, positions = roberta.tensorize(
            tokens=squeeze_2d(rand_tokens))

        sig_idxs = [len(t) + 2 for t in rand_tokens]
        expected_batch_size = 3
        expected_token_size = min(max(max(sig_idxs), seq_padding_control[1]),
                                  roberta.max_seq_len)
        expected_token_padding = [
            max(0, expected_token_size - cnt) for cnt in sig_idxs
        ]
        sig_idxs = [
            expected_token_size - cnt for cnt in expected_token_padding
        ]

        padding_key = {
            tokens: 200,
            pad_mask: 0,
            start_indices: 0,
            end_indices: 0,
            positions: 0,
        }

        # verify padding
        for output_tensor, pad_val in padding_key.items():
            self.validate_padding(
                output_tensor,
                pad_val,
                significant_idxs=sig_idxs,
                expected_batch_size=expected_batch_size,
                expected_token_padding=expected_token_padding,
            )
Beispiel #26
0
    def test_xlm_tensorizer_seq_padding_size_exceeds_max_seq_len(self):
        vocab = self._mock_vocab()

        xlm = ScriptXLMTensorizer(
            tokenizer=ScriptDoNothingTokenizer(),
            token_vocab=vocab,
            language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
            max_seq_len=20,
            default_language="en",
        )

        seq_padding_control = [0, 32, 256]
        xlm.set_padding_control("sequence_length", seq_padding_control)

        rand_tokens = [
            [str(random.randint(100, 200)) for i in range(30)],
            [str(random.randint(100, 200)) for i in range(20)],
            [str(random.randint(100, 200)) for i in range(10)],
        ]

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), )

        token_count = [len(t) + 2 for t in rand_tokens]
        expected_batch_size = len(rand_tokens)
        expected_token_size = min(
            max(max(token_count), seq_padding_control[1]), xlm.max_seq_len)
        expected_padding_count = [
            max(0, expected_token_size - cnt) for cnt in token_count
        ]
        token_count = [
            expected_token_size - cnt for cnt in expected_padding_count
        ]

        # verify tensorized tokens padding
        tokens = tokens.tolist()
        self.assertEqual(len(tokens), expected_batch_size)
        self.assertEqual(
            max(len(t) for t in tokens),
            min(len(t) for t in tokens),
            expected_token_size,
        )
        for i in range(expected_batch_size):
            self.assertEqual(tokens[i][token_count[i]:],
                             [200] * expected_padding_count[i])

        # verify tensorized languages
        languages = languages.tolist()
        self.assertEqual(len(languages), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(languages[i][:token_count[i]],
                             [2] * token_count[i])
            self.assertEqual(languages[i][token_count[i]:],
                             [0] * expected_padding_count[i])

        # verify tensorized postions
        positions = positions.tolist()
        self.assertEqual(len(positions), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(positions[i][token_count[i]:],
                             [0] * expected_padding_count[i])

        # verify pad_masks
        pad_masks = pad_masks.tolist()
        self.assertEqual(len(pad_masks), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(pad_masks[i][:token_count[i]],
                             [1] * token_count[i])
            self.assertEqual(pad_masks[i][token_count[i]:],
                             [0] * expected_padding_count[i])
Beispiel #27
0
 def forward(self, tokens: List[List[str]]):
     input_tensors = self.tensorizer.tensorize(tokens=squeeze_2d(tokens))
     logits = self.model(input_tensors)
     return self.output_layer(logits)