Ejemplo n.º 1
0
    def forward(
        self,
        right_texts: Optional[List[str]] = None,
        left_texts: Optional[List[str]] = None,
        right_tokens: Optional[List[List[str]]] = None,
        left_tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
        right_dense_feat: Optional[List[List[float]]] = None,
        left_dense_feat: Optional[List[List[float]]] = None,
    ) -> torch.Tensor:
        if right_dense_feat is None or left_dense_feat is None:
            raise RuntimeError("Expect dense feature.")

        right_inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(right_texts),
            tokens=squeeze_2d(right_tokens),
            languages=squeeze_1d(languages),
        )
        left_inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(left_texts),
            tokens=squeeze_2d(left_tokens),
            languages=squeeze_1d(languages),
        )

        right_dense_feat = self.right_normalizer.normalize(right_dense_feat)
        left_dense_feat = self.left_normalizer.normalize(left_dense_feat)
        right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float)
        left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float)

        sentence_embedding = self._forward(right_inputs, left_inputs,
                                           right_dense_tensor,
                                           left_dense_tensor)
        return sentence_embedding
Ejemplo n.º 2
0
 def forward(
     self,
     texts: Optional[List[str]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ):
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     input_tensors = self.tensorizer(inputs)
     logits = self.model(input_tensors)
     return self.output_layer(logits)
Ejemplo n.º 3
0
 def forward(
     self,
     texts: Optional[List[str]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
     dense_feat: Optional[List[List[float]]] = None,
 ) -> torch.Tensor:
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     input_tensors = self.tensorizer(inputs)
     # call model
     return self.model(input_tensors)[self.index]
Ejemplo n.º 4
0
    def test_xlm_token_tensorizer(self):
        vocab = self._mock_vocab()

        xlm = ScriptXLMTensorizer(
            tokenizer=ScriptDoNothingTokenizer(),
            token_vocab=vocab,
            language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
            max_seq_len=256,
            default_language="en",
        )
        rand_tokens = [
            [str(random.randint(100, 200)) for i in range(20)],
            [str(random.randint(100, 200)) for i in range(10)],
        ]

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens))
        tokens = tokens.tolist()
        # eos token
        self.assertEqual(tokens[0][0], 202)
        self.assertEqual(tokens[0][-1], 202)
        # pad token
        self.assertEqual(tokens[1][12:], [200] * 10)

        languages = languages.tolist()
        self.assertEqual(languages[0], [2] * len(tokens[0]))
        self.assertEqual(languages[1][12:], [0] * 10)

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"]))
        languages = languages.tolist()
        self.assertEqual(languages[0][:], [1] * len(tokens[0]))
        self.assertEqual(languages[1][:12], [2] * 12)
Ejemplo n.º 5
0
def resolve_texts(
    texts: Optional[List[str]] = None,
    multi_texts: Optional[List[List[str]]] = None
) -> Optional[List[List[str]]]:
    if texts is not None:
        return squeeze_1d(texts)
    return multi_texts
Ejemplo n.º 6
0
    def forward(
        self,
        texts: Optional[List[str]] = None,
        # multi_texts is of shape [batch_size, num_columns]
        multi_texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
        dense_feat: Optional[List[List[float]]] = None,
    ) -> torch.Tensor:
        if dense_feat is None:
            raise RuntimeError("Expect dense feature.")

        inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(texts, multi_texts),
            tokens=squeeze_2d(tokens),
            languages=squeeze_1d(languages),
        )
        # call model
        dense_feat = self.normalizer.normalize(dense_feat)
        dense_tensor = torch.tensor(dense_feat, dtype=torch.float)

        sentence_embedding = self._forward(inputs, dense_tensor)
        if self.concat_dense:
            return torch.cat([sentence_embedding, dense_tensor], 1)
        else:
            return sentence_embedding
Ejemplo n.º 7
0
 def forward(self,
             tokens: List[List[str]],
             languages: Optional[List[str]] = None):
     input_tensors = self.tensorizer(pre_tokenized=squeeze_2d(tokens),
                                     languages=squeeze_1d(languages))
     logits = self.model(input_tensors)
     return self.output_layer(logits)
Ejemplo n.º 8
0
    def forward(
        self,
        right_dense_feat: List[List[float]],
        left_dense_feat: List[List[float]],
        texts: Optional[List[str]] = None,
        # multi_texts is of shape [batch_size, num_columns]
        multi_texts: Optional[List[List[str]]] = None,
        tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
    ):
        inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(texts, multi_texts),
            tokens=squeeze_2d(tokens),
            languages=squeeze_1d(languages),
        )
        input_tensors = self.tensorizer(inputs)
        right_dense_feat = self.right_normalizer.normalize(right_dense_feat)
        left_dense_feat = self.left_normalizer.normalize(left_dense_feat)

        right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float)
        left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float)
        if self.tensorizer.device != "":
            right_dense_tensor = right_dense_tensor.to(self.tensorizer.device)
            left_dense_tensor = left_dense_tensor.to(self.tensorizer.device)
        logits = self.model(input_tensors, right_dense_tensor,
                            left_dense_tensor)
        return self.output_layer(logits)
Ejemplo n.º 9
0
 def forward(
     self,
     dense_feat: List[List[float]],
     texts: Optional[List[str]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ):
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     input_tensors = self.tensorizer(inputs)
     dense_feat = self.normalizer.normalize(dense_feat)
     logits = self.model(input_tensors,
                         torch.tensor(dense_feat, dtype=torch.float))
     return self.output_layer(logits)
Ejemplo n.º 10
0
 def forward(
     self,
     right_texts: Optional[List[str]] = None,
     left_texts: Optional[List[str]] = None,
     right_tokens: Optional[List[List[str]]] = None,
     left_tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ) -> torch.Tensor:
     right_inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(right_texts),
         tokens=squeeze_2d(right_tokens),
         languages=squeeze_1d(languages),
     )
     left_inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(left_texts),
         tokens=squeeze_2d(left_tokens),
         languages=squeeze_1d(languages),
     )
     return self._forward(right_inputs, left_inputs)
Ejemplo n.º 11
0
 def forward(
     self,
     tokens: List[List[str]],
     dense_feat: List[List[float]],
     languages: Optional[List[str]] = None,
 ):
     input_tensors = self.tensorizer(pre_tokenized=squeeze_2d(tokens),
                                     languages=squeeze_1d(languages))
     logits = self.model(input_tensors, torch.tensor(dense_feat).float())
     return self.output_layer(logits)
Ejemplo n.º 12
0
    def forward(
        self,
        right_dense_feat: List[List[float]],
        left_dense_feat: List[List[float]],
        right_texts: Optional[List[str]] = None,
        left_texts: Optional[List[str]] = None,
        right_tokens: Optional[List[List[str]]] = None,
        left_tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
    ):
        right_inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(right_texts),
            tokens=squeeze_2d(right_tokens),
            languages=squeeze_1d(languages),
        )
        right_input_tensors = self.right_tensorizer(right_inputs)
        left_inputs: ScriptBatchInput = ScriptBatchInput(
            texts=resolve_texts(left_texts),
            tokens=squeeze_2d(left_tokens),
            languages=squeeze_1d(languages),
        )
        left_input_tensors = self.left_tensorizer(left_inputs)

        right_dense_feat = self.right_normalizer.normalize(right_dense_feat)
        left_dense_feat = self.left_normalizer.normalize(left_dense_feat)
        right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float)
        left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float)
        if self.right_tensorizer.device != "":
            right_dense_tensor = right_dense_tensor.to(
                self.right_tensorizer.device)
        if self.left_tensorizer.device != "":
            left_dense_tensor = left_dense_tensor.to(
                self.left_tensorizer.device)

        logits = self.model(
            right_input_tensors,
            left_input_tensors,
            right_dense_tensor,
            left_dense_tensor,
        )
        return self.output_layer(logits)
Ejemplo n.º 13
0
 def forward(
     self,
     right_texts: Optional[List[str]] = None,
     left_texts: Optional[List[str]] = None,
     right_tokens: Optional[List[List[str]]] = None,
     left_tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ):
     right_inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(right_texts),
         tokens=squeeze_2d(right_tokens),
         languages=squeeze_1d(languages),
     )
     right_input_tensors = self.right_tensorizer(right_inputs)
     left_inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(left_texts),
         tokens=squeeze_2d(left_tokens),
         languages=squeeze_1d(languages),
     )
     left_input_tensors = self.left_tensorizer(left_inputs)
     logits = self.model(right_input_tensors, left_input_tensors)
     return self.output_layer(logits)
Ejemplo n.º 14
0
    def forward(
        self,
        texts: Optional[List[str]] = None,
        tokens: Optional[List[List[str]]] = None,
        languages: Optional[List[str]] = None,
        dense_feat: Optional[List[List[float]]] = None,
    ) -> torch.Tensor:
        if dense_feat is None:
            raise RuntimeError("Expect dense feature.")

        inputs: ScriptBatchInput = ScriptBatchInput(
            texts=squeeze_1d(texts),
            tokens=squeeze_2d(tokens),
            languages=squeeze_1d(languages),
        )
        input_tensors = self.tensorizer(inputs)
        # call model
        dense_feat = self.normalizer.normalize(dense_feat)
        dense_tensor = torch.tensor(dense_feat, dtype=torch.float)

        encoder_embedding = self.model(input_tensors, dense_tensor)[self.index]
        return torch.cat([encoder_embedding, dense_tensor], 1)
Ejemplo n.º 15
0
 def forward(
     self,
     # first input
     texts1: Optional[List[str]] = None,
     tokens1: Optional[List[List[str]]] = None,
     # second input
     texts2: Optional[List[str]] = None,
     tokens2: Optional[List[List[str]]] = None,
 ):
     inputs1: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts1),
         tokens=squeeze_2d(tokens1),
         languages=None,
     )
     inputs2: ScriptBatchInput = ScriptBatchInput(
         texts=squeeze_1d(texts2),
         tokens=squeeze_2d(tokens2),
         languages=None,
     )
     input_tensors1 = self.tensorizer1(inputs1)
     input_tensors2 = self.tensorizer2(inputs2)
     return self.model(input_tensors1, input_tensors2)
Ejemplo n.º 16
0
    def tensorize_1d(
        self,
        texts: Optional[List[str]] = None,
        tokens: Optional[List[List[str]]] = None,
    ):
        """
        Process raw inputs(single sentence) into model input tensors, it
        supports two input formats:
            1) multiple rows of single sentence
            2) multiple rows of single sentence pre-processed tokens

        This function should handle the logic of calling numberize() and also
        padding the numberized result.
        """
        return self.tensorize(squeeze_1d(texts), squeeze_2d(tokens))
Ejemplo n.º 17
0
 def forward(
     self,
     texts: Optional[List[str]] = None,
     # multi_texts is of shape [batch_size, num_columns]
     multi_texts: Optional[List[List[str]]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
     dense_feat: Optional[List[List[float]]] = None,
 ) -> List[torch.Tensor]:
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(texts, multi_texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     return self._forward(inputs)
Ejemplo n.º 18
0
 def forward(
     self,
     texts: Optional[List[str]] = None,
     # multi_texts is of shape [batch_size, num_columns]
     multi_texts: Optional[List[List[str]]] = None,
     tokens: Optional[List[List[str]]] = None,
     languages: Optional[List[str]] = None,
 ):
     inputs: ScriptBatchInput = ScriptBatchInput(
         texts=resolve_texts(texts, multi_texts),
         tokens=squeeze_2d(tokens),
         languages=squeeze_1d(languages),
     )
     input_tensors = self.tensorizer(inputs)
     logits = self.model(input_tensors)
     return self.output_layer(logits)
Ejemplo n.º 19
0
    def test_xlm_token_tensorizer(self):
        xlm = self._mock_xlm_tensorizer()
        rand_tokens = self.get_rand_tokens([20, 10])

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens))
        tokens = tokens.tolist()
        # eos token
        self.assertEqual(tokens[0][0], 202)
        self.assertEqual(tokens[0][-1], 202)
        # pad token
        self.assertEqual(tokens[1][12:], [200] * 10)

        languages = languages.tolist()
        self.assertEqual(languages[0], [2] * len(tokens[0]))
        self.assertEqual(languages[1][12:], [0] * 10)

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"]))
        languages = languages.tolist()
        self.assertEqual(languages[0][:], [1] * len(tokens[0]))
        self.assertEqual(languages[1][:12], [2] * 12)
Ejemplo n.º 20
0
 def forward(self, texts: List[str]):
     input_tensors = self.tensorizer.tensorize(texts=squeeze_1d(texts))
     logits = self.model(input_tensors)
     return self.output_layer(logits)