コード例 #1
0
 def encode(self, sequence: str) -> torch.Tensor:
     """ Encodes a 'sequence'.
     :param sequence: String 'sequence' to encode.
     
     :return: torch.Tensor with Encoding of the `sequence`.
     """
     sequence = TextEncoder.encode(self, sequence)
     return self.tokenizer(sequence, return_tensors="pt")["input_ids"][0]
コード例 #2
0
ファイル: roberta_tokenizer.py プロジェクト: Unbabel/caption
 def encode(self, sequence: str) -> torch.Tensor:
     """ Encodes a 'sequence'.
     :param sequence: String 'sequence' to encode.
     
     Returns:
         - torch.Tensor: Encoding of the 'sequence'.
     """
     sequence = TextEncoder.encode(self, sequence)
     return self.encode_func(sequence)
コード例 #3
0
    def encode(self, sequence: str) -> torch.Tensor:
        """Encodes a 'sequence'.
        :param sequence: String 'sequence' to encode.

        :return: torch.Tensor with Encoding of the `sequence`.
        """
        sequence = TextEncoder.encode(self, sequence)
        return self.tokenizer(sequence, truncation=True,
                              max_length=256)["input_ids"]
コード例 #4
0
ファイル: bert_tokenizer.py プロジェクト: heorhii-bolotov/QA
    def encode(self, sequence: str) -> torch.Tensor:
        """ Encodes a 'sequence'.
        :param sequence: String 'sequence' to encode.

        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
        """
        sequence = TextEncoder.encode(self, sequence)
        vector = self.tokenizer.encode(sequence)
        return torch.tensor(vector)
コード例 #5
0
    def encode(self, sequence: str) -> torch.Tensor:
        """Encodes a 'sequence'.
        :param sequence: String 'sequence' to encode.

        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
        """
        sequence = TextEncoder.encode(self, sequence)
        return torch.tensor(
            self.tokenizer(sequence, truncation=False)["input_ids"])
コード例 #6
0
    def encode_trackpos(self, sequence: str) -> torch.Tensor:
        """ Encodes a 'sequence' and keeps the alignments with the respective tags.
        :param sequence: String 'sequence' to encode.

        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
            - torch.Tensor: Alignment indexes
        """
        sequence = TextEncoder.encode(self, sequence)
        tag_index, vector = [], [
            self._bos_index,
        ]
        for index, token in enumerate(sequence.split()):
            tag_index.append(len(vector))
            vector = vector + self.tokenizer.encode(token,
                                                    add_special_tokens=False)
        vector.append(self._eos_index)
        return torch.tensor(vector), torch.tensor(tag_index)
コード例 #7
0
ファイル: roberta_tokenizer.py プロジェクト: Unbabel/caption
    def encode_trackpos(self, sequence: str) -> torch.Tensor:
        """ Encodes a 'sequence' and keeps the alignments with the respective tags.
        :param sequence: String 'sequence' to encode.

        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
            - torch.Tensor: Alignment indexes
        """
        sequence = TextEncoder.encode(self, sequence)
        tag_index, vector = [], [
            self._bos_index,
        ]
        tokens = sequence.split()
        # Add whitespace to each token to prevent Ġ<token>
        tokens = [tokens[0]] + [" " + token for token in tokens[1:]]
        for index, token in enumerate(tokens):
            tag_index.append(len(vector))
            vector = vector + self.encode_func(token)[1:-1].tolist()
        vector.append(self._eos_index)
        return torch.tensor(vector), torch.tensor(tag_index)