Beispiel #1
0
 def encode(self, sequence: str) -> torch.Tensor:
     """ Encodes a 'sequence'.
     :param sequence: String 'sequence' to encode.
     
     :return: torch.Tensor with Encoding of the `sequence`.
     """
     sequence = TextEncoder.encode(self, sequence)
     return self.tokenizer(sequence, return_tensors="pt")["input_ids"][0]
Beispiel #2
0
 def encode(self, sequence: str) -> torch.Tensor:
     """ Encodes a 'sequence'.
     :param sequence: String 'sequence' to encode.
     
     Returns:
         - torch.Tensor: Encoding of the 'sequence'.
     """
     sequence = TextEncoder.encode(self, sequence)
     return self.encode_func(sequence)
Beispiel #3
0
    def encode(self, sequence: str) -> torch.Tensor:
        """Encodes a 'sequence'.
        :param sequence: String 'sequence' to encode.

        :return: torch.Tensor with Encoding of the `sequence`.
        """
        sequence = TextEncoder.encode(self, sequence)
        return self.tokenizer(sequence, truncation=True,
                              max_length=256)["input_ids"]
Beispiel #4
0
    def encode(self, sequence: str) -> torch.Tensor:
        """ Encodes a 'sequence'.
        :param sequence: String 'sequence' to encode.

        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
        """
        sequence = TextEncoder.encode(self, sequence)
        vector = self.tokenizer.encode(sequence)
        return torch.tensor(vector)
Beispiel #5
0
    def encode(self, sequence: str) -> torch.Tensor:
        """Encodes a 'sequence'.
        :param sequence: String 'sequence' to encode.

        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
        """
        sequence = TextEncoder.encode(self, sequence)
        return torch.tensor(
            self.tokenizer(sequence, truncation=False)["input_ids"])
    def decode(self, embeddings):
        """ Encodes a 'sequence'.
        Requires a space to start the input string  -> the encoding methods should be called with the 
        'add_prefix_space' flag set to 'True'. Otherwise, this tokenizer encode and decode will not conserve the
        absence of space at the beginning of a string.

        :param sequence: String 'sequence' to encode.
        
        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
        """
        sequence = TextEncoder.decode(self, embeddings)
        vector = self.tokenizer.decode(embeddings)
        return vector
Beispiel #7
0
    def encode_trackpos(self, sequence: str) -> torch.Tensor:
        """ Encodes a 'sequence' and keeps the alignments with the respective tags.
        :param sequence: String 'sequence' to encode.

        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
            - torch.Tensor: Alignment indexes
        """
        sequence = TextEncoder.encode(self, sequence)
        tag_index, vector = [], [
            self._bos_index,
        ]
        for index, token in enumerate(sequence.split()):
            tag_index.append(len(vector))
            vector = vector + self.tokenizer.encode(token,
                                                    add_special_tokens=False)
        vector.append(self._eos_index)
        return torch.tensor(vector), torch.tensor(tag_index)
Beispiel #8
0
    def encode_trackpos(self, sequence: str) -> torch.Tensor:
        """ Encodes a 'sequence' and keeps the alignments with the respective tags.
        :param sequence: String 'sequence' to encode.

        Returns:
            - torch.Tensor: Encoding of the 'sequence'.
            - torch.Tensor: Alignment indexes
        """
        sequence = TextEncoder.encode(self, sequence)
        tag_index, vector = [], [
            self._bos_index,
        ]
        tokens = sequence.split()
        # Add whitespace to each token to prevent Ġ<token>
        tokens = [tokens[0]] + [" " + token for token in tokens[1:]]
        for index, token in enumerate(tokens):
            tag_index.append(len(vector))
            vector = vector + self.encode_func(token)[1:-1].tolist()
        vector.append(self._eos_index)
        return torch.tensor(vector), torch.tensor(tag_index)