def pad_token_sequence(
            self, tokens: Dict[str,
                               List[List[int]]], desired_num_tokens: Dict[str,
                                                                          int],
            padding_lengths: Dict[str, int]) -> Dict[str, List[List[int]]]:
        # Pad the tokens.
        # tokens has only one key...
        key = list(tokens.keys())[0]

        padded_tokens = pad_sequence_to_length(
            tokens[key],
            desired_num_tokens[key],
            default_value=self.get_padding_token)

        # Pad the characters within the tokens.
        desired_token_length = padding_lengths['num_token_characters']
        longest_token: List[int] = max(tokens[key], key=len, default=[])
        padding_value = 0
        if desired_token_length > len(longest_token):
            # Since we want to pad to greater than the longest token, we add a
            # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
            padded_tokens.append([padding_value] * desired_token_length)
        # pad the list of lists to the longest sublist, appending 0's
        padded_tokens = list(
            zip(*itertools.zip_longest(*padded_tokens,
                                       fillvalue=padding_value)))
        if desired_token_length > len(longest_token):
            # Removes the "dummy token".
            padded_tokens.pop()
        # Truncates all the tokens to the desired length, and return the result.
        return {
            key:
            [list(token[:desired_token_length]) for token in padded_tokens]
        }
 def pad_token_sequence(
     self, tokens: Dict[str, List[int]], desired_num_tokens: Dict[str, int],
     padding_lengths: Dict[str, int]
 ) -> Dict[str, List[int]]:  # pylint: disable=unused-argument
     return {
         key: pad_sequence_to_length(val, desired_num_tokens[key])
         for key, val in tokens.items()
     }
Esempio n. 3
0
 def as_tensor(self, padding_lengths: Dict[str, int]) -> DataArray:
     padded_field_list = pad_sequence_to_length(
         self.field_list, padding_lengths['num_fields'],
         self.field_list[0].empty_field)
     # Here we're removing the scoping on the padding length keys that we added in
     # `get_padding_lengths`; see the note there for more detail.
     child_padding_lengths = {
         key.replace('list_', '', 1): value
         for key, value in padding_lengths.items()
         if key.startswith('list_')
     }
     padded_fields = [
         field.as_tensor(child_padding_lengths)
         for field in padded_field_list
     ]
     return self.field_list[0].batch_tensors(padded_fields)
Esempio n. 4
0
 def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
     desired_num_tokens = padding_lengths['num_tokens']
     padded_tags = pad_sequence_to_length(self._indexed_labels,
                                          desired_num_tokens)
     tensor = torch.LongTensor(padded_tags)
     return tensor