예제 #1
0
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tensor_dict = {}
        for key, val in tokens.items():
            if key == "type_ids":
                padding_value = 0
                mktensor = torch.LongTensor
            elif key == "mask" or key == "wordpiece_mask":
                padding_value = False
                mktensor = torch.BoolTensor
            elif len(val) > 0 and isinstance(val[0], bool):
                padding_value = False
                mktensor = torch.BoolTensor
            else:
                padding_value = self._tokenizer.pad_token_id
                if padding_value is None:
                    padding_value = (
                        0  # Some tokenizers don't have padding tokens and rely on the mask only.
                    )
                mktensor = torch.LongTensor

            tensor = mktensor(
                pad_sequence_to_length(val,
                                       padding_lengths[key],
                                       default_value=lambda: padding_value))

            tensor_dict[key] = tensor
        return tensor_dict
예제 #2
0
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        # Different transformers use different padding values for tokens, but for mask and type id, the padding
        # value is always 0.

        tokens = tokens.copy()
        padding_lengths = padding_lengths.copy()

        offsets_tokens = tokens.pop("offsets")
        offsets_padding_lengths = padding_lengths.pop("offsets")

        tensor_dict = {
            key: torch.LongTensor(
                pad_sequence_to_length(
                    val,
                    padding_lengths[key],
                    default_value=lambda: 0 if "mask" in key or "type-ids" in
                    key else self._tokenizer.pad_token_id,
                ))
            for key, val in tokens.items()
        }

        tensor_dict["offsets"] = torch.LongTensor(
            pad_sequence_to_length(offsets_tokens,
                                   offsets_padding_lengths,
                                   default_value=lambda: (0, 0)))

        return tensor_dict
 def as_padded_tensor_dict(
         self, tokens: IndexedTokenList,
         padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
     # Different transformers use different padding values for tokens, but for mask and type id, the padding
     # value is always 0.
     return {
         key: torch.LongTensor(
             pad_sequence_to_length(
                 val,
                 padding_lengths[key],
                 default_value=lambda: 0 if key == 'attention-mask' else
                 (0 if "mask" in key or "type-ids" in key else self.
                  _tokenizer.pad_token_id),
             ))
         for key, val in tokens.items()
     }
 def as_padded_tensor_dict(
     self,
     tokens: IndexedTokenList,
     padding_lengths: Dict[str, int],
 ) -> Dict[str, torch.Tensor]:
     # Different transformers use different padding values for tokens, but for mask and type id, the padding
     # value is always 0.
     return {
         key: torch.LongTensor(
             pad_sequence_to_length(
                 val,
                 padding_lengths[key],
                 default_value=lambda: 0 if key in {
                     "mask", "type_ids", "wordpiece_mask",
                     "segment_concat_mask"
                 } else self._tokenizer.pad_token_id,
             ))
         for key, val in tokens.items()
     }
 def as_padded_tensor_dict(
         self, tokens: IndexedTokenList,
         padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
     # Different transformers use different padding values for tokens, but for mask and type id, the padding
     # value is always False/0.
     tensor_dict = {}
     for key, val in tokens.items():
         if val and isinstance(val[0], bool):
             tensor = torch.BoolTensor(
                 pad_sequence_to_length(val,
                                        padding_lengths[key],
                                        default_value=lambda: False))
         else:
             tensor = torch.LongTensor(
                 pad_sequence_to_length(
                     val,
                     padding_lengths[key],
                     default_value=lambda: 0
                     if key == "type_ids" else self._tokenizer.pad_token_id,
                 ), )
         tensor_dict[key] = tensor
     return tensor_dict
예제 #6
0
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        # Different transformers use different padding values for tokens, but for mask and type id, the padding
        # value is always 0.

        # 改写:
        # tensor= {}
        # for key, val in tokens.items():
        #     if key in {'mask','type_ids'}:
        #         pass
        #     else:
        #         self._tokenizer.pad_token_id

        return {
            key: torch.LongTensor(
                pad_sequence_to_length(
                    val,
                    padding_lengths[key],
                    default_value=lambda: 0 if key in {"mask", "type_ids"} else
                    self._tokenizer.pad_token_id,
                ))
            for key, val in tokens.items()
        }