def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: tensor_dict = {} for key, val in tokens.items(): if key == "type_ids": padding_value = 0 mktensor = torch.LongTensor elif key == "mask" or key == "wordpiece_mask": padding_value = False mktensor = torch.BoolTensor elif len(val) > 0 and isinstance(val[0], bool): padding_value = False mktensor = torch.BoolTensor else: padding_value = self._tokenizer.pad_token_id if padding_value is None: padding_value = ( 0 # Some tokenizers don't have padding tokens and rely on the mask only. ) mktensor = torch.LongTensor tensor = mktensor( pad_sequence_to_length(val, padding_lengths[key], default_value=lambda: padding_value)) tensor_dict[key] = tensor return tensor_dict
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always 0. tokens = tokens.copy() padding_lengths = padding_lengths.copy() offsets_tokens = tokens.pop("offsets") offsets_padding_lengths = padding_lengths.pop("offsets") tensor_dict = { key: torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if "mask" in key or "type-ids" in key else self._tokenizer.pad_token_id, )) for key, val in tokens.items() } tensor_dict["offsets"] = torch.LongTensor( pad_sequence_to_length(offsets_tokens, offsets_padding_lengths, default_value=lambda: (0, 0))) return tensor_dict
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always 0. return { key: torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if key == 'attention-mask' else (0 if "mask" in key or "type-ids" in key else self. _tokenizer.pad_token_id), )) for key, val in tokens.items() }
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int], ) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always 0. return { key: torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if key in { "mask", "type_ids", "wordpiece_mask", "segment_concat_mask" } else self._tokenizer.pad_token_id, )) for key, val in tokens.items() }
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always False/0. tensor_dict = {} for key, val in tokens.items(): if val and isinstance(val[0], bool): tensor = torch.BoolTensor( pad_sequence_to_length(val, padding_lengths[key], default_value=lambda: False)) else: tensor = torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if key == "type_ids" else self._tokenizer.pad_token_id, ), ) tensor_dict[key] = tensor return tensor_dict
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always 0. # 改写: # tensor= {} # for key, val in tokens.items(): # if key in {'mask','type_ids'}: # pass # else: # self._tokenizer.pad_token_id return { key: torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if key in {"mask", "type_ids"} else self._tokenizer.pad_token_id, )) for key, val in tokens.items() }