예제 #1
0
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        # Different transformers use different padding values for tokens, but for mask and type id, the padding
        # value is always 0.

        tokens = tokens.copy()
        padding_lengths = padding_lengths.copy()

        offsets_tokens = tokens.pop("offsets")
        offsets_padding_lengths = padding_lengths.pop("offsets")

        tensor_dict = {
            key: torch.LongTensor(
                pad_sequence_to_length(
                    val,
                    padding_lengths[key],
                    default_value=lambda: 0 if "mask" in key or "type-ids" in
                    key else self._tokenizer.pad_token_id,
                ))
            for key, val in tokens.items()
        }

        tensor_dict["offsets"] = torch.LongTensor(
            pad_sequence_to_length(offsets_tokens,
                                   offsets_padding_lengths,
                                   default_value=lambda: (0, 0)))

        return tensor_dict
예제 #2
0
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tensor_dict = {}
        for key, val in tokens.items():
            if key == "type_ids":
                padding_value = 0
                mktensor = torch.LongTensor
            elif key == "mask" or key == "wordpiece_mask":
                padding_value = False
                mktensor = torch.BoolTensor
            elif len(val) > 0 and isinstance(val[0], bool):
                padding_value = False
                mktensor = torch.BoolTensor
            else:
                padding_value = self._tokenizer.pad_token_id
                if padding_value is None:
                    padding_value = (
                        0  # Some tokenizers don't have padding tokens and rely on the mask only.
                    )
                mktensor = torch.LongTensor

            tensor = mktensor(
                pad_sequence_to_length(val,
                                       padding_lengths[key],
                                       default_value=lambda: padding_value))

            tensor_dict[key] = tensor
        return tensor_dict
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tokens = tokens.copy()
        padding_lengths = padding_lengths.copy()

        offsets_tokens = tokens.pop("offsets")
        offsets_padding_lengths = padding_lengths.pop("offsets")

        tensor_dict = self._matched_indexer.as_padded_tensor_dict(
            tokens, padding_lengths)
        tensor_dict["offsets"] = torch.LongTensor(
            pad_sequence_to_length(offsets_tokens,
                                   offsets_padding_lengths,
                                   default_value=lambda: (0, 0)))
        return tensor_dict
예제 #4
0
    def indices_to_tokens(self, indexed_tokens: IndexedTokenList,
                          vocabulary: Vocabulary) -> List[Token]:
        token_ids = indexed_tokens["token_ids"]
        type_ids = indexed_tokens.get("type_ids")

        return [
            Token(
                text=vocabulary.get_token_from_index(token_ids[i],
                                                     self._namespace),
                text_id=token_ids[i],
                type_id=type_ids[i] if type_ids is not None else None,
            ) for i in range(len(token_ids))
        ]
 def as_padded_tensor_dict(
         self, tokens: IndexedTokenList,
         padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
     # Different transformers use different padding values for tokens, but for mask and type id, the padding
     # value is always 0.
     return {
         key: torch.LongTensor(
             pad_sequence_to_length(
                 val,
                 padding_lengths[key],
                 default_value=lambda: 0 if key == 'attention-mask' else
                 (0 if "mask" in key or "type-ids" in key else self.
                  _tokenizer.pad_token_id),
             ))
         for key, val in tokens.items()
     }
 def as_padded_tensor_dict(
     self,
     tokens: IndexedTokenList,
     padding_lengths: Dict[str, int],
 ) -> Dict[str, torch.Tensor]:
     # Different transformers use different padding values for tokens, but for mask and type id, the padding
     # value is always 0.
     return {
         key: torch.LongTensor(
             pad_sequence_to_length(
                 val,
                 padding_lengths[key],
                 default_value=lambda: 0 if key in {
                     "mask", "type_ids", "wordpiece_mask",
                     "segment_concat_mask"
                 } else self._tokenizer.pad_token_id,
             ))
         for key, val in tokens.items()
     }
 def as_padded_tensor_dict(
         self, tokens: IndexedTokenList,
         padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
     # Different transformers use different padding values for tokens, but for mask and type id, the padding
     # value is always False/0.
     tensor_dict = {}
     for key, val in tokens.items():
         if val and isinstance(val[0], bool):
             tensor = torch.BoolTensor(
                 pad_sequence_to_length(val,
                                        padding_lengths[key],
                                        default_value=lambda: False))
         else:
             tensor = torch.LongTensor(
                 pad_sequence_to_length(
                     val,
                     padding_lengths[key],
                     default_value=lambda: 0
                     if key == "type_ids" else self._tokenizer.pad_token_id,
                 ), )
         tensor_dict[key] = tensor
     return tensor_dict
예제 #8
0
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        # Different transformers use different padding values for tokens, but for mask and type id, the padding
        # value is always 0.

        # 改写:
        # tensor= {}
        # for key, val in tokens.items():
        #     if key in {'mask','type_ids'}:
        #         pass
        #     else:
        #         self._tokenizer.pad_token_id

        return {
            key: torch.LongTensor(
                pad_sequence_to_length(
                    val,
                    padding_lengths[key],
                    default_value=lambda: 0 if key in {"mask", "type_ids"} else
                    self._tokenizer.pad_token_id,
                ))
            for key, val in tokens.items()
        }
    def _postprocess_output(self, output: IndexedTokenList) -> IndexedTokenList:
        """
        Takes an IndexedTokenList about to be returned by `tokens_to_indices()` and adds any
        necessary postprocessing, e.g. long sequence splitting.

        The input should have a `"token_ids"` key corresponding to the token indices. They should
        have special tokens already inserted.
        """
        if self._max_length is not None:
            # We prepare long indices by converting them to (assuming max_length == 5)
            # [CLS] A B C [SEP] [CLS] D E F [SEP] ...
            # Embedder is responsible for folding this 1-d sequence to 2-d and feed to the
            # transformer model.
            # TODO(zhaofengw): we aren't respecting word boundaries when segmenting wordpieces.

            indices = output["token_ids"]
            type_ids = output.get("type_ids", [0] * len(indices))

            # Strips original special tokens
            indices = indices[
                self._num_added_start_tokens : len(indices) - self._num_added_end_tokens
            ]
            type_ids = type_ids[
                self._num_added_start_tokens : len(type_ids) - self._num_added_end_tokens
            ]

            # Folds indices
            folded_indices = [
                indices[i : i + self._effective_max_length]
                for i in range(0, len(indices), self._effective_max_length)
            ]
            folded_type_ids = [
                type_ids[i : i + self._effective_max_length]
                for i in range(0, len(type_ids), self._effective_max_length)
            ]

            # Adds special tokens to each segment
            folded_indices = [
                self._tokenizer.build_inputs_with_special_tokens(segment)
                for segment in folded_indices
            ]
            single_sequence_start_type_ids = [
                t.type_id for t in self._allennlp_tokenizer.single_sequence_start_tokens
            ]
            single_sequence_end_type_ids = [
                t.type_id for t in self._allennlp_tokenizer.single_sequence_end_tokens
            ]
            folded_type_ids = [
                single_sequence_start_type_ids + segment + single_sequence_end_type_ids
                for segment in folded_type_ids
            ]
            assert all(
                len(segment_indices) == len(segment_type_ids)
                for segment_indices, segment_type_ids in zip(folded_indices, folded_type_ids)
            )

            # Flattens
            indices = [i for segment in folded_indices for i in segment]
            type_ids = [i for segment in folded_type_ids for i in segment]

            output["token_ids"] = indices
            output["type_ids"] = type_ids
            output["segment_concat_mask"] = [True] * len(indices)

        return output