Python BatchEncodingの例、transformers.tokenization_utils_base.BatchEncoding Pythonの例

コード例 #1

0

ファイルを表示

    def test_decode_best_spans(self):
        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")

        text_1 = tokenizer.encode("question sequence",
                                  add_special_tokens=False)
        text_2 = tokenizer.encode("title sequence", add_special_tokens=False)
        text_3 = tokenizer.encode("text sequence " * 4,
                                  add_special_tokens=False)
        input_ids = [[101] + text_1 + [102] + text_2 + [102] + text_3]
        reader_input = BatchEncoding({"input_ids": input_ids})

        start_logits = [[0] * len(input_ids[0])]
        end_logits = [[0] * len(input_ids[0])]
        relevance_logits = [0]
        reader_output = DPRReaderOutput(start_logits, end_logits,
                                        relevance_logits)

        start_index, end_index = 8, 9
        start_logits[0][start_index] = 10
        end_logits[0][end_index] = 10
        predicted_spans = tokenizer.decode_best_spans(reader_input,
                                                      reader_output)
        self.assertEqual(predicted_spans[0].start_index, start_index)
        self.assertEqual(predicted_spans[0].end_index, end_index)
        self.assertEqual(predicted_spans[0].doc_id, 0)

コード例 #2

0

ファイルを表示

ファイル: tokenizer.py プロジェクト: NilayNigam/Networks

    def batch_encode_packets(
            self,
            flows: Union[pd.DataFrame, np.ndarray],
            target_class: Optional[str] = None,
            add_special_tokens: bool = True,
            return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
            return_attention_mask: Optional[bool] = True,
    ) -> BatchEncoding:

        if isinstance(flows, pd.DataFrame):
            flows = flows.values

        if flows.shape[1] // 2 != self.max_model_input_sizes:
            logger.debug(f'input number of features ({flows.shape[1] // 2}) does not match '
                         f'max_model_input_sizes ({self.max_model_input_sizes})')
        clusters = self.packet_quantizer.transform(flows)

        if add_special_tokens:
            first_token = self.convert_tokens_to_ids(target_class) if target_class is not None else self.bos_token_id
            expander = partial(self._expand_with_special_tokens, first_token=first_token)
            clusters = np.apply_along_axis(expander, axis=1, arr=clusters)
        else:
            clusters = np.apply_along_axis(self._pad_flow, axis=1, arr=clusters)

        result = {'input_ids': clusters.astype(np.int64)}

        if return_attention_mask:
            token_mask = (clusters != self.pad_token_id).astype(np.int64)
            result.update({'attention_mask': token_mask})

        return BatchEncoding(result, tensor_type=TensorType(return_tensors), prepend_batch_axis=False)

コード例 #3

0

ファイルを表示

ファイル: unsupervised_word_seg.py プロジェクト: X-CCS/Unsupervised-Word-Segment

def get_word_seg(text):
    token_batch = tokenizer(text, return_tensors="pt")

    token_ids = token_batch["input_ids"]
    token_ids = token_ids.numpy().tolist()[0]
    length = len(token_ids) - 2

    batch_token_ids = np.array([token_ids] * (2 * length - 1))
    batch_segment_ids = np.zeros_like(batch_token_ids)

    for i in range(length):
        if i > 0:
            batch_token_ids[2 * i - 1, i] = 103
            batch_token_ids[2 * i - 1, i + 1] = 103
        batch_token_ids[2 * i, i + 1] = 103

    attention_mask = token_batch["attention_mask"].repeat(2 * length - 1, 1)
    input_ids = torch.from_numpy(batch_token_ids)
    token_type_ids = torch.from_numpy(batch_segment_ids)

    input_dict = {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    }

    inputs = BatchEncoding(input_dict)

    outputs = model(**inputs)
    vectors, _ = outputs[:2]
    vectors = vectors.detach().numpy()

    seg_list = []
    for threshold in range(length):
        # threshold = 8
        print(threshold)
        word_token_ids = [[token_ids[1]]]
        for i in range(1, length):
            d1 = dist(vectors[2 * i, i + 1], vectors[2 * i - 1, i + 1])
            d2 = dist(vectors[2 * i - 2, i], vectors[2 * i - 1, i])
            d = (d1 + d2) / 2
            if d >= threshold:
                word_token_ids[-1].append(token_ids[i + 1])
            else:
                word_token_ids.append([token_ids[i + 1]])
        words = [
            tokenizer.decode(ids).replace(" ", "") for ids in word_token_ids
        ]
        print(words)
        seg_list.append(words)

    return seg_list

コード例 #4

0

ファイルを表示

ファイル: get_en_dataloader.py プロジェクト: Coldog2333/GMN_Chatbot

    def __init__(
        self,
        descriptions: BatchEncoding,
        patients: BatchEncoding,
        doctors: BatchEncoding,
        neg_samples: int = 9,
        max_length: int = 256,
        evaluation: bool = False
    ):
        super().__init__()
        self.descriptions = descriptions.copy()
        self.patients = patients.copy()
        self.doctors = doctors.copy()
        self.evaluation = evaluation
        print(len(self))

        # Usually description length might be shorter
        self.__trim_or_pad_input(self.descriptions, max_length)
        self.__trim_or_pad_input(self.patients, max_length)
        self.__trim_or_pad_input(self.doctors, max_length)

        self.neg_samples = neg_samples

コード例 #5

0

ファイルを表示

    def test_group_by_length_with_batch_encoding(self):
        # Get some inputs of random lengths
        data = []
        for _ in range(6):
            input_ids = torch.randint(0, 25, (100, )).tolist()
            data.append(BatchEncoding({"input_ids": input_ids}))
        # Put one bigger than the others to check it ends up in first position
        data[3]["input_ids"] = torch.randint(0, 25, (105, )).tolist()

        indices = list(LengthGroupedSampler(4, dataset=data))
        # The biggest element should be first
        self.assertEqual(len(data[indices[0]]["input_ids"]), 105)
        # The indices should be a permutation of range(6)
        self.assertEqual(list(sorted(indices)), list(range(6)))

コード例 #6

0

ファイルを表示

ファイル: __init__.py プロジェクト: ChenghaoMou/embeddings

    def __call__(
        self,
        text: Union[TextInput, List[TextInput]],
        text_pair: Optional[Union[TextInput, List[TextInput]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize the text into a sequence of image blocks.

        Parameters
        ----------
        text : Union[TextInput, List[TextInput]]
            A single text or a list of text
        text_pair : Optional[Union[TextInput, List[TextInput]]], optional
            A single text or a list of text, by default None
        add_special_tokens : bool, optional
            Whether to add special tokens to the data, by default True
        padding : Union[bool, str, PaddingStrategy], optional
            The padding strategy, by default False
        truncation : Union[bool, str, TruncationStrategy], optional
            The truncation strategy, by default False
        max_length : Optional[int], optional
            Maximum sequence length, overriding the class variable, by default None
        pad_to_multiple_of : Optional[int], optional
            Padding parameters, by default None
        return_tensors : Optional[Union[str, TensorType]], optional
            Return tensors in `pt`, 'tf' or 'np', by default None
        return_token_type_ids : Optional[bool], optional
            Return token type ids, by default None
        return_attention_mask : Optional[bool], optional
            Return attention mask, by default None
        return_overflowing_tokens : bool, optional
            Return overflowing tokens, by default False
        return_special_tokens_mask : bool, optional
            Return special token mask, by default False
        return_length : bool, optional
            Return length, by default False

        Returns
        -------
        BatchEncoding
            A BatchEncoding object
        """
        if self.special_tokens is None:
            self.special_tokens = {
                "CLS": self.text2embeddings("[CLS]"),
                "SEP": self.text2embeddings("[SEP]"),
            }

        if add_special_tokens and text_pair:
            actual_max_length = self.max_length - len(
                self.special_tokens["SEP"]) * 2 - len(
                    self.special_tokens["CLS"])
        else:
            actual_max_length = self.max_length

        batch_outputs = {}
        text = text if isinstance(text, list) else [text]
        text_pair = text_pair if isinstance(text_pair, list) else [text_pair]

        if isinstance(padding, str):
            padding = PaddingStrategy(padding)

        if isinstance(truncation, str):
            truncation = TruncationStrategy(truncation)

        for first_text, second_text in zip_longest(text,
                                                   text_pair,
                                                   fillvalue=None):

            first_embeddings = self.text2embeddings(first_text)
            second_embeddings = self.text2embeddings(second_text)

            outputs = self.prepare_for_model(
                first_embeddings,
                second_embeddings,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.
                DO_NOT_PAD,  # we pad in batch afterward
                truncation=truncation,
                max_length=max_length or actual_max_length,
                pad_to_multiple_of=None,  # we pad in batch afterward
                return_attention_mask=False,  # we pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=
                None,  # We convert the whole batch to tensors at the end
                prepend_batch_axis=False,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        batch_outputs = self.pad(
            batch_outputs,
            padding=padding,
            max_length=max_length or actual_max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        batch_outputs = BatchEncoding(batch_outputs,
                                      tensor_type=return_tensors)

        return batch_outputs

コード例 #7

0

ファイルを表示

ファイル: __init__.py プロジェクト: ChenghaoMou/embeddings

    def pad(
        self,
        encoded_inputs: Union[BatchEncoding, List[BatchEncoding],
                              Dict[str,
                                   EncodedInput], Dict[str,
                                                       List[EncodedInput]],
                              List[Dict[str, EncodedInput]], ],
        padding: Union[bool, str, PaddingStrategy] = True,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ) -> BatchEncoding:

        # If we have a list of dicts, let's convert it in a dict of lists
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
        if isinstance(encoded_inputs,
                      (list, tuple)) and isinstance(encoded_inputs[0],
                                                    (dict, BatchEncoding)):
            encoded_inputs = {
                key: [example[key] for example in encoded_inputs]
                for key in encoded_inputs[0].keys()
            }

        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
                "You should supply an encoding or a list of encodings to this method"
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )

        required_input = encoded_inputs[self.model_input_names[0]]

        if required_input is None:
            if return_attention_mask:
                encoded_inputs["attention_mask"] = []
            return encoded_inputs

        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch

        first_element = required_input[0]
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
            index = 0
            while len(required_input[index]) == 0:
                index += 1
            if index < len(required_input):
                first_element = required_input[index][0]
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
            if is_torch_available() and is_torch(first_element):
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
                    f"Should be one of a python, numpy or pytorch object.")

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)

        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
            encoded_inputs = self._pad(
                encoded_inputs,
                max_length=max_length,
                padding_strategy=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

        batch_size = len(required_input)
        assert all(
            len(v) == batch_size for v in encoded_inputs.values()
        ), "Some items in the output dictionary have a different batch size than others."

        if padding == PaddingStrategy.LONGEST:
            max_length = max(len(inputs) for inputs in required_input)
            padding = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        return BatchEncoding(batch_outputs, tensor_type=return_tensors)

コード例 #8

0

ファイルを表示

ファイル: __init__.py プロジェクト: ChenghaoMou/embeddings

    def prepare_for_model(self,
                          ids: List[int],
                          pair_ids: Optional[List[int]] = None,
                          add_special_tokens: bool = True,
                          padding: Union[bool, str, PaddingStrategy] = False,
                          truncation: Union[bool, str,
                                            TruncationStrategy] = False,
                          max_length: Optional[int] = None,
                          stride: int = 0,
                          pad_to_multiple_of: Optional[int] = None,
                          return_tensors: Optional[Union[str,
                                                         TensorType]] = None,
                          return_token_type_ids: Optional[bool] = None,
                          return_attention_mask: Optional[bool] = None,
                          return_overflowing_tokens: bool = False,
                          return_special_tokens_mask: bool = False,
                          return_length: bool = False,
                          prepend_batch_axis: bool = False,
                          **kwargs):

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0
        if return_token_type_ids and not add_special_tokens:
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None.")

        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(
            pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
        overflowing_tokens = []
        if truncation != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation,
                stride=stride,
            )

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(
                ids, pair_ids)
        else:
            sequence = np.concatenate([ids, pair_ids],
                                      axis=0) if pair is True else ids
            token_type_ids = [0] * len(ids) + ([0] *
                                               len(pair_ids) if pair else [])

        # Build output dictionary
        encoded_inputs["input_ids"] = sequence

        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs[
                    "special_tokens_mask"] = self.get_special_tokens_mask(
                        ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Padding
        if padding != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(encoded_inputs,
                                      tensor_type=return_tensors,
                                      prepend_batch_axis=prepend_batch_axis)

        return batch_outputs

コード例 #9

0

ファイルを表示

ファイル: duplex_tagger.py プロジェクト: carolmanderson/NeMo

    def _infer(self,
               sents: List[List[str]],
               inst_directions: List[str],
               do_basic_tokenization=True):
        """ Main function for Inference

        Args:
            sents: A list of inputs tokenized by a basic tokenizer.
            inst_directions: A list of str where each str indicates the direction of the corresponding instance
                (i.e., INST_BACKWARD for ITN or INST_FORWARD for TN).

        Returns:
            all_tag_preds: A list of list where each list contains the raw tag predictions for the corresponding input words in sents.
            nb_spans: A list of ints where each int indicates the number of semiotic spans in input words.
            span_starts: A list of lists where each list contains the starting locations of semiotic spans in input words.
            span_ends: A list of lists where each list contains the ending locations of semiotic spans in input words.
            do_basic_tokenization: whether to do a pre-processing to separate punctuation marks, recommended to set to True
        """
        self.eval()

        # Append prefix
        texts = []
        for ix, sent in enumerate(sents):
            if inst_directions[ix] == constants.INST_BACKWARD:
                prefix = constants.ITN_PREFIX
            elif inst_directions[ix] == constants.INST_FORWARD:
                prefix = constants.TN_PREFIX
            if do_basic_tokenization:
                texts.append([prefix] + sent)
            else:
                texts.append(prefix + " " + sent)

        # Apply the model
        if do_basic_tokenization:
            is_split_into_words = True
        else:
            is_split_into_words = False

        encodings = self._tokenizer(texts,
                                    is_split_into_words=is_split_into_words,
                                    padding=True,
                                    truncation=True,
                                    return_tensors='pt')

        inputs = encodings
        encodings_reduced = None

        # check that the length of the 'input_ids' equals as least the length of the original input
        # if an input symbol is missing in the tokenizer's vocabulary (such as emoji or a Chinese character), it could be skipped
        if do_basic_tokenization:
            len_texts = [len(x) for x in texts]
        else:
            len_texts = [len(x.split()) for x in texts]
        len_ids = [
            len(
                self._tokenizer.convert_ids_to_tokens(
                    x, skip_special_tokens=True))
            for x in encodings['input_ids']
        ]
        idx_valid = [
            i for i, (t, enc) in enumerate(zip(len_texts, len_ids)) if enc >= t
        ]

        if len(idx_valid) != len(texts):
            logging.warning(
                'Some of the examples have symbols that were skipped during the tokenization. Such examples will be skipped.'
            )
            for i in range(len(texts)):
                if i not in idx_valid:
                    logging.warning(f'Invalid input: {texts[i]}')
            # skip these sentences and fall back to the input
            # exclude invalid examples from the encodings
            encodings_reduced = {
                k: tensor[idx_valid, :]
                for k, tensor in encodings.items()
            }
            for k, tensor in encodings_reduced.items():
                if tensor.ndim == 1:
                    encodings_reduced[k] = tensor.unsqueeze(dim=0)
            inputs = BatchEncoding(data=encodings_reduced)

        # skip the batch if no valid inputs are present
        if encodings_reduced and encodings_reduced['input_ids'].numel() == 0:
            # -1 to exclude tag for the prompt token
            all_tag_preds = [[constants.SAME_TAG] * (len(x) - 1)
                             for x in texts]
            nb_spans = [0] * len(texts)
            span_starts = [] * len(texts)
            span_ends = [] * len(texts)
            return all_tag_preds, nb_spans, span_starts, span_ends

        logits = self.model(**inputs.to(self.device)).logits
        pred_indexes = torch.argmax(logits, dim=-1).tolist()

        # Extract all_tag_preds for words
        all_tag_preds = []
        batch_size, max_len = encodings['input_ids'].size()
        pred_idx = 0
        for ix in range(batch_size):
            if ix in idx_valid:
                # remove first special token and task prefix token
                raw_tag_preds = [
                    constants.ALL_TAG_LABELS[p]
                    for p in pred_indexes[pred_idx][2:]
                ]
                tag_preds, previous_word_idx = [], None
                word_ids = encodings.word_ids(batch_index=ix)[2:]
                for jx, word_idx in enumerate(word_ids):
                    if word_idx is None:
                        continue
                    if word_idx != previous_word_idx:
                        tag_preds.append(raw_tag_preds[jx]
                                         )  # without special token at index 0
                    previous_word_idx = word_idx
                pred_idx += 1
            else:
                # for excluded examples, use SAME tags for all words
                tag_preds = [constants.SAME_TAG] * (len(texts[ix]) - 1)
            all_tag_preds.append(tag_preds)

        # Post-correction of simple tagger mistakes, i.e. I- tag is proceeding the B- tag in a span
        all_tag_preds = [
            self._postprocess_tag_preds(words, inst_dir, ps) for words,
            inst_dir, ps in zip(sents, inst_directions, all_tag_preds)
        ]

        # Decoding
        nb_spans, span_starts, span_ends = self.decode_tag_preds(all_tag_preds)
        return all_tag_preds, nb_spans, span_starts, span_ends

コード例 #10

0

ファイルを表示

ファイル: tokenization_bart_custom.py プロジェクト: azraar/nlp_summarization

    def _prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.
        DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[str] = None,
        prepend_batch_axis: bool = False,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
        It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            ids: list of tokenized input ids. Can be obtained from a string by chaining the
                `tokenize` and `convert_tokens_to_ids` methods.
            pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
                `tokenize` and `convert_tokens_to_ids` methods.
        """
        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0

        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Truncation: Handle max sequence length
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(
            pair=pair) if add_special_tokens else 0)
        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation_strategy,
                stride=stride,
            )
            if return_overflowing_tokens:
                encoded_inputs["overflowing_tokens"] = overflowing_tokens
                encoded_inputs["num_truncated_tokens"] = total_len - max_length

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(
                ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
            token_type_ids = [0] * len(ids) + ([1] *
                                               len(pair_ids) if pair else [])

        # Build output dictionnary
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs[
                    "special_tokens_mask"] = self.get_special_tokens_mask(
                        ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Check lengths
        if max_length is None and len(encoded_inputs["input_ids"]
                                      ) > self.model_max_length and verbose:
            logger.warning(
                "Token indices sequence length is longer than the specified maximum sequence length "
                "for this model ({} > {}). Running this sequence through the model will result in "
                "indexing errors".format(len(ids), self.model_max_length))

        # Padding
        encoded_inputs = self.pad(
            encoded_inputs,
            max_length=max_length,
            padding=padding_strategy.value,
            return_attention_mask=return_attention_mask,
        )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(encoded_inputs,
                                      tensor_type=return_tensors,
                                      prepend_batch_axis=prepend_batch_axis)

        return batch_outputs