Python AutoTokenizer.tokenize Exemples, transformers.AutoTokenizer.tokenize Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : torch_transformers_preprocessor.py Projet : slowwavesleep/DeepPavlov

    def _ner_bert_tokenize(tokens: List[str],
                           tags: List[str],
                           tokenizer: AutoTokenizer,
                           max_subword_len: int = None,
                           mode: str = None,
                           subword_mask_mode: str = "first",
                           token_masking_prob: float = None) -> Tuple[List[str], List[int], List[str]]:
        do_masking = (mode == 'train') and (token_masking_prob is not None)
        do_cutting = (max_subword_len is not None)
        tokens_subword = ['[CLS]']
        startofword_markers = [0]
        tags_subword = ['X']
        for token, tag in zip(tokens, tags):
            token_marker = int(tag != 'X')
            subwords = tokenizer.tokenize(token)
            if not subwords or (do_cutting and (len(subwords) > max_subword_len)):
                tokens_subword.append('[UNK]')
                startofword_markers.append(token_marker)
                tags_subword.append(tag)
            else:
                if do_masking and (random.random() < token_masking_prob):
                    tokens_subword.extend(['[MASK]'] * len(subwords))
                else:
                    tokens_subword.extend(subwords)
                if subword_mask_mode == "last":
                    startofword_markers.extend([0] * (len(subwords) - 1) + [token_marker])
                else:
                    startofword_markers.extend([token_marker] + [0] * (len(subwords) - 1))
                tags_subword.extend([tag] + ['X'] * (len(subwords) - 1))

        tokens_subword.append('[SEP]')
        startofword_markers.append(0)
        tags_subword.append('X')
        return tokens_subword, startofword_markers, tags_subword

Exemple #2

0

Afficher le fichier

def preprocess_text(x: str, tokenizer: AutoTokenizer, max_sequence_len: int):
    cur_x = x
    if isinstance(tokenizer, BertTokenizer):
        cur_x = "[CLS] " + cur_x
    cur_x = cur_x.replace("\n", "")
    cur_x = cur_x.replace(" cannot ", " can not ")
    cur_x = tokenizer.tokenize(cur_x)
    cur_x = tokenizer.convert_tokens_to_ids(cur_x)
    cur_x = cur_x[:max_sequence_len]
    cur_x = cur_x + [0] * (max_sequence_len - len(cur_x))
    return cur_x

Exemple #3

0

Afficher le fichier

Fichier : data_loader.py Projet : TheoRenLi/Baidu2021InfoExtract

def convert_example_to_feature(
        example,
        tokenizer: AutoTokenizer,
        chineseandpunctuationextractor: ChineseAndPunctuationExtractor,
        label_map,
        max_length: Optional[int] = 512,
        pad_to_max_length: Optional[bool] = None):
    spo_list = example['spo_list'] if "spo_list" in example.keys() else None
    text_raw = example['text']

    sub_text = []  # 放置中文字符
    buff = ""  # 存放非中文字符
    for char in text_raw:
        if chineseandpunctuationextractor.is_chinese_or_punct(char):
            if buff != "":
                sub_text.append(buff)
                buff = ""
            sub_text.append(char)
        else:
            buff += char
    if buff != "":
        sub_text.append(buff)

    tok_to_orig_start_index = []
    tok_to_orig_end_index = []
    orig_to_tok_index = []
    tokens = []
    text_tmp = ''
    for (i, token) in enumerate(sub_text):
        orig_to_tok_index.append(len(tokens))
        sub_tokens = tokenizer.tokenize(token)
        text_tmp += token
        for sub_token in sub_tokens:
            tok_to_orig_start_index.append(len(text_tmp) - len(token))
            tok_to_orig_end_index.append(len(text_tmp) - 1)
            tokens.append(sub_token)
            if len(tokens) >= max_length - 2:
                break
        else:
            continue
        break
    # print("tok_to_orig_start_index: ", tok_to_orig_start_index)
    # print("tok_to_orig_end_index: ", tok_to_orig_end_index)
    # print("orig_to_tok_index: ", orig_to_tok_index)
    # print("tokens: ", tokens)
    seq_len = len(tokens)
    # 2 tags for each predicate + I tag + O tag
    num_labels = 2 * (len(label_map.keys()) - 2) + 2
    # initialize tag
    labels = [[0] * num_labels for i in range(seq_len)]  # 每个字都要生成标签表示，用于预测

    if spo_list is not None:
        labels = parse_label(spo_list, label_map, tokens, tokenizer)

    # add [CLS] and [SEP] token, they are tagged into "O" for outside
    if seq_len > max_length - 2:
        tokens = tokens[0:(max_length - 2)]
        labels = labels[0:(max_length - 2)]
        tok_to_orig_start_index = tok_to_orig_start_index[0:(max_length - 2)]
        tok_to_orig_end_index = tok_to_orig_end_index[0:(max_length - 2)]
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    # "O" tag for [PAD], [CLS], [SEP] token
    outside_label = [[1] + [0] * (num_labels - 1)]

    labels = outside_label + labels + outside_label
    tok_to_orig_start_index = [-1] + tok_to_orig_start_index + [-1]
    tok_to_orig_end_index = [-1] + tok_to_orig_end_index + [-1]
    if seq_len < max_length:
        tokens = tokens + ["[PAD]"] * (max_length - seq_len - 2)
        labels = labels + outside_label * (max_length - len(labels))
        tok_to_orig_start_index = tok_to_orig_start_index + [-1] * (
            max_length - len(tok_to_orig_start_index))
        tok_to_orig_end_index = tok_to_orig_end_index + [-1] * (
            max_length - len(tok_to_orig_end_index))

    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    return InputFeature(
        input_ids=np.array(token_ids),
        seq_len=np.array(seq_len),
        tok_to_orig_start_index=np.array(tok_to_orig_start_index),
        tok_to_orig_end_index=np.array(tok_to_orig_end_index),
        labels=np.array(labels),
    )

Exemple #4

0

Afficher le fichier

Fichier : torch_transformers_squad.py Projet : slowwavesleep/DeepPavlov

class TorchTransformersSquadInfer(Component):
    """This model wraps BertSQuADModel to make predictions on longer than 512 tokens sequences.

    It splits context on chunks with `max_seq_length - 3 - len(question)` length, preserving sentences boundaries.

    It reassembles batches with chunks instead of full contexts to optimize performance, e.g.,:
        batch_size = 5
        number_of_contexts == 2
        number of first context chunks == 8
        number of second context chunks == 2

        we will create two batches with 5 chunks

    For each context the best answer is selected via logits or scores from BertSQuADModel.


    Args:
        squad_model_config: path to DeepPavlov BertSQuADModel config file
        vocab_file: path to Bert vocab file
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        batch_size: size of batch to use during inference
        lang: either `en` or `ru`, it is used to select sentence tokenizer

    """
    def __init__(self,
                 squad_model_config: str,
                 vocab_file: str,
                 do_lower_case: bool,
                 max_seq_length: int = 512,
                 batch_size: int = 10,
                 lang: str = 'en',
                 **kwargs) -> None:
        config = json.load(open(squad_model_config))
        config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length
        self.model = build_model(config)
        self.max_seq_length = max_seq_length

        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)

        self.batch_size = batch_size

        if lang == 'en':
            from nltk import sent_tokenize
            self.sent_tokenizer = sent_tokenize
        elif lang == 'ru':
            from ru_sent_tokenize import ru_sent_tokenize
            self.sent_tokenizer = ru_sent_tokenize
        else:
            raise RuntimeError('en and ru languages are supported only')

    def __call__(self, contexts: List[str], questions: List[str],
                 **kwargs) -> Tuple[List[str], List[int], List[float]]:
        """get predictions for given contexts and questions

        Args:
            contexts: batch of contexts
            questions: batch of questions

        Returns:
            predictions: answer, answer start position, logits or scores

        """
        batch_indices = []
        contexts_to_predict = []
        questions_to_predict = []
        predictions = {}
        for i, (context, question) in enumerate(zip(contexts, questions)):
            context_subtokens = self.tokenizer.tokenize(context)
            question_subtokens = self.tokenizer.tokenize(question)
            max_chunk_len = self.max_seq_length - len(question_subtokens) - 3
            if 0 < max_chunk_len < len(context_subtokens):
                number_of_chunks = math.ceil(
                    len(context_subtokens) / max_chunk_len)
                sentences = self.sent_tokenizer(context)
                for chunk in np.array_split(sentences, number_of_chunks):
                    contexts_to_predict += [' '.join(chunk)]
                    questions_to_predict += [question]
                    batch_indices += [i]
            else:
                contexts_to_predict += [context]
                questions_to_predict += [question]
                batch_indices += [i]

        for j in range(0, len(contexts_to_predict), self.batch_size):
            c_batch = contexts_to_predict[j:j + self.batch_size]
            q_batch = questions_to_predict[j:j + self.batch_size]
            ind_batch = batch_indices[j:j + self.batch_size]
            a_batch, a_st_batch, logits_batch = self.model(c_batch, q_batch)
            for a, a_st, logits, ind in zip(a_batch, a_st_batch, logits_batch,
                                            ind_batch):
                if ind in predictions:
                    predictions[ind] += [(a, a_st, logits)]
                else:
                    predictions[ind] = [(a, a_st, logits)]

        answers, answer_starts, logits = [], [], []
        for ind in sorted(predictions.keys()):
            prediction = predictions[ind]
            best_answer_ind = np.argmax([p[2] for p in prediction])
            answers += [prediction[best_answer_ind][0]]
            answer_starts += [prediction[best_answer_ind][1]]
            logits += [prediction[best_answer_ind][2]]

        return answers, answer_starts, logits

Exemple #5

0

Afficher le fichier

Fichier : prod.py Projet : jiangfeng1124/ChemRxnExtractor

def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_seq_length: int,
    tokenizer: AutoTokenizer,
    cls_token="[CLS]",
    cls_token_segment_id=0,
    sep_token="[SEP]",
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    sequence_b_segment_id=1,
    mask_padding_with_zero=True,
    verbose=False
) -> List[InputFeatures]:
    """ Loads a data file into a list of `InputFeatures`
    """
    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10_000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []
        for word, label in zip(example.words, example.labels):
            word_tokens = tokenizer.tokenize(word)
            # word_tokens = word_tokens[:5]

            if len(word_tokens) > 0:
                tokens.extend(word_tokens)
                label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        if len(tokens) > max_seq_length - 2:
            logger.warning("Sequence length exceed {} (cut).".format(max_seq_length))
            tokens = tokens[: (max_seq_length - 2)]
            label_ids = label_ids[: (max_seq_length - 2)]

        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        tokens = [cls_token] + tokens
        label_ids = [pad_token_label_id] + label_ids
        segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        seq_length = len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        input_ids += [pad_token] * padding_length
        input_mask += [0 if mask_padding_with_zero else 1] * padding_length
        segment_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_token_label_id] * padding_length

        decoder_mask = [(x != pad_token_label_id) for x in label_ids]

        # assert len(input_ids) == max_seq_length
        # assert len(input_mask) == max_seq_length
        # assert len(segment_ids) == max_seq_length
        # assert len(label_ids) == max_seq_length

        if verbose and ex_index < 1:
            logger.info("*** Example ***")
            logger.info("guid: {} (length: {})".format(example.guid, seq_length))
            logger.info("tokens: %s", " ".join([str(x) for x in tokens[:seq_length]]))
            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids[:seq_length]]))
            # logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
            # logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids[:seq_length]]))
            logger.info("decode_mask: %s", " ".join([str(x) for x in decoder_mask[:seq_length]]))

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=input_mask,
                token_type_ids=segment_ids,
                label_ids=label_ids,
                decoder_mask=decoder_mask
            )
        )

Exemple #6

0

Afficher le fichier

def convert_examples_to_features(examples: List[InputExample],
                                 label_list: List[str],
                                 max_seq_length: int,
                                 tokenizer: AutoTokenizer,
                                 cls_token="[CLS]",
                                 cls_token_segment_id=0,
                                 sep_token="[SEP]",
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 pad_token_label_id=-100,
                                 sequence_a_segment_id=0,
                                 sequence_b_segment_id=1,
                                 mask_padding_with_zero=True,
                                 verbose=False) -> List[InputFeatures]:
    """ Loads a data file into a list of `InputFeatures`
    """
    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10_000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []

        prod_start_index = prod_end_index = -1
        for wid, (word, label) in enumerate(zip(example.words,
                                                example.labels)):
            if label == "B-arm_description":
                prod_start_index = len(tokens)
                tokens.append(PROD_START_MARKER)
                label_ids.append(pad_token_label_id)
            elif prod_start_index >= 0 and prod_end_index < 0 and label != "I-arm_description":
                prod_end_index = len(tokens)
                tokens.append(PROD_END_MARKER)
                label_ids.append(pad_token_label_id)

            word_tokens = tokenizer.tokenize(word)
            word_tokens = word_tokens[:5]  # avoid long chemical names

            if len(word_tokens) > 0:
                tokens.extend(word_tokens)
                # Use the real label id for the first token of the word,
                # and padding ids for the remaining tokens
                # skip unknown labels (used by semi-supervised training with partial annotations
                label_ids.extend([label_map.get(label, pad_token_label_id)] +
                                 [pad_token_label_id] * (len(word_tokens) - 1))

        # Product at the end of sequence
        if prod_start_index >= 0 and prod_end_index < 0:
            prod_end_index = len(tokens)
            tokens.append(PROD_END_MARKER)
            label_ids.append(pad_token_label_id)

        assert prod_start_index >= 0
        assert prod_end_index >= 0

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        if len(tokens) > max_seq_length - 2:  # [CLS], [SEP]
            logger.info(
                "Sentence length exceeds max_seq_length: {} ({})".format(
                    " ".join(tokens), len(tokens)))
            # This will fail if PROD is cut
            tokens = tokens[:(max_seq_length - 2)]
            label_ids = label_ids[:(max_seq_length - 2)]

        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        tokens = [cls_token] + tokens
        label_ids = [pad_token_label_id] + label_ids
        segment_ids = [cls_token_segment_id] + segment_ids

        prod_start_index += 1  # cls_token added to th beginning
        prod_end_index += 1

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        prod_start_mask = [0 for i in range(len(input_ids))]
        prod_start_mask[prod_start_index] = 1
        prod_end_mask = [0 for i in range(len(input_ids))]
        prod_end_mask[prod_end_index] = 1
        prod_mask = [0 for i in range(len(input_ids))]
        prod_mask[prod_start_index:prod_end_index +
                  1] = [1] * (prod_end_index + 1 - prod_start_index)

        # set segment ids for product
        # segment_ids[prod_start_index:prod_end_index+1] = [1] * (prod_end_index+1-prod_start_index)

        # Zero-pad up to the sequence length.
        seq_length = len(input_ids)
        padding_length = max_seq_length - seq_length
        input_ids += [pad_token] * padding_length
        input_mask += [0 if mask_padding_with_zero else 1] * padding_length
        prod_start_mask += ([0 if mask_padding_with_zero else 1] *
                            padding_length)
        prod_end_mask += ([0 if mask_padding_with_zero else 1] *
                          padding_length)
        prod_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
        segment_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_token_label_id] * padding_length
        decoder_mask = [(x != pad_token_label_id) for x in label_ids]

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(prod_start_mask) == max_seq_length
        assert len(prod_end_mask) == max_seq_length
        assert len(prod_mask) == max_seq_length
        assert len(prod_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        if verbose and ex_index < 1:
            logger.info("*** Example ***")
            logger.info("guid: {} (length: {})".format(example.guid,
                                                       seq_length))
            logger.info("tokens: " +
                        " ".join([str(x) for x in tokens[:seq_length]]))
            logger.info("input_ids: " +
                        " ".join([str(x) for x in input_ids[:seq_length]]))
            logger.info("label_ids: " +
                        " ".join([str(x) for x in label_ids[:seq_length]]))
            logger.info("decoder_mask: " +
                        " ".join([str(x) for x in decoder_mask[:seq_length]]))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=input_mask,
                          prod_start_mask=prod_start_mask,
                          prod_end_mask=prod_end_mask,
                          prod_mask=prod_mask,
                          token_type_ids=segment_ids,
                          label_ids=label_ids,
                          decoder_mask=decoder_mask))

Exemple #7

0

Afficher le fichier

Fichier : load_data.py Projet : etbenitez4/scholarphi

def convert_examples_to_features(
    examples: List[InputExample],
    max_seq_len: int,
    tokenizer: AutoTokenizer,
    pad_token_label_id: int = -100,
    cls_token_segment_id: int = 0,
    pad_token_segment_id: int = 0,
    sequence_a_segment_id: int = 0,
    mask_padding_with_zero: bool = True,
) -> List[InputFeatures]:
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logging.debug("Processing example %d of %d", ex_index,
                          len(examples))

        # Tokenize word by word (for NER)
        tokens: List[str] = []
        slot_labels_ids = []
        pos_labels_ids = []
        np_labels_ids, vp_labels_ids, entity_labels_ids, acronym_labels_ids = (
            [],
            [],
            [],
            [],
        )

        for (
                word,
                slot_label,
                pos_label,
                np_label,
                vp_label,
                entity_label,
                acronym_label,
        ) in zip(
                example.words,
                example.slot_labels,
                example.pos_labels,
                example.np_labels,
                example.vp_labels,
                example.entity_labels,
                example.acronym_labels,
        ):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                # For handling the bad-encoded word
                word_tokens = [unk_token]
            tokens.extend(word_tokens)

            # Use the real label ID for the first token of the word, and padding IDs for the
            # remaining tokens.
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] *
                                   (len(word_tokens) - 1))
            pos_labels_ids.extend([int(pos_label)] + [pad_token_label_id] *
                                  (len(word_tokens) - 1))
            np_labels_ids.extend([int(np_label)] + [pad_token_label_id] *
                                 (len(word_tokens) - 1))
            vp_labels_ids.extend([int(vp_label)] + [pad_token_label_id] *
                                 (len(word_tokens) - 1))
            entity_labels_ids.extend([int(entity_label)] +
                                     [pad_token_label_id] *
                                     (len(word_tokens) - 1))
            acronym_labels_ids.extend([int(acronym_label)] +
                                      [pad_token_label_id] *
                                      (len(word_tokens) - 1))

        # Account for [CLS] and [SEP].
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[:(max_seq_len -
                                                special_tokens_count)]
            pos_labels_ids = pos_labels_ids[:(max_seq_len -
                                              special_tokens_count)]

            np_labels_ids = np_labels_ids[:(max_seq_len -
                                            special_tokens_count)]
            vp_labels_ids = vp_labels_ids[:(max_seq_len -
                                            special_tokens_count)]
            entity_labels_ids = entity_labels_ids[:(max_seq_len -
                                                    special_tokens_count)]
            acronym_labels_ids = acronym_labels_ids[:(max_seq_len -
                                                      special_tokens_count)]

        # Add [SEP] token.
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        pos_labels_ids += [pad_token_label_id]
        np_labels_ids += [pad_token_label_id]
        vp_labels_ids += [pad_token_label_id]
        entity_labels_ids += [pad_token_label_id]
        acronym_labels_ids += [pad_token_label_id]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token.
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        pos_labels_ids = [pad_token_label_id] + pos_labels_ids
        np_labels_ids = [pad_token_label_id] + np_labels_ids
        vp_labels_ids = [pad_token_label_id] + vp_labels_ids
        entity_labels_ids = [pad_token_label_id] + entity_labels_ids
        acronym_labels_ids = [pad_token_label_id] + acronym_labels_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + (
            [0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                           padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] *
                                             padding_length)
        pos_labels_ids = pos_labels_ids + ([pad_token_label_id] *
                                           padding_length)

        np_labels_ids = np_labels_ids + ([pad_token_label_id] * padding_length)
        vp_labels_ids = vp_labels_ids + ([pad_token_label_id] * padding_length)
        entity_labels_ids = entity_labels_ids + ([pad_token_label_id] *
                                                 padding_length)
        acronym_labels_ids = acronym_labels_ids + ([pad_token_label_id] *
                                                   padding_length)

        assert len(input_ids
                   ) == max_seq_len, "Error with input length {} vs {}".format(
                       len(input_ids), max_seq_len)
        assert (len(attention_mask) == max_seq_len
                ), "Error with attention mask length {} vs {}".format(
                    len(attention_mask), max_seq_len)
        assert (len(token_type_ids) == max_seq_len
                ), "Error with token type length {} vs {}".format(
                    len(token_type_ids), max_seq_len)
        assert (len(slot_labels_ids) == max_seq_len
                ), "Error with slot labels length {} vs {}".format(
                    len(slot_labels_ids), max_seq_len)
        assert (len(pos_labels_ids) == max_seq_len
                ), "Error with pos labels length {} vs {}".format(
                    len(pos_labels_ids), max_seq_len)
        assert (len(np_labels_ids) == max_seq_len
                ), "Error with np labels length {} vs {}".format(
                    len(np_labels_ids), max_seq_len)
        assert (len(vp_labels_ids) == max_seq_len
                ), "Error with vp labels length {} vs {}".format(
                    len(vp_labels_ids), max_seq_len)
        assert (len(entity_labels_ids) == max_seq_len
                ), "Error with entity labels length {} vs {}".format(
                    len(entity_labels_ids), max_seq_len)
        assert (len(acronym_labels_ids) == max_seq_len
                ), "Error with acronym labels length {} vs {}".format(
                    len(acronym_labels_ids), max_seq_len)

        intent_label_id = int(example.intent_label)

        if ex_index < 3:
            logging.debug(  # pylint: disable=logging-not-lazy
                "Example created. guid: %s, tokens: %s, input_ids: %s, " +
                "attention_mask: %s, token_type_ids: %s, intent_label: %s (id = %d), "
                + "slot_labels: %s, POS_labels: %s, NP_labels: %s" +
                "VP_labels: %s, entity_labels, %s acronym_labels: %s",
                example.guid,
                " ".join([str(x) for x in tokens]),
                " ".join([str(x) for x in input_ids]),
                " ".join([str(x) for x in attention_mask]),
                " ".join([str(x) for x in token_type_ids]),
                example.intent_label,
                intent_label_id,
                " ".join([str(x) for x in slot_labels_ids]),
                " ".join([str(x) for x in pos_labels_ids]),
                " ".join([str(x) for x in np_labels_ids]),
                " ".join([str(x) for x in vp_labels_ids]),
                " ".join([str(x) for x in entity_labels_ids]),
                " ".join([str(x) for x in acronym_labels_ids]),
            )

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                intent_label_id=intent_label_id,
                slot_labels_ids=slot_labels_ids,
                pos_labels_ids=pos_labels_ids,
                np_labels_ids=np_labels_ids,
                vp_labels_ids=vp_labels_ids,
                entity_labels_ids=entity_labels_ids,
                acronym_labels_ids=acronym_labels_ids,
            ))
    return features