Ejemplo n.º 1
0
    def process(self,
                paths,
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                src_embed_op: EmbeddingOption = None):
        input_name, target_name = 'words', 'target'
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(
            **src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

        info = DataBundle(datasets=self.load(paths))
        _train_ds = [info.datasets[name] for name in train_ds
                     ] if train_ds else info.datasets.values()
        src_vocab.from_dataset(*_train_ds, field_name=input_name)
        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        src_vocab.index_dataset(*info.datasets.values(),
                                field_name=input_name,
                                new_field_name=input_name)
        tgt_vocab.index_dataset(*info.datasets.values(),
                                field_name=target_name,
                                new_field_name=target_name)
        info.vocabs = {input_name: src_vocab, target_name: tgt_vocab}

        if src_embed_op is not None:
            src_embed_op.vocab = src_vocab
            init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
            info.embeddings[input_name] = init_emb

        for name, dataset in info.datasets.items():
            dataset.set_input(input_name)
            dataset.set_target(target_name)
        return info
Ejemplo n.º 2
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None,
                char_level_op=False):

        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        input_name, target_name = 'words', 'target'
        info.vocabs = {}

        # 就分隔为char形式
        if char_level_op:
            for dataset in datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')
        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(
            **src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {"words": src_vocab, "target": tgt_vocab}

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt,
                                                vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
Ejemplo n.º 3
0
def _indexize(data_bundle,
              input_field_names=Const.INPUT,
              target_field_names=Const.TARGET,
              vocabulary=None):
    if isinstance(input_field_names, str):
        input_field_names = [input_field_names]
    if isinstance(target_field_names, str):
        target_field_names = [target_field_names]
    for input_field_name in input_field_names:
        if vocabulary is None:
            src_vocab = Vocabulary()
            src_vocab.from_dataset(
                *[
                    ds for name, ds in data_bundle.iter_datasets()
                    if 'train' in name
                ],
                field_name=input_field_name,
                no_create_entry_dataset=[
                    ds for name, ds in data_bundle.iter_datasets()
                    if ('train' not in name) and (
                        ds.has_field(input_field_name))
                ])

        else:
            src_vocab = vocabulary
        src_vocab.index_dataset(*data_bundle.datasets.values(),
                                field_name=input_field_name)
        data_bundle.set_vocab(src_vocab, input_field_name)

    for target_field_name in target_field_names:
        tgt_vocab = Vocabulary(unknown=None, padding=None)
        tgt_vocab.from_dataset(
            *[
                ds for name, ds in data_bundle.iter_datasets()
                if 'train' in name
            ],
            field_name=target_field_name,
            no_create_entry_dataset=[
                ds for name, ds in data_bundle.iter_datasets()
                if ('train' not in name) and (ds.has_field(target_field_name))
            ])
        if len(tgt_vocab._no_create_word) > 0:
            warn_msg = f"There are {len(tgt_vocab._no_create_word)} `{target_field_name}` labels" \
                       f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \
                       f"data set but not in train data set!.\n" \
                       f"These label(s) are {tgt_vocab._no_create_word}"
            print(warn_msg)
        tgt_vocab.index_dataset(*[
            ds for ds in data_bundle.datasets.values()
            if ds.has_field(target_field_name)
        ],
                                field_name=target_field_name)
        data_bundle.set_vocab(tgt_vocab, target_field_name)

    return data_bundle
Ejemplo n.º 4
0
    def setup(self, stage="train"):
        if stage == 'train':
            data = self.hparams.data
            # build dataset
            # indexes: the ith column of the conll file. it depends on your file and may need modification.
            loader = ConllLoader([word, pos, head], indexes=[1, 3, 6])
            train_dataset = loader._load(data.train_file)
            val_dataset = loader._load(data.val_file)
            test_dataset = loader._load(data.test_file)

            def clean_word(words):
                def clean_number(word):
                    def is_number(s):
                        try:
                            float(s)
                            return True
                        except ValueError:
                            return False
                    if is_number(word):
                        return '0'
                    else:
                        return word
                # import re
                # def clean_number(w):
                #     new_w = re.sub('[0-9]{1,}([,.]?[0-9]*)*', '0', w)
                #     return new_w
                return [clean_number(word) for word in words]

            def numerize(heads):
                return [int(head) for head in heads]

            train_dataset.apply_field(clean_word, word, new_field_name=word)
            val_dataset.apply_field(clean_word, word, new_field_name=word)
            test_dataset.apply_field(clean_word, word, new_field_name=word)
            test_dataset.apply_field(numerize, head, new_field_name=head)
            train_dataset.add_seq_len(field_name=word, new_field_name=seq_len)
            val_dataset.add_seq_len(field_name=word, new_field_name=seq_len)
            test_dataset.add_seq_len(field_name=word, new_field_name=seq_len)


            pos_vocab = Vocabulary()
            pos_vocab.from_dataset(train_dataset, field_name=pos)

            if data.wordposastoken:
                '''
                combining pos tag and word as a single token.
                Largely speaking, we build the vocabulary based on the co-occurance of (NT, 'word')
                Then, we replace all unknown word with their corresponding POS tag.
                Please refer
                "Dependency Grammar Induction with Neural Lexicalization and Big Training Data"
                for details.
                '''
                def combine(x):
                    sent = list(zip(x[pos], x[word]))
                    return [x[0] + "_" + x[1] for x in sent]

                train_dataset.apply(combine, new_field_name=word)
                val_dataset.apply(combine, new_field_name=word)
                test_dataset.apply(combine, new_field_name=word)
                word_vocab = Vocabulary(min_freq=data.min_freq)
                word_vocab.from_dataset(train_dataset, field_name=word)

                '''
                Replace the unknown word with their POS tag.
                '''

                word_vocab.add_word_lst(pos_vocab.word2idx)
                word_vocab.index_dataset(train_dataset, field_name=word)
                word_vocab.index_dataset(val_dataset, field_name=word)
                word_vocab.index_dataset(test_dataset, field_name=word)
                unk = 1

                def replace(x):
                    poses = x[pos]
                    words = x[word]
                    for i in range(len(words)):
                        # 1 stands for unk. we replace the unknown word with its POS tags.
                        if words[i] == unk:
                            pos_tag_name = poses[i]
                            words[i] = word_vocab[pos_tag_name]
                    return words

                train_dataset.apply(replace, new_field_name=word)
                val_dataset.apply(replace, new_field_name=word)
                test_dataset.apply(replace, new_field_name=word)

                if data.use_emb:
                    if data.emb_type == 'fasttext':
                        model = FastText.load(data.embedding)
                    else:
                        raise NotImplementedError
                    word_vec = model.wv
                    emb = np.random.rand(len(word_vocab), data.word_emb_size)
                    for idx, w in word_vocab.idx2word.items():
                        if "_" in w:
                            w = w.split('_')[-1]
                            emb[idx] = word_vec[w]
                    emb = torch.from_numpy(emb)
                    self.pretrained_emb = emb.to(self.device).float()

                word2pos = np.zeros(shape=(len(word_vocab),))

                # to match each token in vocabulary with its corresponding POS tag.
                for idx, w in word_vocab.idx2word.items():
                    if idx == 0:
                        continue
                    if idx == 1:
                        word2pos[1] = 1
                        continue
                    if "_" in w:
                        pos_tag_name = w.split("_")[0]
                        word2pos[idx] = pos_vocab.word2idx[pos_tag_name]
                    else:
                        word2pos[idx] = pos_vocab.word2idx[w]
                self.word2pos = torch.from_numpy(word2pos).long().to(self.device)


            # if not combine pos/word as a single token.
            else:
                # choose the create the vocabulary with fix size or based on the word frequency.
                if data.vocab_type == 'max_size':
                    word_vocab = Vocabulary(max_size=data.vocab_size)
                else:
                    word_vocab = Vocabulary(min_freq=data.min_freq)
                word_vocab.from_dataset(train_dataset, field_name=word)
                word_vocab.index_dataset(train_dataset, field_name=word)
                word_vocab.index_dataset(val_dataset, field_name=word)
                word_vocab.index_dataset(test_dataset, field_name=word)

            train_dataset.set_input(pos, word, seq_len)
            val_dataset.set_input(pos, word, seq_len)
            test_dataset.set_input(pos, word, seq_len)
            test_dataset.set_target(head)

            pos_vocab.index_dataset(train_dataset, field_name=pos)
            pos_vocab.index_dataset(val_dataset, field_name=pos)
            pos_vocab.index_dataset(test_dataset, field_name=pos)

            train_dataset_init = None

            '''
            Use external unsupervised parser's parse result as "psudo-gold-tree" to initialize our model.
            '''
            if self.hparams.train.initializer == 'external':
                # dependent on your file format.
                conll_loader = ConllLoader([word, pos, head], indexes=[1, 4, 6])
                train_dataset_init = conll_loader._load(data.external_parser)
                train_dataset_init.add_seq_len(field_name=word, new_field_name=seq_len)
                train_dataset_init.apply_field(clean_word, word, new_field_name=word)
                train_dataset_init.apply_field(numerize, head, new_field_name=head)

                if not data.wordposastoken:
                    word_vocab.index_dataset(train_dataset_init, field_name=word)
                else:
                    train_dataset_init.apply(combine, new_field_name=word)
                    word_vocab.index_dataset(train_dataset_init, field_name=word)
                    train_dataset_init.apply(replace, new_field_name=word)

                pos_vocab.index_dataset(train_dataset_init, field_name=pos)

                if self.hparams.joint_training:
                    import copy
                    train_dataset_init_for_model2 = copy.deepcopy(train_dataset_init)

                # first-order model
                if (self.hparams.model.model_name == 'NeuralDMV') or (self.hparams.model.model_name == 'LexicalizedNDMV'):
                    rule_generator = RuleGenerator1o()

                # second-order model
                elif self.hparams.model.model_name == 'SiblingNDMV':
                    rule_generator = RuleGeneratorSib()

                elif self.hparams.model.model_name == 'JointFirstSecond':
                    rule_generator = RuleGenerator1o()
                    rule_generator_for_model2 = RuleGeneratorSib()

                else:
                    raise NameError

                self.setup_init_dataset(train_dataset_init, rule_generator)

                if self.hparams.joint_training:
                    self.setup_init_dataset(train_dataset_init_for_model2, rule_generator_for_model2)


            elif self.hparams.train.initializer == 'km':
                train_dataset_init = train_dataset

            self.pos_vocab = pos_vocab
            self.word_vocab = word_vocab
            self.train_dataset = train_dataset
            self.val_dataset =  val_dataset
            self.test_dataset = test_dataset
            self.train_dataset_init = train_dataset_init
            if self.hparams.joint_training:
                self.train_dataset_init_for_model2 = train_dataset_init_for_model2

        else:
            raise NotImplementedError
Ejemplo n.º 5
0
    def process(self,
                paths,
                vocab_size,
                vocab_path,
                sent_max_len,
                doc_max_timesteps,
                domain=False,
                tag=False,
                load_vocab=True):
        """
        :param paths: dict  path for each dataset
        :param vocab_size: int  max_size for vocab
        :param vocab_path: str  vocab path
        :param sent_max_len: int    max token number of the sentence
        :param doc_max_timesteps: int   max sentence number of the document
        :param domain: bool  build vocab for publication, use 'X' for unknown
        :param tag: bool  build vocab for tag, use 'X' for unknown
        :param load_vocab: bool  build vocab (False) or load vocab (True)
        :return: DataInfo
            datasets: dict  keys correspond to the paths dict
            vocabs: dict  key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True)
            embeddings: optional
        """
        def _pad_sent(text_wd):
            pad_text_wd = []
            for sent_wd in text_wd:
                if len(sent_wd) < sent_max_len:
                    pad_num = sent_max_len - len(sent_wd)
                    sent_wd.extend([WORD_PAD] * pad_num)
                else:
                    sent_wd = sent_wd[:sent_max_len]
                pad_text_wd.append(sent_wd)
            return pad_text_wd

        def _token_mask(text_wd):
            token_mask_list = []
            for sent_wd in text_wd:
                token_num = len(sent_wd)
                if token_num < sent_max_len:
                    mask = [1] * token_num + [0] * (sent_max_len - token_num)
                else:
                    mask = [1] * sent_max_len
                token_mask_list.append(mask)
            return token_mask_list

        def _pad_label(label):
            text_len = len(label)
            if text_len < doc_max_timesteps:
                pad_label = label + [0] * (doc_max_timesteps - text_len)
            else:
                pad_label = label[:doc_max_timesteps]
            return pad_label

        def _pad_doc(text_wd):
            text_len = len(text_wd)
            if text_len < doc_max_timesteps:
                padding = [WORD_PAD] * sent_max_len
                pad_text = text_wd + [padding] * (doc_max_timesteps - text_len)
            else:
                pad_text = text_wd[:doc_max_timesteps]
            return pad_text

        def _sent_mask(text_wd):
            text_len = len(text_wd)
            if text_len < doc_max_timesteps:
                sent_mask = [1] * text_len + [0] * (doc_max_timesteps -
                                                    text_len)
            else:
                sent_mask = [1] * doc_max_timesteps
            return sent_mask

        datasets = {}
        train_ds = None
        for key, value in paths.items():
            ds = self.load(value)
            # pad sent
            ds.apply(lambda x: _pad_sent(x["text_wd"]),
                     new_field_name="pad_text_wd")
            ds.apply(lambda x: _token_mask(x["text_wd"]),
                     new_field_name="pad_token_mask")
            # pad document
            ds.apply(lambda x: _pad_doc(x["pad_text_wd"]),
                     new_field_name="pad_text")
            ds.apply(lambda x: _sent_mask(x["pad_text_wd"]),
                     new_field_name="seq_len")
            ds.apply(lambda x: _pad_label(x["flatten_label"]),
                     new_field_name="pad_label")

            # rename field
            ds.rename_field("pad_text", Const.INPUT)
            ds.rename_field("seq_len", Const.INPUT_LEN)
            ds.rename_field("pad_label", Const.TARGET)

            # set input and target
            ds.set_input(Const.INPUT, Const.INPUT_LEN)
            ds.set_target(Const.TARGET, Const.INPUT_LEN)

            datasets[key] = ds
            if "train" in key:
                train_ds = datasets[key]

        vocab_dict = {}
        if load_vocab == False:
            logger.info("[INFO] Build new vocab from training dataset!")
            if train_ds == None:
                raise ValueError("Lack train file to build vocabulary!")

            vocabs = Vocabulary(max_size=vocab_size,
                                padding=WORD_PAD,
                                unknown=WORD_UNK)
            vocabs.from_dataset(train_ds, field_name=["text_wd", "summary_wd"])
            vocab_dict["vocab"] = vocabs
        else:
            logger.info("[INFO] Load existing vocab from %s!" % vocab_path)
            word_list = []
            with open(vocab_path, 'r', encoding='utf8') as vocab_f:
                cnt = 2  # pad and unk
                for line in vocab_f:
                    pieces = line.split("\t")
                    word_list.append(pieces[0])
                    cnt += 1
                    if cnt > vocab_size:
                        break
            vocabs = Vocabulary(max_size=vocab_size,
                                padding=WORD_PAD,
                                unknown=WORD_UNK)
            vocabs.add_word_lst(word_list)
            vocabs.build_vocab()
            vocab_dict["vocab"] = vocabs

        if domain == True:
            domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK)
            domaindict.from_dataset(train_ds, field_name="publication")
            vocab_dict["domain"] = domaindict
        if tag == True:
            tagdict = Vocabulary(padding=None, unknown=TAG_UNK)
            tagdict.from_dataset(train_ds, field_name="tag")
            vocab_dict["tag"] = tagdict

        for ds in datasets.values():
            vocab_dict["vocab"].index_dataset(ds,
                                              field_name=Const.INPUT,
                                              new_field_name=Const.INPUT)

        return DataInfo(vocabs=vocab_dict, datasets=datasets)
Ejemplo n.º 6
0
    def process(
        self,
        paths: Union[str, Dict[str, str]],
        dataset_name: str = None,
        to_lower=False,
        seq_len_type: str = None,
        bert_tokenizer: str = None,
        cut_text: int = None,
        get_index=True,
        auto_pad_length: int = None,
        auto_pad_token: str = '<pad>',
        set_input: Union[list, str, bool] = True,
        set_target: Union[list, str, bool] = True,
        concat: Union[str, list, bool] = None,
    ) -> DataBundle:
        """
        :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹,
            则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和
            对应的全路径文件名。
        :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义
            这个数据集的名字,如果不定义则默认为train。
        :param bool to_lower: 是否将文本自动转为小写。默认值为False。
        :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` :
            提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和
            attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len
        :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径
        :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。
        :param bool get_index: 是否需要根据词表将文本转为index
        :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad
        :param str auto_pad_token: 自动pad的内容
        :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False
            则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input,
            于此同时其他field不会被设置为input。默认值为True。
        :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。
        :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。
            如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果
            传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]'].
        :return:
        """
        if isinstance(set_input, str):
            set_input = [set_input]
        if isinstance(set_target, str):
            set_target = [set_target]
        if isinstance(set_input, bool):
            auto_set_input = set_input
        else:
            auto_set_input = False
        if isinstance(set_target, bool):
            auto_set_target = set_target
        else:
            auto_set_target = False
        if isinstance(paths, str):
            if os.path.isdir(paths):
                path = {
                    n: os.path.join(paths, self.paths[n])
                    for n in self.paths.keys()
                }
            else:
                path = {
                    dataset_name if dataset_name is not None else 'train':
                    paths
                }
        else:
            path = paths

        data_info = DataBundle()
        for data_name in path.keys():
            data_info.datasets[data_name] = self._load(path[data_name])

        for data_name, data_set in data_info.datasets.items():
            if auto_set_input:
                data_set.set_input(Const.INPUTS(0), Const.INPUTS(1))
            if auto_set_target:
                if Const.TARGET in data_set.get_field_names():
                    data_set.set_target(Const.TARGET)

        if to_lower:
            for data_name, data_set in data_info.datasets.items():
                data_set.apply(
                    lambda x: [w.lower() for w in x[Const.INPUTS(0)]],
                    new_field_name=Const.INPUTS(0),
                    is_input=auto_set_input)
                data_set.apply(
                    lambda x: [w.lower() for w in x[Const.INPUTS(1)]],
                    new_field_name=Const.INPUTS(1),
                    is_input=auto_set_input)

        if bert_tokenizer is not None:
            if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR:
                PRETRAIN_URL = _get_base_url('bert')
                model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer]
                model_url = PRETRAIN_URL + model_name
                model_dir = cached_path(model_url)
                # 检查是否存在
            elif os.path.isdir(bert_tokenizer):
                model_dir = bert_tokenizer
            else:
                raise ValueError(
                    f"Cannot recognize BERT tokenizer from {bert_tokenizer}.")

            words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
            with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f:
                lines = f.readlines()
            lines = [line.strip() for line in lines]
            words_vocab.add_word_lst(lines)
            words_vocab.build_vocab()

            tokenizer = BertTokenizer.from_pretrained(model_dir)

            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x: tokenizer.tokenize(' '.join(x[fields])),
                            new_field_name=fields,
                            is_input=auto_set_input)

        if isinstance(concat, bool):
            concat = 'default' if concat else None
        if concat is not None:
            if isinstance(concat, str):
                CONCAT_MAP = {
                    'bert': ['[CLS]', '[SEP]', '', '[SEP]'],
                    'default': ['', '<sep>', '', '']
                }
                if concat.lower() in CONCAT_MAP:
                    concat = CONCAT_MAP[concat]
                else:
                    concat = 4 * [concat]
            assert len(concat) == 4, \
                f'Please choose a list with 4 symbols which at the beginning of first sentence ' \
                f'the end of first sentence, the begin of second sentence, and the end of second' \
                f'sentence. Your input is {concat}'

            for data_name, data_set in data_info.datasets.items():
                data_set.apply(
                    lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[
                        1]] + [concat[2]] + x[Const.INPUTS(1)] + [concat[3]],
                    new_field_name=Const.INPUT)
                data_set.apply(
                    lambda x: [w for w in x[Const.INPUT] if len(w) > 0],
                    new_field_name=Const.INPUT,
                    is_input=auto_set_input)

        if seq_len_type is not None:
            if seq_len_type == 'seq_len':  #
                for data_name, data_set in data_info.datasets.items():
                    for fields in data_set.get_field_names():
                        if Const.INPUT in fields:
                            data_set.apply(lambda x: len(x[fields]),
                                           new_field_name=fields.replace(
                                               Const.INPUT, Const.INPUT_LEN),
                                           is_input=auto_set_input)
            elif seq_len_type == 'mask':
                for data_name, data_set in data_info.datasets.items():
                    for fields in data_set.get_field_names():
                        if Const.INPUT in fields:
                            data_set.apply(lambda x: [1] * len(x[fields]),
                                           new_field_name=fields.replace(
                                               Const.INPUT, Const.INPUT_LEN),
                                           is_input=auto_set_input)
            elif seq_len_type == 'bert':
                for data_name, data_set in data_info.datasets.items():
                    if Const.INPUT not in data_set.get_field_names():
                        raise KeyError(
                            f'Field ``{Const.INPUT}`` not in {data_name} data set: '
                            f'got {data_set.get_field_names()}')
                    data_set.apply(lambda x: [0] *
                                   (len(x[Const.INPUTS(0)]) + 2) + [1] *
                                   (len(x[Const.INPUTS(1)]) + 1),
                                   new_field_name=Const.INPUT_LENS(0),
                                   is_input=auto_set_input)
                    data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]),
                                   new_field_name=Const.INPUT_LENS(1),
                                   is_input=auto_set_input)

        if auto_pad_length is not None:
            cut_text = min(
                auto_pad_length,
                cut_text if cut_text is not None else auto_pad_length)

        if cut_text is not None:
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if (Const.INPUT
                            in fields) or ((Const.INPUT_LEN in fields) and
                                           (seq_len_type != 'seq_len')):
                        data_set.apply(lambda x: x[fields][:cut_text],
                                       new_field_name=fields,
                                       is_input=auto_set_input)

        data_set_list = [d for n, d in data_info.datasets.items()]
        assert len(data_set_list) > 0, f'There are NO data sets in data info!'

        if bert_tokenizer is None:
            words_vocab = Vocabulary(padding=auto_pad_token)
            words_vocab = words_vocab.from_dataset(
                *[d for n, d in data_info.datasets.items() if 'train' in n],
                field_name=[
                    n for n in data_set_list[0].get_field_names()
                    if (Const.INPUT in n)
                ],
                no_create_entry_dataset=[
                    d for n, d in data_info.datasets.items()
                    if 'train' not in n
                ])
        target_vocab = Vocabulary(padding=None, unknown=None)
        target_vocab = target_vocab.from_dataset(
            *[d for n, d in data_info.datasets.items() if 'train' in n],
            field_name=Const.TARGET)
        data_info.vocabs = {
            Const.INPUT: words_vocab,
            Const.TARGET: target_vocab
        }

        if get_index:
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x:
                            [words_vocab.to_index(w) for w in x[fields]],
                            new_field_name=fields,
                            is_input=auto_set_input)

                if Const.TARGET in data_set.get_field_names():
                    data_set.apply(
                        lambda x: target_vocab.to_index(x[Const.TARGET]),
                        new_field_name=Const.TARGET,
                        is_input=auto_set_input,
                        is_target=auto_set_target)

        if auto_pad_length is not None:
            if seq_len_type == 'seq_len':
                raise RuntimeError(
                    f'the sequence will be padded with the length {auto_pad_length}, '
                    f'so the seq_len_type cannot be `{seq_len_type}`!')
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x: x[fields] +
                            [words_vocab.to_index(words_vocab.padding)] *
                            (auto_pad_length - len(x[fields])),
                            new_field_name=fields,
                            is_input=auto_set_input)
                    elif (Const.INPUT_LEN
                          in fields) and (seq_len_type != 'seq_len'):
                        data_set.apply(lambda x: x[fields] + [0] *
                                       (auto_pad_length - len(x[fields])),
                                       new_field_name=fields,
                                       is_input=auto_set_input)

        for data_name, data_set in data_info.datasets.items():
            if isinstance(set_input, list):
                data_set.set_input(*[
                    inputs for inputs in set_input
                    if inputs in data_set.get_field_names()
                ])
            if isinstance(set_target, list):
                data_set.set_target(*[
                    target for target in set_target
                    if target in data_set.get_field_names()
                ])

        return data_info