def process(self, paths, train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, src_embed_op: EmbeddingOption = None): input_name, target_name = 'words', 'target' src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary( **src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) info = DataBundle(datasets=self.load(paths)) _train_ds = [info.datasets[name] for name in train_ds ] if train_ds else info.datasets.values() src_vocab.from_dataset(*_train_ds, field_name=input_name) tgt_vocab.from_dataset(*_train_ds, field_name=target_name) src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) tgt_vocab.index_dataset(*info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs = {input_name: src_vocab, target_name: tgt_vocab} if src_embed_op is not None: src_embed_op.vocab = src_vocab init_emb = EmbedLoader.load_with_vocab(**src_embed_op) info.embeddings[input_name] = init_emb for name, dataset in info.datasets.items(): dataset.set_input(input_name) dataset.set_target(target_name) return info
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None, char_level_op=False): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars input_name, target_name = 'words', 'target' info.vocabs = {} # 就分隔为char形式 if char_level_op: for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=Const.TARGET, vocabulary=None): if isinstance(input_field_names, str): input_field_names = [input_field_names] if isinstance(target_field_names, str): target_field_names = [target_field_names] for input_field_name in input_field_names: if vocabulary is None: src_vocab = Vocabulary() src_vocab.from_dataset( *[ ds for name, ds in data_bundle.iter_datasets() if 'train' in name ], field_name=input_field_name, no_create_entry_dataset=[ ds for name, ds in data_bundle.iter_datasets() if ('train' not in name) and ( ds.has_field(input_field_name)) ]) else: src_vocab = vocabulary src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) data_bundle.set_vocab(src_vocab, input_field_name) for target_field_name in target_field_names: tgt_vocab = Vocabulary(unknown=None, padding=None) tgt_vocab.from_dataset( *[ ds for name, ds in data_bundle.iter_datasets() if 'train' in name ], field_name=target_field_name, no_create_entry_dataset=[ ds for name, ds in data_bundle.iter_datasets() if ('train' not in name) and (ds.has_field(target_field_name)) ]) if len(tgt_vocab._no_create_word) > 0: warn_msg = f"There are {len(tgt_vocab._no_create_word)} `{target_field_name}` labels" \ f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \ f"data set but not in train data set!.\n" \ f"These label(s) are {tgt_vocab._no_create_word}" print(warn_msg) tgt_vocab.index_dataset(*[ ds for ds in data_bundle.datasets.values() if ds.has_field(target_field_name) ], field_name=target_field_name) data_bundle.set_vocab(tgt_vocab, target_field_name) return data_bundle
def setup(self, stage="train"): if stage == 'train': data = self.hparams.data # build dataset # indexes: the ith column of the conll file. it depends on your file and may need modification. loader = ConllLoader([word, pos, head], indexes=[1, 3, 6]) train_dataset = loader._load(data.train_file) val_dataset = loader._load(data.val_file) test_dataset = loader._load(data.test_file) def clean_word(words): def clean_number(word): def is_number(s): try: float(s) return True except ValueError: return False if is_number(word): return '0' else: return word # import re # def clean_number(w): # new_w = re.sub('[0-9]{1,}([,.]?[0-9]*)*', '0', w) # return new_w return [clean_number(word) for word in words] def numerize(heads): return [int(head) for head in heads] train_dataset.apply_field(clean_word, word, new_field_name=word) val_dataset.apply_field(clean_word, word, new_field_name=word) test_dataset.apply_field(clean_word, word, new_field_name=word) test_dataset.apply_field(numerize, head, new_field_name=head) train_dataset.add_seq_len(field_name=word, new_field_name=seq_len) val_dataset.add_seq_len(field_name=word, new_field_name=seq_len) test_dataset.add_seq_len(field_name=word, new_field_name=seq_len) pos_vocab = Vocabulary() pos_vocab.from_dataset(train_dataset, field_name=pos) if data.wordposastoken: ''' combining pos tag and word as a single token. Largely speaking, we build the vocabulary based on the co-occurance of (NT, 'word') Then, we replace all unknown word with their corresponding POS tag. Please refer "Dependency Grammar Induction with Neural Lexicalization and Big Training Data" for details. ''' def combine(x): sent = list(zip(x[pos], x[word])) return [x[0] + "_" + x[1] for x in sent] train_dataset.apply(combine, new_field_name=word) val_dataset.apply(combine, new_field_name=word) test_dataset.apply(combine, new_field_name=word) word_vocab = Vocabulary(min_freq=data.min_freq) word_vocab.from_dataset(train_dataset, field_name=word) ''' Replace the unknown word with their POS tag. ''' word_vocab.add_word_lst(pos_vocab.word2idx) word_vocab.index_dataset(train_dataset, field_name=word) word_vocab.index_dataset(val_dataset, field_name=word) word_vocab.index_dataset(test_dataset, field_name=word) unk = 1 def replace(x): poses = x[pos] words = x[word] for i in range(len(words)): # 1 stands for unk. we replace the unknown word with its POS tags. if words[i] == unk: pos_tag_name = poses[i] words[i] = word_vocab[pos_tag_name] return words train_dataset.apply(replace, new_field_name=word) val_dataset.apply(replace, new_field_name=word) test_dataset.apply(replace, new_field_name=word) if data.use_emb: if data.emb_type == 'fasttext': model = FastText.load(data.embedding) else: raise NotImplementedError word_vec = model.wv emb = np.random.rand(len(word_vocab), data.word_emb_size) for idx, w in word_vocab.idx2word.items(): if "_" in w: w = w.split('_')[-1] emb[idx] = word_vec[w] emb = torch.from_numpy(emb) self.pretrained_emb = emb.to(self.device).float() word2pos = np.zeros(shape=(len(word_vocab),)) # to match each token in vocabulary with its corresponding POS tag. for idx, w in word_vocab.idx2word.items(): if idx == 0: continue if idx == 1: word2pos[1] = 1 continue if "_" in w: pos_tag_name = w.split("_")[0] word2pos[idx] = pos_vocab.word2idx[pos_tag_name] else: word2pos[idx] = pos_vocab.word2idx[w] self.word2pos = torch.from_numpy(word2pos).long().to(self.device) # if not combine pos/word as a single token. else: # choose the create the vocabulary with fix size or based on the word frequency. if data.vocab_type == 'max_size': word_vocab = Vocabulary(max_size=data.vocab_size) else: word_vocab = Vocabulary(min_freq=data.min_freq) word_vocab.from_dataset(train_dataset, field_name=word) word_vocab.index_dataset(train_dataset, field_name=word) word_vocab.index_dataset(val_dataset, field_name=word) word_vocab.index_dataset(test_dataset, field_name=word) train_dataset.set_input(pos, word, seq_len) val_dataset.set_input(pos, word, seq_len) test_dataset.set_input(pos, word, seq_len) test_dataset.set_target(head) pos_vocab.index_dataset(train_dataset, field_name=pos) pos_vocab.index_dataset(val_dataset, field_name=pos) pos_vocab.index_dataset(test_dataset, field_name=pos) train_dataset_init = None ''' Use external unsupervised parser's parse result as "psudo-gold-tree" to initialize our model. ''' if self.hparams.train.initializer == 'external': # dependent on your file format. conll_loader = ConllLoader([word, pos, head], indexes=[1, 4, 6]) train_dataset_init = conll_loader._load(data.external_parser) train_dataset_init.add_seq_len(field_name=word, new_field_name=seq_len) train_dataset_init.apply_field(clean_word, word, new_field_name=word) train_dataset_init.apply_field(numerize, head, new_field_name=head) if not data.wordposastoken: word_vocab.index_dataset(train_dataset_init, field_name=word) else: train_dataset_init.apply(combine, new_field_name=word) word_vocab.index_dataset(train_dataset_init, field_name=word) train_dataset_init.apply(replace, new_field_name=word) pos_vocab.index_dataset(train_dataset_init, field_name=pos) if self.hparams.joint_training: import copy train_dataset_init_for_model2 = copy.deepcopy(train_dataset_init) # first-order model if (self.hparams.model.model_name == 'NeuralDMV') or (self.hparams.model.model_name == 'LexicalizedNDMV'): rule_generator = RuleGenerator1o() # second-order model elif self.hparams.model.model_name == 'SiblingNDMV': rule_generator = RuleGeneratorSib() elif self.hparams.model.model_name == 'JointFirstSecond': rule_generator = RuleGenerator1o() rule_generator_for_model2 = RuleGeneratorSib() else: raise NameError self.setup_init_dataset(train_dataset_init, rule_generator) if self.hparams.joint_training: self.setup_init_dataset(train_dataset_init_for_model2, rule_generator_for_model2) elif self.hparams.train.initializer == 'km': train_dataset_init = train_dataset self.pos_vocab = pos_vocab self.word_vocab = word_vocab self.train_dataset = train_dataset self.val_dataset = val_dataset self.test_dataset = test_dataset self.train_dataset_init = train_dataset_init if self.hparams.joint_training: self.train_dataset_init_for_model2 = train_dataset_init_for_model2 else: raise NotImplementedError
def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab=True): """ :param paths: dict path for each dataset :param vocab_size: int max_size for vocab :param vocab_path: str vocab path :param sent_max_len: int max token number of the sentence :param doc_max_timesteps: int max sentence number of the document :param domain: bool build vocab for publication, use 'X' for unknown :param tag: bool build vocab for tag, use 'X' for unknown :param load_vocab: bool build vocab (False) or load vocab (True) :return: DataInfo datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ def _pad_sent(text_wd): pad_text_wd = [] for sent_wd in text_wd: if len(sent_wd) < sent_max_len: pad_num = sent_max_len - len(sent_wd) sent_wd.extend([WORD_PAD] * pad_num) else: sent_wd = sent_wd[:sent_max_len] pad_text_wd.append(sent_wd) return pad_text_wd def _token_mask(text_wd): token_mask_list = [] for sent_wd in text_wd: token_num = len(sent_wd) if token_num < sent_max_len: mask = [1] * token_num + [0] * (sent_max_len - token_num) else: mask = [1] * sent_max_len token_mask_list.append(mask) return token_mask_list def _pad_label(label): text_len = len(label) if text_len < doc_max_timesteps: pad_label = label + [0] * (doc_max_timesteps - text_len) else: pad_label = label[:doc_max_timesteps] return pad_label def _pad_doc(text_wd): text_len = len(text_wd) if text_len < doc_max_timesteps: padding = [WORD_PAD] * sent_max_len pad_text = text_wd + [padding] * (doc_max_timesteps - text_len) else: pad_text = text_wd[:doc_max_timesteps] return pad_text def _sent_mask(text_wd): text_len = len(text_wd) if text_len < doc_max_timesteps: sent_mask = [1] * text_len + [0] * (doc_max_timesteps - text_len) else: sent_mask = [1] * doc_max_timesteps return sent_mask datasets = {} train_ds = None for key, value in paths.items(): ds = self.load(value) # pad sent ds.apply(lambda x: _pad_sent(x["text_wd"]), new_field_name="pad_text_wd") ds.apply(lambda x: _token_mask(x["text_wd"]), new_field_name="pad_token_mask") # pad document ds.apply(lambda x: _pad_doc(x["pad_text_wd"]), new_field_name="pad_text") ds.apply(lambda x: _sent_mask(x["pad_text_wd"]), new_field_name="seq_len") ds.apply(lambda x: _pad_label(x["flatten_label"]), new_field_name="pad_label") # rename field ds.rename_field("pad_text", Const.INPUT) ds.rename_field("seq_len", Const.INPUT_LEN) ds.rename_field("pad_label", Const.TARGET) # set input and target ds.set_input(Const.INPUT, Const.INPUT_LEN) ds.set_target(Const.TARGET, Const.INPUT_LEN) datasets[key] = ds if "train" in key: train_ds = datasets[key] vocab_dict = {} if load_vocab == False: logger.info("[INFO] Build new vocab from training dataset!") if train_ds == None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) vocabs.from_dataset(train_ds, field_name=["text_wd", "summary_wd"]) vocab_dict["vocab"] = vocabs else: logger.info("[INFO] Load existing vocab from %s!" % vocab_path) word_list = [] with open(vocab_path, 'r', encoding='utf8') as vocab_f: cnt = 2 # pad and unk for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) vocabs.add_word_lst(word_list) vocabs.build_vocab() vocab_dict["vocab"] = vocabs if domain == True: domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK) domaindict.from_dataset(train_ds, field_name="publication") vocab_dict["domain"] = domaindict if tag == True: tagdict = Vocabulary(padding=None, unknown=TAG_UNK) tagdict.from_dataset(train_ds, field_name="tag") vocab_dict["tag"] = tagdict for ds in datasets.values(): vocab_dict["vocab"].index_dataset(ds, field_name=Const.INPUT, new_field_name=Const.INPUT) return DataInfo(vocabs=vocab_dict, datasets=datasets)
def process( self, paths: Union[str, Dict[str, str]], dataset_name: str = None, to_lower=False, seq_len_type: str = None, bert_tokenizer: str = None, cut_text: int = None, get_index=True, auto_pad_length: int = None, auto_pad_token: str = '<pad>', set_input: Union[list, str, bool] = True, set_target: Union[list, str, bool] = True, concat: Union[str, list, bool] = None, ) -> DataBundle: """ :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, 则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和 对应的全路径文件名。 :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义 这个数据集的名字,如果不定义则默认为train。 :param bool to_lower: 是否将文本自动转为小写。默认值为False。 :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` : 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。 :param bool get_index: 是否需要根据词表将文本转为index :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad :param str auto_pad_token: 自动pad的内容 :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, 于此同时其他field不会被设置为input。默认值为True。 :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。 :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。 如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果 传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]']. :return: """ if isinstance(set_input, str): set_input = [set_input] if isinstance(set_target, str): set_target = [set_target] if isinstance(set_input, bool): auto_set_input = set_input else: auto_set_input = False if isinstance(set_target, bool): auto_set_target = set_target else: auto_set_target = False if isinstance(paths, str): if os.path.isdir(paths): path = { n: os.path.join(paths, self.paths[n]) for n in self.paths.keys() } else: path = { dataset_name if dataset_name is not None else 'train': paths } else: path = paths data_info = DataBundle() for data_name in path.keys(): data_info.datasets[data_name] = self._load(path[data_name]) for data_name, data_set in data_info.datasets.items(): if auto_set_input: data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) if auto_set_target: if Const.TARGET in data_set.get_field_names(): data_set.set_target(Const.TARGET) if to_lower: for data_name, data_set in data_info.datasets.items(): data_set.apply( lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0), is_input=auto_set_input) data_set.apply( lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1), is_input=auto_set_input) if bert_tokenizer is not None: if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: PRETRAIN_URL = _get_base_url('bert') model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] model_url = PRETRAIN_URL + model_name model_dir = cached_path(model_url) # 检查是否存在 elif os.path.isdir(bert_tokenizer): model_dir = bert_tokenizer else: raise ValueError( f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] words_vocab.add_word_lst(lines) words_vocab.build_vocab() tokenizer = BertTokenizer.from_pretrained(model_dir) for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields, is_input=auto_set_input) if isinstance(concat, bool): concat = 'default' if concat else None if concat is not None: if isinstance(concat, str): CONCAT_MAP = { 'bert': ['[CLS]', '[SEP]', '', '[SEP]'], 'default': ['', '<sep>', '', ''] } if concat.lower() in CONCAT_MAP: concat = CONCAT_MAP[concat] else: concat = 4 * [concat] assert len(concat) == 4, \ f'Please choose a list with 4 symbols which at the beginning of first sentence ' \ f'the end of first sentence, the begin of second sentence, and the end of second' \ f'sentence. Your input is {concat}' for data_name, data_set in data_info.datasets.items(): data_set.apply( lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[ 1]] + [concat[2]] + x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT) data_set.apply( lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT, is_input=auto_set_input) if seq_len_type is not None: if seq_len_type == 'seq_len': # for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply(lambda x: len(x[fields]), new_field_name=fields.replace( Const.INPUT, Const.INPUT_LEN), is_input=auto_set_input) elif seq_len_type == 'mask': for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply(lambda x: [1] * len(x[fields]), new_field_name=fields.replace( Const.INPUT, Const.INPUT_LEN), is_input=auto_set_input) elif seq_len_type == 'bert': for data_name, data_set in data_info.datasets.items(): if Const.INPUT not in data_set.get_field_names(): raise KeyError( f'Field ``{Const.INPUT}`` not in {data_name} data set: ' f'got {data_set.get_field_names()}') data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input) data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) if auto_pad_length is not None: cut_text = min( auto_pad_length, cut_text if cut_text is not None else auto_pad_length) if cut_text is not None: for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')): data_set.apply(lambda x: x[fields][:cut_text], new_field_name=fields, is_input=auto_set_input) data_set_list = [d for n, d in data_info.datasets.items()] assert len(data_set_list) > 0, f'There are NO data sets in data info!' if bert_tokenizer is None: words_vocab = Vocabulary(padding=auto_pad_token) words_vocab = words_vocab.from_dataset( *[d for n, d in data_info.datasets.items() if 'train' in n], field_name=[ n for n in data_set_list[0].get_field_names() if (Const.INPUT in n) ], no_create_entry_dataset=[ d for n, d in data_info.datasets.items() if 'train' not in n ]) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab = target_vocab.from_dataset( *[d for n, d in data_info.datasets.items() if 'train' in n], field_name=Const.TARGET) data_info.vocabs = { Const.INPUT: words_vocab, Const.TARGET: target_vocab } if get_index: for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, is_input=auto_set_input) if Const.TARGET in data_set.get_field_names(): data_set.apply( lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET, is_input=auto_set_input, is_target=auto_set_target) if auto_pad_length is not None: if seq_len_type == 'seq_len': raise RuntimeError( f'the sequence will be padded with the length {auto_pad_length}, ' f'so the seq_len_type cannot be `{seq_len_type}`!') for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] * (auto_pad_length - len(x[fields])), new_field_name=fields, is_input=auto_set_input) elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'): data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])), new_field_name=fields, is_input=auto_set_input) for data_name, data_set in data_info.datasets.items(): if isinstance(set_input, list): data_set.set_input(*[ inputs for inputs in set_input if inputs in data_set.get_field_names() ]) if isinstance(set_target, list): data_set.set_target(*[ target for target in set_target if target in data_set.get_field_names() ]) return data_info