def process(self, paths, **kwargs): data_info = DataBundle() for name in ['train', 'test', 'dev']: data_info.datasets[name] = self.load(paths[name]) config = Config() vocab = Vocabulary().from_dataset(*data_info.datasets.values(), field_name='sentences') vocab.build_vocab() word2id = vocab.word2idx char_dict = preprocess.get_char_dict(config.char_path) data_info.vocabs = vocab genres = { g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"]) } for name, ds in data_info.datasets.items(): ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[0], new_field_name='doc_np') ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[1], new_field_name='char_index') ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[2], new_field_name='seq_len') ds.apply(lambda x: preprocess.speaker2numpy( x["speakers"], config.max_sentences, is_train=name == 'train'), new_field_name='speaker_ids_np') ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre') ds.set_ignore_type('clusters') ds.set_padder('clusters', None) ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len") ds.set_target("clusters") # train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False) # train, dev = train_dev.split(343 / (2802 + 343), shuffle=False) return data_info
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None, char_level_op=False): datasets = {} info = DataBundle() paths = check_dataloader_paths(paths) for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars if char_level_op: for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False) src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = { "words": src_vocab, "target": tgt_vocab } info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def process(self, paths): def get_seq_len(instance): return len(instance['article']) print('Start loading datasets !!!') start = time() # load datasets datasets = {} for name in paths: datasets[name] = self._load(paths[name]) datasets[name].apply(get_seq_len, new_field_name='seq_len') # set input and target datasets[name].set_input('article', 'segment_id', 'cls_id') datasets[name].set_target(Const.TARGET) # set padding value datasets[name].set_pad_val('article', 0) datasets[name].set_pad_val('segment_id', 0) datasets[name].set_pad_val('cls_id', -1) datasets[name].set_pad_val(Const.TARGET, 0) print('Finished in {}'.format(timedelta(seconds=time()-start))) return DataBundle(datasets=datasets)
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = { "words": src_vocab, "target": tgt_vocab } info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def process(self, paths, train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, src_embed_op: EmbeddingOption = None): input_name, target_name = 'words', 'target' src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) info = DataBundle(datasets=self.load(paths)) _train_ds = [info.datasets[name] for name in train_ds] if train_ds else info.datasets.values() src_vocab.from_dataset(*_train_ds, field_name=input_name) tgt_vocab.from_dataset(*_train_ds, field_name=target_name) src_vocab.index_dataset( *info.datasets.values(), field_name=input_name, new_field_name=input_name) tgt_vocab.index_dataset( *info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs = { input_name: src_vocab, target_name: tgt_vocab } if src_embed_op is not None: src_embed_op.vocab = src_vocab init_emb = EmbedLoader.load_with_vocab(**src_embed_op) info.embeddings[input_name] = init_emb for name, dataset in info.datasets.items(): dataset.set_input(input_name) dataset.set_target(target_name) return info
def process(self, paths: Union[str, Dict[str, str]], train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, embed_opt: EmbeddingOption = None, char_level_op=False, split_dev_op=True ): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle(datasets=self.load(paths)) src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) _train_ds = [info.datasets[name] for name in train_ds] if train_ds else info.datasets.values() def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars input_name, target_name = 'words', 'target' info.vocabs={} #就分隔为char形式 if char_level_op: for dataset in info.datasets.values(): dataset.apply_field(wordtochar, field_name="words",new_field_name='chars') # if embed_opt is not None: # embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) # info.embeddings['words'] = embed else: src_vocab.from_dataset(*_train_ds, field_name=input_name) src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name) info.vocabs[input_name]=src_vocab tgt_vocab.from_dataset(*_train_ds, field_name=target_name) tgt_vocab.index_dataset( *info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs[target_name]=tgt_vocab if split_dev_op: info.datasets['train'], info.datasets['dev'] = info.datasets['train'].split(0.1, shuffle=False) for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def process(self, paths): def truncate_articles(instance, max_nsents=self.max_nsents, max_ntokens=self.max_ntokens): article = [' '.join(sent.lower().split()[:max_ntokens]) for sent in instance['article']] return article[:max_nsents] def truncate_labels(instance): label = list(filter(lambda x: x < len(instance['article']), instance['label'])) return label def bert_tokenize(instance, tokenizer, max_len, pad_value): article = instance['article'] article = ' [SEP] [CLS] '.join(article) word_pieces = tokenizer.tokenize(article)[:(max_len - 2)] word_pieces = ['[CLS]'] + word_pieces + ['[SEP]'] token_ids = tokenizer.convert_tokens_to_ids(word_pieces) while len(token_ids) < max_len: token_ids.append(pad_value) assert len(token_ids) == max_len return token_ids def get_seg_id(instance, max_len, sep_id): _segs = [-1] + [i for i, idx in enumerate(instance['article']) if idx == sep_id] segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))] segment_id = [] for i, length in enumerate(segs): if i % 2 == 0: segment_id += length * [0] else: segment_id += length * [1] while len(segment_id) < max_len: segment_id.append(0) return segment_id def get_cls_id(instance, cls_id): classification_id = [i for i, idx in enumerate(instance['article']) if idx == cls_id] return classification_id def get_labels(instance): labels = [0] * len(instance['cls_id']) label_idx = list(filter(lambda x: x < len(instance['cls_id']), instance['label'])) for idx in label_idx: labels[idx] = 1 return labels datasets = {} for name in paths: datasets[name] = self._load(paths[name]) # remove empty samples datasets[name].drop(lambda ins: len(ins['article']) == 0 or len(ins['label']) == 0) # truncate articles datasets[name].apply(lambda ins: truncate_articles(ins, self.max_nsents, self.max_ntokens), new_field_name='article') # truncate labels datasets[name].apply(truncate_labels, new_field_name='label') # tokenize and convert tokens to id datasets[name].apply(lambda ins: bert_tokenize(ins, self.tokenizer, self.max_len, self.pad_id), new_field_name='article') # get segment id datasets[name].apply(lambda ins: get_seg_id(ins, self.max_len, self.sep_id), new_field_name='segment_id') # get classification id datasets[name].apply(lambda ins: get_cls_id(ins, self.cls_id), new_field_name='cls_id') # get label datasets[name].apply(get_labels, new_field_name='label') # rename filed datasets[name].rename_field('article', Const.INPUTS(0)) datasets[name].rename_field('segment_id', Const.INPUTS(1)) datasets[name].rename_field('cls_id', Const.INPUTS(2)) datasets[name].rename_field('lbael', Const.TARGET) # set input and target datasets[name].set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2)) datasets[name].set_target(Const.TARGET) # set paddding value datasets[name].set_pad_val('article', 0) return DataBundle(datasets=datasets)
def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt: VocabularyOption = None, lower: bool = False): """ 读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略 :param paths: :param word_vocab_opt: vocabulary的初始化值 :param lower: 是否将所有字母转为小写。 :return: """ # 读取数据 paths = check_dataloader_paths(paths) data = DataBundle() input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary( min_freq=2) if word_vocab_opt is None else Vocabulary( **word_vocab_opt) word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() cap_word_vocab.from_dataset( data.datasets['train'], field_name='raw_words', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab # 对target建vocab target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data
def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt: VocabularyOption = None, lower: bool = True) -> DataBundle: """ 读取并处理数据。返回的DataInfo包含以下的内容 vocabs: word: Vocabulary target: Vocabulary datasets: train: DataSet words: List[int], 被设置为input target: int. label,被同时设置为input和target seq_len: int. 句子的长度,被同时设置为input和target raw_words: List[str] xxx(根据传入的paths可能有所变化) :param paths: :param word_vocab_opt: vocabulary的初始化值 :param lower: 是否使用小写 :return: """ paths = check_dataloader_paths(paths) data = DataBundle() input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary( min_freq=2) if word_vocab_opt is None else Vocabulary( **word_vocab_opt) word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab # 对target建vocab target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data
def process(self, paths): """ :param paths: :return: Dataset包含以下的field chars: bigrams: trigrams: pre_chars: pre_bigrams: pre_trigrams: seg_targets: seg_masks: seq_lens: char_labels: char_heads: gold_word_pairs: seg_targets: seg_masks: char_labels: char_heads: pun_masks: gold_label_word_pairs: """ paths = check_dataloader_paths(paths) data = DataBundle() for name, path in paths.items(): dataset = self.load(path) data.datasets[name] = dataset char_labels_vocab = Vocabulary(padding=None, unknown=None) def process(dataset, char_label_vocab): dataset.apply(add_word_lst, new_field_name='word_lst') dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars') dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams') dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams') dataset.apply(add_char_heads, new_field_name='char_heads') dataset.apply(add_char_labels, new_field_name='char_labels') dataset.apply(add_segs, new_field_name='seg_targets') dataset.apply(add_mask, new_field_name='seg_masks') dataset.add_seq_len('chars', new_field_name='seq_lens') dataset.apply(add_pun_masks, new_field_name='pun_masks') if len(char_label_vocab.word_count)==0: char_label_vocab.from_dataset(dataset, field_name='char_labels') char_label_vocab.index_dataset(dataset, field_name='char_labels') new_dataset = add_root(dataset) new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True) global add_label_word_pairs add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab) new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True) new_dataset.set_pad_val('char_labels', -1) new_dataset.set_pad_val('char_heads', -1) return new_dataset for name in list(paths.keys()): dataset = data.datasets[name] dataset = process(dataset, char_labels_vocab) data.datasets[name] = dataset data.vocabs['char_labels'] = char_labels_vocab char_vocab = Vocabulary(min_freq=2).from_dataset(data.datasets['train'], field_name='chars') bigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='bigrams') trigram_vocab = Vocabulary(min_freq=5).from_dataset(data.datasets['train'], field_name='trigrams') for name in ['chars', 'bigrams', 'trigrams']: vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list(data.datasets.values())) vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name) data.vocabs['pre_{}'.format(name)] = vocab for name, vocab in zip(['chars', 'bigrams', 'trigrams'], [char_vocab, bigram_vocab, trigram_vocab]): vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name) data.vocabs[name] = vocab for name, dataset in data.datasets.items(): dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars', 'pre_bigrams', 'pre_trigrams') dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels', 'char_heads', 'pun_masks', 'gold_label_word_pairs') return data
def process(self, paths: Union[str, Dict[str, str]], char_vocab_opt: VocabularyOption = None, char_embed_opt: EmbeddingOption = None, bigram_vocab_opt: VocabularyOption = None, bigram_embed_opt: EmbeddingOption = None, L: int = 4): """ 支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如 Option:: 共同 创造 美好 的 新 世纪 —— 二○○一年 新年 贺词 ( 二○○○年 十二月 三十一日 ) ( 附 图片 1 张 ) 女士 们 , 先生 们 , 同志 们 , 朋友 们 : paths支持两种格式,第一种是str,第二种是Dict[str, str]. Option:: # 1. str类型 # 1.1 传入具体的文件路径 data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容 # 包含以下的内容data.vocabs['chars']:Vocabulary对象, # data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值 # data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项 # data.datasets['train']: DataSet对象 # 包含的field有: # raw_chars: list[str], 每个元素是一个汉字 # chars: list[int], 每个元素是汉字对应的index # target: list[int], 根据encoding_type有对应的变化 # 1.2 传入一个目录, 里面必须包含train.txt文件 data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt # 包含以下的内容data.vocabs['chars']: Vocabulary对象 # data.vocabs['target']:Vocabulary对象 # data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象; # data.datasets['train']: DataSet对象 # 包含的field有: # raw_chars: list[str], 每个元素是一个汉字 # chars: list[int], 每个元素是汉字对应的index # target: list[int], 根据encoding_type有对应的变化 # data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样 # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'} data = SigHanLoader(paths).process(paths) # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致 :param paths: 支持传入目录,文件路径,以及dict。 :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2 :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。 为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e<eos> :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效 :param L: 当target_type为shift_relay时传入的segment长度 :return: """ # 推荐大家使用这个check_data_loader_paths进行paths的验证 paths = check_dataloader_paths(paths) datasets = {} data = DataBundle() bigram = bigram_vocab_opt is not None for name, path in paths.items(): dataset = self.load(path, bigram=bigram) datasets[name] = dataset input_fields = [] target_fields = [] # 创建vocab char_vocab = Vocabulary( min_freq=2) if char_vocab_opt is None else Vocabulary( **char_vocab_opt) char_vocab.from_dataset(datasets['train'], field_name='raw_chars') char_vocab.index_dataset(*datasets.values(), field_name='raw_chars', new_field_name='chars') data.vocabs[Const.CHAR_INPUT] = char_vocab input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET]) target_fields.append(Const.TARGET) # 创建target if self.target_type == 'bmes': target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.add_word_lst(['B'] * 4 + ['M'] * 3 + ['E'] * 2 + ['S']) target_vocab.index_dataset(*datasets.values(), field_name='target') data.vocabs[Const.TARGET] = target_vocab if char_embed_opt is not None: char_embed = EmbedLoader.load_with_vocab(**char_embed_opt, vocab=char_vocab) data.embeddings['chars'] = char_embed if bigram: bigram_vocab = Vocabulary(**bigram_vocab_opt) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams') bigram_vocab.index_dataset(*datasets.values(), field_name='bigrams') data.vocabs['bigrams'] = bigram_vocab if bigram_embed_opt is not None: bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt, vocab=bigram_vocab) data.embeddings['bigrams'] = bigram_embed input_fields.append('bigrams') if self.target_type == 'shift_relay': func = partial(self._clip_target, L=L) for name, dataset in datasets.items(): res = dataset.apply_field(func, field_name='target') relay_target = [res_i[0] for res_i in res] relay_mask = [res_i[1] for res_i in res] dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) if self.target_type == 'shift_relay': input_fields.extend(['end_seg_mask']) target_fields.append('start_seg_mask') # 将dataset加入DataInfo for name, dataset in datasets.items(): dataset.set_input(*input_fields) dataset.set_target(*target_fields) data.datasets[name] = dataset return data
def process( self, paths: Union[str, Dict[str, str]], dataset_name: str = None, to_lower=False, seq_len_type: str = None, bert_tokenizer: str = None, cut_text: int = None, get_index=True, auto_pad_length: int = None, auto_pad_token: str = '<pad>', set_input: Union[list, str, bool] = True, set_target: Union[list, str, bool] = True, concat: Union[str, list, bool] = None, ) -> DataBundle: """ :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, 则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和 对应的全路径文件名。 :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义 这个数据集的名字,如果不定义则默认为train。 :param bool to_lower: 是否将文本自动转为小写。默认值为False。 :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` : 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。 :param bool get_index: 是否需要根据词表将文本转为index :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad :param str auto_pad_token: 自动pad的内容 :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, 于此同时其他field不会被设置为input。默认值为True。 :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。 :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。 如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果 传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]']. :return: """ if isinstance(set_input, str): set_input = [set_input] if isinstance(set_target, str): set_target = [set_target] if isinstance(set_input, bool): auto_set_input = set_input else: auto_set_input = False if isinstance(set_target, bool): auto_set_target = set_target else: auto_set_target = False if isinstance(paths, str): if os.path.isdir(paths): path = { n: os.path.join(paths, self.paths[n]) for n in self.paths.keys() } else: path = { dataset_name if dataset_name is not None else 'train': paths } else: path = paths data_info = DataBundle() for data_name in path.keys(): data_info.datasets[data_name] = self._load(path[data_name]) for data_name, data_set in data_info.datasets.items(): if auto_set_input: data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) if auto_set_target: if Const.TARGET in data_set.get_field_names(): data_set.set_target(Const.TARGET) if to_lower: for data_name, data_set in data_info.datasets.items(): data_set.apply( lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0), is_input=auto_set_input) data_set.apply( lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1), is_input=auto_set_input) if bert_tokenizer is not None: if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: PRETRAIN_URL = _get_base_url('bert') model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] model_url = PRETRAIN_URL + model_name model_dir = cached_path(model_url) # 检查是否存在 elif os.path.isdir(bert_tokenizer): model_dir = bert_tokenizer else: raise ValueError( f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f: lines = f.readlines() lines = [line.strip() for line in lines] words_vocab.add_word_lst(lines) words_vocab.build_vocab() tokenizer = BertTokenizer.from_pretrained(model_dir) for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields, is_input=auto_set_input) if isinstance(concat, bool): concat = 'default' if concat else None if concat is not None: if isinstance(concat, str): CONCAT_MAP = { 'bert': ['[CLS]', '[SEP]', '', '[SEP]'], 'default': ['', '<sep>', '', ''] } if concat.lower() in CONCAT_MAP: concat = CONCAT_MAP[concat] else: concat = 4 * [concat] assert len(concat) == 4, \ f'Please choose a list with 4 symbols which at the beginning of first sentence ' \ f'the end of first sentence, the begin of second sentence, and the end of second' \ f'sentence. Your input is {concat}' for data_name, data_set in data_info.datasets.items(): data_set.apply( lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[ 1]] + [concat[2]] + x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT) data_set.apply( lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT, is_input=auto_set_input) if seq_len_type is not None: if seq_len_type == 'seq_len': # for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply(lambda x: len(x[fields]), new_field_name=fields.replace( Const.INPUT, Const.INPUT_LEN), is_input=auto_set_input) elif seq_len_type == 'mask': for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply(lambda x: [1] * len(x[fields]), new_field_name=fields.replace( Const.INPUT, Const.INPUT_LEN), is_input=auto_set_input) elif seq_len_type == 'bert': for data_name, data_set in data_info.datasets.items(): if Const.INPUT not in data_set.get_field_names(): raise KeyError( f'Field ``{Const.INPUT}`` not in {data_name} data set: ' f'got {data_set.get_field_names()}') data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input) data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) if auto_pad_length is not None: cut_text = min( auto_pad_length, cut_text if cut_text is not None else auto_pad_length) if cut_text is not None: for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')): data_set.apply(lambda x: x[fields][:cut_text], new_field_name=fields, is_input=auto_set_input) data_set_list = [d for n, d in data_info.datasets.items()] assert len(data_set_list) > 0, f'There are NO data sets in data info!' if bert_tokenizer is None: words_vocab = Vocabulary(padding=auto_pad_token) words_vocab = words_vocab.from_dataset( *[d for n, d in data_info.datasets.items() if 'train' in n], field_name=[ n for n in data_set_list[0].get_field_names() if (Const.INPUT in n) ], no_create_entry_dataset=[ d for n, d in data_info.datasets.items() if 'train' not in n ]) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab = target_vocab.from_dataset( *[d for n, d in data_info.datasets.items() if 'train' in n], field_name=Const.TARGET) data_info.vocabs = { Const.INPUT: words_vocab, Const.TARGET: target_vocab } if get_index: for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, is_input=auto_set_input) if Const.TARGET in data_set.get_field_names(): data_set.apply( lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET, is_input=auto_set_input, is_target=auto_set_target) if auto_pad_length is not None: if seq_len_type == 'seq_len': raise RuntimeError( f'the sequence will be padded with the length {auto_pad_length}, ' f'so the seq_len_type cannot be `{seq_len_type}`!') for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply( lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] * (auto_pad_length - len(x[fields])), new_field_name=fields, is_input=auto_set_input) elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'): data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])), new_field_name=fields, is_input=auto_set_input) for data_name, data_set in data_info.datasets.items(): if isinstance(set_input, list): data_set.set_input(*[ inputs for inputs in set_input if inputs in data_set.get_field_names() ]) if isinstance(set_target, list): data_set.set_target(*[ target for target in set_target if target in data_set.get_field_names() ]) return data_info
def process(self, paths, bigrams=False, trigrams=False): """ :param paths: :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d<eos>] :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd<eos>, d<eos><eos>] :return: DataBundle 包含以下的fields raw_chars: List[str] chars: List[int] seq_len: int, 字的长度 bigrams: List[int], optional trigrams: List[int], optional target: List[int] """ paths = check_dataloader_paths(paths) data = DataBundle() input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) if bigrams: dataset.apply_field(lambda raw_chars: [ c1 + c2 for c1, c2 in zip(raw_chars, raw_chars[1:] + ['<eos>']) ], field_name='raw_chars', new_field_name='bigrams') if trigrams: dataset.apply_field(lambda raw_chars: [ c1 + c2 + c3 for c1, c2, c3 in zip(raw_chars, raw_chars[1:] + ['<eos>'], raw_chars[2:] + ['<eos>'] * 2) ], field_name='raw_chars', new_field_name='trigrams') data.datasets[name] = dataset char_vocab = Vocabulary().from_dataset( data.datasets['train'], field_name='raw_chars', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) char_vocab.index_dataset(*data.datasets.values(), field_name='raw_chars', new_field_name=Const.CHAR_INPUT) data.vocabs[Const.CHAR_INPUT] = char_vocab target_vocab = Vocabulary(unknown=None, padding=None).from_dataset( data.datasets['train'], field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab if bigrams: bigram_vocab = Vocabulary().from_dataset( data.datasets['train'], field_name='bigrams', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) bigram_vocab.index_dataset(*data.datasets.values(), field_name='bigrams', new_field_name='bigrams') data.vocabs['bigrams'] = bigram_vocab input_fields.append('bigrams') if trigrams: trigram_vocab = Vocabulary().from_dataset( data.datasets['train'], field_name='trigrams', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) trigram_vocab.index_dataset(*data.datasets.values(), field_name='trigrams', new_field_name='trigrams') data.vocabs['trigrams'] = trigram_vocab input_fields.append('trigrams') for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data
def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab_file=True): """ :param paths: dict path for each dataset :param vocab_size: int max_size for vocab :param vocab_path: str vocab path :param sent_max_len: int max token number of the sentence :param doc_max_timesteps: int max sentence number of the document :param domain: bool build vocab for publication, use 'X' for unknown :param tag: bool build vocab for tag, use 'X' for unknown :param load_vocab_file: bool build vocab (False) or load vocab (True) :return: DataBundle datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ def _pad_sent(text_wd): pad_text_wd = [] for sent_wd in text_wd: if len(sent_wd) < sent_max_len: pad_num = sent_max_len - len(sent_wd) sent_wd.extend([WORD_PAD] * pad_num) else: sent_wd = sent_wd[:sent_max_len] pad_text_wd.append(sent_wd) return pad_text_wd def _token_mask(text_wd): token_mask_list = [] for sent_wd in text_wd: token_num = len(sent_wd) if token_num < sent_max_len: mask = [1] * token_num + [0] * (sent_max_len - token_num) else: mask = [1] * sent_max_len token_mask_list.append(mask) return token_mask_list def _pad_label(label): text_len = len(label) if text_len < doc_max_timesteps: pad_label = label + [0] * (doc_max_timesteps - text_len) else: pad_label = label[:doc_max_timesteps] return pad_label def _pad_doc(text_wd): text_len = len(text_wd) if text_len < doc_max_timesteps: padding = [WORD_PAD] * sent_max_len pad_text = text_wd + [padding] * (doc_max_timesteps - text_len) else: pad_text = text_wd[:doc_max_timesteps] return pad_text def _sent_mask(text_wd): text_len = len(text_wd) if text_len < doc_max_timesteps: sent_mask = [1] * text_len + [0] * (doc_max_timesteps - text_len) else: sent_mask = [1] * doc_max_timesteps return sent_mask datasets = {} train_ds = None for key, value in paths.items(): ds = self.load(value) # pad sent ds.apply(lambda x: _pad_sent(x["text_wd"]), new_field_name="pad_text_wd") ds.apply(lambda x: _token_mask(x["text_wd"]), new_field_name="pad_token_mask") # pad document ds.apply(lambda x: _pad_doc(x["pad_text_wd"]), new_field_name="pad_text") ds.apply(lambda x: _sent_mask(x["pad_text_wd"]), new_field_name="seq_len") ds.apply(lambda x: _pad_label(x["flatten_label"]), new_field_name="pad_label") # rename field ds.rename_field("pad_text", Const.INPUT) ds.rename_field("seq_len", Const.INPUT_LEN) ds.rename_field("pad_label", Const.TARGET) # set input and target ds.set_input(Const.INPUT, Const.INPUT_LEN) ds.set_target(Const.TARGET, Const.INPUT_LEN) datasets[key] = ds if "train" in key: train_ds = datasets[key] vocab_dict = {} if load_vocab_file == False: logger.info("[INFO] Build new vocab from training dataset!") if train_ds == None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) vocabs.from_dataset(train_ds, field_name=["text_wd", "summary_wd"]) vocab_dict["vocab"] = vocabs else: logger.info("[INFO] Load existing vocab from %s!" % vocab_path) word_list = [] with open(vocab_path, 'r', encoding='utf8') as vocab_f: cnt = 2 # pad and unk for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) vocabs.add_word_lst(word_list) vocabs.build_vocab() vocab_dict["vocab"] = vocabs if domain == True: domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK) domaindict.from_dataset(train_ds, field_name="publication") vocab_dict["domain"] = domaindict if tag == True: tagdict = Vocabulary(padding=None, unknown=TAG_UNK) tagdict.from_dataset(train_ds, field_name="tag") vocab_dict["tag"] = tagdict for ds in datasets.values(): vocab_dict["vocab"].index_dataset(ds, field_name=Const.INPUT, new_field_name=Const.INPUT) return DataBundle(vocabs=vocab_dict, datasets=datasets)