def process(self, paths, train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, src_embed_op: EmbeddingOption = None): input_name, target_name = 'words', 'target' src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary( **src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) info = DataBundle(datasets=self.load(paths)) _train_ds = [info.datasets[name] for name in train_ds ] if train_ds else info.datasets.values() src_vocab.from_dataset(*_train_ds, field_name=input_name) tgt_vocab.from_dataset(*_train_ds, field_name=target_name) src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) tgt_vocab.index_dataset(*info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs = {input_name: src_vocab, target_name: tgt_vocab} if src_embed_op is not None: src_embed_op.vocab = src_vocab init_emb = EmbedLoader.load_with_vocab(**src_embed_op) info.embeddings[input_name] = init_emb for name, dataset in info.datasets.items(): dataset.set_input(input_name) dataset.set_target(target_name) return info
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None, char_level_op=False): datasets = {} info = DataBundle() paths = check_dataloader_paths(paths) for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars if char_level_op: for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') datasets["train"], datasets["dev"] = datasets["train"].split( 0.1, shuffle=False) src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def process(self, paths): def get_seq_len(instance): return len(instance['article']) print('Start loading datasets !!!') start = time() # load datasets datasets = {} for name in paths: datasets[name] = self._load(paths[name]) datasets[name].apply(get_seq_len, new_field_name='seq_len') # set input and target datasets[name].set_input('article', 'segment_id', 'cls_id') datasets[name].set_target(Const.TARGET) # set padding value datasets[name].set_pad_val('article', 0) datasets[name].set_pad_val('segment_id', 0) datasets[name].set_pad_val('cls_id', -1) datasets[name].set_pad_val(Const.TARGET, 0) print('Finished in {}'.format(timedelta(seconds=time() - start))) return DataBundle(datasets=datasets)
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def process(self, paths: Union[str, Dict[str, str]], train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, embed_opt: EmbeddingOption = None, char_level_op=False, split_dev_op=True ): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle(datasets=self.load(paths)) src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) _train_ds = [info.datasets[name] for name in train_ds] if train_ds else info.datasets.values() def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars input_name, target_name = 'words', 'target' info.vocabs={} #就分隔为char形式 if char_level_op: for dataset in info.datasets.values(): dataset.apply_field(wordtochar, field_name="words",new_field_name='chars') # if embed_opt is not None: # embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) # info.embeddings['words'] = embed else: src_vocab.from_dataset(*_train_ds, field_name=input_name) src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name) info.vocabs[input_name]=src_vocab tgt_vocab.from_dataset(*_train_ds, field_name=target_name) tgt_vocab.index_dataset( *info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs[target_name]=tgt_vocab if split_dev_op: info.datasets['train'], info.datasets['dev'] = info.datasets['train'].split(0.1, shuffle=False) for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def print_data_bundle(data_bundle: DataBundle, title: str = None): """ 打印输出data_bundle的信息. @params: data_bundle - 数据集DataBundle. title - 打印输出的标题信息. """ if title: logger.warning(title) for name, dataset in data_bundle.iter_datasets(): logger.info('dataset name : {}'.format(name)) logger.info('dataset len : {}'.format(len(dataset))) logger.info('dataset example : ') logger.info('\n{}'.format(dataset[:5])) logger.info('dataset 输出各个field的被设置成input和target的情况 : ') logger.info('\n{}'.format(dataset.print_field_meta()))
def load(self, paths): def get_seq_len(instance): return len(instance['text_id']) def sample(instance, candidate_num): candidate_id = instance['candidate_id'][:candidate_num] return candidate_id def truncate_candidate_id(instance, max_len): candidate_id = [] for i in range(len(instance['candidate_id'])): if len(instance['candidate_id'][i]) > max_len: cur_id = instance['candidate_id'][i][:(max_len - 1)] cur_id += self.sep_id else: cur_id = instance['candidate_id'][i] candidate_id.append(cur_id) return candidate_id print('Start loading datasets !!!') start = time() # load datasets datasets = {} for name in paths: datasets[name] = self._load(paths[name]) if name == 'train': datasets[name].apply( lambda ins: truncate_candidate_id(ins, self.max_len), new_field_name='candidate_id') # set input and target datasets[name].set_input('text_id', 'candidate_id', 'summary_id') # set padding value if self.encoder == 'bert': pad_id = 0 else: pad_id = 1 # for RoBERTa datasets[name].set_pad_val('text_id', pad_id) datasets[name].set_pad_val('candidate_id', pad_id) datasets[name].set_pad_val('summary_id', pad_id) print('Finished in {}'.format(timedelta(seconds=time() - start))) return DataBundle(datasets=datasets)
def get_data_bundle_tags(data_bundle: DataBundle): """ 根据dataBundle获取tags. @params: data_bundle - DataBundle数据集. @return: On success - 数据标签的tag列表. """ try: dataset = data_bundle.get_dataset('train') target_names = dataset.get_field(Const.TARGET).content target_names = list(set(target_names)) except Exception: traceback.print_exc() logger.error('缺少train数据集') raise Exception('缺少train数据集') target_names = list(set(target_names)) target_names.sort() return target_names
def process_from_file(self, paths): """ :param paths: :return: Dataset包含以下的field chars: bigrams: trigrams: pre_chars: pre_bigrams: pre_trigrams: seg_targets: seg_masks: seq_lens: char_labels: char_heads: gold_word_pairs: seg_targets: seg_masks: char_labels: char_heads: pun_masks: gold_label_word_pairs: """ paths = check_loader_paths(paths) data = DataBundle() for name, path in paths.items(): dataset = self.load(path) data.datasets[name] = dataset char_labels_vocab = Vocabulary(padding=None, unknown=None) def process(dataset, char_label_vocab): dataset.apply(add_word_lst, new_field_name='word_lst') dataset.apply(lambda x: list(chain(*x['word_lst'])), new_field_name='chars') dataset.apply(add_bigram, field_name='chars', new_field_name='bigrams') dataset.apply(add_trigram, field_name='chars', new_field_name='trigrams') dataset.apply(add_char_heads, new_field_name='char_heads') dataset.apply(add_char_labels, new_field_name='char_labels') dataset.apply(add_segs, new_field_name='seg_targets') dataset.apply(add_mask, new_field_name='seg_masks') dataset.add_seq_len('chars', new_field_name='seq_lens') dataset.apply(add_pun_masks, new_field_name='pun_masks') if len(char_label_vocab.word_count) == 0: char_label_vocab.from_dataset(dataset, field_name='char_labels') char_label_vocab.index_dataset(dataset, field_name='char_labels') new_dataset = add_root(dataset) new_dataset.apply(add_word_pairs, new_field_name='gold_word_pairs', ignore_type=True) global add_label_word_pairs add_label_word_pairs = partial(add_label_word_pairs, label_vocab=char_label_vocab) new_dataset.apply(add_label_word_pairs, new_field_name='gold_label_word_pairs', ignore_type=True) new_dataset.set_pad_val('char_labels', -1) new_dataset.set_pad_val('char_heads', -1) return new_dataset for name in list(paths.keys()): dataset = data.datasets[name] dataset = process(dataset, char_labels_vocab) data.datasets[name] = dataset data.vocabs['char_labels'] = char_labels_vocab char_vocab = Vocabulary(min_freq=2).from_dataset( data.datasets['train'], field_name='chars', no_create_entry_dataset=[ data.get_dataset('dev'), data.get_dataset('test') ]) bigram_vocab = Vocabulary(min_freq=3).from_dataset( data.datasets['train'], field_name='bigrams', no_create_entry_dataset=[ data.get_dataset('dev'), data.get_dataset('test') ]) trigram_vocab = Vocabulary(min_freq=5).from_dataset( data.datasets['train'], field_name='trigrams', no_create_entry_dataset=[ data.get_dataset('dev'), data.get_dataset('test') ]) for name in ['chars', 'bigrams', 'trigrams']: vocab = Vocabulary().from_dataset(field_name=name, no_create_entry_dataset=list( data.datasets.values())) vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name='pre_' + name) data.vocabs['pre_{}'.format(name)] = vocab for name, vocab in zip(['chars', 'bigrams', 'trigrams'], [char_vocab, bigram_vocab, trigram_vocab]): vocab.index_dataset(*data.datasets.values(), field_name=name, new_field_name=name) data.vocabs[name] = vocab for name, dataset in data.datasets.items(): dataset.set_input('chars', 'bigrams', 'trigrams', 'seq_lens', 'char_labels', 'char_heads', 'pre_chars', 'pre_bigrams', 'pre_trigrams') dataset.set_target('gold_word_pairs', 'seq_lens', 'seg_targets', 'seg_masks', 'char_labels', 'char_heads', 'pun_masks', 'gold_label_word_pairs') return data
def process(self, data_bundle: DataBundle): """ 可处理的DataSet应具备如下的field .. csv-table:: :header: "raw_words", "target" "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育" "...", "..." :param data_bundle: :return: """ # 根据granularity设置tag # 由原来的固定tagmap,修改为根据数据集获取tagmap targets_vocabs = get_data_bundle_tags(data_bundle) self.tag_map = {tag_name: tag_name for tag_name in targets_vocabs} data_bundle = self._granularize(data_bundle=data_bundle, tag_map=self.tag_map) # clean,lower # CWS(tokenize) data_bundle = self._tokenize(data_bundle=data_bundle, field_name='raw_chars', new_field_name='chars') input_field_names = [Const.CHAR_INPUT] # n-grams if self.bigrams: for name, dataset in data_bundle.iter_datasets(): dataset.apply_field( lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') if self.trigrams: for name, dataset in data_bundle.iter_datasets(): dataset.apply_field(lambda chars: [ c1 + c2 + c3 for c1, c2, c3 in zip(chars, chars[1:] + ['<eos>'], chars[ 2:] + ['<eos>'] * 2) ], field_name=Const.CHAR_INPUT, new_field_name='trigrams') input_field_names.append('trigrams') # index data_bundle = _indexize(data_bundle=data_bundle, input_field_names=Const.CHAR_INPUT) # add length for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(field_name=Const.CHAR_INPUT, new_field_name=Const.INPUT_LEN) # input_fields包含的字段名称 # input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names input_fields = [Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET] data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, paths): def truncate_articles(instance, max_nsents=self.max_nsents, max_ntokens=self.max_ntokens): article = [ ' '.join(sent.lower().split()[:max_ntokens]) for sent in instance['article'] ] return article[:max_nsents] def truncate_labels(instance): label = list( filter(lambda x: x < len(instance['article']), instance['label'])) return label def bert_tokenize(instance, tokenizer, max_len, pad_value): article = instance['article'] article = ' [SEP] [CLS] '.join(article) word_pieces = tokenizer.tokenize(article)[:(max_len - 2)] word_pieces = ['[CLS]'] + word_pieces + ['[SEP]'] token_ids = tokenizer.convert_tokens_to_ids(word_pieces) while len(token_ids) < max_len: token_ids.append(pad_value) assert len(token_ids) == max_len return token_ids def get_seg_id(instance, max_len, sep_id): _segs = [-1] + [ i for i, idx in enumerate(instance['article']) if idx == sep_id ] segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))] segment_id = [] for i, length in enumerate(segs): if i % 2 == 0: segment_id += length * [0] else: segment_id += length * [1] while len(segment_id) < max_len: segment_id.append(0) return segment_id def get_cls_id(instance, cls_id): classification_id = [ i for i, idx in enumerate(instance['article']) if idx == cls_id ] return classification_id def get_labels(instance): labels = [0] * len(instance['cls_id']) label_idx = list( filter(lambda x: x < len(instance['cls_id']), instance['label'])) for idx in label_idx: labels[idx] = 1 return labels datasets = {} for name in paths: datasets[name] = self._load(paths[name]) # remove empty samples datasets[name].drop( lambda ins: len(ins['article']) == 0 or len(ins['label']) == 0) # truncate articles datasets[name].apply(lambda ins: truncate_articles( ins, self.max_nsents, self.max_ntokens), new_field_name='article') # truncate labels datasets[name].apply(truncate_labels, new_field_name='label') # tokenize and convert tokens to id datasets[name].apply(lambda ins: bert_tokenize( ins, self.tokenizer, self.max_len, self.pad_id), new_field_name='article') # get segment id datasets[name].apply( lambda ins: get_seg_id(ins, self.max_len, self.sep_id), new_field_name='segment_id') # get classification id datasets[name].apply(lambda ins: get_cls_id(ins, self.cls_id), new_field_name='cls_id') # get label datasets[name].apply(get_labels, new_field_name='label') # rename filed datasets[name].rename_field('article', Const.INPUTS(0)) datasets[name].rename_field('segment_id', Const.INPUTS(1)) datasets[name].rename_field('cls_id', Const.INPUTS(2)) datasets[name].rename_field('lbael', Const.TARGET) # set input and target datasets[name].set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2)) datasets[name].set_target(Const.TARGET) # set paddding value datasets[name].set_pad_val('article', 0) return DataBundle(datasets=datasets)