def input_with_span_attr(datasets, vocabs): datasets['train'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target', new_field_name='span_label') if 'dev' in datasets: datasets['dev'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target', new_field_name='span_label') datasets['test'].apply_field(lambda x: list(map(lambda y: y[0], x)), field_name='target', new_field_name='span_label') datasets['train'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_start_label') if 'dev' in datasets: datasets['dev'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_start_label') datasets['test'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'B'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_start_label') datasets['train'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_end_label') if 'dev' in datasets: datasets['dev'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_end_label') datasets['test'].apply_field(lambda x: list(map(lambda y: y[2:] if y[0] in ['S', 'E'] else ATTR_NULL_TAG, x)), field_name='target', new_field_name='attr_end_label') span_label_vocab = Vocabulary() attr_label_vocab = Vocabulary() span_label_vocab.from_dataset(datasets['train'], field_name='span_label') attr_label_vocab.from_dataset(datasets['train'], field_name=['attr_start_label', 'attr_end_label']) vocabs['span_label'] = span_label_vocab vocabs['attr_label'] = attr_label_vocab print(f"span label: {span_label_vocab.word2idx.keys()}") print(f"attr label: {attr_label_vocab.word2idx.keys()}") return datasets, vocabs
def process(self, paths: Union[str, Dict[str, str]], train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, embed_opt: EmbeddingOption = None, char_level_op=False): paths = check_dataloader_paths(paths) datasets = {} info = DataInfo(datasets=self.load(paths)) src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary( **src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) _train_ds = [info.datasets[name] for name in train_ds ] if train_ds else info.datasets.values() def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars input_name, target_name = 'words', 'target' info.vocabs = {} #就分隔为char形式 if char_level_op: for dataset in info.datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') # if embed_opt is not None: # embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) # info.embeddings['words'] = embed else: src_vocab.from_dataset(*_train_ds, field_name=input_name) src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) info.vocabs[input_name] = src_vocab tgt_vocab.from_dataset(*_train_ds, field_name=target_name) tgt_vocab.index_dataset(*info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs[target_name] = tgt_vocab info.datasets['train'], info.datasets['dev'] = info.datasets[ 'train'].split(0.1, shuffle=False) for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def test_bert_embed_eq_bert_piece_encoder(self): ds = DataSet({ 'words': ["this is a texta model vocab".split(), 'this is'.split()] }) encoder = BertWordPieceEncoder( model_dir_or_name='test/data_for_tests/embedding/small_bert') encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) vocab = Vocabulary() vocab.from_dataset(ds, field_name='words') vocab.index_dataset(ds, field_name='words', new_field_name='words') ds.set_input('words') words = torch.LongTensor(ds['words'].get([0, 1])) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', pool_method='first', include_cls_sep=True, pooled_cls=False, min_freq=1) embed.eval() words_res = embed(words) # 检查word piece什么的是正常work的 self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0) self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0) self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
def test_roberta_embed_eq_roberta_piece_encoder(self): # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致 weight_path = 'test/data_for_tests/embedding/small_roberta' ds = DataSet({ 'words': ["this is a texta a sentence".split(), 'this is'.split()] }) encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path) encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) vocab = Vocabulary() vocab.from_dataset(ds, field_name='words') vocab.index_dataset(ds, field_name='words', new_field_name='words') ds.set_input('words') words = torch.LongTensor(ds['words'].get([0, 1])) embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, pool_method='first', include_cls_sep=True, pooled_cls=False) embed.eval() words_res = embed(words) # 检查word piece什么的是正常work的 self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0) self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0) self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None, char_level_op=False): datasets = {} info = DataBundle() paths = check_dataloader_paths(paths) for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars if char_level_op: for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') datasets["train"], datasets["dev"] = datasets["train"].split( 0.1, shuffle=False) src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def load_sst2(dict_path, embedding_path=None): ''' :param dict_path: /remote-home/xnli/data/corpus/text_classification/SST-2/ :param embedding_path: glove 300d txt :return: ''' train_path = os.path.join(dict_path, 'train.tsv') dev_path = os.path.join(dict_path, 'dev.tsv') loader = CSVLoader(headers=('words', 'target'), sep='\t') train_data = loader.load(train_path).datasets['train'] dev_data = loader.load(dev_path).datasets['train'] train_data.apply_field(lambda x: x.split(), field_name='words', new_field_name='words') dev_data.apply_field(lambda x: x.split(), field_name='words', new_field_name='words') train_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dev_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') vocab = Vocabulary(min_freq=2) vocab.from_dataset(train_data, field_name='words') vocab.from_dataset(dev_data, field_name='words') # pretrained_embedding = load_word_emb(embedding_path, 300, vocab) label_vocab = Vocabulary(padding=None, unknown=None).from_dataset(train_data, field_name='target') label_vocab.index_dataset(train_data, field_name='target') label_vocab.index_dataset(dev_data, field_name='target') vocab.index_dataset(train_data, field_name='words', new_field_name='words') vocab.index_dataset(dev_data, field_name='words', new_field_name='words') train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) dev_data.set_input(Const.INPUT, Const.INPUT_LEN) dev_data.set_target(Const.TARGET) if embedding_path is not None: pretrained_embedding = load_word_emb(embedding_path, 300, vocab) return (train_data, dev_data), (vocab, label_vocab), pretrained_embedding else: return (train_data, dev_data), (vocab, label_vocab)
def get_label_vocab(data_type='default'): label = [ 'family', 'education', 'money', 'med_exam', 'ID', 'contact', 'name', 'time', 'location', 'profession' ] total_label = [] for prefix in tagging_method: total_label.extend([prefix + '-' + ele for ele in label]) total_label.append('O') print(total_label) label_ds = DataSet({'target': total_label}) label_vocab = Vocabulary(unknown=None, padding=None) label_vocab.from_dataset(label_ds, field_name='target') label_vocab.index_dataset(label_ds, field_name='target') # label_vocab.add_word_lst(total_label) return label_vocab
def prepare_ptb(args): datas = {} datas["pos"] = (ConllLoader(headers=["words", "pos"], indexes=[0, 1]).load(args.pos).datasets) chunk_data = (ConllLoader(headers=["words", "chunk"], indexes=[0, 2]).load(args.chunk).datasets) chunk_data['train'], chunk_data['dev'] = chunk_data['train'].split(0.1) datas['chunk'] = chunk_data datas["ner"] = (ConllLoader(headers=["words", "ner"], indexes=[0, 3]).load(args.ner).datasets) for ds in datas['chunk'].values(): ds.apply_field(lambda x: iob2(x), 'chunk', 'chunk') for ds in datas['ner'].values(): ds.apply_field(lambda x: iob2bioes(iob2(x)), 'ner', 'ner') vocabs = {} src_vocab = Vocabulary() for idx, task_name in enumerate(["pos", "chunk", "ner"]): data = datas[task_name] filter_docstart(data) vocab = Vocabulary(padding=None, unknown=None) vocab.from_dataset(*list(data.values()), field_name=task_name) src_vocab.from_dataset(*list(data.values()), field_name="words") vocabs[task_name] = vocab task_lst = [] for idx, task_name in enumerate(["pos", "chunk", "ner"]): data = datas[task_name] src_vocab.index_dataset(*list(data.values()), field_name="words", new_field_name="words") vocabs[task_name].index_dataset(*list(data.values()), field_name=task_name, new_field_name=task_name) for ds in data.values(): ds.apply_field(len, 'words', 'seq_len') task_lst.append( Task(idx, task_name, data["train"], data["dev"], data["test"])) vocabs["words"] = src_vocab return task_lst, vocabs
def test_from_dataset_no_entry(self): # 测试能否正确将no_create_entry正确设置 dataset = DataSet() start_char = 65 num_samples = 10 test_dataset = DataSet() for i in range(num_samples): char = [chr(start_char + i)] * 6 ins = Instance(char=char) dataset.append(ins) ins = Instance(char=[c + c for c in char]) test_dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char', no_create_entry_dataset=test_dataset) vocab.index_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual( True, vocab._is_word_no_create_entry( chr(start_char + i) + chr(start_char + i)))
def process(self, data_bundle): data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT) input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN] target_fields = [C.TARGET, C.INPUT_LEN] if self.bigram: for dataset in data_bundle.datasets.values(): dataset.apply_field( lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])], field_name=C.CHAR_INPUT, new_field_name='bigrams') bigram_vocab = Vocabulary() bigram_vocab.from_dataset( data_bundle.get_dataset('train'), field_name='bigrams', no_create_entry_dataset=[ ds for name, ds in data_bundle.datasets.items() if name != 'train' ]) bigram_vocab.index_dataset(*data_bundle.datasets.values(), field_name='bigrams') data_bundle.set_vocab(bigram_vocab, field_name='bigrams') input_fields.append('bigrams') _add_chars_field(data_bundle, lower=False) # index _indexize(data_bundle, input_field_names=C.CHAR_INPUT, target_field_names=C.TARGET) for name, dataset in data_bundle.datasets.items(): dataset.set_pad_val(C.TARGET, self.target_pad_val) dataset.add_seq_len(C.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = { "words": src_vocab, "target": tgt_vocab } info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def get_vocab(trainset, testset): # 构建vocab以及word2idx #tok tok_vocab = Vocabulary() tok_vocab.from_dataset(trainset, field_name="tok", no_create_entry_dataset=testset) tok_vocab.index_dataset(trainset, testset, field_name="tok", new_field_name="chars") tok_vocab.index_dataset(trainset, testset, field_name="asp", new_field_name="aspect") # deprel dep_vocab = Vocabulary() dep_vocab.from_dataset(trainset, field_name="deprel") dep_vocab.index_dataset(trainset, testset, field_name="deprel", new_field_name="depidx") # pol(target) pol_vocab = Vocabulary(padding=None, unknown=None) pol_vocab.from_dataset(trainset, field_name="pol") pol_vocab.index_dataset(trainset, testset, field_name="pol", new_field_name="target") # pos pos_vocab = Vocabulary() pos_vocab.from_dataset(trainset, field_name="pos") pos_vocab.index_dataset(trainset, testset, field_name="pos", new_field_name="posidx") # post max_len = max(max(trainset["seq_len"]), max(testset["seq_len"])) post_vocab = Vocabulary() post_vocab.add_word_lst(list(range(-max_len, max_len))) post_vocab.index_dataset(trainset, testset, field_name="post", new_field_name="postidx") return tok_vocab, pos_vocab, post_vocab, trainset, testset
def test_from_dataset(self): start_char = 65 num_samples = 10 # 0 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=chr(start_char + i)) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') # 1 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=[chr(start_char + i)] * 6) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') # 2 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=[[chr(start_char + i) for _ in range(6)] for _ in range(6)]) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char')
kernel_sizes=kernel_sizes, padding=padding) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(sum(kernel_nums), num_classes) def forward(self, words, seq_len=None): x = self.embed(words) # [N,L] -> [N,L,C] x = self.conv_pool(x) # [N,L,C] -> [N,C] x = self.dropout(x) x = self.fc(x) # [N,C] -> [N, N_class] return {C.OUTPUT: x} def predict(self, words, seq_len=None): output = self(words, seq_len) _, predict = output[C.OUTPUT].max(dim=1) return {C.OUTPUT: predict} #demo version trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence') trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) vocab = Vocabulary(min_freq=2) vocab = vocab.from_dataset(trainData, field_name='words') #change to index vocab.index_dataset(trainData, field_name='words',new_field_name='words') trainData.set_target('target') model = CNNText((len(vocab),128), num_classes=20, padding=2, dropout=0.1) train_data, dev_data = trainData.split(0.2) trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), batch_size=16) trainer.train()
def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, normalize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_path = os.path.join(path, 'train.char.bmes') dev_path = os.path.join(path, 'dev.char.bmes') test_path = os.path.join(path, 'test.char.bmes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) print(datasets.keys()) print(len(datasets['dev'])) print(len(datasets['test'])) print(len(datasets['train'])) char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, normalize=normalize['char']) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01, normalize=normalize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, normlize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) bundle = loader.load(path) datasets = bundle.datasets for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} word_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') word_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = word_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') bigram_vocab.index_dataset(*list(datasets.values()), field_name='raw_bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( word_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=0.01, normalize=normlize['char']) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, normalize=normlize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner_old(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, normlize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) # from fastNLP.io.file_reader import _read_conll # from fastNLP.core import Instance,DataSet # def _load(path): # ds = DataSet() # for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna, # encoding='ISO-8859-1'): # ins = {h: data[i] for i, h in enumerate(loader.headers)} # ds.append(Instance(**ins)) # return ds # from fastNLP.io.utils import check_loader_paths # paths = check_loader_paths(path) # datasets = {name: _load(path) for name, path in paths.items()} datasets = {} train_path = os.path.join(path, 'train.all.bmes') dev_path = os.path.join(path, 'dev.all.bmes') test_path = os.path.join(path, 'test.all.bmes') datasets['train'] = loader.load(train_path).datasets['train'] datasets['dev'] = loader.load(dev_path).datasets['train'] datasets['test'] = loader.load(test_path).datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) vocabs = {} word_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') word_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = word_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') bigram_vocab.index_dataset(*list(datasets.values()), field_name='raw_bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( word_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=0.01, normalize=normlize['char']) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, normalize=normlize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_conllized_ontonote_POS(path, embedding_path=None): from fastNLP.io.loader import ConllLoader header2index = {'words': 3, 'POS': 4, 'NER': 10} headers = ['words', 'POS'] if 'NER' in headers: print( '警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!' ) indexes = list(map(lambda x: header2index[x], headers)) loader = ConllLoader(headers, indexes) bundle = loader.load(path) # print(bundle.datasets) train_set = bundle.datasets['train'] dev_set = bundle.datasets['dev'] test_set = bundle.datasets['test'] # train_set = loader.load(os.path.join(path,'train.txt')) # dev_set = loader.load(os.path.join(path, 'dev.txt')) # test_set = loader.load(os.path.join(path, 'test.txt')) # print(len(train_set)) train_set.add_seq_len('words', 'seq_len') dev_set.add_seq_len('words', 'seq_len') test_set.add_seq_len('words', 'seq_len') # print(dataset['POS']) vocab = Vocabulary(min_freq=1) vocab.from_dataset(train_set, field_name='words') vocab.from_dataset(dev_set, field_name='words') vocab.from_dataset(test_set, field_name='words') vocab.index_dataset(train_set, field_name='words') vocab.index_dataset(dev_set, field_name='words') vocab.index_dataset(test_set, field_name='words') label_vocab_dict = {} for i, h in enumerate(headers): if h == 'words': continue label_vocab_dict[h] = Vocabulary(min_freq=1, padding=None, unknown=None) label_vocab_dict[h].from_dataset(train_set, field_name=h) label_vocab_dict[h].index_dataset(train_set, field_name=h) label_vocab_dict[h].index_dataset(dev_set, field_name=h) label_vocab_dict[h].index_dataset(test_set, field_name=h) train_set.set_input(Const.INPUT, Const.INPUT_LEN) train_set.set_target(headers[1]) dev_set.set_input(Const.INPUT, Const.INPUT_LEN) dev_set.set_target(headers[1]) test_set.set_input(Const.INPUT, Const.INPUT_LEN) test_set.set_target(headers[1]) if len(headers) > 2: print('警告:由于任务数量大于1,所以需要每次手动设置target!') print('train:', len(train_set), 'dev:', len(dev_set), 'test:', len(test_set)) if embedding_path is not None: pretrained_embedding = load_word_emb(embedding_path, 300, vocab) return (train_set, dev_set, test_set), (vocab, label_vocab_dict), pretrained_embedding else: return (train_set, dev_set, test_set), (vocab, label_vocab_dict)
def process(self, paths: Union[str, Dict[str, str]], char_vocab_opt: VocabularyOption = None, char_embed_opt: EmbeddingOption = None, bigram_vocab_opt: VocabularyOption = None, bigram_embed_opt: EmbeddingOption = None, L: int = 4): """ 支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如 Option:: 共同 创造 美好 的 新 世纪 —— 二○○一年 新年 贺词 ( 二○○○年 十二月 三十一日 ) ( 附 图片 1 张 ) 女士 们 , 先生 们 , 同志 们 , 朋友 们 : paths支持两种格式,第一种是str,第二种是Dict[str, str]. Option:: # 1. str类型 # 1.1 传入具体的文件路径 data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容 # 包含以下的内容data.vocabs['chars']:Vocabulary对象, # data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值 # data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项 # data.datasets['train']: DataSet对象 # 包含的field有: # raw_chars: list[str], 每个元素是一个汉字 # chars: list[int], 每个元素是汉字对应的index # target: list[int], 根据encoding_type有对应的变化 # 1.2 传入一个目录, 里面必须包含train.txt文件 data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt # 包含以下的内容data.vocabs['chars']: Vocabulary对象 # data.vocabs['target']:Vocabulary对象 # data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象; # data.datasets['train']: DataSet对象 # 包含的field有: # raw_chars: list[str], 每个元素是一个汉字 # chars: list[int], 每个元素是汉字对应的index # target: list[int], 根据encoding_type有对应的变化 # data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样 # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'} data = SigHanLoader(paths).process(paths) # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致 :param paths: 支持传入目录,文件路径,以及dict。 :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2 :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。 为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e<eos> :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效 :param L: 当target_type为shift_relay时传入的segment长度 :return: """ # 推荐大家使用这个check_data_loader_paths进行paths的验证 paths = check_dataloader_paths(paths) datasets = {} data = DataBundle() bigram = bigram_vocab_opt is not None for name, path in paths.items(): dataset = self.load(path, bigram=bigram) datasets[name] = dataset input_fields = [] target_fields = [] # 创建vocab char_vocab = Vocabulary( min_freq=2) if char_vocab_opt is None else Vocabulary( **char_vocab_opt) char_vocab.from_dataset(datasets['train'], field_name='raw_chars') char_vocab.index_dataset(*datasets.values(), field_name='raw_chars', new_field_name='chars') data.vocabs[Const.CHAR_INPUT] = char_vocab input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET]) target_fields.append(Const.TARGET) # 创建target if self.target_type == 'bmes': target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.add_word_lst(['B'] * 4 + ['M'] * 3 + ['E'] * 2 + ['S']) target_vocab.index_dataset(*datasets.values(), field_name='target') data.vocabs[Const.TARGET] = target_vocab if char_embed_opt is not None: char_embed = EmbedLoader.load_with_vocab(**char_embed_opt, vocab=char_vocab) data.embeddings['chars'] = char_embed if bigram: bigram_vocab = Vocabulary(**bigram_vocab_opt) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams') bigram_vocab.index_dataset(*datasets.values(), field_name='bigrams') data.vocabs['bigrams'] = bigram_vocab if bigram_embed_opt is not None: bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt, vocab=bigram_vocab) data.embeddings['bigrams'] = bigram_embed input_fields.append('bigrams') if self.target_type == 'shift_relay': func = partial(self._clip_target, L=L) for name, dataset in datasets.items(): res = dataset.apply_field(func, field_name='target') relay_target = [res_i[0] for res_i in res] relay_mask = [res_i[1] for res_i in res] dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) if self.target_type == 'shift_relay': input_fields.extend(['end_seg_mask']) target_fields.append('start_seg_mask') # 将dataset加入DataInfo for name, dataset in datasets.items(): dataset.set_input(*input_fields) dataset.set_target(*target_fields) data.datasets[name] = dataset return data
def load_msra_ner_1(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams if train_clip: train_path = os.path.join(path, 'train_dev.char.bmes_clip1') test_path = os.path.join(path, 'test.char.bmes_clip1') else: train_path = os.path.join(path, 'train_dev.char.bmes') test_path = os.path.join(path, 'test.char.bmes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() print(datasets.keys()) # print(len(datasets['dev'])) print(len(datasets['test'])) print(len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['label'] = label_vocab embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding( char_vocab, char_embedding_path, word_dropout=0.01, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01, train_dataset_rate=1.0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) # bundle = loader.load(path) # # datasets = bundle.datasets # print(datasets['train'][:5]) train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}' train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}') print(f"load train dataset: {train_path}") dev_path = os.path.join(path, 'dev.char.bmoes') test_path = os.path.join(path, 'test.char.bmoes') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding print(f"train: {len(datasets['train'])}; dev: {len(datasets['dev'])}; test: {len(datasets['test'])}") return datasets, vocabs, embeddings
def process(self, paths, config, load_vocab_file=True): """ :param paths: dict path for each dataset :param load_vocab_file: bool build vocab (False) or load vocab (True) :return: DataBundle datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ vocab_size = config.vocab_size def _merge_abstracts(abstracts): merged = [] for abstract in abstracts: merged.extend(abstract[:self.max_concat_len] + [SEP]) if len(abstracts) == 0: assert merged == [] return merged[:-1] def _pad_graph_inputs(graph_inputs): pad_text_wd = [] max_len = config.max_graph_enc_steps for graph_input in graph_inputs: if len(graph_input) < max_len: pad_num = max_len - len(graph_input) graph_input.extend([PAD_TOKEN] * pad_num) else: graph_input = graph_input[:max_len] pad_text_wd.append(graph_input) if len(pad_text_wd) == 0: pad_text_wd.append([PAD_TOKEN] * max_len) return pad_text_wd def _get_nbr_input_len(input_wd): enc_len = [ min(len(text), config.max_graph_enc_steps) for text in input_wd ] if len(enc_len) == 0: enc_len = [0] return enc_len def _pad_article(text_wd): token_num = len(text_wd) max_len = config.max_enc_steps if config.neighbor_process == "sep": max_len += self.max_concat_len * self.max_concat_num if token_num < max_len: padding = [PAD_TOKEN] * (max_len - token_num) article = text_wd + padding else: article = text_wd[:max_len] return article def _split_list(input_list): return [text.split() for text in input_list] def sent_tokenize(abstract): abs_list = abstract.split(".") return [(abst + ".") for abst in abs_list[:-1]] def _article_token_mask(text_wd): max_enc_len = config.max_enc_steps if config.neighbor_process == "sep": max_enc_len += self.max_concat_len * self.max_concat_num token_num = len(text_wd) if token_num < max_enc_len: mask = [1] * token_num + [0] * (max_enc_len - token_num) else: mask = [1] * max_enc_len return mask def generate_article_input(text, abstracts): if config.neighbor_process == "sep": text_wd = text.split()[:config.max_enc_steps] text_wd.append(SEP) abstracts_wd = _merge_abstracts(abstracts) return text_wd + abstracts_wd else: return text.split() def generate_graph_inputs(graph_struct): graph_inputs_ = [ graph_strut_dict[pid][config.graph_input_type] for pid in graph_struct ] return _split_list(graph_inputs_[1:]) def generate_graph_structs(paper_id): sub_graph_dict = {} sub_graph_set = [] n_hop = config.n_hop max_neighbor_num = config.max_neighbor_num k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num) for sub_g in k_nbrs: sub_graph_set += sub_g for node in sub_graph_set: sub_graph_dict[node] = [] for sub_g in k_nbrs: for centre_node in sub_g: nbrs = graph_strut_dict[centre_node]['references'] c_nbrs = list(set(nbrs).intersection(sub_graph_set)) sub_graph_dict[centre_node].extend(c_nbrs) for c_nbr in c_nbrs: sub_graph_dict[c_nbr].append(centre_node) # in python 3.6, the first in subgraph dict is source paper return sub_graph_dict def _k_hop_neighbor(paper_id, n_hop, max_neighbor): sub_graph = [[] for _ in range(n_hop + 1)] level = 0 visited = set() q = deque() q.append([paper_id, level]) curr_node_num = 0 while len(q) != 0: paper_first = q.popleft() paper_id_first, level_first = paper_first if level_first > n_hop: return sub_graph sub_graph[level_first].append(paper_id_first) curr_node_num += 1 if curr_node_num > max_neighbor: return sub_graph visited.add(paper_id_first) for pid in graph_strut_dict[paper_id_first]["references"]: if pid not in visited and pid in graph_strut_dict: q.append([pid, level_first + 1]) visited.add(pid) return sub_graph def generate_dgl_graph(paper_id, graph_struct, nodes_num): g = dgl.DGLGraph() assert len(graph_struct) == nodes_num g.add_nodes(len(graph_struct)) pid2idx = {} for index, key_node in enumerate(graph_struct): pid2idx[key_node] = index assert pid2idx[paper_id] == 0 for index, key_node in enumerate(graph_struct): neighbor = [pid2idx[node] for node in graph_struct[key_node]] # add self loop neighbor.append(index) key_nodes = [index] * len(neighbor) g.add_edges(key_nodes, neighbor) return g train_ds = None dataInfo = self.load(paths) # pop nodes in train graph in inductive setting if config.mode == "test" and self.setting == "inductive": dataInfo.datasets.pop("train") graph_strut_dict = {} for key, ds in dataInfo.datasets.items(): for ins in ds: graph_strut_dict[ins["paper_id"]] = ins logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes") for key, ds in dataInfo.datasets.items(): # process summary ds.apply(lambda x: x['abstract'].split(), new_field_name='summary_wd') ds.apply(lambda x: sent_tokenize(x['abstract']), new_field_name='abstract_sentences') # generate graph ds.apply(lambda x: generate_graph_structs(x["paper_id"]), new_field_name="graph_struct") ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]), new_field_name='graph_inputs_wd') ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1, new_field_name="nodes_num") # pad input ds.apply(lambda x: generate_article_input(x['introduction'], x[ "graph_inputs_wd"]), new_field_name='input_wd') ds.apply(lambda x: _article_token_mask(x["input_wd"]), new_field_name="enc_len_mask") ds.apply(lambda x: sum(x["enc_len_mask"]), new_field_name="enc_len") ds.apply(lambda x: _pad_article(x["input_wd"]), new_field_name="pad_input_wd") ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]), new_field_name="nbr_inputs_len") ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]), new_field_name="pad_graph_inputs_wd") if key == "train": train_ds = ds vocab_dict = {} if not load_vocab_file: logger.info("[INFO] Build new vocab from training dataset!") if train_ds is None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=config.vocab_size - 2, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.from_dataset(train_ds, field_name=["input_wd", "summary_wd"]) vocabs.add_word(START_DECODING) vocabs.add_word(STOP_DECODING) vocab_dict["vocab"] = vocabs # save vocab with open(os.path.join(config.train_path, "vocab"), "w", encoding="utf8") as f: for w, idx in vocabs: f.write(str(w) + "\t" + str(idx) + "\n") logger.info( "build new vocab ends.. please reRun the code with load_vocab = True" ) exit(0) else: logger.info("[INFO] Load existing vocab from %s!" % config.vocab_path) word_list = [] cnt = 3 # pad and unk if config.neighbor_process == "sep": cnt += 1 with open(config.vocab_path, 'r', encoding='utf8') as vocab_f: for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.add_word_lst(word_list) vocabs.add(START_DECODING) vocabs.add(STOP_DECODING) if config.neighbor_process == "sep": vocabs.add(SEP) vocabs.build_vocab() vocab_dict["vocab"] = vocabs logger.info(f"vocab size = {len(vocabs)}") assert len(vocabs) == config.vocab_size dataInfo.set_vocab(vocabs, "vocab") for key, dataset in dataInfo.datasets.items(): # do not process the training set in test mode if config.mode == "test" and key == "train": continue data_dict = { "enc_input": [], "nbr_inputs": [], "graph": [], "dec_input": [], "target": [], "dec_len": [], "article_oovs": [], "enc_input_extend_vocab": [], } logger.info( f"start construct the input of the model for {key} set, please wait..." ) for instance in dataset: graph_inputs = instance["pad_graph_inputs_wd"] abstract_sentences = instance["summary_wd"] enc_input = instance["pad_input_wd"] enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \ getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config) graph = generate_dgl_graph(instance["paper_id"], instance["graph_struct"], instance["nodes_num"]) data_dict["graph"].append(graph) data_dict["enc_input"].append(enc_input) data_dict["nbr_inputs"].append(nbr_inputs) data_dict["dec_input"].append(dec_input) data_dict["target"].append(target) data_dict["dec_len"].append(dec_len) data_dict["article_oovs"].append(article_oovs) data_dict["enc_input_extend_vocab"].append( enc_input_extend_vocab) dataset.add_field("enc_input", data_dict["enc_input"]) dataset.add_field("nbr_inputs", data_dict["nbr_inputs"]) dataset.add_field("dec_input", data_dict["dec_input"]) dataset.add_field("target", data_dict["target"]) dataset.add_field("dec_len", data_dict["dec_len"]) dataset.add_field("article_oovs", data_dict["article_oovs"]) dataset.add_field("enc_input_extend_vocab", data_dict["enc_input_extend_vocab"]) dataset.add_field("graph", data_dict["graph"]) dataset.set_ignore_type( 'graph') # without this line, there may be some errors dataset.set_input("graph") dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len", "enc_input", "enc_len_mask", "dec_input", "dec_len", "article_oovs", "nodes_num", "enc_input_extend_vocab") dataset.set_target("target", "article_oovs", "abstract_sentences") dataset.delete_field('graph_inputs_wd') dataset.delete_field('pad_graph_inputs_wd') dataset.delete_field('input_wd') dataset.delete_field('pad_input_wd') logger.info("------load dataset over---------") return dataInfo, vocabs
target = file2label[file])) else: train_dataset.append(Instance(raw_words = raw_words, words = words, seq_len = seq_len, target = file2label[file])) train_dataset.set_input('words', 'seq_len', 'target') test_dataset.set_input('words', 'seq_len', 'target') train_dataset.set_target('target') test_dataset.set_target('target') '''build vocabulary''' vocab = Vocabulary() vocab.from_dataset(train_dataset, field_name='words', no_create_entry_dataset=[test_dataset]) vocab.index_dataset(train_dataset, test_dataset, field_name='words') target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(train_dataset, field_name='target', no_create_entry_dataset=[test_dataset]) target_vocab.index_dataset(train_dataset, test_dataset, field_name='target') '''build bundle''' data_dict = {"train":train_dataset, "test":test_dataset} vocab_dict = {"words":vocab, "target":target_vocab} data_bundle = DataBundle(vocab_dict, data_dict) print(data_bundle) '''build model''' embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-uncased', include_cls_sep=True) model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))
def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt: VocabularyOption = None, lower: bool = False): """ 读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略 :param paths: :param word_vocab_opt: vocabulary的初始化值 :param lower: 是否将所有字母转为小写。 :return: """ # 读取数据 paths = check_dataloader_paths(paths) data = DataInfo() input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary( min_freq=2) if word_vocab_opt is None else Vocabulary( **word_vocab_opt) word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() cap_word_vocab.from_dataset( data.datasets['train'], field_name='raw_words', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab # 对target建vocab target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) train_path = os.path.join(path, 'weiboNER_2nd_conll.train') dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev') test_path = os.path.join(path, 'weiboNER_2nd_conll.test') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_ip_step2(path, char_embedding_path=None): train_path = os.path.join(path, 'train1.txt') dev_path = os.path.join(path, 'dev1.txt') test_path = os.path.join(path, 'test1.txt') # 播放徐秉龙的故事\t徐秉龙\talbum_film\t[0 0 0 0 1] loader = myConllLoader() train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] char_vocab = Vocabulary() entity_vocab = Vocabulary() logging.info('dev instance:{}'.format(len(datasets['dev']))) logging.info('test instance:{}'.format(len(datasets['test']))) logging.info('train instance:{}'.format(len(datasets['train']))) char_vocab.from_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='left_context') char_vocab.from_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='right_context') char_vocab.from_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='raw_entity') entity_vocab.from_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='raw_entity_label') char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='left_context', new_field_name='left_chars') char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='right_context', new_field_name='right_chars') char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='raw_entity', new_field_name='entity_chars') entity_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='raw_entity_label', new_field_name='entity_label') vocabs = {} vocabs['char'] = char_vocab vocabs['entity'] = entity_vocab embeddings = {} if char_embedding_path is not None: bi_voc = dict() for k, v in char_vocab: bi_voc[k] = v embed_weight = build_pretrain_embedding(char_embedding_path, bi_voc, embedd_dim=200) embeddings['char'] = embed_weight return datasets, vocabs, embeddings
def load_toy_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_path = os.path.join(path, 'toy_train.bmoes') dev_path = os.path.join(path, 'toy_dev.bmoes') test_path = os.path.join(path, 'toy_test.bmoes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['label'] = label_vocab # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, ) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
data_set.rename_field('Sentiment', 'target') def get_words(instance): ins = instance['raw_words'].split() if not ins: ins.append('nothing') return ins data_set.apply(get_words, new_field_name='words') #data_set.apply(lambda ins: ins['raw_words'].split(), new_field_name='words') #data_set.drop(lambda ins:ins['raw_words'].strip()=='') vocab = Vocabulary() vocab.from_dataset(data_set, field_name='words') vocab.index_dataset(data_set, field_name='words') vocab_target = Vocabulary(unknown=None, padding=None) vocab_target.from_dataset(data_set, field_name='target') vocab_target.index_dataset(data_set, field_name='target') data_set.set_input('words') data_set.set_target('target') train_data, dev_data = data_set.split(0.015) # training device = 0 if torch.cuda.is_available() else 'cpu' ''' EMBED_DIM = 100
def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, train_dataset_rate=1.0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}' train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}') dev_path = os.path.join(path, 'dev.char.bmoes') test_path = os.path.join(path, 'test.char.bmoes') print(f"load train dataset: {train_path}") loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt: VocabularyOption = None, lower: bool = True) -> DataBundle: """ 读取并处理数据。返回的DataInfo包含以下的内容 vocabs: word: Vocabulary target: Vocabulary datasets: train: DataSet words: List[int], 被设置为input target: int. label,被同时设置为input和target seq_len: int. 句子的长度,被同时设置为input和target raw_words: List[str] xxx(根据传入的paths可能有所变化) :param paths: :param word_vocab_opt: vocabulary的初始化值 :param lower: 是否使用小写 :return: """ paths = check_dataloader_paths(paths) data = DataBundle() input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary( min_freq=2) if word_vocab_opt is None else Vocabulary( **word_vocab_opt) word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab # 对target建vocab target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data