def process(self, paths: Union[str, Dict[str, str]], train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, embed_opt: EmbeddingOption = None, char_level_op=False): paths = check_dataloader_paths(paths) datasets = {} info = DataInfo(datasets=self.load(paths)) src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary( **src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) _train_ds = [info.datasets[name] for name in train_ds ] if train_ds else info.datasets.values() def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars input_name, target_name = 'words', 'target' info.vocabs = {} #就分隔为char形式 if char_level_op: for dataset in info.datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') # if embed_opt is not None: # embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) # info.embeddings['words'] = embed else: src_vocab.from_dataset(*_train_ds, field_name=input_name) src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) info.vocabs[input_name] = src_vocab tgt_vocab.from_dataset(*_train_ds, field_name=target_name) tgt_vocab.index_dataset(*info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs[target_name] = tgt_vocab info.datasets['train'], info.datasets['dev'] = info.datasets[ 'train'].split(0.1, shuffle=False) for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def test_roberta_embed_eq_roberta_piece_encoder(self): # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致 weight_path = 'test/data_for_tests/embedding/small_roberta' ds = DataSet({ 'words': ["this is a texta a sentence".split(), 'this is'.split()] }) encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path) encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) vocab = Vocabulary() vocab.from_dataset(ds, field_name='words') vocab.index_dataset(ds, field_name='words', new_field_name='words') ds.set_input('words') words = torch.LongTensor(ds['words'].get([0, 1])) embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, pool_method='first', include_cls_sep=True, pooled_cls=False) embed.eval() words_res = embed(words) # 检查word piece什么的是正常work的 self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0) self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0) self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
def test_bert_embed_eq_bert_piece_encoder(self): ds = DataSet({ 'words': ["this is a texta model vocab".split(), 'this is'.split()] }) encoder = BertWordPieceEncoder( model_dir_or_name='test/data_for_tests/embedding/small_bert') encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) vocab = Vocabulary() vocab.from_dataset(ds, field_name='words') vocab.index_dataset(ds, field_name='words', new_field_name='words') ds.set_input('words') words = torch.LongTensor(ds['words'].get([0, 1])) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', pool_method='first', include_cls_sep=True, pooled_cls=False, min_freq=1) embed.eval() words_res = embed(words) # 检查word piece什么的是正常work的 self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0) self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0) self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
def preprocess(): train_set = DataSet() for i in range(len(raw_train.data)): train_set.append( Instance(sentence=raw_train.data[i], target=int(raw_train.target[i]))) train_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test.data)): test_set.append( Instance(sentence=raw_test.data[i], target=int(raw_test.target[i]))) test_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=10) train_set.apply(lambda x: [vocab.add(word) for word in x['words']]) test_set.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() vocab.index_dataset(train_set, field_name='words', new_field_name='words') vocab.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, vocab
def create_dataset(): # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles'] # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale'] categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..') newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..') dataset = DataSet() for i in range(len(newsgroups_train.data)): if len(newsgroups_train.data[i]) <= 2000: dataset.append(Instance(raw_sentence=newsgroups_train.data[i], target=int(newsgroups_train.target[i]))) for i in range(len(newsgroups_test.data)): if len(newsgroups_test.data[i]) <= 2000: dataset.append(Instance(raw_sentence=newsgroups_test.data[i], target=int(newsgroups_test.target[i]))) dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') dataset.apply(lambda x: x['sentence'].split(), new_field_name='words') dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words') vocab.index_dataset(dataset, field_name='words', new_field_name='words') dataset.set_input('words', 'seq_len') dataset.set_target('target') train_dev_data, test_data = dataset.split(0.1) train_data, dev_data = train_dev_data.split(0.1) return vocab, train_data, dev_data, test_data
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None, char_level_op=False): datasets = {} info = DataBundle() paths = check_dataloader_paths(paths) for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars if char_level_op: for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') datasets["train"], datasets["dev"] = datasets["train"].split( 0.1, shuffle=False) src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def get_train_dev_test_vocab(): dataset_train = fetch_20newsgroups(subset='train', data_home='../../../') dataset_test = fetch_20newsgroups(subset='test', data_home='../../../') # dataset_train, dataset_test = get_text_classification_datasets() train_data = dataset_train.data train_target = dataset_train.target test_data = dataset_test.data test_target = dataset_test.target print(f'train dataset: {len(train_data)}') print(f'test dataset: {len(test_data)}') train_dataset = to_dataset(train_data, train_target) test_dataset = to_dataset(test_data, test_target) vocab = Vocabulary(min_freq=10).from_dataset(train_dataset, field_name='words') print(f'Vocab size: {len(vocab)}') vocab.index_dataset(train_dataset, field_name='words', new_field_name='words') vocab.index_dataset(test_dataset, field_name='words', new_field_name='words') train_dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') test_dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') # Rename to suit inbuilt Model in fastNLP train_dataset.rename_field('words', Const.INPUT) train_dataset.rename_field('seq_len', Const.INPUT_LEN) train_dataset.rename_field('target', Const.TARGET) train_dataset.set_input(Const.INPUT, Const.INPUT_LEN) train_dataset.set_target(Const.TARGET) test_dataset.rename_field('words', Const.INPUT) test_dataset.rename_field('seq_len', Const.INPUT_LEN) test_dataset.rename_field('target', Const.TARGET) test_dataset.set_input(Const.INPUT, Const.INPUT_LEN) test_dataset.set_target(Const.TARGET) # Split into development dataset train_dataset, dev_dataset = train_dataset.split(0.1) return train_dataset, dev_dataset, test_dataset, vocab
def load_sst2(dict_path, embedding_path=None): ''' :param dict_path: /remote-home/xnli/data/corpus/text_classification/SST-2/ :param embedding_path: glove 300d txt :return: ''' train_path = os.path.join(dict_path, 'train.tsv') dev_path = os.path.join(dict_path, 'dev.tsv') loader = CSVLoader(headers=('words', 'target'), sep='\t') train_data = loader.load(train_path).datasets['train'] dev_data = loader.load(dev_path).datasets['train'] train_data.apply_field(lambda x: x.split(), field_name='words', new_field_name='words') dev_data.apply_field(lambda x: x.split(), field_name='words', new_field_name='words') train_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dev_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') vocab = Vocabulary(min_freq=2) vocab.from_dataset(train_data, field_name='words') vocab.from_dataset(dev_data, field_name='words') # pretrained_embedding = load_word_emb(embedding_path, 300, vocab) label_vocab = Vocabulary(padding=None, unknown=None).from_dataset(train_data, field_name='target') label_vocab.index_dataset(train_data, field_name='target') label_vocab.index_dataset(dev_data, field_name='target') vocab.index_dataset(train_data, field_name='words', new_field_name='words') vocab.index_dataset(dev_data, field_name='words', new_field_name='words') train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) dev_data.set_input(Const.INPUT, Const.INPUT_LEN) dev_data.set_target(Const.TARGET) if embedding_path is not None: pretrained_embedding = load_word_emb(embedding_path, 300, vocab) return (train_data, dev_data), (vocab, label_vocab), pretrained_embedding else: return (train_data, dev_data), (vocab, label_vocab)
def get_label_vocab(data_type='default'): label = [ 'family', 'education', 'money', 'med_exam', 'ID', 'contact', 'name', 'time', 'location', 'profession' ] total_label = [] for prefix in tagging_method: total_label.extend([prefix + '-' + ele for ele in label]) total_label.append('O') print(total_label) label_ds = DataSet({'target': total_label}) label_vocab = Vocabulary(unknown=None, padding=None) label_vocab.from_dataset(label_ds, field_name='target') label_vocab.index_dataset(label_ds, field_name='target') # label_vocab.add_word_lst(total_label) return label_vocab
def preprocess(batch=16): raw_data1 = [] raw_data2 = [] for i in range(len(traindata.data)): raw_data1.append( Instance(sentence=traindata.data[i], label=int(traindata.target[i]))) trainset = DataSet(raw_data1) trainset.apply(lambda x: pre(x['sentence']), new_field_name='words') for i in range(len(testdata.data)): raw_data2.append( Instance(sentence=testdata.data[i], label=int(testdata.target[i]))) testset = DataSet(raw_data2) testset.apply(lambda x: pre(x['sentence']), new_field_name='words') global vocab vocab = Vocabulary(min_freq=1).from_dataset(trainset, testset, field_name='words') vocab.index_dataset(trainset, testset, field_name='words', new_field_name='words') trainset.set_input('words') testset.set_input('words') trainset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) testset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) trainset.apply(lambda x: len(x['words']), new_field_name='seq_len') testset.apply(lambda x: len(x['words']), new_field_name='seq_len') global vocabsize vocabsize = len(vocab) sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler) test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler) return train_batch, test_batch, vocabsize
def Get_Data_Vocab(): dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') train_data_raw = construct_dataset(dataset_train) test_data = construct_dataset(dataset_test) vocab = Vocabulary(min_freq=10).from_dataset(train_data_raw, field_name='input') vocab.index_dataset(train_data_raw, field_name='input', new_field_name='input') vocab.index_dataset(test_data, field_name='input', new_field_name='input') train_data_raw.set_input("input") train_data_raw.set_target("target") test_data.set_input("input") test_data.set_target("target") dev_data, train_data = train_data_raw.split(0.8) return vocab, train_data, dev_data, test_data
def prepare_ptb(args): datas = {} datas["pos"] = (ConllLoader(headers=["words", "pos"], indexes=[0, 1]).load(args.pos).datasets) chunk_data = (ConllLoader(headers=["words", "chunk"], indexes=[0, 2]).load(args.chunk).datasets) chunk_data['train'], chunk_data['dev'] = chunk_data['train'].split(0.1) datas['chunk'] = chunk_data datas["ner"] = (ConllLoader(headers=["words", "ner"], indexes=[0, 3]).load(args.ner).datasets) for ds in datas['chunk'].values(): ds.apply_field(lambda x: iob2(x), 'chunk', 'chunk') for ds in datas['ner'].values(): ds.apply_field(lambda x: iob2bioes(iob2(x)), 'ner', 'ner') vocabs = {} src_vocab = Vocabulary() for idx, task_name in enumerate(["pos", "chunk", "ner"]): data = datas[task_name] filter_docstart(data) vocab = Vocabulary(padding=None, unknown=None) vocab.from_dataset(*list(data.values()), field_name=task_name) src_vocab.from_dataset(*list(data.values()), field_name="words") vocabs[task_name] = vocab task_lst = [] for idx, task_name in enumerate(["pos", "chunk", "ner"]): data = datas[task_name] src_vocab.index_dataset(*list(data.values()), field_name="words", new_field_name="words") vocabs[task_name].index_dataset(*list(data.values()), field_name=task_name, new_field_name=task_name) for ds in data.values(): ds.apply_field(len, 'words', 'seq_len') task_lst.append( Task(idx, task_name, data["train"], data["dev"], data["test"])) vocabs["words"] = src_vocab return task_lst, vocabs
def get_data(): dataset_train, dataset_test = get_text_classification_datasets() # print(dataset_train.data) dic_train = { "input" : dataset_train.data, "target" : dataset_train.target } dic_test = { "input" : dataset_test.data, "target" : dataset_test.target } dataset = DataSet(dic_train) test_data = DataSet(dic_test) dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input') dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input') dataset.apply_field(lambda x: x.split(), field_name='input', new_field_name='words') test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input') test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input') test_data.apply_field(lambda x: x.split(), field_name='input', new_field_name='words') # ************************** dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') test_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dataset.rename_field('words', Const.INPUT) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.rename_field('target', Const.TARGET) test_data.rename_field('words', Const.INPUT) test_data.rename_field('seq_len', Const.INPUT_LEN) test_data.rename_field('target', Const.TARGET) # dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_input(Const.INPUT) dataset.set_target(Const.TARGET) # test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_input(Const.INPUT) test_data.set_target(Const.TARGET) # ************************** # only use train for vocab or train+dev train_data, dev_data = dataset.split(0.1) # print(len(train_data), len(dev_data), len(test_data)) # print(train_data[0]) vocab = Vocabulary(min_freq=10).from_dataset(train_data, field_name=Const.INPUT) vocab.index_dataset(train_data, field_name=Const.INPUT,new_field_name=Const.INPUT) vocab.index_dataset(dev_data, field_name=Const.INPUT,new_field_name=Const.INPUT) vocab.index_dataset(test_data, field_name=Const.INPUT,new_field_name=Const.INPUT) # print(test_data[0]) print(len(vocab)) return vocab, train_data, dev_data, test_data
def test_from_dataset_no_entry(self): # 测试能否正确将no_create_entry正确设置 dataset = DataSet() start_char = 65 num_samples = 10 test_dataset = DataSet() for i in range(num_samples): char = [chr(start_char + i)] * 6 ins = Instance(char=char) dataset.append(ins) ins = Instance(char=[c + c for c in char]) test_dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char', no_create_entry_dataset=test_dataset) vocab.index_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual( True, vocab._is_word_no_create_entry( chr(start_char + i) + chr(start_char + i)))
def process(self, data_bundle): data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT) input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN] target_fields = [C.TARGET, C.INPUT_LEN] if self.bigram: for dataset in data_bundle.datasets.values(): dataset.apply_field( lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])], field_name=C.CHAR_INPUT, new_field_name='bigrams') bigram_vocab = Vocabulary() bigram_vocab.from_dataset( data_bundle.get_dataset('train'), field_name='bigrams', no_create_entry_dataset=[ ds for name, ds in data_bundle.datasets.items() if name != 'train' ]) bigram_vocab.index_dataset(*data_bundle.datasets.values(), field_name='bigrams') data_bundle.set_vocab(bigram_vocab, field_name='bigrams') input_fields.append('bigrams') _add_chars_field(data_bundle, lower=False) # index _indexize(data_bundle, input_field_names=C.CHAR_INPUT, target_field_names=C.TARGET) for name, dataset in data_bundle.datasets.items(): dataset.set_pad_val(C.TARGET, self.target_pad_val) dataset.add_seq_len(C.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) return data_bundle
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = { "words": src_vocab, "target": tgt_vocab } info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def preprocess(): train_set = DataSet() for i in range(len(raw_train['data'])): di = transfer(raw_train['data'][i]) train_set.append( Instance(sentence=di, target=int(raw_train['target'][i]))) train_set.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test['data'])): di = transfer(raw_test['data'][i]) test_set.append( Instance(sentence=di, target=int(raw_test['target'][i]))) test_set.apply(lambda x: x['sentence'].lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') word_dict = Vocabulary(min_freq=2) train_set.apply(lambda x: [word_dict.add(word) for word in x['words']]) test_set.apply(lambda x: [word_dict.add(word) for word in x['words']]) word_dict.build_vocab() word_dict.index_dataset(train_set, field_name='words', new_field_name='words') word_dict.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, word_dict
def test_from_dataset(self): start_char = 65 num_samples = 10 # 0 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=chr(start_char + i)) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') # 1 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=[chr(start_char + i)] * 6) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') # 2 dim dataset = DataSet() for i in range(num_samples): ins = Instance(char=[[chr(start_char + i) for _ in range(6)] for _ in range(6)]) dataset.append(ins) vocab = Vocabulary() vocab.from_dataset(dataset, field_name='char') for i in range(num_samples): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char')
label_vocab['POS'] = Vocabulary().from_dataset( all_data['train']['POS-ctb9'], field_name='target') label_vocab['CWS'] = Vocabulary().from_dataset( all_data['train']['CWS-pku'], field_name='target') label_vocab['NER'] = Vocabulary().from_dataset( all_data['train']['NER-msra'], field_name='target') label_vocab['Parsing'] = torch.load('vocab/parsing_vocab') label_vocab['pos'] = Vocabulary().from_dataset( all_data['train']['Parsing-ctb9'], field_name='pos') for target in target_list: for task in task_list: all_data[target][task].drop(lambda ins: len(ins['words']) > 256) chars_vocab.index_dataset(all_data[target][task], field_name='words', new_field_name='chars') task_class = task.split('-')[0] all_data[target][task].apply(lambda ins: task_class, new_field_name='task_class') if task == 'Parsing-ctb9': label_vocab['Parsing'].index_dataset( all_data[target]['Parsing-ctb9'], field_name='char_labels') label_vocab[task_class].index_dataset(all_data[target][task], field_name='dep_label') label_vocab['pos'].index_dataset( all_data[target]['Parsing-ctb9'], field_name='pos') label_vocab['POS'].index_dataset( all_data[target]['Parsing-ctb9'], field_name='target') all_data[target][task].set_input('seq_len_for_wordlist',
def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt: VocabularyOption = None, lower: bool = False): """ 读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略 :param paths: :param word_vocab_opt: vocabulary的初始化值 :param lower: 是否将所有字母转为小写。 :return: """ # 读取数据 paths = check_dataloader_paths(paths) data = DataInfo() input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] for name, path in paths.items(): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary( min_freq=2) if word_vocab_opt is None else Vocabulary( **word_vocab_opt) word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() cap_word_vocab.from_dataset( data.datasets['train'], field_name='raw_words', no_create_entry_dataset=[ dataset for name, dataset in data.datasets.items() if name != 'train' ]) cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab # 对target建vocab target_vocab = Vocabulary(unknown=None, padding=None) target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) data.vocabs[Const.TARGET] = target_vocab for name, dataset in data.datasets.items(): dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) dataset.set_input(*input_fields) dataset.set_target(*target_fields) return data
kernel_sizes=kernel_sizes, padding=padding) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(sum(kernel_nums), num_classes) def forward(self, words, seq_len=None): x = self.embed(words) # [N,L] -> [N,L,C] x = self.conv_pool(x) # [N,L,C] -> [N,C] x = self.dropout(x) x = self.fc(x) # [N,C] -> [N, N_class] return {C.OUTPUT: x} def predict(self, words, seq_len=None): output = self(words, seq_len) _, predict = output[C.OUTPUT].max(dim=1) return {C.OUTPUT: predict} #demo version trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence') trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) vocab = Vocabulary(min_freq=2) vocab = vocab.from_dataset(trainData, field_name='words') #change to index vocab.index_dataset(trainData, field_name='words',new_field_name='words') trainData.set_target('target') model = CNNText((len(vocab),128), num_classes=20, padding=2, dropout=0.1) train_data, dev_data = trainData.split(0.2) trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), batch_size=16) trainer.train()
testset = DataSet() for i in range(newsgroups_test.target.shape[0]): testset.append( Instance(raw_sentence=newsgroups_test.data[i].replace('\n', ' '), target=int(newsgroups_test.target[i]))) testset.apply(lambda x: x['raw_sentence'].lower().translate(table), new_field_name='sentence') testset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words') testset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') vocab = Vocabulary(min_freq=10).from_dataset(dataset, field_name='words') vocab.index_dataset(dataset, field_name='words', new_field_name='words') vocab.index_dataset(testset, field_name='words', new_field_name='words') #model = CNNText((len(vocab),50), num_classes=20, padding=2, dropout=0.1) model = mycnn(len(vocab), 100, len(dataset.target)) #model = myrnn(len(vocab),100,20) #model = LSTMText(len(vocab),64,20) #used dataset.rename_field('words', Const.INPUT) dataset.rename_field('target', Const.TARGET) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_target(Const.TARGET) testset.rename_field('words', Const.INPUT) testset.rename_field('target', Const.TARGET)
def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, normalize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_path = os.path.join(path, 'train.char.bmes') dev_path = os.path.join(path, 'dev.char.bmes') test_path = os.path.join(path, 'test.char.bmes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) print(datasets.keys()) print(len(datasets['dev'])) print(len(datasets['test'])) print(len(datasets['train'])) char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, normalize=normalize['char']) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01, normalize=normalize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, normlize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) bundle = loader.load(path) datasets = bundle.datasets for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} word_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') word_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = word_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') bigram_vocab.index_dataset(*list(datasets.values()), field_name='raw_bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( word_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=0.01, normalize=normlize['char']) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, normalize=normlize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner_old(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, normlize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) # from fastNLP.io.file_reader import _read_conll # from fastNLP.core import Instance,DataSet # def _load(path): # ds = DataSet() # for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna, # encoding='ISO-8859-1'): # ins = {h: data[i] for i, h in enumerate(loader.headers)} # ds.append(Instance(**ins)) # return ds # from fastNLP.io.utils import check_loader_paths # paths = check_loader_paths(path) # datasets = {name: _load(path) for name, path in paths.items()} datasets = {} train_path = os.path.join(path, 'train.all.bmes') dev_path = os.path.join(path, 'dev.all.bmes') test_path = os.path.join(path, 'test.all.bmes') datasets['train'] = loader.load(train_path).datasets['train'] datasets['dev'] = loader.load(dev_path).datasets['train'] datasets['test'] = loader.load(test_path).datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) vocabs = {} word_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') word_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = word_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') bigram_vocab.index_dataset(*list(datasets.values()), field_name='raw_bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( word_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=0.01, normalize=normlize['char']) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, normalize=normlize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01, train_dataset_rate=1.0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) # bundle = loader.load(path) # # datasets = bundle.datasets # print(datasets['train'][:5]) train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}' train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}') print(f"load train dataset: {train_path}") dev_path = os.path.join(path, 'dev.char.bmoes') test_path = os.path.join(path, 'test.char.bmoes') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding print(f"train: {len(datasets['train'])}; dev: {len(datasets['dev'])}; test: {len(datasets['test'])}") return datasets, vocabs, embeddings
def load_toy_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_path = os.path.join(path, 'toy_train.bmoes') dev_path = os.path.join(path, 'toy_dev.bmoes') test_path = os.path.join(path, 'toy_test.bmoes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['label'] = label_vocab # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, ) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) train_path = os.path.join(path, 'weiboNER_2nd_conll.train') dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev') test_path = os.path.join(path, 'weiboNER_2nd_conll.test') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, train_dataset_rate=1.0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}' train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}') dev_path = os.path.join(path, 'dev.char.bmoes') test_path = os.path.join(path, 'test.char.bmoes') print(f"load train dataset: {train_path}") loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_conllized_ontonote_POS(path, embedding_path=None): from fastNLP.io.loader import ConllLoader header2index = {'words': 3, 'POS': 4, 'NER': 10} headers = ['words', 'POS'] if 'NER' in headers: print( '警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!' ) indexes = list(map(lambda x: header2index[x], headers)) loader = ConllLoader(headers, indexes) bundle = loader.load(path) # print(bundle.datasets) train_set = bundle.datasets['train'] dev_set = bundle.datasets['dev'] test_set = bundle.datasets['test'] # train_set = loader.load(os.path.join(path,'train.txt')) # dev_set = loader.load(os.path.join(path, 'dev.txt')) # test_set = loader.load(os.path.join(path, 'test.txt')) # print(len(train_set)) train_set.add_seq_len('words', 'seq_len') dev_set.add_seq_len('words', 'seq_len') test_set.add_seq_len('words', 'seq_len') # print(dataset['POS']) vocab = Vocabulary(min_freq=1) vocab.from_dataset(train_set, field_name='words') vocab.from_dataset(dev_set, field_name='words') vocab.from_dataset(test_set, field_name='words') vocab.index_dataset(train_set, field_name='words') vocab.index_dataset(dev_set, field_name='words') vocab.index_dataset(test_set, field_name='words') label_vocab_dict = {} for i, h in enumerate(headers): if h == 'words': continue label_vocab_dict[h] = Vocabulary(min_freq=1, padding=None, unknown=None) label_vocab_dict[h].from_dataset(train_set, field_name=h) label_vocab_dict[h].index_dataset(train_set, field_name=h) label_vocab_dict[h].index_dataset(dev_set, field_name=h) label_vocab_dict[h].index_dataset(test_set, field_name=h) train_set.set_input(Const.INPUT, Const.INPUT_LEN) train_set.set_target(headers[1]) dev_set.set_input(Const.INPUT, Const.INPUT_LEN) dev_set.set_target(headers[1]) test_set.set_input(Const.INPUT, Const.INPUT_LEN) test_set.set_target(headers[1]) if len(headers) > 2: print('警告:由于任务数量大于1,所以需要每次手动设置target!') print('train:', len(train_set), 'dev:', len(dev_set), 'test:', len(test_set)) if embedding_path is not None: pretrained_embedding = load_word_emb(embedding_path, 300, vocab) return (train_set, dev_set, test_set), (vocab, label_vocab_dict), pretrained_embedding else: return (train_set, dev_set, test_set), (vocab, label_vocab_dict)