def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, train_dataset_rate=1.0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}' train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}') dev_path = os.path.join(path, 'dev.char.bmoes') test_path = os.path.join(path, 'test.char.bmoes') print(f"load train dataset: {train_path}") loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_toy_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_path = os.path.join(path, 'toy_train.bmoes') dev_path = os.path.join(path, 'toy_dev.bmoes') test_path = os.path.join(path, 'toy_test.bmoes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['label'] = label_vocab # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, ) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01, train_dataset_rate=1.0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) # bundle = loader.load(path) # # datasets = bundle.datasets # print(datasets['train'][:5]) train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}' train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}') print(f"load train dataset: {train_path}") dev_path = os.path.join(path, 'dev.char.bmoes') test_path = os.path.join(path, 'test.char.bmoes') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding print(f"train: {len(datasets['train'])}; dev: {len(datasets['dev'])}; test: {len(datasets['test'])}") return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, normlize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) bundle = loader.load(path) datasets = bundle.datasets for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} word_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') word_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = word_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') bigram_vocab.index_dataset(*list(datasets.values()), field_name='raw_bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( word_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=0.01, normalize=normlize['char']) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, normalize=normlize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, normalize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_path = os.path.join(path, 'train.char.bmes') dev_path = os.path.join(path, 'dev.char.bmes') test_path = os.path.join(path, 'test.char.bmes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) print(datasets.keys()) print(len(datasets['dev'])) print(len(datasets['test'])) print(len(datasets['train'])) char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, normalize=normalize['char']) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01, normalize=normalize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_conllized_ontonote_POS(path, embedding_path=None): from fastNLP.io.loader import ConllLoader header2index = {'words': 3, 'POS': 4, 'NER': 10} headers = ['words', 'POS'] if 'NER' in headers: print( '警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!' ) indexes = list(map(lambda x: header2index[x], headers)) loader = ConllLoader(headers, indexes) bundle = loader.load(path) # print(bundle.datasets) train_set = bundle.datasets['train'] dev_set = bundle.datasets['dev'] test_set = bundle.datasets['test'] # train_set = loader.load(os.path.join(path,'train.txt')) # dev_set = loader.load(os.path.join(path, 'dev.txt')) # test_set = loader.load(os.path.join(path, 'test.txt')) # print(len(train_set)) train_set.add_seq_len('words', 'seq_len') dev_set.add_seq_len('words', 'seq_len') test_set.add_seq_len('words', 'seq_len') # print(dataset['POS']) vocab = Vocabulary(min_freq=1) vocab.from_dataset(train_set, field_name='words') vocab.from_dataset(dev_set, field_name='words') vocab.from_dataset(test_set, field_name='words') vocab.index_dataset(train_set, field_name='words') vocab.index_dataset(dev_set, field_name='words') vocab.index_dataset(test_set, field_name='words') label_vocab_dict = {} for i, h in enumerate(headers): if h == 'words': continue label_vocab_dict[h] = Vocabulary(min_freq=1, padding=None, unknown=None) label_vocab_dict[h].from_dataset(train_set, field_name=h) label_vocab_dict[h].index_dataset(train_set, field_name=h) label_vocab_dict[h].index_dataset(dev_set, field_name=h) label_vocab_dict[h].index_dataset(test_set, field_name=h) train_set.set_input(Const.INPUT, Const.INPUT_LEN) train_set.set_target(headers[1]) dev_set.set_input(Const.INPUT, Const.INPUT_LEN) dev_set.set_target(headers[1]) test_set.set_input(Const.INPUT, Const.INPUT_LEN) test_set.set_target(headers[1]) if len(headers) > 2: print('警告:由于任务数量大于1,所以需要每次手动设置target!') print('train:', len(train_set), 'dev:', len(dev_set), 'test:', len(test_set)) if embedding_path is not None: pretrained_embedding = load_word_emb(embedding_path, 300, vocab) return (train_set, dev_set, test_set), (vocab, label_vocab_dict), pretrained_embedding else: return (train_set, dev_set, test_set), (vocab, label_vocab_dict)
def load_weibo_ner_old(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, normlize={ 'char': True, 'bigram': True, 'word': False }): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) # from fastNLP.io.file_reader import _read_conll # from fastNLP.core import Instance,DataSet # def _load(path): # ds = DataSet() # for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna, # encoding='ISO-8859-1'): # ins = {h: data[i] for i, h in enumerate(loader.headers)} # ds.append(Instance(**ins)) # return ds # from fastNLP.io.utils import check_loader_paths # paths = check_loader_paths(path) # datasets = {name: _load(path) for name, path in paths.items()} datasets = {} train_path = os.path.join(path, 'train.all.bmes') dev_path = os.path.join(path, 'dev.all.bmes') test_path = os.path.join(path, 'test.all.bmes') datasets['train'] = loader.load(train_path).datasets['train'] datasets['dev'] = loader.load(dev_path).datasets['train'] datasets['test'] = loader.load(test_path).datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) vocabs = {} word_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') word_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = word_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') bigram_vocab.index_dataset(*list(datasets.values()), field_name='raw_bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( word_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=0.01, normalize=normlize['char']) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, normalize=normlize['bigram']) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_msra_ner_1(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams if train_clip: train_path = os.path.join(path, 'train_dev.char.bmes_clip1') test_path = os.path.join(path, 'test.char.bmes_clip1') else: train_path = os.path.join(path, 'train_dev.char.bmes') test_path = os.path.join(path, 'test.char.bmes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() print(datasets.keys()) # print(len(datasets['dev'])) print(len(datasets['test'])) print(len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['label'] = label_vocab embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding( char_vocab, char_embedding_path, word_dropout=0.01, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) train_path = os.path.join(path, 'weiboNER_2nd_conll.train') dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev') test_path = os.path.join(path, 'weiboNER_2nd_conll.test') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): """ 加载微博语料,并缓存数据 :param path: 微博数据集的路径,文件夹 :param unigram_embedding_path: 如果存在,那么使用unigram embedding :param bigram_embedding_path: 如果存在,使用bigram embedding,可以和unigram同时使用 :param index_token: :param char_min_freq: :param bigram_min_freq: :param only_train_min_freq: :param char_word_dropout: :return: """ from fastNLP.io.loader import ConllLoader from utils import get_bigrams #设定数据集的headers是chars和target,即读取的第一列是chars,第二列是target loader = ConllLoader(headers=['chars', 'target']) # bundle = loader.load(path) # # datasets = bundle.datasets # print(datasets['train'][:5]) train_path = os.path.join(path, 'train.conll') dev_path = os.path.join(path, 'dev.conll') test_path = os.path.join(path, 'test.conll') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] print("样本集的样本个数信息如下:") for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_resume_ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True, char_min_freq=1,bigram_min_freq=1,only_train_min_freq=0): # print('*' * 40) # print(bigram_embedding_path) train_path = os.path.join(path,'train.char.bmes') dev_path = os.path.join(path,'dev.char.bmes') test_path = os.path.join(path,'test.char.bmes') loader = ConllLoader(['chars','target']) # | ['爱', '财', '不', '分', '古', '今', ',', '除... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',... | train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() print(datasets.keys()) print(len(datasets['dev'])) print(len(datasets['test'])) print(len(datasets['train'])) char_vocab.from_dataset(datasets['train'],field_name='chars', no_create_entry_dataset=[datasets['dev'],datasets['test']] ) bigram_vocab.from_dataset(datasets['train'],field_name='bigrams', no_create_entry_dataset=[datasets['dev'],datasets['test']]) label_vocab.from_dataset(datasets['train'],field_name='target') if index_token: char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], field_name='chars',new_field_name='chars') bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], field_name='bigrams',new_field_name='bigrams') label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], field_name='target',new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01, min_freq=char_min_freq,only_train_min_freq=only_train_min_freq) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01, min_freq=bigram_min_freq,only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding # embeddings["bigram"] = char_embedding return datasets,vocabs,embeddings
def load_tianchi_ner( path, unigram_embedding_path=None, # yangjie_rich_pretrain_unigram_path bigram_embedding_path=None, # yangjie_rich_pretrain_bigram_path index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): # step0.=============================准备数据,诸如数据地址等 loader = ConllLoader(['chars', 'target']) train_path = os.path.join(path, 'tianchi.train') dev_path = os.path.join(path, 'tianchi.dev') test_path = os.path.join(path, 'tianchi.test') paths = {} paths['dev'] = dev_path paths['train'] = train_path paths['test'] = test_path # step1.=============================构建datasets datasets = {} # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个 DataSet 类的实例 for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) #step2.=============================根据得到的dataset构建字典信息 vocabs = {} # 需要学习一下 Vocabulary 的使用方法 # urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() # datasets 就3个键值对,分别是 train:[] , dev:[], test:[] for item in datasets.items(): print(item) for k, v in datasets.items(): # 处理键值对 # ignore the word segmentation tag # apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法 # 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列 # 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列 v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams # datasets['train']是一个DataSet 的实例 # 形参no_create_entry_dataset的作用:在建立词表的时候将test与dev就考虑到模型中,这会使得最终的结果更好 # 根据训练数据构建字典信息 char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') #char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=datasets['dev']) #bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=datasets['dev']) print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # 将使用len()直接对field_name中每个元素作用,将其结果作为sequence length, 并放入new_field_name=seq_len这个field v.add_seq_len('chars', new_field_name='seq_len') # 是否将dataset中的每列转为字典中的index # 我对 *list(datasets.values()) 这个不是很熟悉 if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # vocabs 的构造和 datasets 的构造原理都是相同的 # 二者都是字典,不同的键值对应着不同的数据信息 vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab # step3.=============================构建embedding信息 '''有如下几个问题: 01.不是说预训练的embedding 会失去上下文的语义信息吗?为什么这里又用embedding了? 02.这个embedding 和后面的bertEmbedding 有什么区别? 03.需要学习一下 StaticEmbedding()的作用 ''' embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) # 这里的 unigram_embedding 就是一个实例 embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): # step0.=============================准备数据,诸如数据地址等 loader = ConllLoader(['chars', 'target']) train_path = os.path.join(path, 'weiboNER_2nd_conll.train_deseg') dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev_deseg') test_path = os.path.join(path, 'weiboNER_2nd_conll.test_deseg') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path # step1.=============================构建datasets datasets = {} # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个(fastNLP)中 DataSet 类的实例 for k, v in paths.items(): bundle = loader.load(v) # 这里有点儿疑问,为什么是固定的 'train' 作为参数? # 固定的 train 为参数,是因为bundle 这个实例的设置,它是把数据都放到 train 这个里面了 datasets[k] = bundle.datasets['train'] trainData = datasets['train'] print(type(trainData)) # <class 'fastNLP.core.dataset.DataSet'> print(len(trainData)) # 1350 print(trainData) """ datasets['train'] 中的数据长成下面这样, +-----------------------------------------------------------+-----------------------------------------------------------+ | chars | target | +-----------------------------------------------------------+-----------------------------------------------------------+ | ['科', '技', '全', '方', '位', '资', '讯', '智', '能',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['对', ',', '输', '给', '一', '个', '女', '人', ',',... | ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM... | | ['今', '天', '下', '午', '起', '来', '看', '到', '外',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['今', '年', '拜', '年', '不', '短', '信', ',', '就',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['浑', '身', '酸', '疼', ',', '两', '腿', '无', '力',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['明', '显', '紧', '张', '状', '态', '没', '出', '来',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['三', '十', '年', '前', ',', '老', '爹', '带', '我',... | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... | | ['好', '活', '动', '呀', ',', '给', '力', '的', '商',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['人', '生', '如', '戏', ',', '导', '演', '是', '自',... | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... | | ['听', '说', '小', '米', '开', '卖', '了', ',', '刚',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ... | ... | +-----------------------------------------------------------+-----------------------------------------------------------+ 这个是 复旦大学开源工具fastNLP 中DataSet 的类型,其详细文档可参考:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html """ for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) #step2.=============================根据得到的dataset构建字典信息 vocabs = {} # 需要学习一下 Vocabulary 的使用方法 # urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() # datasets 就3个键值对,分别是 train:[] , dev:[], test:[] for item in datasets.items(): print(item) for k, v in datasets.items(): # 处理键值对 # ignore the word segmentation tag # apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法 # 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列 # 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列 v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams # datasets['train']是一个DataSet 的实例 char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # vocabs 的构造和 datasets 的构造原理都是相同的 # 二者都是字典,不同的键值对应着不同的数据信息 vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab # step3.=============================构建embedding信息 embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_aicup_ner( path, unigram_embedding_path=None, bigram_embedding_path=None, char_word_dropout=0.01, only_train_min_freq=0, bigram_min_freq=1, data_type='default', index_token=True, char_min_freq=1, cv=False, fold=0, ): vocabs = {} embeddings = {} train_path = os.path.join(path, f'fold{fold}', f'train/{data_type}') dev_path = os.path.join(path, f'fold{fold}', f'dev/{data_type}') print('-----------------Dataset---------------------') print('loading data from', train_path,'\nand', dev_path) loader = ConllLoader(['chars', 'target']) train = loader.load(train_path) dev = loader.load(dev_path) ds = { 'train':train.datasets['train'], 'dev':dev.datasets['train'], } ds['aicup_dev'] = get_aicup_devds() # jieba.enable_paddle() for ds_name in ds.keys(): ds[ds_name].apply_field(get_bigrams, 'chars', 'bigrams') ds[ds_name].add_seq_len('chars', new_field_name='seq_len') ds[ds_name].apply_field(get_pos_tag, 'chars', 'pos_tag') for k, v in ds.items(): print('{}:{}'.format(k, len(v))) char_vocab = Vocabulary() bigram_vocab = Vocabulary() pos_vocab = Vocabulary() label_vocab = get_label_vocab(data_type) pos_vocab.from_dataset(*list(ds.values()), field_name='pos_tag') if cv: no_create_entry_ds = [ds['dev'], ds['aicup_dev']] else: no_create_entry_ds = [ds['dev'], ds['test'], ds['aicup_dev']] char_vocab.from_dataset( ds['train'], field_name='chars', no_create_entry_dataset=no_create_entry_ds ) bigram_vocab.from_dataset( ds['train'], field_name='bigrams', no_create_entry_dataset=no_create_entry_ds ) vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['pos_tag'] = pos_vocab if index_token: char_vocab.index_dataset(*list(ds.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(ds.values()),field_name='bigrams',new_field_name='bigrams') label_vocab.index_dataset(*list([ds['train'], ds['dev']]), field_name='target', new_field_name='target') pos_vocab.index_dataset(*list(ds.values()),field_name='pos_tag', new_field_name='pos_tag') unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding embeddings['bigram'] = bigram_embedding print(ds['train']) print(set([ele[0].split('-')[1] if ele[0]!='O' and ele[0][0]!='<' else ele[0] for ele in list(label_vocab)])) print('------------------------------------------') return ds, vocabs, embeddings
print(sen) assert len(sen) == len(pos_tag) return pos_tag if __name__ == "__main__": label = get_label_vocab(data_type='default') print(label) exit() dev_path = os.path.join(aicup_ner_path, f'fold{0}', f'dev/number') print('-----------------Dataset---------------------') loader = ConllLoader(['chars', 'target']) dev = loader.load(dev_path) print(dev.datasets['train']) # label_vocab = get_label_vocab('number) # print(label_vocab) # print(list(label_vocab)) # pass # ds, vocabs, embeddings = load_aicup_ner( # aicup_ner_path, # yangjie_rich_pretrain_unigram_path, # yangjie_rich_pretrain_bigram_path, # index_token=False, # char_min_freq=1, # bigram_min_freq=1, # only_train_min_freq=True, # char_word_dropout=0.01, # cv=True,
def load_ner(path,unigram_embedding_path=None,bigram_embedding_path=None,index_token=True, train_path=None, dev_path=None, char_min_freq=1,bigram_min_freq=1,only_train_min_freq=0,char_word_dropout=0.01, test_path=None, \ logger=None, with_placeholder=True, placeholder_path=None, with_test_a=False, test_a_path=None, label_word2idx=None, \ **kwargs): loader = ConllLoader(['chars','target']) # train_path = os.path.join(path,'weiboNER_2nd_conll.train_deseg') # dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev_deseg') # test_path = os.path.join(path, 'weiboNER_2nd_conll.test_deseg') if train_path is None: train_path = '/ai/223/person/lichunyu/datasets/dataf/seq_label/seq_label.train' if dev_path is None: dev_path = '/ai/223/person/lichunyu/datasets/dataf/seq_label/seq_label.test' # train_path = '/ai/223/person/lichunyu/datasets/dataf/seq_label/seq_label_all_all.train' # dev_path = '/ai/223/person/lichunyu/datasets/dataf/seq_label/seq_label_test_a_labeled.train' # train_path = '/ai/223/person/lichunyu/datasets/dataf/test/test_A_text.seq' # dev_path = '/ai/223/person/lichunyu/datasets/dataf/test/test_A_text.seq' # test_path = '/ai/223/person/lichunyu/datasets/tmp/test_one.txt' if test_path is None: test_path = '/ai/223/person/lichunyu/datasets/dataf/test/test_B_final_text.nonletter' if placeholder_path is None: placeholder_path = '/root/all_train.test' if test_a_path is None: test_a_path = '/ai/223/person/lichunyu/datasets/df-competition/df-511/test/test_A_text.seq' paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path paths['placeholder'] = placeholder_path paths['test_a'] = test_a_path datasets = {} for k,v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k,v in datasets.items(): if logger is not None: logger.info('{}:{}'.format(k,len(v))) else: print('{}:{}'.format(k,len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() for k,v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x],'chars','chars') v.apply_field(get_bigrams,'chars','bigrams') # char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'],datasets['test']]) if with_placeholder is True and with_test_a is False: char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'], datasets['placeholder']]) elif with_placeholder is True and with_test_a is True: char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'], datasets['placeholder'], datasets['test_a']]) else: char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev']]) label_vocab.from_dataset(datasets['train'],field_name='target') if label_word2idx is not None: label_vocab.word2idx = label_word2idx if logger is not None: logger.info('label_vocab:{}\n{}'.format(len(label_vocab),label_vocab.idx2word)) for k,v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars',new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab # bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'],datasets['test']]) if with_placeholder is True and with_test_a is False: bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'], datasets['placeholder']]) elif with_placeholder is True and with_test_a is True: bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'], datasets['placeholder'], datasets['test_a']]) print('dataset create with test_a') else: bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()),field_name='bigrams',new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq,only_train_min_freq=only_train_min_freq,) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq,only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings