def load_resume_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, train_dataset_rate=1.0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}' train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}') dev_path = os.path.join(path, 'dev.char.bmoes') test_path = os.path.join(path, 'test.char.bmoes') print(f"load train dataset: {train_path}") loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_toy_ner(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False): from fastNLP.io.loader import ConllLoader from utils import get_bigrams train_path = os.path.join(path, 'toy_train.bmoes') dev_path = os.path.join(path, 'toy_dev.bmoes') test_path = os.path.join(path, 'toy_test.bmoes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) dev_bundle = loader.load(dev_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['dev'] = dev_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['dev'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary(padding=None, unknown=None) print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['label'] = label_vocab # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding(char_vocab, char_embedding_path, word_dropout=0.01, ) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, bigram_embedding_path, word_dropout=0.01) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01, train_dataset_rate=1.0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) # bundle = loader.load(path) # # datasets = bundle.datasets # print(datasets['train'][:5]) train_dataset_rate_suffix = '' if train_dataset_rate == 1.0 else f'_{train_dataset_rate}' train_path = os.path.join(path, f'train.char.bmoes{train_dataset_rate_suffix}') print(f"load train dataset: {train_path}") dev_path = os.path.join(path, 'dev.char.bmoes') test_path = os.path.join(path, 'test.char.bmoes') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() # TODO: add span_label, attr_start, attr_end datasets, vocabs = input_with_span_attr(datasets, vocabs) print(datasets.keys()) print("dev:", len(datasets['dev'])) print("test:", len(datasets['test'])) print("train:", len(datasets['train'])) for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding(char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding print(f"train: {len(datasets['train'])}; dev: {len(datasets['dev'])}; test: {len(datasets['test'])}") return datasets, vocabs, embeddings
def equip_chinese_ner_with_lexicon(datasets, vocabs, embeddings, w_list, word_embedding_path=None, only_lexicon_in_train=False, word_char_mix_embedding_path=None, number_normalized=False, lattice_min_freq=1, only_train_min_freq=0): from fastNLP.core import Vocabulary def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if only_lexicon_in_train: print('已支持只加载在trian中出现过的词汇') def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary # from fastNLP.embeddings import StaticEmbedding from fastNLP_module import StaticEmbedding from fastNLP import DataSet a = DataSet() w_trie = Trie() for w in w_list: w_trie.insert(w) if only_lexicon_in_train: lexicon_in_train = set() for s in datasets['train']['chars']: lexicon_in_s = w_trie.get_lexicon(s) for s, e, lexicon in lexicon_in_s: lexicon_in_train.add(''.join(lexicon)) print('lexicon in train:{}'.format(len(lexicon_in_train))) print('i.e.: {}'.format(list(lexicon_in_train)[:10])) w_trie = Trie() for w in lexicon_in_train: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') # print(list(datasets['train'][:10]['lexicons'])) # print(list(datasets['train'][:10]['lattice'])) # print(list(datasets['train'][:10]['lex_s'])) # print(list(datasets['train'][:10]['lex_e'])) # print(list(datasets['train'][:10]['pos_s'])) # print(list(datasets['train'][:10]['pos_e'])) # exit(1208) word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab lattice_vocab = Vocabulary() lattice_vocab.from_dataset(datasets['train'], field_name='lattice', no_create_entry_dataset=[ v for k, v in datasets.items() if k != 'train' ]) vocabs['lattice'] = lattice_vocab # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word') # # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word') # for k,v in datasets.items(): # v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count') # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_l2r_word',new_field_name='skips_l2r_word') # # v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back') # # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_r2l_word',new_field_name='skips_r2l_word') if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding if word_char_mix_embedding_path is not None: lattice_embedding = StaticEmbedding( lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01, min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq) embeddings['lattice'] = lattice_embedding vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') vocabs['span_label'].index_dataset(*(datasets.values()), field_name='span_label', new_field_name='span_label') vocabs['attr_label'].index_dataset(*(datasets.values()), field_name='attr_start_label', new_field_name='attr_start_label') vocabs['attr_label'].index_dataset(*(datasets.values()), field_name='attr_end_label', new_field_name='attr_end_label') return datasets, vocabs, embeddings
def equip_chinese_ner_with_skip(datasets, vocabs, embeddings, w_list, word_embedding_path=None, word_min_freq=1, only_train_min_freq=0): from utils_ import Trie, get_skip_path from functools import partial w_trie = Trie() for w in w_list: w_trie.insert(w) # for k,v in datasets.items(): # v.apply_field(partial(get_skip_path,w_trie=w_trie),'chars','skips') def skips2skips_l2r(chars, w_trie): ''' :param lexicons: list[[int,int,str]] :return: skips_l2r ''' # print(lexicons) # print('******') lexicons = get_skip_path(chars, w_trie=w_trie) # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0 result = [[] for _ in range(len(chars))] for lex in lexicons: s = lex[0] e = lex[1] w = lex[2] result[e].append([s, w]) return result def skips2skips_r2l(chars, w_trie): ''' :param lexicons: list[[int,int,str]] :return: skips_l2r ''' # print(lexicons) # print('******') lexicons = get_skip_path(chars, w_trie=w_trie) # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0 result = [[] for _ in range(len(chars))] for lex in lexicons: s = lex[0] e = lex[1] w = lex[2] result[s].append([e, w]) return result for k, v in datasets.items(): v.apply_field(partial(skips2skips_l2r, w_trie=w_trie), 'chars', 'skips_l2r') for k, v in datasets.items(): v.apply_field(partial(skips2skips_r2l, w_trie=w_trie), 'chars', 'skips_r2l') # print(v['skips_l2r'][0]) word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab for k, v in datasets.items(): v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_l2r', 'skips_l2r_source') v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_l2r', 'skips_l2r_word') for k, v in datasets.items(): v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_r2l', 'skips_r2l_source') v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_r2l', 'skips_r2l_word') for k, v in datasets.items(): v.apply_field(lambda x: list(map(len, x)), 'skips_l2r_word', 'lexicon_count') v.apply_field(lambda x: list(map(lambda y: list(map(lambda z: word_vocab.to_index(z), y)), x)), 'skips_l2r_word', new_field_name='skips_l2r_word') v.apply_field(lambda x: list(map(len, x)), 'skips_r2l_word', 'lexicon_count_back') v.apply_field(lambda x: list(map(lambda y: list(map(lambda z: word_vocab.to_index(z), y)), x)), 'skips_r2l_word', new_field_name='skips_r2l_word') if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding vocabs['char'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') return datasets, vocabs, embeddings
def load_msra_ner_1(path, char_embedding_path=None, bigram_embedding_path=None, index_token=True, train_clip=False, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0): from fastNLP.io.loader import ConllLoader from utils import get_bigrams if train_clip: train_path = os.path.join(path, 'train_dev.char.bmes_clip1') test_path = os.path.join(path, 'test.char.bmes_clip1') else: train_path = os.path.join(path, 'train_dev.char.bmes') test_path = os.path.join(path, 'test.char.bmes') loader = ConllLoader(['chars', 'target']) train_bundle = loader.load(train_path) test_bundle = loader.load(test_path) datasets = dict() datasets['train'] = train_bundle.datasets['train'] datasets['test'] = test_bundle.datasets['train'] datasets['train'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') datasets['train'].add_seq_len('chars') datasets['test'].add_seq_len('chars') char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() print(datasets.keys()) # print(len(datasets['dev'])) print(len(datasets['test'])) print(len(datasets['train'])) char_vocab.from_dataset(datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['test']]) bigram_vocab.from_dataset(datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') if index_token: char_vocab.index_dataset(datasets['train'], datasets['test'], field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(datasets['train'], datasets['test'], field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(datasets['train'], datasets['test'], field_name='target', new_field_name='target') vocabs = {} vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['label'] = label_vocab embeddings = {} if char_embedding_path is not None: char_embedding = StaticEmbedding( char_vocab, char_embedding_path, word_dropout=0.01, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) embeddings['char'] = char_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): from fastNLP.io.loader import ConllLoader from utils import get_bigrams loader = ConllLoader(['chars', 'target']) train_path = os.path.join(path, 'weiboNER_2nd_conll.train') dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev') test_path = os.path.join(path, 'weiboNER_2nd_conll.test') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def equip_chinese_ner_with_lexicon( datasets, vocabs, embeddings, w_list, word_embedding_path=None, only_lexicon_in_train=False, word_char_mix_embedding_path=None, # 字和词的embedding信息 number_normalized=False, lattice_min_freq=1, only_train_min_freq=0): from fastNLP.core import Vocabulary def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if only_lexicon_in_train: print('已支持只加载在trian中出现过的词汇') def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary from fastNLP_module import StaticEmbedding from fastNLP import DataSet a = DataSet() w_trie = Trie() for w in w_list: w_trie.insert(w) if only_lexicon_in_train: lexicon_in_train = set() for s in datasets['train']['chars']: lexicon_in_s = w_trie.get_lexicon(s) for s, e, lexicon in lexicon_in_s: lexicon_in_train.add(''.join(lexicon)) print('lexicon in train:{}'.format(len(lexicon_in_train))) print('i.e.: {}'.format(list(lexicon_in_train)[:10])) w_trie = Trie() for w in lexicon_in_train: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab lattice_vocab = Vocabulary() lattice_vocab.from_dataset(datasets['train'], field_name='lattice', no_create_entry_dataset=[ v for k, v in datasets.items() if k != 'train' ]) vocabs['lattice'] = lattice_vocab """ 1.word_embedding_path 这个参数到底是用做什么的? 我将其设置成了 None。但是如果为None,那么embedding['word']没有了还可以吗? 2.StaticEmbedding: 给定预训练embedding的名称或路径,根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来, 如果没有找到,则会随机初始化一个值(但如果该word是被标记为no_create_entry的话,则不会单独创建一个值,而是会被指向unk的index)) """ if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding if word_char_mix_embedding_path is not None: lattice_embedding = StaticEmbedding( lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01, min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq) embeddings['lattice'] = lattice_embedding vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): """ 加载微博语料,并缓存数据 :param path: 微博数据集的路径,文件夹 :param unigram_embedding_path: 如果存在,那么使用unigram embedding :param bigram_embedding_path: 如果存在,使用bigram embedding,可以和unigram同时使用 :param index_token: :param char_min_freq: :param bigram_min_freq: :param only_train_min_freq: :param char_word_dropout: :return: """ from fastNLP.io.loader import ConllLoader from utils import get_bigrams #设定数据集的headers是chars和target,即读取的第一列是chars,第二列是target loader = ConllLoader(headers=['chars', 'target']) # bundle = loader.load(path) # # datasets = bundle.datasets # print(datasets['train'][:5]) train_path = os.path.join(path, 'train.conll') dev_path = os.path.join(path, 'dev.conll') test_path = os.path.join(path, 'test.conll') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path datasets = {} for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] print("样本集的样本个数信息如下:") for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) vocabs = {} char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() for k, v in datasets.items(): # ignore the word segmentation tag v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') vocabs['char'] = char_vocab vocabs['label'] = label_vocab bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # for k,v in datasets.items(): # v.set_input('chars','bigrams','seq_len','target') # v.set_target('target','seq_len') vocabs['bigram'] = bigram_vocab embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_tianchi_ner( path, unigram_embedding_path=None, # yangjie_rich_pretrain_unigram_path bigram_embedding_path=None, # yangjie_rich_pretrain_bigram_path index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): # step0.=============================准备数据,诸如数据地址等 loader = ConllLoader(['chars', 'target']) train_path = os.path.join(path, 'tianchi.train') dev_path = os.path.join(path, 'tianchi.dev') test_path = os.path.join(path, 'tianchi.test') paths = {} paths['dev'] = dev_path paths['train'] = train_path paths['test'] = test_path # step1.=============================构建datasets datasets = {} # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个 DataSet 类的实例 for k, v in paths.items(): bundle = loader.load(v) datasets[k] = bundle.datasets['train'] for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) #step2.=============================根据得到的dataset构建字典信息 vocabs = {} # 需要学习一下 Vocabulary 的使用方法 # urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() # datasets 就3个键值对,分别是 train:[] , dev:[], test:[] for item in datasets.items(): print(item) for k, v in datasets.items(): # 处理键值对 # ignore the word segmentation tag # apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法 # 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列 # 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列 v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams # datasets['train']是一个DataSet 的实例 # 形参no_create_entry_dataset的作用:在建立词表的时候将test与dev就考虑到模型中,这会使得最终的结果更好 # 根据训练数据构建字典信息 char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') #char_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=datasets['dev']) #bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=datasets['dev']) print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # 将使用len()直接对field_name中每个元素作用,将其结果作为sequence length, 并放入new_field_name=seq_len这个field v.add_seq_len('chars', new_field_name='seq_len') # 是否将dataset中的每列转为字典中的index # 我对 *list(datasets.values()) 这个不是很熟悉 if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # vocabs 的构造和 datasets 的构造原理都是相同的 # 二者都是字典,不同的键值对应着不同的数据信息 vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab # step3.=============================构建embedding信息 '''有如下几个问题: 01.不是说预训练的embedding 会失去上下文的语义信息吗?为什么这里又用embedding了? 02.这个embedding 和后面的bertEmbedding 有什么区别? 03.需要学习一下 StaticEmbedding()的作用 ''' embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq) # 这里的 unigram_embedding 就是一个实例 embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_weibo_ner(path, unigram_embedding_path=None, bigram_embedding_path=None, index_token=True, char_min_freq=1, bigram_min_freq=1, only_train_min_freq=0, char_word_dropout=0.01): # step0.=============================准备数据,诸如数据地址等 loader = ConllLoader(['chars', 'target']) train_path = os.path.join(path, 'weiboNER_2nd_conll.train_deseg') dev_path = os.path.join(path, 'weiboNER_2nd_conll.dev_deseg') test_path = os.path.join(path, 'weiboNER_2nd_conll.test_deseg') paths = {} paths['train'] = train_path paths['dev'] = dev_path paths['test'] = test_path # step1.=============================构建datasets datasets = {} # 字典!!! 但是需要注意的是:datasets 中的每一项都是一个(fastNLP)中 DataSet 类的实例 for k, v in paths.items(): bundle = loader.load(v) # 这里有点儿疑问,为什么是固定的 'train' 作为参数? # 固定的 train 为参数,是因为bundle 这个实例的设置,它是把数据都放到 train 这个里面了 datasets[k] = bundle.datasets['train'] trainData = datasets['train'] print(type(trainData)) # <class 'fastNLP.core.dataset.DataSet'> print(len(trainData)) # 1350 print(trainData) """ datasets['train'] 中的数据长成下面这样, +-----------------------------------------------------------+-----------------------------------------------------------+ | chars | target | +-----------------------------------------------------------+-----------------------------------------------------------+ | ['科', '技', '全', '方', '位', '资', '讯', '智', '能',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['对', ',', '输', '给', '一', '个', '女', '人', ',',... | ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM... | | ['今', '天', '下', '午', '起', '来', '看', '到', '外',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['今', '年', '拜', '年', '不', '短', '信', ',', '就',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['浑', '身', '酸', '疼', ',', '两', '腿', '无', '力',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['明', '显', '紧', '张', '状', '态', '没', '出', '来',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['三', '十', '年', '前', ',', '老', '爹', '带', '我',... | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... | | ['好', '活', '动', '呀', ',', '给', '力', '的', '商',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ['人', '生', '如', '戏', ',', '导', '演', '是', '自',... | ['O', 'O', 'O', 'O', 'O', 'B-PER.NOM', 'I-PER.NOM', 'O... | | ['听', '说', '小', '米', '开', '卖', '了', ',', '刚',... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'... | | ... | ... | +-----------------------------------------------------------+-----------------------------------------------------------+ 这个是 复旦大学开源工具fastNLP 中DataSet 的类型,其详细文档可参考:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html """ for k, v in datasets.items(): print('{}:{}'.format(k, len(v))) # print(*list(datasets.keys())) #step2.=============================根据得到的dataset构建字典信息 vocabs = {} # 需要学习一下 Vocabulary 的使用方法 # urL:https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html char_vocab = Vocabulary() bigram_vocab = Vocabulary() label_vocab = Vocabulary() # datasets 就3个键值对,分别是 train:[] , dev:[], test:[] for item in datasets.items(): print(item) for k, v in datasets.items(): # 处理键值对 # ignore the word segmentation tag # apply_field() 方法是fastNLP 中的一个处理DataSet 实例的方法 # 传入得chars 参数是干什么的?这是形参filed_name 和 new_field_name 的两个值,这表明没有对列名进行修改,即不扩增列 # 同理,第二个(get_bigrams,'chars','bigrams') 是根据 chars 这个列的值,新建bigrams这一列 v.apply_field(lambda x: [w[0] for w in x], 'chars', 'chars') v.apply_field(get_bigrams, 'chars', 'bigrams') # 感觉这里的效果就是将连续的两个字拼在一起,也就是所谓的 bigrams # datasets['train']是一个DataSet 的实例 char_vocab.from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) label_vocab.from_dataset(datasets['train'], field_name='target') print('label_vocab:{}\n{}'.format(len(label_vocab), label_vocab.idx2word)) for k, v in datasets.items(): # v.set_pad_val('target',-100) v.add_seq_len('chars', new_field_name='seq_len') bigram_vocab.from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if index_token: char_vocab.index_dataset(*list(datasets.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(datasets.values()), field_name='bigrams', new_field_name='bigrams') label_vocab.index_dataset(*list(datasets.values()), field_name='target', new_field_name='target') # vocabs 的构造和 datasets 的构造原理都是相同的 # 二者都是字典,不同的键值对应着不同的数据信息 vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab # step3.=============================构建embedding信息 embeddings = {} if unigram_embedding_path is not None: unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding if bigram_embedding_path is not None: bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq) embeddings['bigram'] = bigram_embedding return datasets, vocabs, embeddings
def load_aicup_ner( path, unigram_embedding_path=None, bigram_embedding_path=None, char_word_dropout=0.01, only_train_min_freq=0, bigram_min_freq=1, data_type='default', index_token=True, char_min_freq=1, cv=False, fold=0, ): vocabs = {} embeddings = {} train_path = os.path.join(path, f'fold{fold}', f'train/{data_type}') dev_path = os.path.join(path, f'fold{fold}', f'dev/{data_type}') print('-----------------Dataset---------------------') print('loading data from', train_path,'\nand', dev_path) loader = ConllLoader(['chars', 'target']) train = loader.load(train_path) dev = loader.load(dev_path) ds = { 'train':train.datasets['train'], 'dev':dev.datasets['train'], } ds['aicup_dev'] = get_aicup_devds() # jieba.enable_paddle() for ds_name in ds.keys(): ds[ds_name].apply_field(get_bigrams, 'chars', 'bigrams') ds[ds_name].add_seq_len('chars', new_field_name='seq_len') ds[ds_name].apply_field(get_pos_tag, 'chars', 'pos_tag') for k, v in ds.items(): print('{}:{}'.format(k, len(v))) char_vocab = Vocabulary() bigram_vocab = Vocabulary() pos_vocab = Vocabulary() label_vocab = get_label_vocab(data_type) pos_vocab.from_dataset(*list(ds.values()), field_name='pos_tag') if cv: no_create_entry_ds = [ds['dev'], ds['aicup_dev']] else: no_create_entry_ds = [ds['dev'], ds['test'], ds['aicup_dev']] char_vocab.from_dataset( ds['train'], field_name='chars', no_create_entry_dataset=no_create_entry_ds ) bigram_vocab.from_dataset( ds['train'], field_name='bigrams', no_create_entry_dataset=no_create_entry_ds ) vocabs['char'] = char_vocab vocabs['label'] = label_vocab vocabs['bigram'] = bigram_vocab vocabs['pos_tag'] = pos_vocab if index_token: char_vocab.index_dataset(*list(ds.values()), field_name='chars', new_field_name='chars') bigram_vocab.index_dataset(*list(ds.values()),field_name='bigrams',new_field_name='bigrams') label_vocab.index_dataset(*list([ds['train'], ds['dev']]), field_name='target', new_field_name='target') pos_vocab.index_dataset(*list(ds.values()),field_name='pos_tag', new_field_name='pos_tag') unigram_embedding = StaticEmbedding( char_vocab, model_dir_or_name=unigram_embedding_path, word_dropout=char_word_dropout, min_freq=char_min_freq, only_train_min_freq=only_train_min_freq, ) bigram_embedding = StaticEmbedding( bigram_vocab, model_dir_or_name=bigram_embedding_path, word_dropout=0.01, min_freq=bigram_min_freq, only_train_min_freq=only_train_min_freq, ) embeddings['char'] = unigram_embedding embeddings['bigram'] = bigram_embedding print(ds['train']) print(set([ele[0].split('-')[1] if ele[0]!='O' and ele[0][0]!='<' else ele[0] for ele in list(label_vocab)])) print('------------------------------------------') return ds, vocabs, embeddings