def __init__(self, modelFile, vocabFile, addTarget2Vocab=False): # CHAR_INPUT="chars", 并且会转化为word_index self._vocabFile = vocabFile self._addTarget2Vocab = addTarget2Vocab self._CONST_CHAR = Const.CHAR_INPUT self._CONST_WORDS = Const.INPUT self._CONST_TARGET = Const.TARGET self._input_fields = [self._CONST_WORDS, Const.INPUT_LEN] self._word_counter, self._word_vocab, self._target_counter, \ self._target_vocab, self._target = self._get_vocabs() self._vocab4word = Vocabulary() self._update_word() if self._addTarget2Vocab: self._vocab4target = Vocabulary(unknown=None, padding=None) self._input_fields.append(self._CONST_TARGET) self._update_target() self._model = Predictor(ModelLoader().load_pytorch_model(modelFile))
def test_bert_3(self): vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', include_cls_sep=False) model = BertForTokenClassification(embed, 7) input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) pred = model(input_ids) self.assertTrue(isinstance(pred, dict)) self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 3, 7))
def test_bert_1_w(self): vocab = Vocabulary().add_word_lst("this is a test .".split()) embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', include_cls_sep=False) with self.assertWarns(Warning): model = BertForSequenceClassification(embed, 2) input_ids = torch.LongTensor([[1, 2, 3], [5, 6, 0]]) pred = model.predict(input_ids) self.assertTrue(isinstance(pred, dict)) self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,))
def test_bert_4(self): vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', include_cls_sep=False) model = BertForQuestionAnswering(embed) input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) pred = model(input_ids) self.assertTrue(isinstance(pred, dict)) self.assertTrue('pred_start' in pred) self.assertTrue('pred_end' in pred) self.assertEqual(tuple(pred['pred_start'].shape), (2, 3)) self.assertEqual(tuple(pred['pred_end'].shape), (2, 3))
def test_bert_2(self): vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', include_cls_sep=True) model = BertForMultipleChoice(embed, 2) input_ids = torch.LongTensor([[[2, 6, 7], [1, 6, 5]]]) print(input_ids.size()) pred = model(input_ids) self.assertTrue(isinstance(pred, dict)) self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (1, 2))
def load(cls, folder): """ 从folder中读取数据初始化RobertaEmbedding :param folder: :return: """ for name in [VOCAB_NAME, ROBERTA_EMBED_HYPER, ROBERTA_EMBED_FOLDER]: assert os.path.exists(os.path.join( folder, name)), f"{name} not found in {folder}." vocab = Vocabulary.load(os.path.join(folder, VOCAB_NAME)) with open(os.path.join(folder, ROBERTA_EMBED_HYPER), 'r', encoding='utf-8') as f: hyper = json.load(f) model_name_or_path = os.path.join(folder, ROBERTA_EMBED_FOLDER) roberta = cls(vocab=vocab, model_dir_or_name=model_name_or_path, **hyper) return roberta
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True, init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): """ :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。 :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 :param int embedding_dim: 随机初始化的embedding的维度,当该值为大于0的值时,将忽略model_dir_or_name。 :param bool requires_grad: 是否需要gradient. 默认为True :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法, 传入的方法应该接受一个tensor,并 inplace地修改其值。 :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 为大写的词语开辟一个vector表示,则将lower设置为False。 :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 :param dict kwarngs: only_train_min_freq, 仅对train中的词语使用min_freq筛选; only_norm_found_vector是否仅对在预训练中找到的词语使用normalize。 """ super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) if embedding_dim > 0: model_dir_or_name = None # 得到cache_path if model_dir_or_name is None: assert embedding_dim >= 1, "The dimension of embedding should be larger than 1." embedding_dim = int(embedding_dim) model_path = None elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: model_url = _get_embedding_url('static', model_dir_or_name.lower()) model_path = cached_path(model_url, name='embedding') # 检查是否存在 elif os.path.isfile(os.path.abspath(os.path.expanduser(model_dir_or_name))): model_path = os.path.abspath(os.path.expanduser(model_dir_or_name)) elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") # 根据min_freq缩小vocab truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq) if truncate_vocab: truncated_vocab = deepcopy(vocab) truncated_vocab.min_freq = min_freq truncated_vocab.word2idx = None if lower: # 如果有lower,将大小写的的freq需要同时考虑到 lowered_word_count = defaultdict(int) for word, count in truncated_vocab.word_count.items(): lowered_word_count[word.lower()] += count for word in truncated_vocab.word_count.keys(): word_count = truncated_vocab.word_count[word] if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq: truncated_vocab.add_word_lst([word] * (min_freq - word_count), no_create_entry=truncated_vocab._is_word_no_create_entry(word)) # 只限制在train里面的词语使用min_freq筛选 if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: for word in truncated_vocab.word_count.keys(): if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq: truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]), no_create_entry=True) truncated_vocab.build_vocab() truncated_words_to_words = torch.arange(len(vocab)).long() for word, index in vocab: truncated_words_to_words[index] = truncated_vocab.to_index(word) logger.info( f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") vocab = truncated_vocab self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) # 读取embedding if lower: lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown) for word, index in vocab: if vocab._is_word_no_create_entry(word): lowered_vocab.add_word(word.lower(), no_create_entry=True) else: lowered_vocab.add_word(word.lower()) # 先加入需要创建entry的 logger.info(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} " f"unique lowered words.") if model_path: embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) if lowered_vocab.unknown: unknown_idx = lowered_vocab.unknown_idx else: unknown_idx = embedding.size(0) - 1 # 否则是最后一个为unknow self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) words_to_words = torch.full((len(vocab),), fill_value=unknown_idx).long() for word, index in vocab: if word not in lowered_vocab: word = word.lower() if word not in lowered_vocab and lowered_vocab._is_word_no_create_entry(word): continue # 如果不需要创建entry,已经默认unknown了 words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)] self.register_buffer('words_to_words', words_to_words) self._word_unk_index = lowered_vocab.unknown_idx # 替换一下unknown的index else: if model_path: embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) if not self.only_norm_found_vector and normalize: embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) if truncate_vocab: for i in range(len(truncated_words_to_words)): index_in_truncated_vocab = truncated_words_to_words[i] truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab] del self.words_to_words self.register_buffer('words_to_words', truncated_words_to_words) self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], padding_idx=vocab.padding_idx, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, _weight=embedding) self._embed_size = self.embedding.weight.size(1) self.requires_grad = requires_grad self.dropout = MyDropout(dropout)
# preprocess the file import sys from fastNLP.io.loader import CSVLoader bundle = CSVLoader(headers=['raw_words1', 'raw_words2', 'target'], sep='\t').load(sys.argv[1]) #####test import jieba from fastNLP.core import Vocabulary bundle.apply(lambda line: jieba.lcut(line['raw_words1']) + ['[SEP]'] + jieba. lcut(line['raw_words2']), new_field_name='words') bundle.apply(lambda line: len(line['words']), new_field_name='seq_len') bundle.apply(lambda line: 1, new_field_name='target') vocab = Vocabulary() vocab.from_dataset(bundle.get_dataset("train"), field_name='words', no_create_entry_dataset=[ bundle.get_dataset("test"), bundle.get_dataset("dev") ]) vocab.index_dataset(bundle.get_dataset("train"), field_name='words') vocab.index_dataset(bundle.get_dataset("test"), field_name='words') vocab.index_dataset(bundle.get_dataset("dev"), field_name='words') # establish the model from fastNLP import Const import torch from fastNLP.models import BertForSentenceMatching from fastNLP.embeddings.bert_embedding import BertEmbedding embed = BertEmbedding(vocab, model_dir_or_name='cn-base', requires_grad=False)
def equip_chinese_ner_with_lexicon(datasets, vocabs, embeddings, w_list, word_embedding_path=None, only_lexicon_in_train=False, word_char_mix_embedding_path=None, number_normalized=False, lattice_min_freq=1, only_train_min_freq=0): from fastNLP.core import Vocabulary def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if only_lexicon_in_train: print('已支持只加载在trian中出现过的词汇') def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary # from fastNLP.embeddings import StaticEmbedding from fastNLP_module import StaticEmbedding from fastNLP import DataSet a = DataSet() w_trie = Trie() for w in w_list: w_trie.insert(w) if only_lexicon_in_train: lexicon_in_train = set() for s in datasets['train']['chars']: lexicon_in_s = w_trie.get_lexicon(s) for s, e, lexicon in lexicon_in_s: lexicon_in_train.add(''.join(lexicon)) print('lexicon in train:{}'.format(len(lexicon_in_train))) print('i.e.: {}'.format(list(lexicon_in_train)[:10])) w_trie = Trie() for w in lexicon_in_train: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') # print(list(datasets['train'][:10]['lexicons'])) # print(list(datasets['train'][:10]['lattice'])) # print(list(datasets['train'][:10]['lex_s'])) # print(list(datasets['train'][:10]['lex_e'])) # print(list(datasets['train'][:10]['pos_s'])) # print(list(datasets['train'][:10]['pos_e'])) # exit(1208) word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab lattice_vocab = Vocabulary() lattice_vocab.from_dataset(datasets['train'], field_name='lattice', no_create_entry_dataset=[ v for k, v in datasets.items() if k != 'train' ]) vocabs['lattice'] = lattice_vocab # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word') # # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word') # for k,v in datasets.items(): # v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count') # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_l2r_word',new_field_name='skips_l2r_word') # # v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back') # # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_r2l_word',new_field_name='skips_r2l_word') if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding if word_char_mix_embedding_path is not None: lattice_embedding = StaticEmbedding( lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01, min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq) embeddings['lattice'] = lattice_embedding vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') vocabs['span_label'].index_dataset(*(datasets.values()), field_name='span_label', new_field_name='span_label') vocabs['attr_label'].index_dataset(*(datasets.values()), field_name='attr_start_label', new_field_name='attr_start_label') vocabs['attr_label'].index_dataset(*(datasets.values()), field_name='attr_end_label', new_field_name='attr_end_label') return datasets, vocabs, embeddings
def equip_chinese_ner_with_lexicon( datasets, vocabs, embeddings, w_list, word_embedding_path=None, only_lexicon_in_train=False, word_char_mix_embedding_path=None, # 字和词的embedding信息 number_normalized=False, lattice_min_freq=1, only_train_min_freq=0): from fastNLP.core import Vocabulary def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if only_lexicon_in_train: print('已支持只加载在trian中出现过的词汇') def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary from fastNLP_module import StaticEmbedding from fastNLP import DataSet a = DataSet() w_trie = Trie() for w in w_list: w_trie.insert(w) if only_lexicon_in_train: lexicon_in_train = set() for s in datasets['train']['chars']: lexicon_in_s = w_trie.get_lexicon(s) for s, e, lexicon in lexicon_in_s: lexicon_in_train.add(''.join(lexicon)) print('lexicon in train:{}'.format(len(lexicon_in_train))) print('i.e.: {}'.format(list(lexicon_in_train)[:10])) w_trie = Trie() for w in lexicon_in_train: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab lattice_vocab = Vocabulary() lattice_vocab.from_dataset(datasets['train'], field_name='lattice', no_create_entry_dataset=[ v for k, v in datasets.items() if k != 'train' ]) vocabs['lattice'] = lattice_vocab """ 1.word_embedding_path 这个参数到底是用做什么的? 我将其设置成了 None。但是如果为None,那么embedding['word']没有了还可以吗? 2.StaticEmbedding: 给定预训练embedding的名称或路径,根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来, 如果没有找到,则会随机初始化一个值(但如果该word是被标记为no_create_entry的话,则不会单独创建一个值,而是会被指向unk的index)) """ if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding if word_char_mix_embedding_path is not None: lattice_embedding = StaticEmbedding( lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01, min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq) embeddings['lattice'] = lattice_embedding vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') return datasets, vocabs, embeddings
def from_raw_text_new(chars, vocabs, w_list, number_normalized=False): from fastNLP.core import DataSet from utils import get_bigrams bigrams = get_bigrams(chars) seq_len = len(chars) target = ['O'] * seq_len dataset = DataSet({ 'chars': [chars], 'bigrams': [bigrams], 'seq_len': [seq_len], 'target': [target] }) datasets = {'train': dataset} def normalize_char(inp): result = [] for c in inp: if c.isdigit(): result.append('0') else: result.append(c) return result def normalize_bigram(inp): result = [] for bi in inp: tmp = bi if tmp[0].isdigit(): tmp = '0' + tmp[:1] if tmp[1].isdigit(): tmp = tmp[0] + '0' result.append(tmp) return result if number_normalized == 3: print('not support exit!') exit() for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def get_skip_path(chars, w_trie): sentence = ''.join(chars) result = w_trie.get_lexicon(sentence) # print(result) return result from V0.utils_ import Trie from functools import partial from fastNLP.core import Vocabulary from fastNLP.embeddings import StaticEmbedding from fastNLP import DataSet a = DataSet() a.apply w_trie = Trie() for w in w_list: w_trie.insert(w) import copy for k, v in datasets.items(): v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons') v.apply_field(copy.copy, 'chars', 'raw_chars') v.add_seq_len('lexicons', 'lex_num') v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s') v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e') if number_normalized == 1: print('not support exit!') exit() for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) if number_normalized == 2: print('not support exit!') exit() for k, v in datasets.items(): v.apply_field(normalize_char, 'chars', 'chars') vocabs['char'] = Vocabulary() vocabs['char'].from_dataset( datasets['train'], field_name='chars', no_create_entry_dataset=[datasets['dev'], datasets['test']]) for k, v in datasets.items(): v.apply_field(normalize_bigram, 'bigrams', 'bigrams') vocabs['bigram'] = Vocabulary() vocabs['bigram'].from_dataset( datasets['train'], field_name='bigrams', no_create_entry_dataset=[datasets['dev'], datasets['test']]) def concat(ins): chars = ins['chars'] lexicons = ins['lexicons'] result = chars + list(map(lambda x: x[2], lexicons)) # print('lexicons:{}'.format(lexicons)) # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons)))) # print('result:{}'.format(result)) return result def get_pos_s(ins): lex_s = ins['lex_s'] seq_len = ins['seq_len'] pos_s = list(range(seq_len)) + lex_s return pos_s def get_pos_e(ins): lex_e = ins['lex_e'] seq_len = ins['seq_len'] pos_e = list(range(seq_len)) + lex_e return pos_e for k, v in datasets.items(): v.apply(concat, new_field_name='lattice') v.set_input('lattice') v.apply(get_pos_s, new_field_name='pos_s') v.apply(get_pos_e, new_field_name='pos_e') v.set_input('pos_s', 'pos_e') # print(list(datasets['train'][:10]['lexicons'])) # print(list(datasets['train'][:10]['lattice'])) # print(list(datasets['train'][:10]['lex_s'])) # print(list(datasets['train'][:10]['lex_e'])) # print(list(datasets['train'][:10]['pos_s'])) # print(list(datasets['train'][:10]['pos_e'])) # exit(1208) # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word') # # for k,v in datasets.items(): # v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source') # v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word') # for k,v in datasets.items(): # v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count') # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_l2r_word',new_field_name='skips_l2r_word') # # v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back') # # v.apply_field(lambda x: # list(map(lambda y: # list(map(lambda z:word_vocab.to_index(z),y)),x)), # 'skips_r2l_word',new_field_name='skips_r2l_word') vocabs['char'].index_dataset(*(datasets.values()), field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(*(datasets.values()), field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(*(datasets.values()), field_name='target', new_field_name='target') vocabs['lattice'].index_dataset(*(datasets.values()), field_name='lattice', new_field_name='lattice') return datasets, vocabs
class CustomizedNER(object): def __init__(self, modelFile, vocabFile, addTarget2Vocab=False): # CHAR_INPUT="chars", 并且会转化为word_index self._vocabFile = vocabFile self._addTarget2Vocab = addTarget2Vocab self._CONST_CHAR = Const.CHAR_INPUT self._CONST_WORDS = Const.INPUT self._CONST_TARGET = Const.TARGET self._input_fields = [self._CONST_WORDS, Const.INPUT_LEN] self._word_counter, self._word_vocab, self._target_counter, \ self._target_vocab, self._target = self._get_vocabs() self._vocab4word = Vocabulary() self._update_word() if self._addTarget2Vocab: self._vocab4target = Vocabulary(unknown=None, padding=None) self._input_fields.append(self._CONST_TARGET) self._update_target() self._model = Predictor(ModelLoader().load_pytorch_model(modelFile)) def _target_token(self, word_token, cont, number="", word=""): ret = dict() sign = True lastIdx = len(word_token) - 1 for num, token in zip(enumerate(word_token), cont): if num[1] in self._target: if sign: number += str(num[1]) word += token if num[0] < lastIdx and not word_token[num[0] + 1]: sign = False else: ret.setdefault(number, set()) ret[number].add(word) number = "" word = token sign = True if number: ret.setdefault(number, set()) ret[number].add(word) return ret def _extract_ner(self, tokenNum, token, weighted=False): if not weighted: cls = self._target.get(int(max(tokenNum, key=tokenNum.count)), "") if cls.endswith("LOC"): return {"LOC": [x for x in token]} elif cls.endswith("PER"): return {"PER": [x for x in token]} elif cls.endswith("ORG"): return {"ORG": [x for x in token]} def _get_ner(self, tokenNumber, tokenWord): nerDict = self._target_token(tokenNumber, tokenWord) ret = dict() for num, token in nerDict.items(): if len(num) == 1: continue for k, v in self._extract_ner(num, token).items(): ret.setdefault(k, list()) ret[k].extend(v) return ret def _read_vocab(self): with open(self._vocabFile, "r", encoding="utf-8") as vocabIn: return eval(vocabIn.read()) def _reverse_dict(self, dic): ret = dict() for key, value in dic.items(): ret.setdefault(value, key) return ret def _tartget_label(self, dic): ret = self._reverse_dict(dic) del ret[0] return ret def _get_vocabs(self): vocabs = self._read_vocab() word_count = vocabs.get("wordsWc", dict()) wordsVocab = vocabs.get("wordsVocab", dict()) target_count = vocabs.get("targetWc", dict()) targetVocab = vocabs.get("targetVocab", dict()) reverseTargetVocab = self._tartget_label(targetVocab) return Counter(word_count), wordsVocab, Counter( target_count), targetVocab, reverseTargetVocab def _update_word(self): self._vocab4word.update(self._word_vocab) self._vocab4word.word_count = self._word_counter def _update_target(self): self._vocab4target.update(self._target_vocab) self._vocab4target.word_count = self._target_counter @property def model(self): if not self._model: raise return self._model def formatRowString(self, msg): msg = msg.strip() tokenized_char = [x for x in msg] self._dataset = DataSet() if self._addTarget2Vocab: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char, target=list(dict(self._target_vocab).keys())) else: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char) self._dataset.append(ins) @property def dataset(self): # if input as dict format: # data = DataSet({"raw_chars":[msg], "words":[[x for x in msg]], "seq_len":[len(word_list)]}) # 从该dataset中的chars列建立词表 self._vocab4word.from_dataset(self._dataset, field_name=self._CONST_CHAR) # 使用vocabulary将chars列转换为index self._vocab4word.index_dataset(self._dataset, field_name=self._CONST_CHAR, new_field_name=self._CONST_WORDS) if self._addTarget2Vocab: self._vocab4target.from_dataset(self._dataset, field_name=self._CONST_TARGET) self._vocab4target.index_dataset(self._dataset, field_name=self._CONST_TARGET) self._dataset.add_seq_len(self._CONST_CHAR) self._dataset.set_input(*self._input_fields) return self._dataset def _content(self): for line in self._dataset["raw_chars"].content: yield "".join(line) def result(self, dataset): # 打印数据集中的预测结果 ret = self.model.predict(dataset)["pred"] for line, cont in zip(ret, self._content()): yield self._get_ner(line[0].tolist(), cont)
def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = True, min_freq=1, only_use_pretrain_bpe=False, truncate_embed=True): super().__init__() self.tokenizer = CamembertTokenizer.from_pretrained(model_dir_or_name) self.encoder = CamembertModel.from_pretrained(model_dir_or_name) self.encoder.resize_token_embeddings(len(self.tokenizer)) self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2 encoder_layer_number = len(self.encoder.encoder.layer) if isinstance(layers, list): self.layers = [int(l) for l in layers] elif isinstance(layers, str): self.layers = list(map(int, layers.split(','))) else: raise TypeError("`layers` only supports str or list[int]") for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." else: assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." assert pool_method in ('avg', 'max', 'first', 'last') self.pool_method = pool_method self.include_cls_sep = include_cls_sep self.pooled_cls = pooled_cls self.auto_truncate = auto_truncate logger.info("Start to generate word pieces for word.") word_piece_dict = {'<s>': 1, '</s>': 1} found_count = 0 new_add_to_bpe_vocab = 0 unsegment_count = 0 if "<s>" in vocab: warnings.warn("<s> detected in your vocabulary. RobertaEmbedding will add <s> and </s> to the begin " "and end of the input automatically, make sure you don't add <s> and </s> at the begin" " and end.") unique = [] for word, index in vocab: word_pieces = [] word_pieces.extend(self.tokenizer.tokenize( word)) # , add_prefix_space=True)) word_token_ids = self.tokenizer.convert_tokens_to_ids(word_pieces) if 3 in word_token_ids: if word_pieces[word_token_ids.index(3)] not in unique: unique.append(word_pieces[word_token_ids.index(3)]) unsegment_count += 1 if not vocab._is_word_no_create_entry(word): if index != vocab.unknown_idx and word_pieces[0] == '<unk>': if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry( word) and not only_use_pretrain_bpe: word_piece_dict[word] = 1 new_add_to_bpe_vocab += 1 unsegment_count += 1 continue found_count += 1 for word_piece in word_pieces: word_piece_dict[word_piece] = 1 if unsegment_count > 0: logger.info(f"{unsegment_count} words are unsegmented.") word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: if index == vocab.padding_idx: word = '<pad>' elif index == vocab.unknown_idx: word = '<unk>' word_pieces = self.tokenizer.tokenize(word) word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces) word_to_wordpieces.append(word_pieces) word_pieces_lengths.append(len(word_pieces)) self._cls_index = self.tokenizer.convert_tokens_to_ids('<s>') self._sep_index = self.tokenizer.convert_tokens_to_ids('</s>') self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenizer.convert_tokens_to_ids( '<pad>') self.word_to_wordpieces = np.array(word_to_wordpieces) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) self.encoder.resize_token_embeddings(len(self.tokenizer)) logger.debug("Successfully generate word pieces.")