def dump_model_result(config, model): tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb')) metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag') dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb")) data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False) model.cuda() eval_results = {} dev_data.set_input('tag') dev_data.set_target('seq_len') with torch.no_grad(): for i, (batch_x, batch_y) in enumerate(data_iterator): print('batch', i) #if i > 10: # break char = batch_x['char'].cuda() word = batch_x['word'].cuda() pos = batch_x['pos'].cuda() spo = batch_x['spo'].cuda() seq_len = batch_x['seq_len'].cuda() tag = batch_y['tag'].cuda() #pred = model(char, word, pos, spo, seq_len, tag) pred = model.predict(char, word, pos, spo, seq_len) # labels? #labels = idx2label(pred['pred'], tag_vocab.idx2word) #print(pred) #print(tag) #exit() metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()}) eval_result = metrics.get_metric() metric_name = metrics.__class__.__name__ eval_results[metric_name] = eval_result print("[tester] \n{}".format(_format_eval_results(eval_results)))
def __init__(self, tag_vocab, config): super(CommonSeqEvaluator, self).__init__() self._config = config self._vocab = Vocabulary() self._vocab.add_word_lst(tag_vocab.stoi.keys()) self._evaluator = SpanFPreRecMetric(self._vocab, only_gross=False, f_type=config.evaluation.type) self._pad_index = tag_vocab.stoi['<pad>']
def train(args): data = get_data(args) train_data = data['train'] dev_data = data['dev'] model = get_model(args) optimizer = get_optim(args) device = 'cuda' if torch.cuda.is_available() else 'cpu' callbacks = [] trainer = Trainer( train_data=train_data, model=model, optimizer=optimizer, loss=None, batch_size=args.batch_size, n_epochs=args.epochs, num_workers=4, metrics=SpanFPreRecMetric(tag_vocab=data['tag_vocab'], encoding_type=data['encoding_type'], ignore_labels=data['ignore_labels']), metric_key='f1', dev_data=dev_data, save_path=args.save_path, device=device, callbacks=callbacks, check_code_level=-1, ) print(trainer.train())
def test_each(config, models): dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb")) metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag') for model_name, model in zip(config.ensemble_models, models): print(model_name) tester = Tester(dev_data, model, metrics=metrics, device=config.device, batch_size=config.batch_size) tester.test()
def _get_trainer(self, models_folder): optimizer = optim.SGD(self.parameters(), lr=self.config['lr'], momentum=0.9) callbacks = [] clip_callback = GradientClipCallback(clip_type='value', clip_value=5) evaluate_callback = EvaluateCallback( self.data_bundle.get_dataset('test')) if self.config['warmup_steps'] > 0: warmup_callback = WarmupCallback(self.config['warmup_steps'], schedule='linear') callbacks.append(warmup_callback) callbacks.extend([clip_callback, evaluate_callback]) return Trainer(self.data_bundle.get_dataset('train'), self, optimizer, batch_size=self.config['batch_size'], sampler=BucketSampler(), num_workers=2, n_epochs=100, dev_data=self.data_bundle.get_dataset('dev'), metrics=SpanFPreRecMetric( tag_vocab=self.data_bundle.get_vocab('target'), encoding_type=self.config['encoding_type']), dev_batch_size=self.config['batch_size'] * 5, callbacks=callbacks, device=self.config['device'], test_use_tqdm=False, use_tqdm=True, print_every=300, save_path=models_folder)
def __init__(self, masker, task_lst, vocabs, optimizer, args): """ :param model: 模型 :param description: 模型描述 :param task_lst: 任务列表 :param optimizer: 优化器 :param log_path: TensorboardX存储文件夹 :param save_path: 模型存储位置 :param accumulation_steps: 累积梯度 :param print_every: 评估间隔 """ self.logger = fastNLP.logger self.masker = masker self.task_lst = task_lst self.save_path = args.save_path self.description = args.exp_name self.optim = optimizer self.vocabs = vocabs n_steps = (int( len(task_lst) * len(task_lst[0].train_set) * 100 / args.batch_size) + 1) args.n_steps = n_steps self.epoch_scheduler = get_scheduler(args, self.optim) self.scheduler = None self.logger.info('Using scheduler {}'.format(self.scheduler)) self.accumulation_steps = args.accumulation_steps self.print_every = args.print_every self.batch_size = args.batch_size self.save_ep = args.save_ep include_tasks = args.tasks if include_tasks is None: self.empty_tasks = set() else: self.empty_tasks = set(range(len( self.task_lst))) - set(include_tasks) self.steps = 0 self.best_acc = 0 self.best_epoch = 0 self.metrics = [] for t in task_lst: if has_acc(t.task_name): self.metrics.append(AccuracyMetric()) else: self.metrics.append( SpanFPreRecMetric( self.vocabs[t.task_name], encoding_type="bioes" if t.task_name == "ner" else "bio", )) # self.logger.info(self.metrics) tb_path = "eval" if args.evaluate else "train" self.summary_writer = SummaryWriter(os.path.join( args.tb_path, tb_path))
def trainer(data_folder, write2model, write2vocab): data_bundle = PeopleDailyNERLoader().load( data_folder) # 这一行代码将从{data_dir}处读取数据至DataBundle data_bundle = PeopleDailyPipe().process(data_bundle) data_bundle.rename_field('chars', 'words') # 存储vocab targetVocab = dict(data_bundle.vocabs["target"]) wordsVocab = dict(data_bundle.vocabs["words"]) targetWc = dict(data_bundle.vocabs['target'].word_count) wordsWc = dict(data_bundle.vocabs['words'].word_count) with open(write2vocab, "w", encoding="utf-8") as VocabOut: VocabOut.write( json.dumps( { "targetVocab": targetVocab, "wordsVocab": wordsVocab, "targetWc": targetWc, "wordsWc": wordsWc }, ensure_ascii=False)) embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=False, auto_truncate=True) model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=100, dropout=0.5, target_vocab=data_bundle.get_vocab('target')) metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target')) optimizer = Adam(model.parameters(), lr=2e-5) loss = LossInForward() device = 0 if torch.cuda.is_available() else 'cpu' # device = "cpu" trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, batch_size=8, dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device, n_epochs=1) trainer.train() tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric) tester.test() saver = ModelSaver(write2model) saver.save_pytorch(model, param_only=False)
def evaluate(args): data = get_data(args) test_data = data['test'] model = load_model_from_path(args) device = 'cuda' if torch.cuda.is_available() else 'cpu' tester = Tester( data=test_data, model=model, batch_size=args.batch_size, num_workers=2, device=device, metrics=SpanFPreRecMetric(tag_vocab=data['tag_vocab'], encoding_type=data['encoding_type'], ignore_labels=data['ignore_labels']), ) print(tester.test())
def evaluate(self, data_samples, prefix=''): r""" :param DataSet data_samples: DataSet with Samples and Vocabs :return: dict obj to save metrics result """ # span_f1_metric = SpanFPreRecMetric( # tag_vocab=data_bundle.get_vocab('target'), # encoding_type='bio') tester = Tester(data_bundle.get_dataset('test'), self.model, metrics=SpanFPreRecMetric( tag_vocab=data_bundle.get_vocab('target'), encoding_type='bio'), batch_size=4, device=device) return tester.test()['SpanFPreRecMetric']
def predict(config, model): tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb')) metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag') dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb")) char_vocab = pickle.load(open(os.path.join(config.data_path, config.char_vocab_name), "rb")) data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False) model.cuda() schema = get_schemas(config.source_path) eval_results = {} dev_data.set_input('tag') dev_data.set_target('seq_len') result = {} with torch.no_grad(): for i, (batch_x, _) in enumerate(data_iterator): print('batch', i) #if i > 10: # break char = batch_x['char'].cuda() word = batch_x['word'].cuda() pos = batch_x['pos'].cuda() spo = batch_x['spo'].cuda() seq_len = batch_x['seq_len'].cuda() #pred = model(char, word, pos, spo, seq_len, tag) pred = model.predict(char, word, pos, spo, seq_len) # labels? texts = char2text(char.cpu().data, char_vocab.idx2word) labels = idx2label(pred['pred'].cpu().data, tag_vocab.idx2word) spos = idx2spo(schema, spo.cpu().data) result = label2spo(labels, texts, result, spos) #print(pred) #print(tag) #exit() # metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()}) # eval_result = metrics.get_metric() # metric_name = metrics.__class__.__name__ # eval_results[metric_name] = eval_result return result
def _eval_epoch(self, dev=True): self.logger.info("Evaluating...") dev_loss = 0 e_steps = 0 avg_acc = 0 dev_acc = {} self.model = self.masker.model self.model.eval() metrics = [] for task in self.task_lst: if has_acc(task.task_name): metrics.append(fastNLP.AccuracyMetric()) else: metrics.append( SpanFPreRecMetric( self.vocabs[task.task_name], encoding_type="bioes" if task.task_name == "ner" else "bio", )) with torch.no_grad(): for i in range(len(self.task_lst)): corrects, samples = 0, 0 task = find_task(i, self.task_lst) if task.task_id in self.empty_tasks: continue if dev: data_loader = task.dev_data_loader else: data_loader = task.test_data_loader for batch in data_loader: x, y = batch batch_task_id = x["task_id"].cuda() batch_x = x["x"].cuda() batch_y = y["y"].cuda() if "seq_len" in x: seq_len = x["seq_len"].cuda() else: seq_len = None self.masker.before_forward(batch_task_id[0].item()) # loss, pred = self.model(batch_task_id, batch_x, batch_y, seq_len) if seq_len is not None: out = self.model(batch_task_id, batch_x, batch_y, seq_len) else: out = self.model(batch_task_id, batch_x, batch_y) loss, pred = out["loss"], out["pred"] self.masker.after_forward(batch_task_id[0].item()) dev_loss += loss.item() e_steps += 1 metrics[i].evaluate(pred, batch_y, seq_len) samples += batch_x.size(0) for i in range(len(self.task_lst)): task = find_task(i, self.task_lst) eval_res = metrics[i].get_metric() dev_acc[task.task_name] = eval_res avg_acc += eval_res["acc"] if "acc" in eval_res else eval_res[ "f"] avg_acc /= len(self.task_lst) - len(self.empty_tasks) dev_acc["avg"] = avg_acc dev_loss = dev_loss / e_steps return dev_loss, dev_acc
for ins in all_data[target][key]: CWS_dataset.append(ins) del all_data[target][key] CWS_dataset.set_input('chars','target','seq_len') CWS_dataset.set_target('target','seq_len') all_data[target]['CWS-all']=CWS_dataset model=torch.load('best_model') device = 0 if torch.cuda.is_available() else 'cpu' metric1 = SegAppCharParseF1Metric(label_vocab['Parsing']['APP']) metric2 = CWSMetric(label_vocab['Parsing']['APP']) metrics = [metric1,metric2] for key in all_data['test']: dataset=all_data['test'][key] if key.startswith('CWS'): tester = Tester(data=dataset,model=model,metrics=SpanFPreRecMetric(tag_vocab=label_vocab['CWS']),device=device) elif key.startswith('POS'): tester = Tester(data=dataset,model=model,metrics=SpanFPreRecMetric(tag_vocab=label_vocab['POS']),device=device) elif key.startswith('NER'): tester = Tester(data=dataset,model=model,metrics=SpanFPreRecMetric(tag_vocab=label_vocab['NER']),device=device) else: tester = Tester(data=dataset,model=model,metrics=metrics,device=device) print(key) tester.test()
def eval_mtl_single(args): global logger # import ipdb; ipdb.set_trace() args = torch.load(os.path.join(args.save_path, "args")) print(args) logger.info(args) task_lst, vocabs = utils.get_data(args.data_path) task_db = task_lst[args.task_id] train_data = task_db.train_set dev_data = task_db.dev_set test_data = task_db.test_set task_name = task_db.task_name # text classification for ds in [train_data, dev_data, test_data]: ds.rename_field("words_idx", "x") ds.rename_field("label", "y") ds.set_input("x", "y", "task_id") ds.set_target("y") # seq label if task_name in SEQ_LABEL_TASK: for ds in [train_data, dev_data, test_data]: ds.set_input("seq_len") ds.set_target("seq_len") logger = utils.get_logger(__name__) logger.info("task name: {}, task id: {}".format(task_db.task_name, task_db.task_id)) logger.info( "train len {}, dev len {}, test len {}".format( len(train_data), len(dev_data), len(test_data) ) ) # init model model = get_model(args, task_lst, vocabs) # logger.info('model: \n{}'.format(model)) if task_name not in SEQ_LABEL_TASK or task_name == "pos": metrics = [ AccuracyMetric(target="y"), # MetricInForward(val_name='loss') ] else: metrics = [ SpanFPreRecMetric( tag_vocab=vocabs[task_name], pred="pred", target="y", seq_len="seq_len", encoding_type="bioes" if task_name == "ner" else "chunk", ), AccuracyMetric(target="y") # MetricInForward(val_name='loss') ] cur_best = 0.0 init_best = None eval_time = 0 paths = [path for path in os.listdir(args.save_path) if path.startswith("best")] paths = sorted(paths, key=lambda x: int(x.split("_")[1])) for path in paths: path = os.path.join(args.save_path, path) state = torch.load(path, map_location="cpu") model.load_state_dict(state) tester = Tester( test_data, model, metrics=metrics, batch_size=args.batch_size, num_workers=4, device="cuda", use_tqdm=False, ) res = tester.test() val = 0.0 for metric_name, metric_dict in res.items(): if task_name == "pos" and "acc" in metric_dict: val = metric_dict["acc"] break elif "f" in metric_dict: val = metric_dict["f"] break if init_best is None: init_best = val logger.info( "No #%d: best %f, %s, path: %s, is better: %s", eval_time, val, tester._format_eval_results(res), path, val > init_best, ) eval_time += 1
@cache_results('caches/weibo-lstm.pkl', _refresh=False) def get_data(): data_bundle = WeiboNERLoader().load() data_bundle = ChineseNERPipe(encoding_type='bioes', bigram=True).process(data_bundle) char_embed = StaticEmbedding(data_bundle.get_vocab(C.CHAR_INPUT), model_dir_or_name='cn-fasttext') bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), embedding_dim=100, min_freq=3) return data_bundle, char_embed, bigram_embed data_bundle, char_embed, bigram_embed = get_data() # data_bundle = get_data() print(data_bundle) # exit(0) model = CNBiLSTMCRFNER(char_embed, num_classes=len(data_bundle.vocabs['target']), bigram_embed=bigram_embed) Trainer(data_bundle.datasets['train'], model, batch_size=20, metrics=SpanFPreRecMetric(data_bundle.vocabs['target'], encoding_type='bioes'), num_workers=2, dev_data=data_bundle.datasets['dev'], device=0).train()
embeddings['word'], hidden_size=args.hidden, label_size=len(vocabs['label']), device=args.device, bidirectional=args.bi, embed_dropout=args.embed_dropout, output_dropout=args.output_dropout, use_bigram=args.use_bigram) loss = LossInForward() encoding_type = 'bmeso' if args.dataset == 'weibo': encoding_type = 'bio' f1_metric = SpanFPreRecMetric(vocabs['label'], pred='pred', target='target', seq_len='seq_len', encoding_type=encoding_type) acc_metric = AccuracyMetric(pred='pred', target='target', seq_len='seq_len') metrics = [f1_metric, acc_metric] if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) callbacks = [ FitlogCallback({ 'test': datasets['test'],
requires_grad=True, lower=True, word_dropout=0, dropout=0.5, only_norm_found_vector=normalize_embed) if char_embed is not None: embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02) else: word_embed.word_drop = 0.02 embed = word_embed data__ = data.get_vocab('words') data.rename_field('words', 'chars') return data, embed, data__, word_embed data_bundle, embed, data__, word_embed = load_data() model = TENER(tag_vocab=data_bundle.get_vocab('target'), embed=embed, num_layers=num_layers, d_model=d_model, n_head=n_heads, feedforward_dim=dim_feedforward, dropout=dropout, after_norm=after_norm, attn_type=attn_type, bi_embed=None, fc_dropout=fc_dropout, pos_embed=pos_embed, scale=attn_type=='transformer') #Thay model path trước khi chạy model_path = './w2v_n6' states = torch.load(model_path).state_dict() model.load_state_dict(states) evaluation = Tester(data_bundle.get_dataset('test'), model, metrics=SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'), only_gross=False), batch_size=batch_size) evaluation.test()
def train(config, task_name): train_data = pickle.load( open(os.path.join(config.data_path, config.train_name), "rb")) # debug if config.debug: train_data = train_data[0:100] dev_data = pickle.load( open(os.path.join(config.data_path, config.dev_name), "rb")) print(len(train_data), len(dev_data)) # test_data = pickle.load(open(os.path.join(config.data_path, config.test_name), "rb")) # load w2v data # weight = pickle.load(open(os.path.join(config.data_path, config.weight_name), "rb")) word_vocab = pickle.load( open(os.path.join(config.data_path, config.word_vocab_name), "rb")) char_vocab = pickle.load( open(os.path.join(config.data_path, config.char_vocab_name), "rb")) pos_vocab = pickle.load( open(os.path.join(config.data_path, config.pos_vocab_name), "rb")) # spo_vocab = pickle.load(open(os.path.join(config.data_path, config.spo_vocab_name), "rb")) tag_vocab = pickle.load( open(os.path.join(config.data_path, config.tag_vocab_name), "rb")) print('word vocab', len(word_vocab)) print('char vocab', len(char_vocab)) print('pos vocab', len(pos_vocab)) # print('spo vocab', len(spo_vocab)) print('tag vocab', len(tag_vocab)) schema = get_schemas(config.source_path) if task_name == 'bilstm_crf': model = AdvSeqLabel( char_init_embed=(len(char_vocab), config.char_embed_dim), word_init_embed=(len(word_vocab), config.word_embed_dim), pos_init_embed=(len(pos_vocab), config.pos_embed_dim), spo_embed_dim=len(schema), sentence_length=config.sentence_length, hidden_size=config.hidden_dim, num_classes=len(tag_vocab), dropout=config.dropout, id2words=tag_vocab.idx2word, encoding_type=config.encoding_type) elif task_name == 'trans_crf': model = TransformerSeqLabel( char_init_embed=(len(char_vocab), config.char_embed_dim), word_init_embed=(len(word_vocab), config.word_embed_dim), pos_init_embed=(len(pos_vocab), config.pos_embed_dim), spo_embed_dim=len(schema), num_classes=len(tag_vocab), id2words=tag_vocab.idx2word, encoding_type=config.encoding_type, num_layers=config.num_layers, inner_size=config.inner_size, key_size=config.key_size, value_size=config.value_size, num_head=config.num_head, dropout=config.dropout) optimizer = Adam(lr=config.lr, weight_decay=config.weight_decay) timing = TimingCallback() early_stop = EarlyStopCallback(config.patience) # loss = NLLLoss() logs = FitlogCallback(dev_data) metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag') train_data.set_input('tag') dev_data.set_input('tag') dev_data.set_target('seq_len') #print(train_data.get_field_names()) trainer = Trainer( train_data=train_data, model=model, # loss=loss, metrics=metrics, metric_key='f', batch_size=config.batch_size, n_epochs=config.epoch, dev_data=dev_data, save_path=config.save_path, check_code_level=-1, print_every=config.print_every, validate_every=config.validate_every, optimizer=optimizer, use_tqdm=False, device=config.device, callbacks=[timing, early_stop, logs]) trainer.train() # test result tester = Tester(dev_data, model, metrics=metrics, device=config.device, batch_size=config.batch_size) tester.test()
block_loss=True, input_dropout=0.5, hidden_dropout=0.2, inner_dropout=0.2) print(model) callbacks = [ GradientClipCallback(clip_value=ops.gradient_clip, clip_type='value'), ] metrics = [] metrics.append( SpanFPreRecMetric( tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type, pred=Const.OUTPUT, target=Const.TARGET, seq_len=Const.INPUT_LEN, )) class LossMetric(MetricBase): def __init__(self, loss=None): super(LossMetric, self).__init__() self._init_param_map(loss=loss) self.total_loss = 0.0 self.steps = 0 def evaluate(self, loss): self.total_loss += float(loss) self.steps += 1
model = CharModel(embed=embed, label_vocab=label_vocab, pos_idx=pos_idx, Parsing_rnn_layers=rnn_layers, Parsing_arc_mlp_size=arc_mlp_size, Parsing_label_mlp_size=label_mlp_size, encoding_type='bmeso') optimizer = AdamW(model.parameters(), lr=2e-5) device = 0 if torch.cuda.is_available() else 'cpu' callbacks = [WarmupCallback(warmup=0.1, schedule='linear')] metric1 = SegAppCharParseF1Metric(label_vocab['Parsing']['APP']) metric2 = CWSMetric(label_vocab['Parsing']['APP']) metric3 = SpanFPreRecMetric(tag_vocab=label_vocab['POS']) metrics = [metric1, metric2, metric3] for target in ['train', 'test', 'dev']: CWS_dataset = DataSet() for key in task_list: if key.startswith('CWS'): for ins in all_data[target][key]: CWS_dataset.append(ins) del all_data[target][key] CWS_dataset.set_input('chars', 'target', 'seq_len', 'task_class') CWS_dataset.set_target('target', 'seq_len') all_data[target]['CWS-all'] = CWS_dataset train_data = dict() train_data['POS-ctb9'] = all_data['train']['POS-ctb9']
def my_model_single_sentence(sentence): ''' #取出Pipe的处理过程,目的是取得由训练集所构建的词典 from fastNLP.io import WeiboNERLoader #load原始数据 data_bundle = WeiboNERLoader().load() #这里需要获取原始数据的此表Vocabulary from fastNLP import Vocabulary from fastNLP.core.utils import iob2, iob2bioes from fastNLP.core.const import Const #encoding_type encoding_type: str = 'bio' if encoding_type == 'bio': convert_tag = iob2 elif encoding_type == 'bioes': convert_tag = lambda words: iob2bioes(iob2(words)) #转换tag for name, dataset in data_bundle.datasets.items(): dataset.apply_field(convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) #复制一列chars data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True) input_field_names = [Const.CHAR_INPUT] target_field_names=Const.TARGET if isinstance(input_field_names, str): input_field_names = [input_field_names] if isinstance(target_field_names, str): target_field_names = [target_field_names] #构建词表 for input_field_name in input_field_names: src_vocab = Vocabulary() src_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name], field_name=input_field_name, no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets() if ('train' not in name) and (ds.has_field(input_field_name))] ) src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) data_bundle.set_vocab(src_vocab, input_field_name) #构建target表 for target_field_name in target_field_names: tgt_vocab = Vocabulary(unknown=None, padding=None) tgt_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name], field_name=target_field_name, no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets() if ('train' not in name) and (ds.has_field(target_field_name))] ) if len(tgt_vocab._no_create_word) > 0: warn_msg = f"There are {len(tgt_vocab._no_create_word)} `{target_field_name}` labels" \ f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \ f"data set but not in train data set!.\n" \ f"These label(s) are {tgt_vocab._no_create_word}" warnings.warn(warn_msg) logger.warning(warn_msg) tgt_vocab.index_dataset(*[ds for ds in data_bundle.datasets.values() if ds.has_field(target_field_name)], field_name=target_field_name) data_bundle.set_vocab(tgt_vocab, target_field_name) input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) ''' ''' from fastNLP.io import WeiboNERPipe data_bundle = WeiboNERPipe().process_from_file() ''' from fastNLP.io.loader.conll import CNNERLoader data_bundle = CNNERLoader().load("data/") from fastNLP.io.pipe.conll import _CNNERPipe data_bundle = _CNNERPipe(encoding_type='bio').process(data_bundle) src_vocab = data_bundle.get_vocab('chars') tgt_vocab = data_bundle.get_vocab('target') #至此数据处理完毕,两个词典也构建完成 #需要增加数据的是data_bundle.get_dataset('test')这一个fastNLP dataset对象 #该数据结构格式为 raw_chars target chars seq_len from fastNLP import Instance my_raw_chars = [] my_target = [] my_words = [] for i in range(0, len(sentence)): my_raw_chars.append(sentence[i]) my_target.append(0) my_words.append(src_vocab.to_index(sentence[i])) my_seq_len = len(sentence) ins = Instance() ins.add_field('raw_chars', my_raw_chars) ins.add_field('target', my_target) ins.add_field('chars', my_words) ins.add_field('seq_len', my_seq_len) data_bundle.get_dataset('test').append(ins) #加载模型 from fastNLP.io import ModelLoader loader = ModelLoader() model = loader.load_pytorch_model("./save/bilstmcrf_sec_ner.pkl") data_bundle.rename_field( 'chars', 'words') # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名 from fastNLP import SpanFPreRecMetric metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target')) #进行测试 from fastNLP import Tester tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric) final = tester.get_pred() my_label = [] #我们要的是final的最后一个的最后一行 for i in final[len(final) - 1][len(final[len(final) - 1]) - 1]: i = i.cpu().item() my_label.append(tgt_vocab.to_word(i)) output = '' for j in range(0, my_seq_len): output = output + sentence[j] + ' ' + my_label[j] + '\n' return output
def my_model_passage(sentences): from fastNLP.io.loader.conll import CNNERLoader data_bundle = CNNERLoader().load("data/") from fastNLP.io.pipe.conll import _CNNERPipe data_bundle = _CNNERPipe(encoding_type='bio').process(data_bundle) src_vocab = data_bundle.get_vocab('chars') tgt_vocab = data_bundle.get_vocab('target') for i in range(0, 27): data_bundle.get_dataset('test').delete_instance(0) #至此数据处理完毕,两个词典也构建完成 #构建新的dataset from fastNLP import Instance for i in range(0, len(sentences)): my_raw_chars = [] my_target = [] my_words = [] for j in range(0, len(sentences[i])): my_raw_chars.append(sentences[i][j]) my_target.append(0) my_words.append(src_vocab.to_index(sentences[i][j])) my_seq_len = len(sentences[i]) ins = Instance() ins.add_field('raw_chars', my_raw_chars) ins.add_field('target', my_target) ins.add_field('chars', my_words) ins.add_field('seq_len', my_seq_len) data_bundle.get_dataset('test').append(ins) data_bundle.get_dataset('test').delete_instance(0) #加载模型 from fastNLP.io import ModelLoader loader = ModelLoader() model = loader.load_pytorch_model("./save/bilstmcrf_sec_ner.pkl") data_bundle.get_dataset('test').rename_field( 'chars', 'words') # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名 from fastNLP import SpanFPreRecMetric metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target')) #进行测试 from fastNLP import Tester tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric) final = tester.get_pred() output = '' labels = [] #我们要的是final所有内容 #原test有两个batch 数目为16 12 for i in range(0, len(final)): for j in range(0, len(final[i])): my_label = [] for item in final[i][j]: my_label.append(tgt_vocab.to_word(item.cpu().item())) labels.append(my_label) print(labels[0]) print(final[0][0]) for i in range(0, len(sentences)): for j in range(0, len(sentences[i])): output = output + sentences[i][j] + ' ' + labels[i][j] + '\n' output = output + '\n' return output
def main(): if args.do_eval: torch.multiprocessing.set_start_method('spawn', force=True) if args.model == 'bert': model = BertCRF(embed, [data_bundle.get_vocab('target')], encoding_type='bioes') else: model = StackedTransformersCRF( tag_vocabs=[data_bundle.get_vocab('target')], embed=embed, num_layers=num_layers, d_model=d_model, n_head=n_heads, feedforward_dim=dim_feedforward, dropout=trans_dropout, after_norm=after_norm, attn_type=attn_type, bi_embed=None, fc_dropout=fc_dropout, pos_embed=pos_embed, scale=attn_type == 'transformer') model = torch.nn.DataParallel(model) if args.do_eval: if os.path.exists(os.path.expanduser(args.saved_model)): print("Load checkpoint from {}".format( os.path.expanduser(args.saved_model))) model = torch.load(args.saved_model) model.to('cuda') print('model to CUDA') optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8) callbacks = [] clip_callback = GradientClipCallback(clip_type='value', clip_value=5) evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test')) checkpoint_callback = CheckPointCallback(os.path.join( directory, 'model.pth'), delete_when_train_finish=False, recovery_fitlog=True) if warmup_steps > 0: warmup_callback = WarmupCallback(warmup_steps, schedule='linear') callbacks.append(warmup_callback) callbacks.extend([clip_callback, checkpoint_callback, evaluate_callback]) if not args.do_eval: trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer, batch_size=batch_size, sampler=BucketSampler(), num_workers=no_cpu, n_epochs=args.n_epochs, dev_data=data_bundle.get_dataset('dev'), metrics=SpanFPreRecMetric( tag_vocab=data_bundle.get_vocab('target'), encoding_type=encoding_type), dev_batch_size=batch_size, callbacks=callbacks, device=args.device, test_use_tqdm=True, use_tqdm=True, print_every=300, save_path=os.path.join(directory, 'best')) trainer.train(load_best_model=True) predictor = Predictor(model) predict(os.path.join(directory, 'predictions_dev.tsv'), data_bundle, predictor, 'dev') predict(os.path.join(directory, 'predictions_test.tsv'), data_bundle, predictor, 'test') else: print('Predicting') # predictions of multiple files torch.multiprocessing.freeze_support() model.share_memory() predictor = Predictor(model) if len(files) > multiprocessing.cpu_count(): with torch.multiprocessing.Pool(processes=no_cpu) as p: with tqdm(total=len(files)) as pbar: for i, _ in enumerate( p.imap_unordered( partial(predict, data_bundle=data_bundle, predictor=predictor, predict_on='train', do_eval=args.do_eval), files)): pbar.update() else: for file in tqdm(files): predict(file, data_bundle, predictor, 'train', args.do_eval)
save_serialize_obj(char_vocab, char_vocab_pkl_file) save_serialize_obj(target_vocab, target_vocab_pkl_file) logger.info('词典序列化:{}'.format(char_vocab_pkl_file)) logger.warn('选择预训练词向量') # model_dir_or_name = 'cn-wwm' model_dir_or_name = './data/embed/ERNIE_1.0_max-len-512-pytorch' bert_embed = BertEmbedding(vocab=char_vocab, model_dir_or_name=model_dir_or_name, requires_grad=False) logger.warn('神经网络模型') model = BiLSTMCRF(embed=bert_embed, num_classes=len(target_vocab), num_layers=1, hidden_size=200, dropout=0.5, target_vocab=target_vocab) logger.info(model) logger.warn('训练超参数设定') loss = LossInForward() optimizer = Adam([param for param in model.parameters() if param.requires_grad]) # metric = AccuracyMetric() metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab(Const.TARGET), only_gross=False) # 若only_gross=False, 即还会返回各个label的metric统计值 device = 'cuda' if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快 logger.info('device:{}'.format(device)) batch_size = 32 n_epochs = 10 early_stopping = 10 trainer = Trainer( save_path=model_path, train_data=data_bundle.get_dataset('train'), model=model, loss=loss, optimizer=optimizer, batch_size=batch_size, n_epochs=n_epochs, dev_data=data_bundle.get_dataset('dev'), metrics=metric,
hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type, dropout=dropout) callbacks = [ GradientClipCallback(clip_value=5, clip_type='value'), EvaluateCallback(data.datasets['test']) ] optimizer = SGD(model.parameters(), lr=lr, momentum=0.9) scheduler = LRScheduler( LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) callbacks.append(scheduler) trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100), device=0, dev_data=data.get_dataset('dev'), batch_size=batch_size, metrics=SpanFPreRecMetric( tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), callbacks=callbacks, num_workers=1, n_epochs=100, dev_batch_size=256) trainer.train()
class CommonSeqEvaluator(BaseSeqEvaluator): def __init__(self, tag_vocab, config): super(CommonSeqEvaluator, self).__init__() self._config = config self._vocab = Vocabulary() self._vocab.add_word_lst(tag_vocab.stoi.keys()) self._evaluator = SpanFPreRecMetric(self._vocab, only_gross=False, f_type=config.evaluation.type) self._pad_index = tag_vocab.stoi['<pad>'] def _change_type(self, pred, target): seq_len = torch.tensor([len(text) for text in pred]) max_len = max(seq_len) for text in pred: if len(text) < max_len: text.extend([self._pad_index for i in range(max_len - len(text))]) pred = torch.tensor(pred).to(self._config.device) return pred, target, seq_len def evaluate(self, pred, target): # 送入batch数据 pred, target, seq_len = self._change_type(pred, target) self._evaluator.evaluate(pred, target, seq_len) def _get_eval_result(self): # 统计所有batch数据的结果 eval_dict = self._evaluator.get_metric() if self._config.data.chip_relation.use_chip_relation: names = list(set([label[2:] for label in self._vocab.word2idx.keys()][3:])) if '其他' in names: names.remove('其他') else: names = list(set([label[2:] for label in self._vocab.word2idx.keys()][3:])) head = ['label', ' precision', ' recall', ' F1_score'] table = [] table.append(head) for i in range(len(names)): ps = str(round(eval_dict['pre-' + names[i].lower()], 3)) rs = str(round(eval_dict['rec-' + names[i].lower()], 3)) f1s = str(round(eval_dict['f-' + names[i].lower()], 3)) table.append([names[i], ps, rs, f1s]) ps = str(round(eval_dict['pre'], 3)) rs = str(round(eval_dict['rec'], 3)) f1s = str(round(eval_dict['f'], 3)) table.append(['{}_average'.format(self._config.evaluation.type), ps, rs, f1s]) return eval_dict, table def get_eval_output(self): # 外部获取结果接口,并且可以配置是否打印(eval结果保存暂时默认保存) result, table = self._get_eval_result() if self._config.evaluation.is_display: self._print_table(table) self._write_csv(table) return result def _print_table(self, List): # 展示 k = len(List) v = len(List[0]) for i in range(k): for j in range(v): print(List[i][j].rjust(14), end=' ') print() def _write_csv(self, table): wb = Workbook() ws = wb['Sheet'] for line in range(1,len(table)+1): for column in range(1, 5): ws.cell(line, column, table[line-1][column-1]) save_path = self._config.learn.dir.saved + '/eval_result.xlsx' wb.save(save_path)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) callbacks = [] clip_callback = GradientClipCallback(clip_type='value', clip_value=5) evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test')) if warmup_steps > 0: warmup_callback = WarmupCallback(warmup_steps, schedule='linear') callbacks.append(warmup_callback) callbacks.extend([clip_callback, evaluate_callback]) trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer, batch_size=batch_size, sampler=BucketSampler(), num_workers=2, n_epochs=n_epochs, dev_data=data_bundle.get_dataset('dev'), metrics=SpanFPreRecMetric( tag_vocab=data_bundle.get_vocab('target'), encoding_type=encoding_type), dev_batch_size=batch_size, callbacks=callbacks, device=device, test_use_tqdm=False, use_tqdm=True, print_every=300, save_path=None) trainer.train(load_best_model=False)
def train(config): train_data = pickle.load( open(os.path.join(config.data_path, config.train_name), "rb")) # debug train_data = train_data[0:100] dev_data = pickle.load( open(os.path.join(config.data_path, config.dev_name), "rb")) print(len(train_data), len(dev_data)) # test_data = pickle.load(open(os.path.join(config.data_path, config.test_name), "rb")) # load w2v data # weight = pickle.load(open(os.path.join(config.data_path, config.weight_name), "rb")) word_vocab = pickle.load( open(os.path.join(config.data_path, config.word_vocab_name), "rb")) char_vocab = pickle.load( open(os.path.join(config.data_path, config.char_vocab_name), "rb")) pos_vocab = pickle.load( open(os.path.join(config.data_path, config.pos_vocab_name), "rb")) spo_vocab = pickle.load( open(os.path.join(config.data_path, config.spo_vocab_name), "rb")) tag_vocab = pickle.load( open(os.path.join(config.data_path, config.tag_vocab_name), "rb")) print('word vocab', len(word_vocab)) print('char vocab', len(char_vocab)) print('pos vocab', len(pos_vocab)) print('spo vocab', len(spo_vocab)) print('tag vocab', len(tag_vocab)) model = BiLSTM_CRF(config.batch_size, len(word_vocab), len(char_vocab), len(pos_vocab), len(spo_vocab), config.embed_dim, config.hidden_dim, tag_vocab.idx2word, dropout=0.5) optimizer = SGD(lr=config.lr, momentum=config.momentum) timing = TimingCallback() early_stop = EarlyStopCallback(config.patience) loss = NLLLoss() metrics = SpanFPreRecMetric(tag_vocab) # accuracy = AccuracyMetric(pred='output', target='target') trainer = Trainer(train_data=train_data, model=model, loss=loss, metrics=metrics, batch_size=config.batch_size, n_epochs=config.epoch, dev_data=dev_data, save_path=config.save_path, check_code_level=-1, print_every=100, validate_every=0, optimizer=optimizer, use_tqdm=False, device=config.device, callbacks=[timing, early_stop]) trainer.train()