class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) self.dev_manager = DataManager(batch_size=60, data_type="dev") # 验证集 # self.dev_batch = self.dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.model = self.model.cuda() self.restore_model() elif entry == "predict" or "evaluate": # python main.py predict data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") print('input_size', input_size) print('tag_map', self.tag_map) self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.model = self.model.cuda() self.test_manager = DataManager(batch_size=60, data_type="dev") self.restore_model() # 加载配置项 def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 300, "hidden_size": 128, "batch_size": 30, "dropout": 0.5, "model_path": "models/", "tags": ["TREATMENT", "BODY", "SIGNS", "CHECK", "DISEASE"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") # 保存模型各种训练参数 def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params_6all.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) # 保存模型超参数 def save_params(self, data): with open("models/data_6all.pkl", "wb") as fopen: pickle.dump(data, fopen) # 加载模型超参数 def load_params(self): with open("models/data_6all.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters(), weight_decay=0.002, lr=0.0000004) # 0.000001 # optimizer = optim.SGD(self.model.parameters(), lr=0.00000008,weight_decay=0.001,momentum=0.9) #4e-7 scheduler_lr = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, cooldown=5, verbose=True, min_lr=1e-8, eps=1e-8) best_loss = 240 lossList = [0] * self.total_size for epoch in range(268, 401): losses = [] index = 0 startTime = time.process_time() for batch in self.train_manager.get_batch(): start = time.process_time() index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) # lenght 是句子的原本长度 # shape (batch_size,max.len(sentence) (20,332) batch_size 和 每个batch最长句子的长度 sentences_tensor = torch.tensor(sentences, dtype=torch.long).cuda() tags_tensor = torch.tensor(tags, dtype=torch.long).cuda() length_tensor = torch.tensor(length, dtype=torch.long).cuda() loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) losses.append(loss.cpu().item()) progress = ("█" * int(index * 60 / self.total_size)).ljust(60) loss.backward() optimizer.step() # torch.save(self.model.state_dict(), self.model_path + 'params_6all.pkl') end = time.process_time() dur = end - start print( """epoch [{}] |{}| {}/{}\n\tloss {:.3f}\t\tlast_loss {:.3f}\t\ttime {}\t\tbest_avg_loss {:.3f}""" .format(epoch, progress, index, self.total_size, loss.cpu().tolist()[0], lossList[index - 1], str(dur), best_loss)) lossList[index - 1] = loss.cpu().item() print("-" * 90) endTime = time.process_time() totalTime = endTime - startTime avg_loss = np.mean(losses) # 保存最好的模型 if avg_loss < best_loss: best_loss = avg_loss torch.save(self.model.state_dict(), self.model_path + 'params_6all.pkl') writer.add_scalar('BiLstm_CRF:avg_loss-epoch', avg_loss, epoch) print('epoch ', epoch, ' avg_loss ', avg_loss, ' total_time ', totalTime) if epoch % 5 == 0: self.evaluate(epoch / 5, manager=self.dev_manager) print("-" * 100) scheduler_lr.step(avg_loss) writer.close() # train: BODY 7507, SIGNS 6355, CHECK 6965, DISEASE 474, TREATMENT 805 # test: # 计算f1,评估模型 def evaluate(self, epoch, manager, add_scalar=True): print('正在开始评估') all_origins = all_founds = all_rights = 0 for tag in self.tags: origins = founds = rights = 0 for batch in manager.get_batch(): sentences, labels, length = zip(*batch) _, paths = self.model(sentences) origin, found, right = f1_score(labels, paths, tag, self.model.tag_map) origins += origin founds += found rights += right all_origins += origins all_founds += founds all_rights += rights recall = 0. if origins == 0 else (rights / origins) precision = 0. if founds == 0 else (rights / founds) f1 = 0. if recall + precision == 0 else ( 2 * precision * recall) / (precision + recall) print("\t{}\torigins:{}\t\t\tfounds:{}\t\t\trights:{}".format( tag, origins, founds, rights)) print("\t\t\trecall:{}\tprecision:{}\tf1:{}".format( recall, precision, f1)) if add_scalar: tag_epoch = tag + '-5epoch' writer.add_scalars(tag_epoch, { 'recall': recall, 'precision': precision, 'f1': f1 }, epoch) all_recall = 0. if all_origins == 0 else (all_rights / all_origins) all_precision = 0. if all_founds == 0 else (all_rights / all_founds) all_f1 = 0. if all_recall + all_precision == 0 else ( 2 * all_precision * all_recall) / (all_precision + all_recall) print("\tall_origins:{}\t\t\tall_founds:{}\t\t\tall_rights:{}".format( all_origins, all_founds, all_rights)) print("\tall_recall:{}\tall_precision:{}\tall_f1:{}".format( all_recall, all_precision, all_f1)) if add_scalar: writer.add_scalars( "ALL-5epoch", { 'all_recall': all_recall, 'all_precision': all_precision, 'all_f1': all_f1 }, epoch) print('评估结束') return all_recall, all_precision, all_f1 # 预测方法 def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") # 获取输入句子所有汉字的在vocab的索引 input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec, dtype=torch.long).view(1, -1) sentences = sentences.cuda() # paths 预测出来的标签索引 shape 为 [1,1] _, paths = self.model(sentences) entities = [] # "tags": ["ORG", "PER"] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) print(entities) print(json.dumps(entities, indent=4, ensure_ascii=False)) return entities
class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: print('True') self.model = self.model.cuda() else: print('False') self.restore_model() # elif entry=='testXXX': # self.dev_manager= DataManager(batch_size=30, data_type="test") # # self.dev_batch = dev_manager.batch_data # print('####batch_data###',len(dev_manager.batch_data)) elif entry == 'test': self.dev_manager = DataManager(batch_size=30, data_type="test") # self.dev_batch = dev_manager.iteration() data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: print('True') self.model = self.model.cuda() else: print('False') self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: self.model = self.model.cuda() self.restore_model() def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tags": ["component", "disease&symptom", "people"], #在这里修改tag "use_gpu": True } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") self.use_gpu = config.get("use_gpu") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() print('batch', type(batch), len(batch), len(batch[0]), len(batch[10])) sentences, tags, length = zip(*batch) # print('zip batch sentences', type(sentences), sentences) # print('zip batch tags', type(tags), tags) # print('zip batch length', type(length), length) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor( length, dtype=torch.long) #在一个batch中,每个句子的原长度 if self.use_gpu: sentences_tensor = sentences_tensor.cuda() tags_tensor = tags_tensor.cuda() length_tensor = length_tensor.cuda() # print('zip batch sentences', type(sentences_tensor), sentences_tensor.shape) # print('zip batch tags', type(tags_tensor), tags_tensor.shape) # print('zip batch length', type(length_tensor), length_tensor.shape,length) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) if index % 10 == 0: self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): with torch.no_grad(): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, path): #, input_str=""): # if not input_str: # input_str = input("请输入文本: ") sentences = [] with open('./data/' + path + '.txt', 'r', encoding='utf-8') as f: for i in f: sentences += i.strip().split('。') f = open('./result/tag_' + path + '.json', 'w') for input_str in sentences: input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) dic = {'sentense': input_str, 'entities': entities} json.dump(dic, f, ensure_ascii=False) f.close() # return entities # def testXXX(self): # for batch in self.dev_manager.get_batch(): # print(_) # print(_,len(items),len(items[0][0]),len(items[0][1]),items[0][2]) # break def test(self): with torch.no_grad(): id2vocab = {self.vocab[i]: i for i in self.vocab} print(len(id2vocab)) f = open('./result/test_tag.json', 'w') total_matrix = np.zeros( [len(self.tags), 3] ) #横坐标分别表示component,disease&symptom,people;纵坐标分别表示recall, precision, f1 count = 0 for batch in self.dev_manager.get_batch(): count += 1 print(count) # print(type(items)) sentences, labels, length = zip(*batch) # sentences, labels, length = zip(*self.dev_batch.__next__()) # print('I am in') strs = [[id2vocab[w] for w in s] for s in sentences] # print(strs) # print(len(sentences),len(sentences[0]),len(sentences[5])) _, paths = self.model(sentences) # print("\teval") # print('path',len(paths),len(paths[0]),len(paths[1])) for i in range(len(self.tags)): recall, precision, f1 = f1_score(labels, paths, self.tags[i], self.model.tag_map) total_matrix[i][0] += recall total_matrix[i][1] += precision total_matrix[i][2] += f1 entities = [] for i in range(len(paths)): tmp = [] for tag in self.tags: tags = get_tags(paths[i], tag, self.tag_map) tmp += format_result(tags, strs[i], tag) entities.append(tmp) # print(entities) for i in range(len(entities)): dic = { 'sentense': ''.join(strs[i]), 'entities': entities[i] } json.dump(dic, f, ensure_ascii=False) # f.write(''.join(strs[i])+'#####找到的实体为#####'+'&'.join(entities[i])+'\n') total_matrix /= count # print(total_matrix) for i in range(len(self.tags)): print( "{}\tcount\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}" .format(count, self.tags[i], total_matrix[i][0], total_matrix[i][1], total_matrix[i][2])) f.close()
# sentence_zero_inithidden = parameters['sentence_zero_inithidden'], attention = None, num_layers = parameters['num_layers'], dropout = parameters['dropout']) model = BiLSTMCRF( word_embedding_dimension, number_class, hidden_size=parameters['hidden_size'], sentence_embedding_type=parameters['sentence_embedding_type'], sentence_zero_inithidden=parameters['sentence_zero_inithidden'], attention=None, crf_decode_method=parameters['crf_decode_method'], loss_function=parameters['loss_function'], num_layers=parameters['num_layers'], dropout=parameters['dropout']) if use_cuda: model = model.cuda() model.load_state_dict(stored_model_list[i]) print 'Evaluate on all situation entity' print '----------------------------------------------------' best_macro_Fscore, best_result = evaluate( model, (test_X, test_X_eos_list, test_X_connective_position_list), test_Y) each_iteration_result_list.append(best_result) each_iteration_macro_Fscore_list.append(best_macro_Fscore) if best_macro_Fscore > overall_best_macro: overall_best_macro = best_macro_Fscore overall_best_result = best_result print '--------------------------------------------------------------------------'