class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: print('True') self.model = self.model.cuda() else: print('False') self.restore_model() # elif entry=='testXXX': # self.dev_manager= DataManager(batch_size=30, data_type="test") # # self.dev_batch = dev_manager.batch_data # print('####batch_data###',len(dev_manager.batch_data)) elif entry == 'test': self.dev_manager = DataManager(batch_size=30, data_type="test") # self.dev_batch = dev_manager.iteration() data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: print('True') self.model = self.model.cuda() else: print('False') self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: self.model = self.model.cuda() self.restore_model() def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tags": ["component", "disease&symptom", "people"], #在这里修改tag "use_gpu": True } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") self.use_gpu = config.get("use_gpu") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() print('batch', type(batch), len(batch), len(batch[0]), len(batch[10])) sentences, tags, length = zip(*batch) # print('zip batch sentences', type(sentences), sentences) # print('zip batch tags', type(tags), tags) # print('zip batch length', type(length), length) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor( length, dtype=torch.long) #在一个batch中,每个句子的原长度 if self.use_gpu: sentences_tensor = sentences_tensor.cuda() tags_tensor = tags_tensor.cuda() length_tensor = length_tensor.cuda() # print('zip batch sentences', type(sentences_tensor), sentences_tensor.shape) # print('zip batch tags', type(tags_tensor), tags_tensor.shape) # print('zip batch length', type(length_tensor), length_tensor.shape,length) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) if index % 10 == 0: self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): with torch.no_grad(): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, path): #, input_str=""): # if not input_str: # input_str = input("请输入文本: ") sentences = [] with open('./data/' + path + '.txt', 'r', encoding='utf-8') as f: for i in f: sentences += i.strip().split('。') f = open('./result/tag_' + path + '.json', 'w') for input_str in sentences: input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) dic = {'sentense': input_str, 'entities': entities} json.dump(dic, f, ensure_ascii=False) f.close() # return entities # def testXXX(self): # for batch in self.dev_manager.get_batch(): # print(_) # print(_,len(items),len(items[0][0]),len(items[0][1]),items[0][2]) # break def test(self): with torch.no_grad(): id2vocab = {self.vocab[i]: i for i in self.vocab} print(len(id2vocab)) f = open('./result/test_tag.json', 'w') total_matrix = np.zeros( [len(self.tags), 3] ) #横坐标分别表示component,disease&symptom,people;纵坐标分别表示recall, precision, f1 count = 0 for batch in self.dev_manager.get_batch(): count += 1 print(count) # print(type(items)) sentences, labels, length = zip(*batch) # sentences, labels, length = zip(*self.dev_batch.__next__()) # print('I am in') strs = [[id2vocab[w] for w in s] for s in sentences] # print(strs) # print(len(sentences),len(sentences[0]),len(sentences[5])) _, paths = self.model(sentences) # print("\teval") # print('path',len(paths),len(paths[0]),len(paths[1])) for i in range(len(self.tags)): recall, precision, f1 = f1_score(labels, paths, self.tags[i], self.model.tag_map) total_matrix[i][0] += recall total_matrix[i][1] += precision total_matrix[i][2] += f1 entities = [] for i in range(len(paths)): tmp = [] for tag in self.tags: tags = get_tags(paths[i], tag, self.tag_map) tmp += format_result(tags, strs[i], tag) entities.append(tmp) # print(entities) for i in range(len(entities)): dic = { 'sentense': ''.join(strs[i]), 'entities': entities[i] } json.dump(dic, f, ensure_ascii=False) # f.write(''.join(strs[i])+'#####找到的实体为#####'+'&'.join(entities[i])+'\n') total_matrix /= count # print(total_matrix) for i in range(len(self.tags)): print( "{}\tcount\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}" .format(count, self.tags[i], total_matrix[i][0], total_matrix[i][1], total_matrix[i][2])) f.close()
class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen, Loader=yaml.FullLoader) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w", encoding='UTF-8') config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tasg": ["ORG", "PER"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor(length, dtype=torch.long) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
class ChineseNER: def __init__(self, entry="train"): # Load Hyper-parameters config = load_config() self.model_path = config.get("model_path") self.epochs = config.get("epochs") self.batch_size = config.get("batch_size") self.learning_rate = config.get("learning_rate") self.weight_decay = config.get("weight_decay") self.dropout = config.get("dropout") self.hidden_size = config.get("hidden_size") self.char_num = config.get("char_num") self.char_dim = config.get("char_dim") self.word_dim = config.get("word_dim") self.word_num = config.get("word_num") self.tags = config.get("tags") self.transfer_learning = config.get("transfer_learning") self.lr_decay_step = config.get("lr_decay_step") self.lr_decay_rate = config.get("lr_decay_rate") # Load main model self.main_model(entry) def main_model(self, entry): # The Training Process if entry == "train": # Training Process: read Training Data from DataManager self.train_manager = DataManager(batch_size=self.batch_size, data_type='train', tags=self.tags) self.total_size = len(self.train_manager.batch_data) # Load some model parameters try: load_params(path=self.model_path) print("Successfully load the data.pkl!!!") except Exception as error: print("There was no data.pkl!! Start to save........") # Read the corresponding character index (vocab) and other hyper-parameters saved_data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "char_vocab": self.train_manager.char_vocab, "tag_map": self.train_manager.tag_map, } save_params(data=saved_data, path=self.model_path) # Build BiLSTM-CRF Model self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.char_vocab), dropout=self.dropout, word_num=self.word_num, word_dim=self.word_dim, char_num=self.char_num, char_dim=self.char_dim, hidden_dim=self.hidden_size, ) # Restore model if it exists self.restore_model() # Evaluation Process: read Dev Data from DataManager self.dev_size = DataManager(batch_size=1, data_type="dev", tags=self.tags).load_char_data() self.dev_manager = DataManager(batch_size=int(self.dev_size), data_type="dev") self.dev_batch = self.dev_manager.iteration() # The Inference Process elif entry == "predict": data = load_params(path=self.model_path) input_size = data.get("input_size") self.tag_map = data.get("tag_map") self.vocab = data.get("char_vocab") self.model = BiLSTMCRF( tag_map=self.tag_map, vocab_size=input_size, dropout=0.0, word_num=self.word_num, word_dim=self.word_dim, char_num=self.char_num, char_dim=self.char_dim, hidden_dim=self.hidden_size, ) self.restore_model() def train(self): # Transfer Learning Module if self.transfer_learning == True: keep_grad = [ "transitions", "char_embedding.weight", "char_linear_lstm.weight", "char_linear_lstm.bias", "word_linear_lstm.weight", "word_linear_lstm.bias", "hidden2tag.weight", "hidden2tag.bias" ] for name, value in self.model.named_parameters(): if name in keep_grad: value.requires_grad = True else: value.requires_grad = False else: for name, value in self.model.named_parameters(): value.requires_grad = True # Use Adam Optimizer optimizer = optim.Adam(params=filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.learning_rate, weight_decay=self.weight_decay) # Learning Rate Decay # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate) # Print model architecture print('\033[1;31mThe model architecture is shown below:\033[0m') print(self.model) print('\n') # Print model parameters print('\033[1;31mThe model\'s parameters are shown below:\033[0m') for name, value in self.model.named_parameters(): print("Name: \033[1;31m{0}\033[0m, " "Parameter Size: \033[1;36m{1}\033[0m, " "Gradient: \033[1;35m{2}\033[0m".format( name, value.size(), value.requires_grad)) print('\n') for epoch in range(1, self.epochs + 1): index = 0 for batch in self.train_manager.get_batch(): index += 1 # Clear gradients before training self.model.zero_grad() #################################################################################################################################### # Read sentences and labels from the batch data chars, labels, words, len_word, len_char = zip(*batch) chars_tensor = torch.tensor(chars, dtype=torch.long, device=device) labels_tensor = torch.tensor(labels, dtype=torch.float, device=device) words_tensor = torch.tensor(words, dtype=torch.float, device=device) len_word_tensor = torch.tensor(len_word, dtype=torch.int64, device=device) len_char_tensor = torch.tensor(len_char, dtype=torch.int64, device=device) #################################################################################################################################### loss = self.model.neg_log_likelihood(characters=chars_tensor, tags=labels_tensor, len_char=len_char_tensor, words=words_tensor, len_word=len_word_tensor) progress = ("█" * int(index * 40 / self.total_size)).ljust(40) print("epoch [{}] |{}| {}/{}\t Batch Loss {:.6f}".format( epoch, progress, index, self.total_size, loss.tolist()[0])) #################################################################################################################################### loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') self.evaluate() # scheduler.step() def evaluate(self): """ Evaluation of the performance using the development set """ chars, labels, words, len_words, len_chars = zip( *self.dev_batch.__next__()) chars_tensor = torch.tensor(chars, dtype=torch.long, device=device) words_tensor = torch.tensor(words, dtype=torch.float, device=device) len_word_tensor = torch.tensor(len_words, dtype=torch.int64, device=device) len_char_tensor = torch.tensor(len_chars, dtype=torch.int64, device=device) # Run the Forward pass of the model _, pre = self.model(characters=chars_tensor, len_char=len_char_tensor, words=words_tensor, len_word=len_word_tensor) tags_tensor = torch.tensor(pre, dtype=torch.int, device=device) #################################################################################################################################### # Loss on the dev set loss = self.model.neg_log_likelihood(characters=chars_tensor, tags=tags_tensor, len_char=len_char_tensor, words=words_tensor, len_word=len_word_tensor) print("\t Evaluation Loss on the dev set{:.6f}".format( loss.tolist()[0])) #################################################################################################################################### print('Start to evaluate on the dev set: ') #################################################################################################################################### # Tag-level F1 score summary (w.r.t. each tag) tag_f1_total = [] for tag in self.tags: _, _, f1_tag = tag_f1(tar_path=labels, pre_path=pre, tag=tag, tag_map=self.model.tag_map) tag_f1_total.append(f1_tag) tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total) print( 'Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % tag_macro_f1) # Tag-level Micro-averaged F1 Score _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels, pre_path=pre, tags=self.tags, tag_map=self.model.tag_map) print( 'Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m' % f1_Micro_tag) #################################################################################################################################### # Tag-level with Label-level F1 score summary f1_prefix_total = [] prefixes = ['B', 'I', 'E', 'S'] for tag in self.tags: for prefix in prefixes: _, _, f1_prefix = entity_label_f1(tar_path=labels, pre_path=pre, length=len_chars, tag=tag, tag_map=self.model.tag_map, prefix=prefix) f1_prefix_total.append(f1_prefix) f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total) print( 'Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % f1_macro_tag_prefix) #################################################################################################################################### # Label-level F1 score summary f1_prefix_total = [] prefixes = ['B', 'I', 'E', 'S', 'O'] for prefix in prefixes: _, _, f1_prefix = label_f1(tar_path=labels, pre_path=pre, length=len_chars, tags=self.tags, tag_map=self.model.tag_map, prefix=prefix) f1_prefix_total.append(f1_prefix) f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total) print( 'Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % f1_macro_prefix) def predict(self): """ Prediction & Inference Stage """ # Load word vectors pre_trained = self.load_word_vector() while True: input_str = input("Please input a sentence in Chinese: ") input_str = stringQ2B(input_str) # Get character embedding char_vec = [self.vocab.get(i, 0) for i in input_str] char_tensor = np.reshape(char_vec, [-1]).tolist() len_char = np.expand_dims(len(char_tensor), axis=0) len_char = torch.tensor(len_char, dtype=torch.int64, device=device) char_tensor = np.array(self.pad_char_data(char_tensor)).tolist() char_tensor = torch.tensor(char_tensor, dtype=torch.long, device=device) # Get word embedding embed_words = [] words = jieba.lcut(input_str, HMM=True) for i in words: vec = pre_trained.get(i) if str(type(vec)) != "<class 'NoneType'>": embed_words.append(vec) else: gen_vec = np.random.normal(size=self.word_dim).tolist() embed_words.append(gen_vec) word_tensor = np.array(self.pad_word_data(embed_words)).tolist() len_word = np.expand_dims(len(word_tensor), axis=0) len_word = torch.tensor(len_word, dtype=torch.int64, device=device) word_tensor = torch.tensor(word_tensor, dtype=torch.float, device=device) # Run the model and get all the predicted entities _, paths = self.model(characters=char_tensor, len_char=len_char, words=word_tensor, len_word=len_word) # Format the results entities = [] for tag in self.tags: tags = get_tags(path=paths[0], tag=tag, tag_map=self.tag_map) entities += format_result(result=tags, text=input_str, tag=tag) print(entities) def load_word_vector(self): """ Load pre-trained word vectors """ if 'pre_trained' not in globals().keys(): print("Start to load pre-trained word embeddings!!") pre_trained = {} for i, line in enumerate( codecs.open(self.model_path + "word_vectors.vec", 'r', encoding='utf-8')): line = line.rstrip().split() if len(line) == self.word_dim + 1: pre_trained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: pre_trained = globals().get("pre_trained") return pre_trained def pad_char_data(self, data: list): """ Pad character data """ c_data = copy.deepcopy(data) if np.shape(c_data)[0] < self.char_num: c_data = c_data + (self.char_num - np.shape(c_data)[0]) * [0] else: c_data = c_data[:self.char_num] c_data = np.expand_dims(c_data, axis=0) return c_data def pad_word_data(self, data: list): """ Pad word data """ c_data = copy.deepcopy(data) if len(c_data) <= self.word_num: c_data = c_data + (self.word_num - len(c_data)) * [[0] * self.word_dim] else: c_data = c_data[:self.word_num, :] c_data = np.reshape(c_data, [np.shape(c_data)[0] * np.shape(c_data)[1]]) c_data = np.expand_dims(c_data, axis=0) return c_data def restore_model(self): """ Restore and load the model """ try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("Model Successfully Restored!!") except Exception as error: print("Model Failed to restore!!")
class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() def parse_argument(self): """ :argument :return: """ parser = argparse.ArgumentParser(description="NER") parser.add_argument("-c", "--config", dest="config_file", type=str, default="./Config/config.cfg", help="config path") parser.add_argument("-device", "--device", dest="device", type=str, default="cuda:0", help="device[‘cpu’,‘cuda:0’,‘cuda:1’,......]") parser.add_argument("--train", dest="train", action="store_true", default=True, help="train model") parser.add_argument("-p", "--process", dest="process", action="store_true", default=True, help="data process") parser.add_argument("-t", "--test", dest="test", action="store_true", default=False, help="test model") parser.add_argument("--t_model", dest="t_model", type=str, default=None, help="model for test") parser.add_argument("--t_data", dest="t_data", type=str, default=None, help="data[train, dev, test, None] for test model") parser.add_argument("--predict", dest="predict", action="store_true", default=False, help="predict model") args = parser.parse_args() # print(vars(args)) config = configurable.Configurable(config_file=args.config_file) config.device = args.device config.train = args.train config.process = args.process config.test = args.test config.t_model = args.t_model config.t_data = args.t_data config.predict = args.predict # config if config.test is True: config.train = False if config.t_data not in [None, "train", "dev", "test"]: print("\nUsage") parser.print_help() print("t_data : {}, not in [None, 'train', 'dev', 'test']".format( config.t_data)) exit() print("***************************************") print("Device : {}".format(config.device)) print("Data Process : {}".format(config.process)) print("Train model : {}".format(config.train)) print("Test model : {}".format(config.test)) print("t_model : {}".format(config.t_model)) print("t_data : {}".format(config.t_data)) print("predict : {}".format(config.predict)) print("***************************************") return config def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tasg": ["ORG", "PER"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor(length, dtype=torch.long) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) self.dev_manager = DataManager(batch_size=60, data_type="dev") # 验证集 # self.dev_batch = self.dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.model = self.model.cuda() self.restore_model() elif entry == "predict" or "evaluate": # python main.py predict data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") print('input_size', input_size) print('tag_map', self.tag_map) self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.model = self.model.cuda() self.test_manager = DataManager(batch_size=60, data_type="dev") self.restore_model() # 加载配置项 def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 300, "hidden_size": 128, "batch_size": 30, "dropout": 0.5, "model_path": "models/", "tags": ["TREATMENT", "BODY", "SIGNS", "CHECK", "DISEASE"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") # 保存模型各种训练参数 def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params_6all.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) # 保存模型超参数 def save_params(self, data): with open("models/data_6all.pkl", "wb") as fopen: pickle.dump(data, fopen) # 加载模型超参数 def load_params(self): with open("models/data_6all.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters(), weight_decay=0.002, lr=0.0000004) # 0.000001 # optimizer = optim.SGD(self.model.parameters(), lr=0.00000008,weight_decay=0.001,momentum=0.9) #4e-7 scheduler_lr = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, cooldown=5, verbose=True, min_lr=1e-8, eps=1e-8) best_loss = 240 lossList = [0] * self.total_size for epoch in range(268, 401): losses = [] index = 0 startTime = time.process_time() for batch in self.train_manager.get_batch(): start = time.process_time() index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) # lenght 是句子的原本长度 # shape (batch_size,max.len(sentence) (20,332) batch_size 和 每个batch最长句子的长度 sentences_tensor = torch.tensor(sentences, dtype=torch.long).cuda() tags_tensor = torch.tensor(tags, dtype=torch.long).cuda() length_tensor = torch.tensor(length, dtype=torch.long).cuda() loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) losses.append(loss.cpu().item()) progress = ("█" * int(index * 60 / self.total_size)).ljust(60) loss.backward() optimizer.step() # torch.save(self.model.state_dict(), self.model_path + 'params_6all.pkl') end = time.process_time() dur = end - start print( """epoch [{}] |{}| {}/{}\n\tloss {:.3f}\t\tlast_loss {:.3f}\t\ttime {}\t\tbest_avg_loss {:.3f}""" .format(epoch, progress, index, self.total_size, loss.cpu().tolist()[0], lossList[index - 1], str(dur), best_loss)) lossList[index - 1] = loss.cpu().item() print("-" * 90) endTime = time.process_time() totalTime = endTime - startTime avg_loss = np.mean(losses) # 保存最好的模型 if avg_loss < best_loss: best_loss = avg_loss torch.save(self.model.state_dict(), self.model_path + 'params_6all.pkl') writer.add_scalar('BiLstm_CRF:avg_loss-epoch', avg_loss, epoch) print('epoch ', epoch, ' avg_loss ', avg_loss, ' total_time ', totalTime) if epoch % 5 == 0: self.evaluate(epoch / 5, manager=self.dev_manager) print("-" * 100) scheduler_lr.step(avg_loss) writer.close() # train: BODY 7507, SIGNS 6355, CHECK 6965, DISEASE 474, TREATMENT 805 # test: # 计算f1,评估模型 def evaluate(self, epoch, manager, add_scalar=True): print('正在开始评估') all_origins = all_founds = all_rights = 0 for tag in self.tags: origins = founds = rights = 0 for batch in manager.get_batch(): sentences, labels, length = zip(*batch) _, paths = self.model(sentences) origin, found, right = f1_score(labels, paths, tag, self.model.tag_map) origins += origin founds += found rights += right all_origins += origins all_founds += founds all_rights += rights recall = 0. if origins == 0 else (rights / origins) precision = 0. if founds == 0 else (rights / founds) f1 = 0. if recall + precision == 0 else ( 2 * precision * recall) / (precision + recall) print("\t{}\torigins:{}\t\t\tfounds:{}\t\t\trights:{}".format( tag, origins, founds, rights)) print("\t\t\trecall:{}\tprecision:{}\tf1:{}".format( recall, precision, f1)) if add_scalar: tag_epoch = tag + '-5epoch' writer.add_scalars(tag_epoch, { 'recall': recall, 'precision': precision, 'f1': f1 }, epoch) all_recall = 0. if all_origins == 0 else (all_rights / all_origins) all_precision = 0. if all_founds == 0 else (all_rights / all_founds) all_f1 = 0. if all_recall + all_precision == 0 else ( 2 * all_precision * all_recall) / (all_precision + all_recall) print("\tall_origins:{}\t\t\tall_founds:{}\t\t\tall_rights:{}".format( all_origins, all_founds, all_rights)) print("\tall_recall:{}\tall_precision:{}\tall_f1:{}".format( all_recall, all_precision, all_f1)) if add_scalar: writer.add_scalars( "ALL-5epoch", { 'all_recall': all_recall, 'all_precision': all_precision, 'all_f1': all_f1 }, epoch) print('评估结束') return all_recall, all_precision, all_f1 # 预测方法 def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") # 获取输入句子所有汉字的在vocab的索引 input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec, dtype=torch.long).view(1, -1) sentences = sentences.cuda() # paths 预测出来的标签索引 shape 为 [1,1] _, paths = self.model(sentences) entities = [] # "tags": ["ORG", "PER"] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) print(entities) print(json.dumps(entities, indent=4, ensure_ascii=False)) return entities
class BiLSTMCRFEnter(object): def __init__(self, entry="train"): # 导入训练参数 # 利用配置文件对main函数里面需要的变量进行初始化 self.load_config() # 这里传入的entry是train,也就是训练集,也就是说对model初始化时是利用训练集对模型初始化的 self.__init_model(entry) def __init_model(self, entry): # 模型训练的参数准备 if entry == "train": #创建训练数据集的管理对象 print(self.tags) self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) print(self.train_manager.batch_data) print(len(self.train_manager.batch_data)) self.total_size = len(self.train_manager.batch_data) # print(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } # 保存参数 self.save_params(data) # 验证数据集的准备 # 创建验证数据集的管理对象 dev_manager = DataManager(batch_size=30, data_type="dev") # 通过data_manager中的迭代器不断将创建的数据管理器对象赋值到dev_batch中,用于下面计算损失的函数 self.dev_batch = dev_manager.iteration() # 模型的主体使用的是BiLSTM来进行语义编码,CRF用来约束各个标签 self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) # 加载恢复模型参数 self.restore_model() # 模型用来预测的参数准备 elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") # 这里创建一个模型对象model self.model = BiLSTMCRF( tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size ) self.restore_model() def load_config(self): try: fopen = open("models/config.yml") #读取yml文件 config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) #这里是重写config.yml文件 fopen = open("models/config.yml", "w") config = { # 用于重写的数据,即初始化数据 "embedding_size": 100, "hidden_size": 128, "batch_size": 50, "dropout": 0.5, "model_path": "models/", #这里原来的tags写成了tasg了,需要改过来 "tags": ["Medicinal_Name", "Medicinal_Other_Name", "Medicinal_Function", "Medicinal_Taste", "Medicinal_Use_Num"] } yaml.dump(config, fopen) fopen.close() #重写过后再读取,感觉有点多此一举,主要就是将tags写进了config文件 # word_embedding的维度大小 self.embedding_size = config.get("embedding_size") # 隐藏层的维度 self.hidden_size = config.get("hidden_size") # 每一个batch导入多少条数据 self.batch_size = config.get("batch_size") # 模型的保存数据 self.model_path = config.get("model_path") self.tags = config.get("tags") # 模型中神经百分之多少激活 self.dropout = config.get("dropout") # 模型一共训练多少轮 self.epoch = config.get("epoch") # 模型在测试过程中进行参数导入 def restore_model(self): try: # 加载模型字典、 # 这个load_state_dict函数并没有出现在任何一个文件中,所以这是怎么调用的? self.model.load_state_dict(torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) # 训练过程中保存模型的参数 def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) # 训练过程中读取更新后的模型的参数 def load_params(self): # pkl文件的读取 with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) # print("*"*50+data_map+"*"*50) return data_map def train(self): # 使用Adam优化器进行梯度下降算法的优化迭代 # 这里的parameters函数也没有在任何文件中声明过 optimizer = optim.Adam(self.model.parameters(), lr=0.05) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) # 模型一共训练多少轮轮 for epoch in range(self.epoch): index = 0 # 获取每一个batch的数据 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor(length, dtype=torch.long) # 计算模型训练过程中的损失 loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) # 进度加载 progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0] ) ) self.evaluate() print("-" * 50) # 梯度回传 loss.backward() # 优化器优化 optimizer.step() # 保存模型 torch.save(self.model.state_dict(), self.model_path + 'params.pkl') # torch.save(self.model) # 训练过程中的损失计算 def evaluate(self): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) # 模型训练好之后的预测 def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: # 这里调用了工具类里面的get_tags用来对数据进行标注,就是标一些B-FUNC什么的 tags = get_tags(paths[0], tag, self.tag_map) print(tag) print(self.tag_map) print(paths[0]) print(tags) entities += format_result(tags, input_str, tag) return entities # 模型对文件中的句子进行实体预测 def predict_file(self, f_r_path, f_w_path): # 去除重复预测的实体 duplication = set() with open(f_r_path, encoding='utf-8') as f_r: with open(f_w_path, 'ab') as f_w: for line in f_r.readlines(): sent = line.split('\t')[-3].strip() res = self.predict(sent) for i in range(len(res)-1): entity = res[i]['word'] tag=res[i]["type"] if entity not in duplication: # print(entity) duplication.add(tag) duplication.add(entity) f_w.write((tag+" : "+entity + '\n').encode()) if res[i]["type"]!=res[i+1]["type"]: f_w.write('\n'.encode())
class ChineseNER(object): use_gpu = False def __init__(self, entry="train"): self.load_config() #self.use_gpu = torch.cuda.is_available() self.__init_model(entry) print(self.use_gpu) if (self.use_gpu): # gpu加速 self.model = self.model.cuda() def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tasg": ["ORG", "PER"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map #@torchsnooper.snoop() def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor(length, dtype=torch.long) if (self.use_gpu): # gpu加速 sentences_tensor = sentences_tensor.cuda() tags_tensor = tags_tensor.cuda() length_tensor = length_tensor.cuda() loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) if (self.use_gpu): loss = loss.cuda() progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def get_string(self, x): now = x.split('\n') o = now[1].split(' ') while '' in o: o.remove('') return o[1] def evaluate(self): sentences, labels, length = zip(*self.dev_batch.__next__()) if (self.use_gpu): sentences = torch.tensor(sentences, dtype=torch.long).cuda() _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, input_str="", input_path=None): if input_path is not None: tests = pd.read_csv(input_path) with open('output.txt', 'w', encoding='utf-8') as o: #o.write('id,aspect,opinion\n') for ids in range(1, 2235): input_str = self.get_string( str(tests.loc[ids - 1:ids - 1, ['Review']])) index = int( self.get_string(str(tests.loc[ids - 1:ids - 1, ['id']]))) input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor if (self.use_gpu): # gpu加速 sentences = torch.tensor(input_vec).view(1, -1).cuda() else: sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) entities = sorted(entities, key=lambda x: x['start']) #print(str(index) + " " + input_str + " " +str(len(entities))) for entity in entities: #print(entity) o.write( str(index) + ',' + entity['type'] + ',' + entity['word'] + '\n') else: if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor if (self.use_gpu): # gpu加速 sentences = torch.tensor(input_vec).view(1, -1).cuda() else: sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities