def run(schema_path, name, sample_size, batch_size, epochs): dataset = Dataset(schema_path, name) labels, data = dataset.get_data() X = [x['words'] for x in data] y = [x['labels'] for x in data] word_vocab = Vocabulary() word_vocab.build_vocab([w for command in X for w in command]) #char embedding char_vocab = Vocabulary() char_vocab.build_vocab([ch for w in word_vocab for ch in w]) labels2idx = dict((label, idx) for idx, label in enumerate(labels)) idx2label = dict((idx, label) for idx, label in enumerate(labels)) preprocessor = Preprocessor(word_vocab, labels2idx, char_vocab) model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab)) trainer = Trainer(model, X, y, preprocessor.transform, split=[0.75, 0.95]) trainer.train(batch_size, epochs) trainer.evaluate(idx2label) model.save_weights(name) dataset.save(X[:sample_size], labels) word_vocab.save("%s_word_vocab.json" % name) char_vocab.save("%s_char_vocab.json" % name)
def predict(name, command): command = command.lower() label_path = path.join(path.dirname(path.realpath(__file__)), "intents", "config", "labels", "%s_labels.json" % name) with open(label_path, encoding="utf8") as f: labels = json.load(f) word_vocab = Vocabulary() word_vocab.load("%s_word_vocab.json" % name) #char embedding char_vocab = Vocabulary() char_vocab.load("%s_char_vocab.json" % name) idx2label = dict((idx, label) for idx, label in enumerate(labels)) preprocessor = Preprocessor(word_vocab, None, char_vocab) model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab)) model.load_weights('intents/config/weights/%s.hdf5' % name) sentence = tokenize(command) features = preprocessor.transform([sentence]) p = model.predict(features) predicted_labels = [] for pred in p: predicted_labels.append(idx2label[pred]) for word, label in zip(sentence, predicted_labels): print('%s: %s' % (word, label))
def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model()
def train(config, params): """模型训练。""" # 构建词典 if not (os.path.exists(config["vocab_file"]) and os.path.exists(config["tag_file"])): build_vocab(config["train_path"], config["vocab_file"], config["tag_file"]) # 读取词典 vocab2id, id2vocab = read_vocab(config["vocab_file"]) tag2id, id2tag = read_vocab(config["tag_file"]) # 数据预处理 train_text, train_label = tokenize(config["train_path"], vocab2id, tag2id, params["maxlen"]) dev_text, dev_label = tokenize(config["dev_path"], vocab2id, tag2id, params["maxlen"]) # 将数据转换为tf.data.Dataset train_dataset = data.Dataset.from_tensor_slices((train_text, train_label)) train_dataset = train_dataset.shuffle(len(train_text)).batch( params["batch_size"], drop_remainder=True) dev_dataset = data.Dataset.from_tensor_slices((dev_text, dev_label)) dev_dataset = dev_dataset.batch(params["batch_size"], drop_remainder=True) print(f"hidden_num:{params['hidden_num']}, vocab_size:{len(vocab2id)}, " f"label_size:{len(tag2id)}") # 构建模型 model = BiLSTMCRF(hidden_num=params["hidden_num"], vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=params["embedding_size"]) # 编译模型 model.compile(loss=CRFLoss(model.crf, model.dtype), optimizer=tf.keras.optimizers.Adam(params["lr"]), metrics=[model.crf.viterbi_accuracy, IOBESF1(id2tag)], run_eagerly=True) model.build((None, train_text.shape[-1])) model.summary() # 设置回调函数 callbacks = [ tf.keras.callbacks.ModelCheckpoint(filepath=config["ckpt_path"], save_weights_only=True, save_best_only=True, monitor="val_f1", mode="max"), ] # 训练(拟合)模型 model.fit(train_dataset, epochs=params["epochs"], callbacks=callbacks, validation_data=dev_dataset)
def main_model(self, entry): """ Model Initialization """ # The Training Process if entry == "train": # Training Process: read Training Data from DataManager self.train_manager = DataManager(batch_size=self.batch_size, data_type='train', tags=self.tags) self.total_size = len(self.train_manager.batch_data) # Read the corresponding character index (vocab) and other hyper-parameters data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } save_params(data=data, path=self.model_path) # Build BiLSTM-CRF Model self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, max_length=self.max_length) # Evaluation Process: read Dev Data from DataManager self.dev_size = DataManager(batch_size=1, data_type="dev", tags=self.tags).load_data() self.dev_manager = DataManager(batch_size=int(self.dev_size), data_type="dev", tags=self.tags) self.dev_batch = self.dev_manager.iteration() # Restore model if it exists self.restore_model() # The Testing & Inference Process elif entry == "predict": data_map = load_params(path=self.model_path) input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, dropout=0.0, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, max_length=self.max_length) self.restore_model()
def main_model(self, entry): # The Training Process if entry == "train": # Training Process: read Training Data from DataManager self.train_manager = DataManager(batch_size=self.batch_size, data_type='train', tags=self.tags) self.total_size = len(self.train_manager.batch_data) # Read the corresponding character index (vocab) and other hyper-parameters saved_data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "char_vocab": self.train_manager.char_vocab, "tag_map": self.train_manager.tag_map, } save_params(data=saved_data, path=self.model_path) # Evaluation Process: read Dev Data from DataManager self.dev_size = DataManager(batch_size=1, data_type="dev", tags=self.tags).load_char_data() self.dev_manager = DataManager(batch_size=int(self.dev_size), data_type="dev") self.dev_batch = self.dev_manager.iteration() # Build BiLSTM-CRF Model self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.char_vocab), dropout=self.dropout, word_num=self.word_num, word_dim=self.word_dim, char_num=self.char_num, char_dim=self.char_dim, hidden_dim=self.hidden_size, ) # Restore model if it exists self.restore_model() # The Inference Process elif entry == "predict": data = load_params(path=self.model_path) input_size = data.get("input_size") self.tag_map = data.get("tag_map") self.vocab = data.get("char_vocab") self.model = BiLSTMCRF( tag_map=self.tag_map, vocab_size=input_size, dropout=1.0, word_num=self.word_num, word_dim=self.word_dim, char_num=self.char_num, char_dim=self.char_dim, hidden_dim=self.hidden_size, ) self.restore_model()
def __init_model(self, entry): # 模型训练的参数准备 if entry == "train": #创建训练数据集的管理对象 print(self.tags) self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) print(self.train_manager.batch_data) print(len(self.train_manager.batch_data)) self.total_size = len(self.train_manager.batch_data) # print(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } # 保存参数 self.save_params(data) # 验证数据集的准备 # 创建验证数据集的管理对象 dev_manager = DataManager(batch_size=30, data_type="dev") # 通过data_manager中的迭代器不断将创建的数据管理器对象赋值到dev_batch中,用于下面计算损失的函数 self.dev_batch = dev_manager.iteration() # 模型的主体使用的是BiLSTM来进行语义编码,CRF用来约束各个标签 self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) # 加载恢复模型参数 self.restore_model() # 模型用来预测的参数准备 elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") # 这里创建一个模型对象model self.model = BiLSTMCRF( tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size ) self.restore_model()
def main_model(self, entry): """ Model Initialization """ # The Testing & Inference Process if entry == "predict": data_map = load_params(path=self.model_path) input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, dropout=0.0, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, max_length=self.max_length) self.restore_model()
def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) self.dev_manager = DataManager(batch_size=60, data_type="dev") # 验证集 # self.dev_batch = self.dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.model = self.model.cuda() self.restore_model() elif entry == "predict" or "evaluate": # python main.py predict data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") print('input_size', input_size) print('tag_map', self.tag_map) self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.model = self.model.cuda() self.test_manager = DataManager(batch_size=60, data_type="dev") self.restore_model()
def workflow(): train_data, valid_data, test_data, vocab, speech_vocab = prepare_data() ## Set the corresponding tags for each dataset, which will be used in the Trainer train_data.set_input("token_index_list", "origin_len", "speech_index_list") test_data.set_input("token_index_list", "origin_len", "speech_index_list") valid_data.set_input("token_index_list", "origin_len", "speech_index_list") train_data.set_target("speech_index_list") test_data.set_target("speech_index_list") valid_data.set_target("speech_index_list") ## Build the model config = { "vocab_size": len(vocab), "word_emb_dim": args.word_emb, "rnn_hidden_units": args.rnn_hidden, "num_classes": len(speech_vocab), "bi_direction": args.bilstm } ## Load the model from scratch or from saved model if args.cont: model = torch.load(args.cont) else: model = BiLSTMCRF(config) if args.mode == "train": ##Choose the optimizer optimizer = Adam(lr=args.lr) if args.op else SGD(lr=args.lr) ## Train the model trainer = Trainer(model=model, train_data=train_data, dev_data=valid_data, use_cuda=args.cuda, metrics=PosMetric(pred='pred', target='speech_index_list'), optimizer=optimizer, n_epochs=args.epoch, batch_size=args.batch_size, save_path="./save") trainer.train() ## Test the model tester = Tester( data=test_data, model=model, metrics=PosMetric(pred='pred', target='speech_index_list'), use_cuda=args.cuda, ) tester.test()
def single_predict(): vocab_size = 4688 embed_size = 128 units = 64 num_tags = 4 _, _, char_index_dict, index_char_dict = open_file("./data/data.txt") optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.1) my_model = BiLSTMCRF(vocab_size, embed_size, units, num_tags) ckpt = tf.train.Checkpoint(optimizer=optimizer, my_model=my_model) ckpt.restore(tf.train.latest_checkpoint("./save_checkpoint/")) text = input_text() char_index_list = [char_index_dict.get(char, 0) for char in text] text_list = [char for char in text] tag_list = ['b', 'm', 'e', 's'] inputs = tf.keras.preprocessing.sequence.pad_sequences([char_index_list], padding='post') #predict得到numpy矩阵 logits, inputs_length = my_model.predict(inputs) #viterbi_decode得到最优路径 path, _ = tfa_crf.viterbi_decode(logits[0], my_model.transition_params) path_list = [tag_list[index] for index in path] new_path_list = tag_finetune(path_list) #衡量标签路径更改的程度 print("标签正常率%.2f%%" % (100 * sum([i1 == i2 for i1, i2 in zip(path_list, new_path_list)]) / len(path_list))) seg_text(text, new_path_list)
def predict(text, config, params, is_export=False): """模型预测。""" # 读取词典 vocab2id, id2vocab = read_vocab(config["vocab_file"]) tag2id, id2tag = read_vocab(config["tag_file"]) # 构建模型 model = BiLSTMCRF(hidden_num=params["hidden_num"], vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=params["embedding_size"]) model.load_weights(config["ckpt_path"]) # 数据预处理 dataset = tf.keras.preprocessing.sequence.pad_sequences( [[vocab2id.get(char, 0) for char in text]], padding='post', maxlen=params["maxlen"]) # 模型预测 result = model.predict(dataset)[0] result = np.argmax(result, axis=-1) result = [id2tag[i] for i in result] print(result) # 结果处理 entities_result = format_result(list(text), result) print(json.dumps(entities_result, indent=4, ensure_ascii=False)) if is_export: # 导出模型 tf.keras.models.save_model(model, config["export_dir"], overwrite=True, include_optimizer=True, save_format=None, options=None)
def create_model(bert_config, is_training, input_ids, input_mask, sequence_lens, segment_ids, label_ids, num_labels, use_one_hot_embeddings): """ 创建模型 :param bert_config: bert 模型的配置参数 :param is_training: 判断是否是训练模式 :param input_ids: 输入的数据的index表示 :param input_mask: mask列表 :param segment_ids: 句子的index :param label_ids: 标签序列 :param num_labels: 标签的数量 :param use_one_hot_embeddings: :return: """ # 初始化bert模型 model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings ) # 获得bert模型最后的输出,维度为[batch_size, seq_length, embedding_size] # 将bert的输出作为我们的输入,相当于做word embedding embedding = model.get_sequence_output() tf.logging.info("bert embedding size: {}".format(embedding.get_shape())) max_seq_length = embedding.shape[1].value blstm_crf = BiLSTMCRF(embedded_chars=embedding, hidden_sizes=FLAGS.hidden_sizes, layers=FLAGS.layers, dropout_rate=FLAGS.dropout_rate, num_labels=num_labels, max_len=max_seq_length, labels=label_ids, sequence_lens=sequence_lens, is_training=is_training) result = blstm_crf.construct_graph() return result
def predict(text, config, params): """模型预测。""" # 读取词典 vocab2id, id2vocab = read_vocab(config["vocab_file"]) tag2id, id2tag = read_vocab(config["tag_file"]) # 构建模型 model = BiLSTMCRF( hidden_num=params["hidden_num"], vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=params["embedding_size"]) model.load_weights(config["ckpt_path"]) # 数据预处理 dataset = tf.keras.preprocessing.sequence.pad_sequences( [[vocab2id.get(char, 0) for char in text]], padding='post') # 模型预测 result = model.predict(dataset)[0] result = np.argmax(result, axis=-1) result = [id2tag[i] for i in result] print(result) # 结果处理 entities_result = format_result(list(text), result) print(json.dumps(entities_result, indent=4, ensure_ascii=False))
print 'Evaluation: #test_samples= ' + str(len(test_Y)) for i in range(len(stored_model_list)): #model = BaseSequenceLabeling(word_embedding_dimension, number_class, hidden_size=parameters['hidden_size'], sentence_embedding_type = parameters['sentence_embedding_type'], # sentence_zero_inithidden = parameters['sentence_zero_inithidden'], attention = None, num_layers = parameters['num_layers'], dropout = parameters['dropout']) #model = BaseSequenceLabeling_LSTMEncoder(word_embedding_dimension, number_class, hidden_size=parameters['hidden_size'], sentence_embedding_type = parameters['sentence_embedding_type'], # sentence_zero_inithidden = parameters['sentence_zero_inithidden'], attention = None, num_layers = parameters['num_layers'], dropout = parameters['dropout']) model = BiLSTMCRF( word_embedding_dimension, number_class, hidden_size=parameters['hidden_size'], sentence_embedding_type=parameters['sentence_embedding_type'], sentence_zero_inithidden=parameters['sentence_zero_inithidden'], attention=None, crf_decode_method=parameters['crf_decode_method'], loss_function=parameters['loss_function'], num_layers=parameters['num_layers'], dropout=parameters['dropout']) if use_cuda: model = model.cuda() model.load_state_dict(stored_model_list[i]) print 'Evaluate on all situation entity' print '----------------------------------------------------' best_macro_Fscore, best_result = evaluate( model, (test_X, test_X_eos_list, test_X_connective_position_list), test_Y)
def main(): # load vocab vocab = Vocab(stopwords_file=args.vocab_dir + 'stopwords.txt', vocab_dir=args.vocab_dir) vocab.load_vocab_label() vocab_size = vocab.get_vocab_size() label_size = vocab.get_label_size() # load pre-trained word embedding # embeddings_index = {} # embedding_matrix = {} # if args.w2v_file is not None: # with open(args.w2v_file, 'r', encoding='utf-8') as f: # for line in f: # arrs = line.split() # if len(arrs) == 2: # continue # w = arrs[0] # vec = np.asarray(arrs[1:], dtype='float32') # embeddings_index[w] = vec # print('{} INFO: Use pre-train word embedding , Found {} word vectors'.format( # get_timestamp(), len(embeddings_index))) # # # convert embedding to weights # embedding_matrix = np.zeros((vocab_size, args.embedding_dim)) # for word, idx in vocab.vocab.items(): # if word in embeddings_index: # embedding_matrix[idx] = embeddings_index[word] # # # define tf train summary writer # summary_writer = tf.summary.create_file_writer(args.summary_dir) # load data train_dataset, num_train_samples = create_dataset_with_tf( args.train_data, vocab, args.epochs, args.batch_size, args.max_seq_len, args.mode) dev_dataset, num_dev_samples = create_dataset_with_tf( args.test_data, vocab, 1, args.batch_size, args.max_seq_len, "evaluate") model = BiLSTMCRF(args.hidden_num, vocab_size, label_size, args.embedding_dim, args.max_seq_len, weights=None, weights_trainable=False) optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate, decay=0.0) steps_per_epoch = num_train_samples // args.batch_size train_loss = tf.metrics.Mean() for epoch in range(args.epochs): train_loss.reset_states() # train_accuracy.reset_states() for (batch_idx, (inputs, labels)) in enumerate(train_dataset.take(steps_per_epoch)): time_s = time.time() train_loss, pred, seq_len = train_step(inputs, labels, model, train_loss, optimizer, True) time_e = time.time() train_accuracy = cal_acc_one_step(model, pred, seq_len, labels) # write to summary file # tf.summary.scalar("train_loss", train_loss.result().numpy(), step=batch_idx) # tf.summary.scalar("train_accuracy", train_accuracy.result().numpy(), step=batch_idx) # tf.summary.scalar("learning_rate", params.learning_rate, step=batch_idx) # summary_writer.flush() print( "{} INFO: Train batch:{}/{}\tloss:{:.4f}\tacc:{:.4f} time:{:.4f}s" .format(get_timestamp(), batch_idx + epoch * steps_per_epoch, args.epochs * steps_per_epoch, train_loss, train_accuracy, (time_e - time_s)))
class ChineseNER: def __init__(self, entry="train"): # Load some Hyper-parameters config = load_config() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.dropout = config.get("dropout") self.tags = config.get("tags") self.learning_rate = config.get("learning_rate") self.epochs = config.get("epochs") self.weight_decay = config.get("weight_decay") self.transfer_learning = config.get("transfer_learning") self.lr_decay_step = config.get("lr_decay_step") self.lr_decay_rate = config.get("lr_decay_rate") self.max_length = config.get("max_length") # Model Initialization self.main_model(entry) def main_model(self, entry): """ Model Initialization """ # The Training Process if entry == "train": # Training Process: read Training Data from DataManager self.train_manager = DataManager(batch_size=self.batch_size, data_type='train', tags=self.tags) self.total_size = len(self.train_manager.batch_data) # Read the corresponding character index (vocab) and other hyper-parameters data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } save_params(data=data, path=self.model_path) # Build BiLSTM-CRF Model self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, max_length=self.max_length) # Evaluation Process: read Dev Data from DataManager self.dev_size = DataManager(batch_size=1, data_type="dev", tags=self.tags).load_data() self.dev_manager = DataManager(batch_size=int(self.dev_size), data_type="dev", tags=self.tags) self.dev_batch = self.dev_manager.iteration() # Restore model if it exists self.restore_model() # The Testing & Inference Process elif entry == "predict": data_map = load_params(path=self.model_path) input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, dropout=0.0, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, max_length=self.max_length) self.restore_model() def restore_model(self): """ Restore the model if there is one """ try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("Model Successfully Restored!") except Exception as error: print("Model Failed to restore! {}".format(error)) def train(self): """ Training stage """ model = self.model.to(device=device) # Transfer Learning Module if self.transfer_learning == True: keep_grad = [ "transitions", "word_embeddings.weight", "hidden2tag.weight", "hidden2tag.bias", "linear1.weight", "linear1.bias", "linear2.weight", "linear2.bias" ] for name, value in model.named_parameters(): if name in keep_grad: value.requires_grad = True else: value.requires_grad = False else: for name, value in model.named_parameters(): value.requires_grad = True # Use Adam Optimizer optimizer = optim.AdamW(params=filter(lambda p: p.requires_grad, model.parameters()), lr=self.learning_rate, weight_decay=self.weight_decay, amsgrad=True) # Learning Rate Decay # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate) # Print model architecture print('\033[1;31mThe model architecture is shown below:\033[0m') print(model) print('\n') # Print model parameters print('\033[1;31mThe model\'s parameters are shown below:\033[0m') for name, value in model.named_parameters(): print("Name: \033[1;31m{0}\033[0m, " "Parameter Size: \033[1;36m{1}\033[0m, " "Gradient: \033[1;35m{2}\033[0m".format( name, value.size(), value.requires_grad)) print('\n') for epoch in range(1, self.epochs + 1): index = 0 for batch in self.train_manager.get_batch(): index += 1 # Clear gradients before training self.model.zero_grad() # Read sentences and tags from the batch data sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long, device=device) tags_tensor = torch.tensor(tags, dtype=torch.float, device=device) length_tensor = torch.tensor(length, dtype=torch.int64, device=device) # Use Negative Log-Likelihood (NLL) as Loss Function, Run the forward pass batch_loss = self.model.neg_log_likelihood( sentences_tensor, tags_tensor, length_tensor) loss = batch_loss.mean() progress = ("█" * int(index * 40 / self.total_size)).ljust(40) print("epoch [{}] |{}| {}/{}\n\t Training Loss {:.6f}".format( epoch, progress, index, self.total_size, loss)) loss.backward() optimizer.step() # Save the model during training torch.save(self.model.state_dict(), self.model_path + 'params.pkl') self.evaluate() # scheduler.step() def evaluate(self): """ Evaluation of the performance using the dev batch - dev dataset """ sentences, labels, length = zip(*self.dev_batch.__next__()) _, pre = self.model(sentences=sentences, real_length=length, lengths=None) sentences_tensor = torch.tensor(sentences, dtype=torch.long, device=device) tags_tensor = torch.tensor(pre, dtype=torch.float, device=device) length_tensor = torch.tensor(length, dtype=torch.int64, device=device) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) print("\t Evaluation Loss {:.6f}".format(loss.tolist()[0])) #################################################################################################################################### print('Start to evaluate on the dev set: ') # Tag-level F1 score summary (w.r.t. each tag) tag_f1_total = [] for tag in self.tags: _, _, f1_tag = tag_f1(tar_path=labels, pre_path=pre, tag=tag, tag_map=self.model.tag_map) tag_f1_total.append(f1_tag) tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total) print( 'Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % tag_macro_f1) # Tag-level Micro-averaged F1 Score _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels, pre_path=pre, tags=self.tags, tag_map=self.model.tag_map) print( 'Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m' % f1_Micro_tag) #################################################################################################################################### # Tag-level with Label-level F1 score summary f1_prefix_total = [] prefixes = ['B', 'I', 'E', 'S'] for tag in self.tags: for prefix in prefixes: _, _, f1_prefix = entity_label_f1(tar_path=labels, pre_path=pre, length=length, tag=tag, tag_map=self.model.tag_map, prefix=prefix) f1_prefix_total.append(f1_prefix) f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total) print( 'Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % f1_macro_tag_prefix) #################################################################################################################################### # Label-level F1 score summary f1_prefix_total = [] prefixes = ['B', 'I', 'E', 'S', 'O'] for prefix in prefixes: _, _, f1_prefix = label_f1(tar_path=labels, pre_path=pre, length=length, tags=self.tags, tag_map=self.model.tag_map, prefix=prefix) f1_prefix_total.append(f1_prefix) f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total) print( 'Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % f1_macro_prefix) def predict(self): """ Prediction & Inference Stage :param input_str: Input Chinese sentence :return entities: Predicted entities """ # Print model architecture print('\033[1;31mThe model architecture is shown below:\033[0m') print(self.model) print('\n') # Input one Chinese Sentence while True: input_str = input("Please input a sentence in Chinese: ") if len(input_str) != 0: # Full-width to half-width input_str = strQ2B(input_str) input_str = re.sub(pattern='。', repl='.', string=input_str) text = cut_text(text=input_str, length=self.max_length) cut_out = [] for cuttext in text: # Get the embedding vector (Input Vector) from vocab input_vec = [self.vocab.get(i, 0) for i in cuttext] # convert it to tensor and run the model sentences = torch.tensor(input_vec).view(1, -1) length = np.expand_dims(np.shape(sentences)[1], axis=0) length = torch.tensor(length, dtype=torch.int64, device=device) _, paths = self.model(sentences=sentences, real_length=length, lengths=None) # Get the entities from the model entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, cuttext, tag) # Get all the entities all_start = [] for entity in entities: start = entity.get('start') all_start.append([start, entity]) # Sort the results by the "start" index sort_d = [ value for index, value in sorted( enumerate(all_start), key=lambda all_start: all_start[1]) ] if len(sort_d) == 0: return print("There was no entity in this sentence!!") else: sort_d = np.reshape( np.array(sort_d)[:, 1], [np.shape(sort_d)[0], 1]) cut_out.append(sort_d) # return cut_out print(cut_out) else: return print('Invalid input! Please re-input!!\n')
def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: print('True') self.model = self.model.cuda() else: print('False') self.restore_model() # elif entry=='testXXX': # self.dev_manager= DataManager(batch_size=30, data_type="test") # # self.dev_batch = dev_manager.batch_data # print('####batch_data###',len(dev_manager.batch_data)) elif entry == 'test': self.dev_manager = DataManager(batch_size=30, data_type="test") # self.dev_batch = dev_manager.iteration() data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: print('True') self.model = self.model.cuda() else: print('False') self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: self.model = self.model.cuda() self.restore_model()
class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen, Loader=yaml.FullLoader) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w", encoding='UTF-8') config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tasg": ["ORG", "PER"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor(length, dtype=torch.long) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
exit() # 定义训练集 train_dataset = MyDataset(batch_size=32, tags=["ORG", "PER"]) # 定义测试集 word2id, tag2id = train_dataset.word2id, train_dataset.tag2id test_dataset = MyDataset(batch_size=32, data_type="test", word2id=word2id, tag2id=tag2id) if sys.argv[1] == "train_model": # 定义模型 model = BiLSTMCRF(tag2id=tag2id, word2id_size=len(word2id), batch_size=32, embedding_dim=100, hidden_dim=128) # 训练模型 train_model(train_dataset=train_dataset, test_dataset=test_dataset, model=model, tag2id=tag2id) elif sys.argv[1] == "predict_model": # 定义模型 model = BiLSTMCRF(tag2id=tag2id, word2id_size=len(word2id), batch_size=1, embedding_dim=100, hidden_dim=128)
class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: print('True') self.model = self.model.cuda() else: print('False') self.restore_model() # elif entry=='testXXX': # self.dev_manager= DataManager(batch_size=30, data_type="test") # # self.dev_batch = dev_manager.batch_data # print('####batch_data###',len(dev_manager.batch_data)) elif entry == 'test': self.dev_manager = DataManager(batch_size=30, data_type="test") # self.dev_batch = dev_manager.iteration() data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: print('True') self.model = self.model.cuda() else: print('False') self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, use_gpu=self.use_gpu) if self.use_gpu: self.model = self.model.cuda() self.restore_model() def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tags": ["component", "disease&symptom", "people"], #在这里修改tag "use_gpu": True } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") self.use_gpu = config.get("use_gpu") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() print('batch', type(batch), len(batch), len(batch[0]), len(batch[10])) sentences, tags, length = zip(*batch) # print('zip batch sentences', type(sentences), sentences) # print('zip batch tags', type(tags), tags) # print('zip batch length', type(length), length) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor( length, dtype=torch.long) #在一个batch中,每个句子的原长度 if self.use_gpu: sentences_tensor = sentences_tensor.cuda() tags_tensor = tags_tensor.cuda() length_tensor = length_tensor.cuda() # print('zip batch sentences', type(sentences_tensor), sentences_tensor.shape) # print('zip batch tags', type(tags_tensor), tags_tensor.shape) # print('zip batch length', type(length_tensor), length_tensor.shape,length) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) if index % 10 == 0: self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): with torch.no_grad(): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, path): #, input_str=""): # if not input_str: # input_str = input("请输入文本: ") sentences = [] with open('./data/' + path + '.txt', 'r', encoding='utf-8') as f: for i in f: sentences += i.strip().split('。') f = open('./result/tag_' + path + '.json', 'w') for input_str in sentences: input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) dic = {'sentense': input_str, 'entities': entities} json.dump(dic, f, ensure_ascii=False) f.close() # return entities # def testXXX(self): # for batch in self.dev_manager.get_batch(): # print(_) # print(_,len(items),len(items[0][0]),len(items[0][1]),items[0][2]) # break def test(self): with torch.no_grad(): id2vocab = {self.vocab[i]: i for i in self.vocab} print(len(id2vocab)) f = open('./result/test_tag.json', 'w') total_matrix = np.zeros( [len(self.tags), 3] ) #横坐标分别表示component,disease&symptom,people;纵坐标分别表示recall, precision, f1 count = 0 for batch in self.dev_manager.get_batch(): count += 1 print(count) # print(type(items)) sentences, labels, length = zip(*batch) # sentences, labels, length = zip(*self.dev_batch.__next__()) # print('I am in') strs = [[id2vocab[w] for w in s] for s in sentences] # print(strs) # print(len(sentences),len(sentences[0]),len(sentences[5])) _, paths = self.model(sentences) # print("\teval") # print('path',len(paths),len(paths[0]),len(paths[1])) for i in range(len(self.tags)): recall, precision, f1 = f1_score(labels, paths, self.tags[i], self.model.tag_map) total_matrix[i][0] += recall total_matrix[i][1] += precision total_matrix[i][2] += f1 entities = [] for i in range(len(paths)): tmp = [] for tag in self.tags: tags = get_tags(paths[i], tag, self.tag_map) tmp += format_result(tags, strs[i], tag) entities.append(tmp) # print(entities) for i in range(len(entities)): dic = { 'sentense': ''.join(strs[i]), 'entities': entities[i] } json.dump(dic, f, ensure_ascii=False) # f.write(''.join(strs[i])+'#####找到的实体为#####'+'&'.join(entities[i])+'\n') total_matrix /= count # print(total_matrix) for i in range(len(self.tags)): print( "{}\tcount\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}" .format(count, self.tags[i], total_matrix[i][0], total_matrix[i][1], total_matrix[i][2])) f.close()
class ChineseNER: def __init__(self, entry="train"): # Load Hyper-parameters config = load_config() self.model_path = config.get("model_path") self.epochs = config.get("epochs") self.batch_size = config.get("batch_size") self.learning_rate = config.get("learning_rate") self.weight_decay = config.get("weight_decay") self.dropout = config.get("dropout") self.hidden_size = config.get("hidden_size") self.char_num = config.get("char_num") self.char_dim = config.get("char_dim") self.word_dim = config.get("word_dim") self.word_num = config.get("word_num") self.tags = config.get("tags") self.transfer_learning = config.get("transfer_learning") self.lr_decay_step = config.get("lr_decay_step") self.lr_decay_rate = config.get("lr_decay_rate") # Load main model self.main_model(entry) def main_model(self, entry): # The Training Process if entry == "train": # Training Process: read Training Data from DataManager self.train_manager = DataManager(batch_size=self.batch_size, data_type='train', tags=self.tags) self.total_size = len(self.train_manager.batch_data) # Load some model parameters try: load_params(path=self.model_path) print("Successfully load the data.pkl!!!") except Exception as error: print("There was no data.pkl!! Start to save........") # Read the corresponding character index (vocab) and other hyper-parameters saved_data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "char_vocab": self.train_manager.char_vocab, "tag_map": self.train_manager.tag_map, } save_params(data=saved_data, path=self.model_path) # Build BiLSTM-CRF Model self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.char_vocab), dropout=self.dropout, word_num=self.word_num, word_dim=self.word_dim, char_num=self.char_num, char_dim=self.char_dim, hidden_dim=self.hidden_size, ) # Restore model if it exists self.restore_model() # Evaluation Process: read Dev Data from DataManager self.dev_size = DataManager(batch_size=1, data_type="dev", tags=self.tags).load_char_data() self.dev_manager = DataManager(batch_size=int(self.dev_size), data_type="dev") self.dev_batch = self.dev_manager.iteration() # The Inference Process elif entry == "predict": data = load_params(path=self.model_path) input_size = data.get("input_size") self.tag_map = data.get("tag_map") self.vocab = data.get("char_vocab") self.model = BiLSTMCRF( tag_map=self.tag_map, vocab_size=input_size, dropout=0.0, word_num=self.word_num, word_dim=self.word_dim, char_num=self.char_num, char_dim=self.char_dim, hidden_dim=self.hidden_size, ) self.restore_model() def train(self): # Transfer Learning Module if self.transfer_learning == True: keep_grad = [ "transitions", "char_embedding.weight", "char_linear_lstm.weight", "char_linear_lstm.bias", "word_linear_lstm.weight", "word_linear_lstm.bias", "hidden2tag.weight", "hidden2tag.bias" ] for name, value in self.model.named_parameters(): if name in keep_grad: value.requires_grad = True else: value.requires_grad = False else: for name, value in self.model.named_parameters(): value.requires_grad = True # Use Adam Optimizer optimizer = optim.Adam(params=filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.learning_rate, weight_decay=self.weight_decay) # Learning Rate Decay # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate) # Print model architecture print('\033[1;31mThe model architecture is shown below:\033[0m') print(self.model) print('\n') # Print model parameters print('\033[1;31mThe model\'s parameters are shown below:\033[0m') for name, value in self.model.named_parameters(): print("Name: \033[1;31m{0}\033[0m, " "Parameter Size: \033[1;36m{1}\033[0m, " "Gradient: \033[1;35m{2}\033[0m".format( name, value.size(), value.requires_grad)) print('\n') for epoch in range(1, self.epochs + 1): index = 0 for batch in self.train_manager.get_batch(): index += 1 # Clear gradients before training self.model.zero_grad() #################################################################################################################################### # Read sentences and labels from the batch data chars, labels, words, len_word, len_char = zip(*batch) chars_tensor = torch.tensor(chars, dtype=torch.long, device=device) labels_tensor = torch.tensor(labels, dtype=torch.float, device=device) words_tensor = torch.tensor(words, dtype=torch.float, device=device) len_word_tensor = torch.tensor(len_word, dtype=torch.int64, device=device) len_char_tensor = torch.tensor(len_char, dtype=torch.int64, device=device) #################################################################################################################################### loss = self.model.neg_log_likelihood(characters=chars_tensor, tags=labels_tensor, len_char=len_char_tensor, words=words_tensor, len_word=len_word_tensor) progress = ("█" * int(index * 40 / self.total_size)).ljust(40) print("epoch [{}] |{}| {}/{}\t Batch Loss {:.6f}".format( epoch, progress, index, self.total_size, loss.tolist()[0])) #################################################################################################################################### loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') self.evaluate() # scheduler.step() def evaluate(self): """ Evaluation of the performance using the development set """ chars, labels, words, len_words, len_chars = zip( *self.dev_batch.__next__()) chars_tensor = torch.tensor(chars, dtype=torch.long, device=device) words_tensor = torch.tensor(words, dtype=torch.float, device=device) len_word_tensor = torch.tensor(len_words, dtype=torch.int64, device=device) len_char_tensor = torch.tensor(len_chars, dtype=torch.int64, device=device) # Run the Forward pass of the model _, pre = self.model(characters=chars_tensor, len_char=len_char_tensor, words=words_tensor, len_word=len_word_tensor) tags_tensor = torch.tensor(pre, dtype=torch.int, device=device) #################################################################################################################################### # Loss on the dev set loss = self.model.neg_log_likelihood(characters=chars_tensor, tags=tags_tensor, len_char=len_char_tensor, words=words_tensor, len_word=len_word_tensor) print("\t Evaluation Loss on the dev set{:.6f}".format( loss.tolist()[0])) #################################################################################################################################### print('Start to evaluate on the dev set: ') #################################################################################################################################### # Tag-level F1 score summary (w.r.t. each tag) tag_f1_total = [] for tag in self.tags: _, _, f1_tag = tag_f1(tar_path=labels, pre_path=pre, tag=tag, tag_map=self.model.tag_map) tag_f1_total.append(f1_tag) tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total) print( 'Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % tag_macro_f1) # Tag-level Micro-averaged F1 Score _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels, pre_path=pre, tags=self.tags, tag_map=self.model.tag_map) print( 'Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m' % f1_Micro_tag) #################################################################################################################################### # Tag-level with Label-level F1 score summary f1_prefix_total = [] prefixes = ['B', 'I', 'E', 'S'] for tag in self.tags: for prefix in prefixes: _, _, f1_prefix = entity_label_f1(tar_path=labels, pre_path=pre, length=len_chars, tag=tag, tag_map=self.model.tag_map, prefix=prefix) f1_prefix_total.append(f1_prefix) f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total) print( 'Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % f1_macro_tag_prefix) #################################################################################################################################### # Label-level F1 score summary f1_prefix_total = [] prefixes = ['B', 'I', 'E', 'S', 'O'] for prefix in prefixes: _, _, f1_prefix = label_f1(tar_path=labels, pre_path=pre, length=len_chars, tags=self.tags, tag_map=self.model.tag_map, prefix=prefix) f1_prefix_total.append(f1_prefix) f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total) print( 'Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m' % f1_macro_prefix) def predict(self): """ Prediction & Inference Stage """ # Load word vectors pre_trained = self.load_word_vector() while True: input_str = input("Please input a sentence in Chinese: ") input_str = stringQ2B(input_str) # Get character embedding char_vec = [self.vocab.get(i, 0) for i in input_str] char_tensor = np.reshape(char_vec, [-1]).tolist() len_char = np.expand_dims(len(char_tensor), axis=0) len_char = torch.tensor(len_char, dtype=torch.int64, device=device) char_tensor = np.array(self.pad_char_data(char_tensor)).tolist() char_tensor = torch.tensor(char_tensor, dtype=torch.long, device=device) # Get word embedding embed_words = [] words = jieba.lcut(input_str, HMM=True) for i in words: vec = pre_trained.get(i) if str(type(vec)) != "<class 'NoneType'>": embed_words.append(vec) else: gen_vec = np.random.normal(size=self.word_dim).tolist() embed_words.append(gen_vec) word_tensor = np.array(self.pad_word_data(embed_words)).tolist() len_word = np.expand_dims(len(word_tensor), axis=0) len_word = torch.tensor(len_word, dtype=torch.int64, device=device) word_tensor = torch.tensor(word_tensor, dtype=torch.float, device=device) # Run the model and get all the predicted entities _, paths = self.model(characters=char_tensor, len_char=len_char, words=word_tensor, len_word=len_word) # Format the results entities = [] for tag in self.tags: tags = get_tags(path=paths[0], tag=tag, tag_map=self.tag_map) entities += format_result(result=tags, text=input_str, tag=tag) print(entities) def load_word_vector(self): """ Load pre-trained word vectors """ if 'pre_trained' not in globals().keys(): print("Start to load pre-trained word embeddings!!") pre_trained = {} for i, line in enumerate( codecs.open(self.model_path + "word_vectors.vec", 'r', encoding='utf-8')): line = line.rstrip().split() if len(line) == self.word_dim + 1: pre_trained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: pre_trained = globals().get("pre_trained") return pre_trained def pad_char_data(self, data: list): """ Pad character data """ c_data = copy.deepcopy(data) if np.shape(c_data)[0] < self.char_num: c_data = c_data + (self.char_num - np.shape(c_data)[0]) * [0] else: c_data = c_data[:self.char_num] c_data = np.expand_dims(c_data, axis=0) return c_data def pad_word_data(self, data: list): """ Pad word data """ c_data = copy.deepcopy(data) if len(c_data) <= self.word_num: c_data = c_data + (self.word_num - len(c_data)) * [[0] * self.word_dim] else: c_data = c_data[:self.word_num, :] c_data = np.reshape(c_data, [np.shape(c_data)[0] * np.shape(c_data)[1]]) c_data = np.expand_dims(c_data, axis=0) return c_data def restore_model(self): """ Restore and load the model """ try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("Model Successfully Restored!!") except Exception as error: print("Model Failed to restore!!")
class ChineseNER: def __init__(self, entry="train"): # Load some Hyper-parameters config = load_config() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.dropout = config.get("dropout") self.tags = config.get("tags") self.learning_rate = config.get("learning_rate") self.epochs = config.get("epochs") self.weight_decay = config.get("weight_decay") self.transfer_learning = config.get("transfer_learning") self.lr_decay_step = config.get("lr_decay_step") self.lr_decay_rate = config.get("lr_decay_rate") self.max_length = config.get("max_length") # Model Initialization self.main_model(entry) def main_model(self, entry): """ Model Initialization """ # The Testing & Inference Process if entry == "predict": data_map = load_params(path=self.model_path) input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, dropout=0.0, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, max_length=self.max_length) self.restore_model() def restore_model(self): """ Restore the model if there is one """ try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("Model Successfully Restored!") except Exception as error: print("Model Failed to restore! {}".format(error)) def predict(self, input_str): """ Prediction & Inference Stage :param input_str: Input Chinese sentence :return entities: Predicted entities """ if len(input_str) != 0: # Full-width to half-width input_str = strQ2B(input_str) input_str = re.sub(pattern='。', repl='.', string=input_str) text = cut_text(text=input_str, length=self.max_length) cut_out = [] for cuttext in text: # Get the embedding vector (Input Vector) from vocab input_vec = [self.vocab.get(i, 0) for i in cuttext] # convert it to tensor and run the model sentences = torch.tensor(input_vec).view(1, -1) length = np.expand_dims(np.shape(sentences)[1], axis=0) length = torch.tensor(length, dtype=torch.int64, device=device) _, paths = self.model(sentences=sentences, real_length=length, lengths=None) # Get the entities from the model entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, cuttext, tag) # Get all the entities all_start = [] for entity in entities: start = entity.get('start') all_start.append([start, entity]) # Sort the results by the "start" index sort_d = [ value for index, value in sorted( enumerate(all_start), key=lambda all_start: all_start[1]) ] if len(sort_d) == 0: return print("There was no entity in this sentence!!") else: sort_d = np.reshape( np.array(sort_d)[:, 1], [np.shape(sort_d)[0], 1]) cut_out.append(sort_d) return cut_out else: return print('Invalid input! Please re-input!!\n')
class ChineseNER(object): use_gpu = False def __init__(self, entry="train"): self.load_config() #self.use_gpu = torch.cuda.is_available() self.__init_model(entry) print(self.use_gpu) if (self.use_gpu): # gpu加速 self.model = self.model.cuda() def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tasg": ["ORG", "PER"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map #@torchsnooper.snoop() def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor(length, dtype=torch.long) if (self.use_gpu): # gpu加速 sentences_tensor = sentences_tensor.cuda() tags_tensor = tags_tensor.cuda() length_tensor = length_tensor.cuda() loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) if (self.use_gpu): loss = loss.cuda() progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def get_string(self, x): now = x.split('\n') o = now[1].split(' ') while '' in o: o.remove('') return o[1] def evaluate(self): sentences, labels, length = zip(*self.dev_batch.__next__()) if (self.use_gpu): sentences = torch.tensor(sentences, dtype=torch.long).cuda() _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, input_str="", input_path=None): if input_path is not None: tests = pd.read_csv(input_path) with open('output.txt', 'w', encoding='utf-8') as o: #o.write('id,aspect,opinion\n') for ids in range(1, 2235): input_str = self.get_string( str(tests.loc[ids - 1:ids - 1, ['Review']])) index = int( self.get_string(str(tests.loc[ids - 1:ids - 1, ['id']]))) input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor if (self.use_gpu): # gpu加速 sentences = torch.tensor(input_vec).view(1, -1).cuda() else: sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) entities = sorted(entities, key=lambda x: x['start']) #print(str(index) + " " + input_str + " " +str(len(entities))) for entity in entities: #print(entity) o.write( str(index) + ',' + entity['type'] + ',' + entity['word'] + '\n') else: if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor if (self.use_gpu): # gpu加速 sentences = torch.tensor(input_vec).view(1, -1).cuda() else: sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) dev_manager = DataManager(batch_size=30, data_type="dev") self.dev_batch = dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.restore_model() elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.restore_model() def parse_argument(self): """ :argument :return: """ parser = argparse.ArgumentParser(description="NER") parser.add_argument("-c", "--config", dest="config_file", type=str, default="./Config/config.cfg", help="config path") parser.add_argument("-device", "--device", dest="device", type=str, default="cuda:0", help="device[‘cpu’,‘cuda:0’,‘cuda:1’,......]") parser.add_argument("--train", dest="train", action="store_true", default=True, help="train model") parser.add_argument("-p", "--process", dest="process", action="store_true", default=True, help="data process") parser.add_argument("-t", "--test", dest="test", action="store_true", default=False, help="test model") parser.add_argument("--t_model", dest="t_model", type=str, default=None, help="model for test") parser.add_argument("--t_data", dest="t_data", type=str, default=None, help="data[train, dev, test, None] for test model") parser.add_argument("--predict", dest="predict", action="store_true", default=False, help="predict model") args = parser.parse_args() # print(vars(args)) config = configurable.Configurable(config_file=args.config_file) config.device = args.device config.train = args.train config.process = args.process config.test = args.test config.t_model = args.t_model config.t_data = args.t_data config.predict = args.predict # config if config.test is True: config.train = False if config.t_data not in [None, "train", "dev", "test"]: print("\nUsage") parser.print_help() print("t_data : {}, not in [None, 'train', 'dev', 'test']".format( config.t_data)) exit() print("***************************************") print("Device : {}".format(config.device)) print("Data Process : {}".format(config.process)) print("Train model : {}".format(config.train)) print("Test model : {}".format(config.test)) print("t_model : {}".format(config.t_model)) print("t_data : {}".format(config.t_data)) print("predict : {}".format(config.predict)) print("***************************************") return config def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 100, "hidden_size": 128, "batch_size": 20, "dropout": 0.5, "model_path": "models/", "tasg": ["ORG", "PER"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) def load_params(self): with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters()) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) for epoch in range(100): index = 0 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor(length, dtype=torch.long) loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0])) self.evaluate() print("-" * 50) loss.backward() optimizer.step() torch.save(self.model.state_dict(), self.model_path + 'params.pkl') def evaluate(self): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) return entities
class BiLSTMCRFEnter(object): def __init__(self, entry="train"): # 导入训练参数 # 利用配置文件对main函数里面需要的变量进行初始化 self.load_config() # 这里传入的entry是train,也就是训练集,也就是说对model初始化时是利用训练集对模型初始化的 self.__init_model(entry) def __init_model(self, entry): # 模型训练的参数准备 if entry == "train": #创建训练数据集的管理对象 print(self.tags) self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) print(self.train_manager.batch_data) print(len(self.train_manager.batch_data)) self.total_size = len(self.train_manager.batch_data) # print(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } # 保存参数 self.save_params(data) # 验证数据集的准备 # 创建验证数据集的管理对象 dev_manager = DataManager(batch_size=30, data_type="dev") # 通过data_manager中的迭代器不断将创建的数据管理器对象赋值到dev_batch中,用于下面计算损失的函数 self.dev_batch = dev_manager.iteration() # 模型的主体使用的是BiLSTM来进行语义编码,CRF用来约束各个标签 self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) # 加载恢复模型参数 self.restore_model() # 模型用来预测的参数准备 elif entry == "predict": data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") # 这里创建一个模型对象model self.model = BiLSTMCRF( tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size ) self.restore_model() def load_config(self): try: fopen = open("models/config.yml") #读取yml文件 config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) #这里是重写config.yml文件 fopen = open("models/config.yml", "w") config = { # 用于重写的数据,即初始化数据 "embedding_size": 100, "hidden_size": 128, "batch_size": 50, "dropout": 0.5, "model_path": "models/", #这里原来的tags写成了tasg了,需要改过来 "tags": ["Medicinal_Name", "Medicinal_Other_Name", "Medicinal_Function", "Medicinal_Taste", "Medicinal_Use_Num"] } yaml.dump(config, fopen) fopen.close() #重写过后再读取,感觉有点多此一举,主要就是将tags写进了config文件 # word_embedding的维度大小 self.embedding_size = config.get("embedding_size") # 隐藏层的维度 self.hidden_size = config.get("hidden_size") # 每一个batch导入多少条数据 self.batch_size = config.get("batch_size") # 模型的保存数据 self.model_path = config.get("model_path") self.tags = config.get("tags") # 模型中神经百分之多少激活 self.dropout = config.get("dropout") # 模型一共训练多少轮 self.epoch = config.get("epoch") # 模型在测试过程中进行参数导入 def restore_model(self): try: # 加载模型字典、 # 这个load_state_dict函数并没有出现在任何一个文件中,所以这是怎么调用的? self.model.load_state_dict(torch.load(self.model_path + "params.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) # 训练过程中保存模型的参数 def save_params(self, data): with open("models/data.pkl", "wb") as fopen: pickle.dump(data, fopen) # 训练过程中读取更新后的模型的参数 def load_params(self): # pkl文件的读取 with open("models/data.pkl", "rb") as fopen: data_map = pickle.load(fopen) # print("*"*50+data_map+"*"*50) return data_map def train(self): # 使用Adam优化器进行梯度下降算法的优化迭代 # 这里的parameters函数也没有在任何文件中声明过 optimizer = optim.Adam(self.model.parameters(), lr=0.05) # optimizer = optim.SGD(ner_model.parameters(), lr=0.01) # 模型一共训练多少轮轮 for epoch in range(self.epoch): index = 0 # 获取每一个batch的数据 for batch in self.train_manager.get_batch(): index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) sentences_tensor = torch.tensor(sentences, dtype=torch.long) tags_tensor = torch.tensor(tags, dtype=torch.long) length_tensor = torch.tensor(length, dtype=torch.long) # 计算模型训练过程中的损失 loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) # 进度加载 progress = ("█" * int(index * 25 / self.total_size)).ljust(25) print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format( epoch, progress, index, self.total_size, loss.cpu().tolist()[0] ) ) self.evaluate() print("-" * 50) # 梯度回传 loss.backward() # 优化器优化 optimizer.step() # 保存模型 torch.save(self.model.state_dict(), self.model_path + 'params.pkl') # torch.save(self.model) # 训练过程中的损失计算 def evaluate(self): sentences, labels, length = zip(*self.dev_batch.__next__()) _, paths = self.model(sentences) print("\teval") for tag in self.tags: f1_score(labels, paths, tag, self.model.tag_map) # 模型训练好之后的预测 def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec).view(1, -1) _, paths = self.model(sentences) entities = [] for tag in self.tags: # 这里调用了工具类里面的get_tags用来对数据进行标注,就是标一些B-FUNC什么的 tags = get_tags(paths[0], tag, self.tag_map) print(tag) print(self.tag_map) print(paths[0]) print(tags) entities += format_result(tags, input_str, tag) return entities # 模型对文件中的句子进行实体预测 def predict_file(self, f_r_path, f_w_path): # 去除重复预测的实体 duplication = set() with open(f_r_path, encoding='utf-8') as f_r: with open(f_w_path, 'ab') as f_w: for line in f_r.readlines(): sent = line.split('\t')[-3].strip() res = self.predict(sent) for i in range(len(res)-1): entity = res[i]['word'] tag=res[i]["type"] if entity not in duplication: # print(entity) duplication.add(tag) duplication.add(entity) f_w.write((tag+" : "+entity + '\n').encode()) if res[i]["type"]!=res[i+1]["type"]: f_w.write('\n'.encode())
class ChineseNER(object): def __init__(self, entry="train"): self.load_config() self.__init_model(entry) def __init_model(self, entry): if entry == "train": self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags) self.total_size = len(self.train_manager.batch_data) data = { "batch_size": self.train_manager.batch_size, "input_size": self.train_manager.input_size, "vocab": self.train_manager.vocab, "tag_map": self.train_manager.tag_map, } self.save_params(data) self.dev_manager = DataManager(batch_size=60, data_type="dev") # 验证集 # self.dev_batch = self.dev_manager.iteration() self.model = BiLSTMCRF( tag_map=self.train_manager.tag_map, batch_size=self.batch_size, vocab_size=len(self.train_manager.vocab), dropout=self.dropout, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size, ) self.model = self.model.cuda() self.restore_model() elif entry == "predict" or "evaluate": # python main.py predict data_map = self.load_params() input_size = data_map.get("input_size") self.tag_map = data_map.get("tag_map") self.vocab = data_map.get("vocab") print('input_size', input_size) print('tag_map', self.tag_map) self.model = BiLSTMCRF(tag_map=self.tag_map, vocab_size=input_size, embedding_dim=self.embedding_size, hidden_dim=self.hidden_size) self.model = self.model.cuda() self.test_manager = DataManager(batch_size=60, data_type="dev") self.restore_model() # 加载配置项 def load_config(self): try: fopen = open("models/config.yml") config = yaml.load(fopen) fopen.close() except Exception as error: print("Load config failed, using default config {}".format(error)) fopen = open("models/config.yml", "w") config = { "embedding_size": 300, "hidden_size": 128, "batch_size": 30, "dropout": 0.5, "model_path": "models/", "tags": ["TREATMENT", "BODY", "SIGNS", "CHECK", "DISEASE"] } yaml.dump(config, fopen) fopen.close() self.embedding_size = config.get("embedding_size") self.hidden_size = config.get("hidden_size") self.batch_size = config.get("batch_size") self.model_path = config.get("model_path") self.tags = config.get("tags") self.dropout = config.get("dropout") # 保存模型各种训练参数 def restore_model(self): try: self.model.load_state_dict( torch.load(self.model_path + "params_6all.pkl")) print("model restore success!") except Exception as error: print("model restore faild! {}".format(error)) # 保存模型超参数 def save_params(self, data): with open("models/data_6all.pkl", "wb") as fopen: pickle.dump(data, fopen) # 加载模型超参数 def load_params(self): with open("models/data_6all.pkl", "rb") as fopen: data_map = pickle.load(fopen) return data_map def train(self): optimizer = optim.Adam(self.model.parameters(), weight_decay=0.002, lr=0.0000004) # 0.000001 # optimizer = optim.SGD(self.model.parameters(), lr=0.00000008,weight_decay=0.001,momentum=0.9) #4e-7 scheduler_lr = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, cooldown=5, verbose=True, min_lr=1e-8, eps=1e-8) best_loss = 240 lossList = [0] * self.total_size for epoch in range(268, 401): losses = [] index = 0 startTime = time.process_time() for batch in self.train_manager.get_batch(): start = time.process_time() index += 1 self.model.zero_grad() sentences, tags, length = zip(*batch) # lenght 是句子的原本长度 # shape (batch_size,max.len(sentence) (20,332) batch_size 和 每个batch最长句子的长度 sentences_tensor = torch.tensor(sentences, dtype=torch.long).cuda() tags_tensor = torch.tensor(tags, dtype=torch.long).cuda() length_tensor = torch.tensor(length, dtype=torch.long).cuda() loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor) losses.append(loss.cpu().item()) progress = ("█" * int(index * 60 / self.total_size)).ljust(60) loss.backward() optimizer.step() # torch.save(self.model.state_dict(), self.model_path + 'params_6all.pkl') end = time.process_time() dur = end - start print( """epoch [{}] |{}| {}/{}\n\tloss {:.3f}\t\tlast_loss {:.3f}\t\ttime {}\t\tbest_avg_loss {:.3f}""" .format(epoch, progress, index, self.total_size, loss.cpu().tolist()[0], lossList[index - 1], str(dur), best_loss)) lossList[index - 1] = loss.cpu().item() print("-" * 90) endTime = time.process_time() totalTime = endTime - startTime avg_loss = np.mean(losses) # 保存最好的模型 if avg_loss < best_loss: best_loss = avg_loss torch.save(self.model.state_dict(), self.model_path + 'params_6all.pkl') writer.add_scalar('BiLstm_CRF:avg_loss-epoch', avg_loss, epoch) print('epoch ', epoch, ' avg_loss ', avg_loss, ' total_time ', totalTime) if epoch % 5 == 0: self.evaluate(epoch / 5, manager=self.dev_manager) print("-" * 100) scheduler_lr.step(avg_loss) writer.close() # train: BODY 7507, SIGNS 6355, CHECK 6965, DISEASE 474, TREATMENT 805 # test: # 计算f1,评估模型 def evaluate(self, epoch, manager, add_scalar=True): print('正在开始评估') all_origins = all_founds = all_rights = 0 for tag in self.tags: origins = founds = rights = 0 for batch in manager.get_batch(): sentences, labels, length = zip(*batch) _, paths = self.model(sentences) origin, found, right = f1_score(labels, paths, tag, self.model.tag_map) origins += origin founds += found rights += right all_origins += origins all_founds += founds all_rights += rights recall = 0. if origins == 0 else (rights / origins) precision = 0. if founds == 0 else (rights / founds) f1 = 0. if recall + precision == 0 else ( 2 * precision * recall) / (precision + recall) print("\t{}\torigins:{}\t\t\tfounds:{}\t\t\trights:{}".format( tag, origins, founds, rights)) print("\t\t\trecall:{}\tprecision:{}\tf1:{}".format( recall, precision, f1)) if add_scalar: tag_epoch = tag + '-5epoch' writer.add_scalars(tag_epoch, { 'recall': recall, 'precision': precision, 'f1': f1 }, epoch) all_recall = 0. if all_origins == 0 else (all_rights / all_origins) all_precision = 0. if all_founds == 0 else (all_rights / all_founds) all_f1 = 0. if all_recall + all_precision == 0 else ( 2 * all_precision * all_recall) / (all_precision + all_recall) print("\tall_origins:{}\t\t\tall_founds:{}\t\t\tall_rights:{}".format( all_origins, all_founds, all_rights)) print("\tall_recall:{}\tall_precision:{}\tall_f1:{}".format( all_recall, all_precision, all_f1)) if add_scalar: writer.add_scalars( "ALL-5epoch", { 'all_recall': all_recall, 'all_precision': all_precision, 'all_f1': all_f1 }, epoch) print('评估结束') return all_recall, all_precision, all_f1 # 预测方法 def predict(self, input_str=""): if not input_str: input_str = input("请输入文本: ") # 获取输入句子所有汉字的在vocab的索引 input_vec = [self.vocab.get(i, 0) for i in input_str] # convert to tensor sentences = torch.tensor(input_vec, dtype=torch.long).view(1, -1) sentences = sentences.cuda() # paths 预测出来的标签索引 shape 为 [1,1] _, paths = self.model(sentences) entities = [] # "tags": ["ORG", "PER"] for tag in self.tags: tags = get_tags(paths[0], tag, self.tag_map) entities += format_result(tags, input_str, tag) print(entities) print(json.dumps(entities, indent=4, ensure_ascii=False)) return entities
#单字个数,包括数据集中所有出现的字符 vocab_size = 4688 #词嵌入维度 embed_size = 128 #lstm结构中隐藏层维度 units = 64 #标签类别 tag_list = ['b', 'm', 'e', 's'] num_tags = len(tag_list) datafile = "./data/data.txt" my_model = BiLSTMCRF(vocab_size, embed_size, units, num_tags) train_loss = tf.keras.metrics.Mean(name='train_loss') lstm_acc = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') #optimizer = tf.keras.optimizers.SGD(learning_rate=0.1) #optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) #optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1) optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.1) #优化器大致效果Adagrad>Adam>RMSprop>SGD #设置checkpoint,只保存最新的3个 ckpt = tf.train.Checkpoint(my_model=my_model, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, './save_checkpoint/',