def train(): train_data, train_label, word2id, word_embedding, max_sentence_len = load_all(settings.TRAIN_PATH,settings.VOCAB_PATH, settings.VOCAB_EMBEDDING_PATH) # test no embedding # word_embedding=np.random.uniform(-0.25,0.25,word_embedding.shape) ner_model = NerModel(word2id, word_embedding, settings.TAGS, max_sentence_len, settings.EMBEDDING_SIZE) ner_model.train(train_data, train_label, save_path=settings.MODEL_PATH)
#Preprocessing Data data_train = load_data() getter = modifying(data_train) getter.get_next() tag2id,n_tags,word2id,n_words = getter.indexing() text_sequences,label_sequences = getter.padding(args.max_len,word2id,tag2id) # making length of all sentences to be equal train_dataset = tf.data.Dataset.from_tensor_slices((text_sequences, label_sequences)) # converting to tensorflow dataset train_dataset = train_dataset.shuffle(len(text_sequences)).batch(args.batch_size, drop_remainder=True) print("hidden_num:{}, vocab_size:{}, label_size:{}".format(args.hidden_num, len(word2id), len(tag2id))) ####################################################################################################### model = NerModel(hidden_num = args.hidden_num, vocab_size = len(word2id)+1, label_size= len(tag2id), embedding_size = args.embedding_size) optimizer = tf.keras.optimizers.Adam(args.lr) ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model) ckpt.restore(tf.train.latest_checkpoint(args.output_dir)) ckpt_manager = tf.train.CheckpointManager(ckpt,args.output_dir,checkpoint_name='model.ckpt',max_to_keep=3) ######################################################################################################### # @tf.function def train_one_step(text_batch, labels_batch): with tf.GradientTape() as tape: logits, text_lens, log_likelihood = model(text_batch, labels_batch,training=True) loss = - tf.reduce_mean(log_likelihood) gradients = tape.gradient(loss, model.trainable_variables)
import tensorflow as tf from model import NerModel from utils import tokenize,read_vocab,format_result import tensorflow_addons as tf_ad from args_help import args import json vocab2id, id2vocab = read_vocab(args.vocab_file) tag2id, id2tag = read_vocab(args.tag_file) text_sequences ,label_sequences= tokenize(args.test_path,vocab2id,tag2id) optimizer = tf.keras.optimizers.Adam(args.lr) model = NerModel(hidden_num = args.hidden_num, vocab_size =len(vocab2id), label_size = len(tag2id), embedding_size = args.embedding_size) # restore model ckpt = tf.train.Checkpoint(optimizer=optimizer,model=model) ckpt.restore(tf.train.latest_checkpoint(args.output_dir)) while True: text = input("input:") dataset = tf.keras.preprocessing.sequence.pad_sequences([[vocab2id.get(char,0) for char in text]], padding='post') print(dataset) logits, text_lens = model.predict(dataset) paths = [] for logit, text_len in zip(logits, text_lens): viterbi_path, _ = tf_ad.text.viterbi_decode(logit[:text_len], model.transition_params) paths.append(viterbi_path) print(paths[0])
parser.add_argument("--embedding_size", type=int, default=32,help="embedding dim") parser.add_argument("--output_dir", type=str, default='./checkpoint',help="output dir") parser.add_argument("--lr", type=float, default=1e-3,help="lr") parser.add_argument("--batch_size", type=int, default=64,help="lr") args = parser.parse_args() gpus=tf.config.experimental.list_physical_devices(device_type='GPU') tf.config.experimental.set_visible_devices(devices=gpus[2], device_type='GPU') vocab2id, id2vocab = read_vocab(args.vocab_file) tag2id, id2tag = read_vocab(args.tag_file) text_sequences, text_lens ,label_sequences= tokenize_pred(args.test_file,vocab2id,tag2id) train_dataset = tf.data.Dataset.from_tensor_slices((text_sequences, text_lens, label_sequences)) train_dataset = train_dataset.shuffle(len(text_sequences)).batch(args.batch_size, drop_remainder=True) optimizer = tf.keras.optimizers.Adam(args.lr) model = NerModel(hidden_num = args.hidden_num, vocab_size =len(vocab2id), label_size = len(tag2id), embedding_size = args.embedding_size) # restore model ckpt = tf.train.Checkpoint(optimizer=optimizer,model=model) ckpt.restore(tf.train.latest_checkpoint(args.output_dir)) for text_batch, text_lens,labels_batch in train_dataset: logits, _ = model.predict(text_batch) paths = [] for logit, text_len, labels in zip(logits, text_lens, labels_batch): viterbi_path, _ = tf_ad.text.viterbi_decode(logit[:text_len], model.transition_params) paths.append(viterbi_path) for i in range(len(text_batch)): res = {'text':[],'pred':[],'label':[]} for j,t in enumerate(paths[i]): res['text'].append(id2vocab.get(text_batch[i][j].numpy(),'<UKN>'))
tag2id, id2tag = read_vocab(args.tag_file) print(id2tag) text_sequences, label_sequences, text_origin, label_origin = tokenize( args.test_path, vocab2id, tag2id) # text_sequences 的维度是(159,110) embedded_matrix = build_embedding_matrix(args.pretrain_embedding_vec, vocab2id) # print('查看 text_sequences 的值和维度:') # print(text_sequences.shape) # print(type(text_sequences)) # 载入模型 optimizer = tf.keras.optimizers.Adam(args.lr) model = NerModel(hidden_num=args.hidden_num, vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=args.embedding_size, embedding_matrix=embedded_matrix) # restore model ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model) ckpt.restore(tf.train.latest_checkpoint(args.output_dir)) def evaluationMetrics(id2tag, logits_batch, labels_batch): """ (待加入模型)添加 presicion和 recall 作为测试集的评估方式 logits_batch 表示预测值(单位为batch) labels_batch 表示真实值(单位为batch) """ entity = [] # 真实的实体 pre_entity = [] # 预测的实体
# 获取训练数据 train_dataset = tf.data.Dataset.from_tensor_slices((text_sequences, label_sequences)) train_dataset = train_dataset.shuffle(len(text_sequences)).batch(args.batch_size, drop_remainder=True) # Tensor序列中最后少于一个batch数量的不要了 # # 测试代码,用后即删 # for _, (text_batch, labels_batch) in enumerate(train_dataset): # print(type(text_batch)) # print(text_batch.shape) # print(text_batch) # break # 构建模型 logger.info("hidden_num:{}, vocab_size:{}, label_size:{}".format(args.hidden_num, len(vocab2id), len(tag2id))) model = NerModel(hidden_num = args.hidden_num, vocab_size = len(vocab2id), label_size= len(tag2id), embedding_size = args.embedding_size, embedding_matrix=embedded_matrix) optimizer = tf.keras.optimizers.Adam(args.lr) ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model) ckpt.restore(tf.train.latest_checkpoint(args.output_dir)) ckpt_manager = tf.train.CheckpointManager(ckpt, args.output_dir, checkpoint_name='model.ckpt', max_to_keep=3) # @tf.function def train_one_step(text_batch, labels_batch): with tf.GradientTape() as tape: logits, text_lens, log_likelihood = model(text_batch, labels_batch, training=True) # 此时调用model的call方法 # log_likelihood 即表示loss结果
def get_model(): global __model if not __model: __model = NerModel() return __model
#!/usr/bin/env python # encoding: utf-8 ''' @author: Ben @license: (C) Copyright 2013-2017, Node Supply Chain Manager Corporation Limited. @contact: [email protected] @file: keras_run.py @time: 2019/8/15 09:42 @desc: ''' from model import NerModel from utils import * if __name__ == '__main__': log.i('Start main function.') model = NerModel() model.train() if is_train() else model.predict() log.i('Process finish')
logger.info("vocab file exits!!") vocab2id, id2vocab = read_vocab(args.vocab_file) tag2id, id2tag = read_vocab(args.tag_file) text_sequences, label_sequences = tokenize(args.train_path, vocab2id, tag2id) train_dataset = tf.data.Dataset.from_tensor_slices( (text_sequences, label_sequences)) train_dataset = train_dataset.shuffle(len(text_sequences)).batch( args.batch_size, drop_remainder=True) logger.info("hidden_num:{}, vocab_size:{}, label_size:{}".format( args.hidden_num, len(vocab2id), len(tag2id))) model = NerModel(hidden_num=args.hidden_num, vocab_size=len(vocab2id), label_size=len(tag2id), embedding_size=args.embedding_size) model.compile(optimizer=tf.keras.optimizers.Adam(args.lr), # loss= ) ckpt = tf.train.Checkpoint(optimizer=model.optimizer, model=model) ckpt.restore(tf.train.latest_checkpoint(args.output_dir)) ckpt_manager = tf.train.CheckpointManager(ckpt, args.output_dir, checkpoint_name='model.ckpt', max_to_keep=3) model.fit(train_dataset, batch_size=args.batch_size, epoch=args.epoch) # @tf.function