def __init__(self): output_graph_def = tf.GraphDef() with open('./ckpt_3/ner.pb', "rb") as f: output_graph_def.ParseFromString(f.read()) _ = tf.import_graph_def(output_graph_def, name="") self.sess = tf.Session() init = tf.global_variables_initializer() self.sess.run(init) # 读入数据 self.char2id, self.ner2id, self.pos2id = load_dict( char_dict="train_data_4/char2id.json", ner_dict="train_data_4/ner2id.json", pos_dict="train_data_4/pos2id.json") self.id2type = {value: key for key, value in self.ner2id.items()} self.ids = list(self.id2type.keys()) self.input_ids = self.sess.graph.get_tensor_by_name( "placeholder/input_chars:0") self.input_pos = self.sess.graph.get_tensor_by_name( "placeholder/input_pos:0") # is_training = sess.graph.get_tensor_by_name("placeholder/Placeholder:0") # is_training self.viterbi_sequence = self.sess.graph.get_tensor_by_name( "doc_segment/ReverseSequence_1:0")
def predict(text): """ :return: """ def load_model(): output_graph_def = tf.GraphDef() with open('./ckpt_3/ner.pb', "rb") as f: output_graph_def.ParseFromString(f.read()) _ = tf.import_graph_def(output_graph_def, name="") sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) return sess # 读入数据 char2id, ner2id = load_dict(char_dict="train_data_3/char2id.json", ner_dict="train_data_3/ner2id.json") id2type = {value: key for key, value in ner2id.items()} ids = list(id2type.keys()) sess = load_model() input_ids = sess.graph.get_tensor_by_name("placeholder/input_chars:0") # is_training = sess.graph.get_tensor_by_name("placeholder/Placeholder:0") # is_training viterbi_sequence = sess.graph.get_tensor_by_name( "doc_segment/ReverseSequence_1:0") t1 = time.time() x = sequence_padding([char2id.get(c, 1) for c in text], max_len=384) feed_dict = { input_ids: [x], # is_training: True, } predicts_d = sess.run([viterbi_sequence], feed_dict)[0] p = predicts_d.tolist()[0] # 封装一下,输出结果 IOS = [] index = 0 start = None for i in p: if i == 0: if start is None: pass else: IOS.append((start, index)) break elif i == 1: if start is None: pass else: if index > 0: IOS.append((start, index)) start = None else: # 包含实体 if start is None: start = index else: if i == p[index - 1]: pass else: IOS.append((start, index)) start = index index += 1 # print(p) print(IOS) extract_dict = [] # 首先找到 主题,即 为 2 的 subject = "" for i in IOS: if p[i[0]] == 2: subject = text[i[0]:i[1]] break if subject != "": for i in IOS: if p[i[0]] > 2: schema = schemas[p[i[0]] - 3] object_ = text[i[0]:i[1]] # {"predicate": "连载网站", "object_type": "网站", "subject_type": "网络小说", "object": "晋江文学城", "subject": "猫喵"} schema["subject"] = subject schema["object"] = object_ # extract_id = p[i[0]] # tag = id2type.get(extract_id) # value = text[i[0]:i[1]] extract_dict.append(schema) if len(extract_dict) < 1: print(text) else: print(text) return extract_dict
def train(): """ 模型训练 :return: """ char2id, ner2id, pos2id = load_dict(char_dict="train_data_4/char2id.json", ner_dict="train_data_4/ner2id.json", pos_dict="train_data_4/pos2id.json") # tf.flags.DEFINE_string("data_dir", "data/data.dat", "data directory") tf.flags.DEFINE_integer("vocab_size_c", len(char2id), "vocabulary size") tf.flags.DEFINE_integer("vocab_size_p", len(pos2id), "vocabulary size") tf.flags.DEFINE_integer("num_classes", len(ner2id), "number of classes") tf.flags.DEFINE_integer("max_num", 384, "max_sentence_num") tf.flags.DEFINE_integer( "embedding_size_c", 256, "Dimensionality of character embedding (default: 200)") tf.flags.DEFINE_integer( "embedding_size_p", 256, "Dimensionality of character embedding (default: 200)") tf.flags.DEFINE_integer( "hidden_size", 128, "Dimensionality of GRU hidden layer (default: 50)") tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 50)") tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") tf.flags.DEFINE_integer("num_checkpoints", 3, "Number of checkpoints to store (default: 5)") tf.flags.DEFINE_integer("evaluate_every", 300, "evaluate every this many batches") tf.flags.DEFINE_float("learning_rate", 0.01, "learning rate") tf.flags.DEFINE_float("grad_clip", 5, "grad clip to prevent gradient explode") FLAGS = tf.flags.FLAGS with tf.Session(config=config) as sess: ner = NER(vocab_size_c=FLAGS.vocab_size_c, vocab_size_p=FLAGS.vocab_size_p, num_classes=FLAGS.num_classes, embedding_size_c=FLAGS.embedding_size_c, embedding_size_p=FLAGS.embedding_size_p, hidden_size=FLAGS.hidden_size, max_num=FLAGS.max_num) # 外部定义 优化器 global_step = tf.Variable(0, trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) # RNN中常用的梯度截断,防止出现梯度过大难以求导的现象 tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(ner.loss, tvars), FLAGS.grad_clip) grads_and_vars = tuple(zip(grads, tvars)) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) if not os.path.exists('./ckpt_3/'): os.makedirs("./ckpt_3/") # 恢复模型 / 重新初始化参数 # model_file = tf.train.latest_checkpoint('./ckpt/') ckpt = tf.train.get_checkpoint_state('./ckpt_3/') if ckpt: print("load saved model:\t", ckpt.model_checkpoint_path) saver = tf.train.Saver() saver.restore(sess, ckpt.model_checkpoint_path) else: print("init model...") sess.run(tf.global_variables_initializer()) def extract(p): # 封装一下,输出结果 IOS = [] index = 0 start = None for i in p: if i == 0: if start is None: pass else: IOS.append((start, index)) break elif i == 1: if start is None: pass else: if index > 0: IOS.append((start, index)) start = None else: # 包含实体 if start is None: start = index else: if i == p[index - 1]: pass else: IOS.append((start, index)) start = index index += 1 return IOS def evaluate(viterbi_sequence, Y): ''' 计算变长的 准确率 指标 :return: ''' TP = 0 P_ = 0 R_ = 0 for p, y in zip(viterbi_sequence, Y): # 当前句子的长度 pre_ = extract(p) tru_ = extract(y) # 计算 acc comm = [i for i in pre_ if i in tru_] TP += len(comm) P_ += len(pre_) R_ += len(tru_) # l = len(np.nonzero(y)) # # 通过两个序列,计算准确率 # t_all += l # t_true += np.sum(np.equal(p[:l], y[:l])) return TP, P_, R_ def train_step(x, pos, y): feed_dict = { ner.input_chars: x, ner.input_pos: pos, ner.output: y, ner.is_training: True, } _, step, predicts_t, cost, accuracy = sess.run([ train_op, global_step, ner.viterbi_sequence, ner.loss, ner.acc ], feed_dict) tp, p_, r_ = evaluate(np.array(predicts_t), y) time_str = str(int(time.time())) p = float(tp) / p_ if p_ else 0 r = float(tp) / r_ if r_ else 0 if p + r: f = 2 * p * r / (p + r) else: f = 0 print("{}: step {}, loss {}, p {}, r {}, f {}".format( time_str, step, cost, p, r, f)) # train_summary_writer.add_summary(summaries, step) return step def dev_step(x, pos, y, writer=None): feed_dict = { ner.input_chars: x, ner.input_pos: pos, ner.output: y, ner.is_training: False, } step, predicts_d, cost, accuracy = sess.run( [global_step, ner.viterbi_sequence, ner.loss, ner.acc], feed_dict) tp, p_, r_ = evaluate(np.array(predicts_d), y) time_str = str(int(time.time())) p = float(tp) / p_ if p_ else 0 r = float(tp) / r_ if r_ else 0 if p + r: f = 2 * p * r / (p + r) else: f = 0 print("+dev+{}: step {}, loss {}, p {}, r {}, f {}".format( time_str, step, cost, p, r, f)) # time_str = str(int(time.time())) # print("+dev+{}: step {}, loss {}, f_acc {}, t_acc {}".format(time_str, step, cost, accuracy, acc_d)) return cost, tp, p_, r_ best_accuracy, best_at_step = 0, 0 train_example_len = 173109 dev_example_len = 21639 num_train_steps = int(train_example_len / FLAGS.batch_size * FLAGS.num_epochs) num_dev_steps = int(dev_example_len / FLAGS.batch_size) min_loss = 99999 input_ids_train, input_pos_train, output_types_train = get_input_data( "./train_data_4/train_ner.tf_record", FLAGS.batch_size) input_ids_dev, input_pos_dev, output_types_dev = get_input_data( "./train_data_4/dev_ner.tf_record", FLAGS.batch_size) for i in range(num_train_steps): # batch 数据 input_ids_train_, input_pos_train_, output_types_train_ = sess.run( [input_ids_train, input_pos_train, output_types_train]) step = train_step(input_ids_train_, input_pos_train_, output_types_train_) if step % FLAGS.evaluate_every == 0: # dev 数据过大, 也需要进行 分批 TP = 0 P_ = 0 R_ = 0 total_loss = 0 for j in range(num_dev_steps): input_ids_dev_, input_pos_dev_, output_types_dev_ = sess.run( [input_ids_dev, input_pos_dev, output_types_dev]) loss, tp, p_, r_ = dev_step(input_ids_dev_, input_pos_dev_, output_types_dev_) TP += tp P_ += p_ R_ += r_ total_loss += loss # total_dev_correct += count # total_devs += total p = float(TP) / P_ if P_ else 0 r = float(TP) / R_ if R_ else 0 f = 2 * p * r / (p + r) if p + r else 0 print("tp:p", TP, p) print("p_:r", P_, r) print("r_:f", R_, f) if total_loss < min_loss: print("save model:\t%f\t>%f\t%f\t>%f" % (total_loss, p, r, f)) min_loss = total_loss saver.save(sess, './ckpt_3/ner.ckpt', global_step=step) sess.close()