コード例 #1
0
    def __init__(self):
        output_graph_def = tf.GraphDef()

        with open('./ckpt_3/ner.pb', "rb") as f:
            output_graph_def.ParseFromString(f.read())
            _ = tf.import_graph_def(output_graph_def, name="")
        self.sess = tf.Session()
        init = tf.global_variables_initializer()
        self.sess.run(init)
        # 读入数据
        self.char2id, self.ner2id, self.pos2id = load_dict(
            char_dict="train_data_4/char2id.json",
            ner_dict="train_data_4/ner2id.json",
            pos_dict="train_data_4/pos2id.json")
        self.id2type = {value: key for key, value in self.ner2id.items()}
        self.ids = list(self.id2type.keys())
        self.input_ids = self.sess.graph.get_tensor_by_name(
            "placeholder/input_chars:0")
        self.input_pos = self.sess.graph.get_tensor_by_name(
            "placeholder/input_pos:0")
        # is_training = sess.graph.get_tensor_by_name("placeholder/Placeholder:0")  # is_training
        self.viterbi_sequence = self.sess.graph.get_tensor_by_name(
            "doc_segment/ReverseSequence_1:0")
コード例 #2
0
def predict(text):
    """

    :return:
    """
    def load_model():
        output_graph_def = tf.GraphDef()

        with open('./ckpt_3/ner.pb', "rb") as f:
            output_graph_def.ParseFromString(f.read())
            _ = tf.import_graph_def(output_graph_def, name="")

        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)
        return sess

    # 读入数据
    char2id, ner2id = load_dict(char_dict="train_data_3/char2id.json",
                                ner_dict="train_data_3/ner2id.json")
    id2type = {value: key for key, value in ner2id.items()}
    ids = list(id2type.keys())
    sess = load_model()

    input_ids = sess.graph.get_tensor_by_name("placeholder/input_chars:0")
    # is_training = sess.graph.get_tensor_by_name("placeholder/Placeholder:0")  # is_training
    viterbi_sequence = sess.graph.get_tensor_by_name(
        "doc_segment/ReverseSequence_1:0")

    t1 = time.time()
    x = sequence_padding([char2id.get(c, 1) for c in text], max_len=384)
    feed_dict = {
        input_ids: [x],
        # is_training: True,
    }
    predicts_d = sess.run([viterbi_sequence], feed_dict)[0]
    p = predicts_d.tolist()[0]
    # 封装一下,输出结果
    IOS = []
    index = 0
    start = None
    for i in p:
        if i == 0:
            if start is None:
                pass
            else:
                IOS.append((start, index))
            break
        elif i == 1:
            if start is None:
                pass
            else:
                if index > 0:
                    IOS.append((start, index))
                start = None
        else:  # 包含实体
            if start is None:
                start = index
            else:
                if i == p[index - 1]:
                    pass
                else:
                    IOS.append((start, index))
                    start = index
        index += 1
    # print(p)
    print(IOS)
    extract_dict = []
    # 首先找到 主题,即 为 2 的
    subject = ""
    for i in IOS:
        if p[i[0]] == 2:
            subject = text[i[0]:i[1]]
            break
    if subject != "":
        for i in IOS:
            if p[i[0]] > 2:
                schema = schemas[p[i[0]] - 3]
                object_ = text[i[0]:i[1]]
                # {"predicate": "连载网站", "object_type": "网站", "subject_type": "网络小说", "object": "晋江文学城", "subject": "猫喵"}
                schema["subject"] = subject
                schema["object"] = object_
                # extract_id = p[i[0]]
                # tag = id2type.get(extract_id)
                # value = text[i[0]:i[1]]
                extract_dict.append(schema)
        if len(extract_dict) < 1:
            print(text)
    else:
        print(text)
    return extract_dict
コード例 #3
0
def train():
    """
        模型训练
    :return:
    """
    char2id, ner2id, pos2id = load_dict(char_dict="train_data_4/char2id.json",
                                        ner_dict="train_data_4/ner2id.json",
                                        pos_dict="train_data_4/pos2id.json")
    # tf.flags.DEFINE_string("data_dir", "data/data.dat", "data directory")
    tf.flags.DEFINE_integer("vocab_size_c", len(char2id), "vocabulary size")
    tf.flags.DEFINE_integer("vocab_size_p", len(pos2id), "vocabulary size")
    tf.flags.DEFINE_integer("num_classes", len(ner2id), "number of classes")
    tf.flags.DEFINE_integer("max_num", 384, "max_sentence_num")
    tf.flags.DEFINE_integer(
        "embedding_size_c", 256,
        "Dimensionality of character embedding (default: 200)")
    tf.flags.DEFINE_integer(
        "embedding_size_p", 256,
        "Dimensionality of character embedding (default: 200)")
    tf.flags.DEFINE_integer(
        "hidden_size", 128, "Dimensionality of GRU hidden layer (default: 50)")
    tf.flags.DEFINE_integer("batch_size", 256, "Batch Size (default: 64)")
    tf.flags.DEFINE_integer("num_epochs", 10,
                            "Number of training epochs (default: 50)")
    tf.flags.DEFINE_integer("checkpoint_every", 100,
                            "Save model after this many steps (default: 100)")
    tf.flags.DEFINE_integer("num_checkpoints", 3,
                            "Number of checkpoints to store (default: 5)")
    tf.flags.DEFINE_integer("evaluate_every", 300,
                            "evaluate every this many batches")
    tf.flags.DEFINE_float("learning_rate", 0.01, "learning rate")
    tf.flags.DEFINE_float("grad_clip", 5,
                          "grad clip to prevent gradient explode")
    FLAGS = tf.flags.FLAGS
    with tf.Session(config=config) as sess:
        ner = NER(vocab_size_c=FLAGS.vocab_size_c,
                  vocab_size_p=FLAGS.vocab_size_p,
                  num_classes=FLAGS.num_classes,
                  embedding_size_c=FLAGS.embedding_size_c,
                  embedding_size_p=FLAGS.embedding_size_p,
                  hidden_size=FLAGS.hidden_size,
                  max_num=FLAGS.max_num)

        # 外部定义 优化器
        global_step = tf.Variable(0, trainable=False)
        optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
        # RNN中常用的梯度截断,防止出现梯度过大难以求导的现象
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(ner.loss, tvars),
                                          FLAGS.grad_clip)
        grads_and_vars = tuple(zip(grads, tvars))
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=FLAGS.num_checkpoints)
        if not os.path.exists('./ckpt_3/'):
            os.makedirs("./ckpt_3/")

        # 恢复模型 / 重新初始化参数
        # model_file = tf.train.latest_checkpoint('./ckpt/')
        ckpt = tf.train.get_checkpoint_state('./ckpt_3/')
        if ckpt:
            print("load saved model:\t", ckpt.model_checkpoint_path)
            saver = tf.train.Saver()
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("init model...")
            sess.run(tf.global_variables_initializer())

        def extract(p):
            # 封装一下,输出结果

            IOS = []
            index = 0
            start = None
            for i in p:
                if i == 0:
                    if start is None:
                        pass
                    else:
                        IOS.append((start, index))
                    break
                elif i == 1:
                    if start is None:
                        pass
                    else:
                        if index > 0:
                            IOS.append((start, index))
                        start = None
                else:  # 包含实体
                    if start is None:
                        start = index
                    else:
                        if i == p[index - 1]:
                            pass
                        else:
                            IOS.append((start, index))
                            start = index
                index += 1

            return IOS

        def evaluate(viterbi_sequence, Y):
            '''
                计算变长的 准确率 指标
            :return:
            '''
            TP = 0
            P_ = 0
            R_ = 0
            for p, y in zip(viterbi_sequence, Y):
                # 当前句子的长度
                pre_ = extract(p)
                tru_ = extract(y)
                # 计算 acc
                comm = [i for i in pre_ if i in tru_]
                TP += len(comm)
                P_ += len(pre_)
                R_ += len(tru_)
                # l = len(np.nonzero(y))
                # # 通过两个序列,计算准确率
                # t_all += l
                # t_true += np.sum(np.equal(p[:l], y[:l]))

            return TP, P_, R_

        def train_step(x, pos, y):
            feed_dict = {
                ner.input_chars: x,
                ner.input_pos: pos,
                ner.output: y,
                ner.is_training: True,
            }
            _, step, predicts_t, cost, accuracy = sess.run([
                train_op, global_step, ner.viterbi_sequence, ner.loss, ner.acc
            ], feed_dict)
            tp, p_, r_ = evaluate(np.array(predicts_t), y)
            time_str = str(int(time.time()))
            p = float(tp) / p_ if p_ else 0
            r = float(tp) / r_ if r_ else 0
            if p + r:
                f = 2 * p * r / (p + r)
            else:
                f = 0
            print("{}: step {}, loss {},  p {}, r {}, f {}".format(
                time_str, step, cost, p, r, f))
            # train_summary_writer.add_summary(summaries, step)
            return step

        def dev_step(x, pos, y, writer=None):
            feed_dict = {
                ner.input_chars: x,
                ner.input_pos: pos,
                ner.output: y,
                ner.is_training: False,
            }
            step, predicts_d, cost, accuracy = sess.run(
                [global_step, ner.viterbi_sequence, ner.loss, ner.acc],
                feed_dict)

            tp, p_, r_ = evaluate(np.array(predicts_d), y)

            time_str = str(int(time.time()))
            p = float(tp) / p_ if p_ else 0
            r = float(tp) / r_ if r_ else 0
            if p + r:
                f = 2 * p * r / (p + r)
            else:
                f = 0
            print("+dev+{}: step {}, loss {}, p {}, r {}, f {}".format(
                time_str, step, cost, p, r, f))
            # time_str = str(int(time.time()))
            # print("+dev+{}: step {}, loss {}, f_acc {}, t_acc {}".format(time_str, step, cost, accuracy, acc_d))
            return cost, tp, p_, r_

        best_accuracy, best_at_step = 0, 0

        train_example_len = 173109
        dev_example_len = 21639
        num_train_steps = int(train_example_len / FLAGS.batch_size *
                              FLAGS.num_epochs)
        num_dev_steps = int(dev_example_len / FLAGS.batch_size)

        min_loss = 99999

        input_ids_train, input_pos_train, output_types_train = get_input_data(
            "./train_data_4/train_ner.tf_record", FLAGS.batch_size)
        input_ids_dev, input_pos_dev, output_types_dev = get_input_data(
            "./train_data_4/dev_ner.tf_record", FLAGS.batch_size)
        for i in range(num_train_steps):
            # batch 数据
            input_ids_train_, input_pos_train_, output_types_train_ = sess.run(
                [input_ids_train, input_pos_train, output_types_train])
            step = train_step(input_ids_train_, input_pos_train_,
                              output_types_train_)
            if step % FLAGS.evaluate_every == 0:
                # dev 数据过大, 也需要进行 分批
                TP = 0
                P_ = 0
                R_ = 0
                total_loss = 0
                for j in range(num_dev_steps):
                    input_ids_dev_, input_pos_dev_, output_types_dev_ = sess.run(
                        [input_ids_dev, input_pos_dev, output_types_dev])
                    loss, tp, p_, r_ = dev_step(input_ids_dev_, input_pos_dev_,
                                                output_types_dev_)
                    TP += tp
                    P_ += p_
                    R_ += r_
                    total_loss += loss
                    # total_dev_correct += count
                    # total_devs += total
                p = float(TP) / P_ if P_ else 0
                r = float(TP) / R_ if R_ else 0
                f = 2 * p * r / (p + r) if p + r else 0
                print("tp:p", TP, p)
                print("p_:r", P_, r)
                print("r_:f", R_, f)
                if total_loss < min_loss:
                    print("save model:\t%f\t>%f\t%f\t>%f" %
                          (total_loss, p, r, f))
                    min_loss = total_loss
                    saver.save(sess, './ckpt_3/ner.ckpt', global_step=step)

        sess.close()