コード例 #1
0
    def prodeuce_embedding_vec_file(
            filename, path="../data/simple_questions/fb_0_2m_files"):
        dh = data_helper.DataClass("sq")
        model = models.Word2Vec.load(filename)
        # 遍历每个单词,查出word2vec然后输出

        v_base = model['end']
        ct.print(v_base)

        for word in dh.converter.vocab:
            try:
                v = model[word]
            except Exception as e1:
                msg1 = "%s : %s " % (word, e1)
                ct.print(msg1)
                ct.just_log(path + "/wiki.vector.log", msg1)
                v = model['end']
            m_v = ' '.join([str(x) for x in list(v)])
            msg = "%s %s" % (word, str(m_v))
            ct.just_log(path + "/wiki.vector", msg)
        # 多记录一个单词
        word = 'end'
        v = model[word]
        m_v = ' '.join([str(x) for x in list(v)])
        msg = "%s %s" % (word, str(m_v))
        ct.just_log(path + "/wiki.vector", msg)
コード例 #2
0
ファイル: train.py プロジェクト: yikedouer/deeplearning
def main(_):
    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    model = 'ner'
    dh = data_helper.DataClass(model)
    train_batch_size = 1
    # g = dh.batch_iter_char_rnn(train_batch_size)  # (FLAGS.num_seqs, FLAGS.num_steps)
    embedding_weight = dh.embeddings
    # with codecs.open(FLAGS.input_file, encoding='utf-8') as f:
    #     text = f.read()
    # converter = TextConverter(text, FLAGS.max_vocab)
    # converter.save_to_file(os.path.join(model_path, 'converter.pkl'))
    #
    # arr = converter.text_to_arr(text)
    # g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)

    model = CharRNN(dh.converter.vocab_size,  # 词汇表大小 从其中生成所有候选
                    num_seqs=train_batch_size,  # FLAGS.num_seqs,  # ? 一个batch 的 句子 数目
                    num_steps=dh.max_document_length,  # FLAGS.num_steps,  # 一个句子的长度
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size,
                    embedding_weight=embedding_weight,
                    sampling=False,
                    dh=dh
                    )

    model.train(
        FLAGS.max_steps,
        model_path,
        FLAGS.save_every_n,
        FLAGS.log_every_n,
    )
コード例 #3
0
def main():
    with tf.device("/gpu"):
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        now = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        #  重要的,是否恢复模型,loss的部分;属性的数目
        model = FLAGS.mode
        test_style = True
        ct.print("tf:%s should be 1.2.1 model:%s " %
                 (str(tf.__version__), model))  # 1.2.1
        ct.print("mark:%s " % config.cc_par('mark'), 'mark')  # 1.2.1
        ct.just_log2("info", now)
        ct.just_log2("result", now)
        ct.just_log2("info", get_config_msg())
        ct.print(get_config_msg(), "mark")
        ct.just_log3(
            "test_check",
            "mode\tid\tglobal_id\tglobal_id_in_origin\tquestion\tentity\tpos\tanswer\tr1\tr2\tr3\n"
        )
        ct.log3(now)
        msg1 = "t_relation_num:%d  train_part:%s loss_part:%s" % \
               (config.cc_par('t_relation_num'),config.cc_par('train_part'), config.cc_par('loss_part'))
        ct.print(msg1)
        msg1 = 'restrore:%s use_alias_dict:%s' % (
            config.cc_par('restore_model'), config.cc_par('use_alias_dict'))
        ct.print(msg1)
        if config.cc_par('restore_model'):
            ct.print(config.cc_par('restore_path'))

        embedding_weight = None
        error_test_dict = dict()
        valid_test_dict = dict()
        # 1 读取所有的数据,返回一批数据标记好的数据{data.x,data.label}
        dh = data_helper.DataClass(model, "test")
        if FLAGS.word_model == "word2vec_train":
            embedding_weight = dh.embeddings

        # 3 构造模型LSTM类
        # loss_type = "pair"
        discriminator = Discriminator(
            max_document_length=dh.max_document_length,  # timesteps
            word_dimension=FLAGS.word_dimension,  # 一个单词的维度
            vocab_size=dh.converter.
            vocab_size,  # embedding时候的W的大小embedding_size
            rnn_size=FLAGS.rnn_size,  # 隐藏层大小
            model=model,
            need_cal_attention=config.cc_par('d_need_cal_attention'),
            need_max_pooling=FLAGS.need_max_pooling,
            word_model=FLAGS.word_model,
            embedding_weight=embedding_weight,
            need_gan=True,
            first=True)

        # generator = Generator(
        #     max_document_length=dh.max_document_length,  # timesteps
        #     word_dimension=FLAGS.word_dimension,  # 一个单词的维度
        #     vocab_size=dh.converter.vocab_size,  # embedding时候的W的大小embedding_size
        #     rnn_size=FLAGS.rnn_size,  # 隐藏层大小
        #     model=model,
        #     need_cal_attention=config.cc_par('g_need_cal_attention'), # 不带注意力玩
        #     need_max_pooling=FLAGS.need_max_pooling,
        #     word_model=FLAGS.word_model,
        #     embedding_weight=embedding_weight,
        #     need_gan=True, first=False)

        ct.print("max_document_length=%s,vocab_size=%s " %
                 (str(dh.max_document_length), str(dh.converter.vocab_size)))
        # 初始化
        init = tf.global_variables_initializer()
        merged = tf.summary.merge_all()
        with sess.as_default():
            writer = tf.summary.FileWriter(ct.log_path() + "\\log\\",
                                           sess.graph)
            sess.run(init)
            loss_dict = dict()
            loss_dict['loss'] = 0
            loss_dict['pos'] = 0
            loss_dict['neg'] = 0

            # 如果需要恢复则恢复
            if config.cc_par('restore_model'):
                saver = tf.train.Saver(tf.global_variables(),
                                       max_to_keep=FLAGS.num_checkpoints)
                save_path = config.cc_par('restore_path')
                ct.print('restore:%s' % save_path, 'model')
                saver.restore(sess, config.cc_par('restore_path'))

            # 1 NER 部分1
            print('加载别名词典:')
            dh.bh.stat_dict('../data/nlpcc2016/4-ner/extract_entitys_all.txt')
            dh.bh.init_ner(f_in2='../data/nlpcc2016/4-ner/extract_e/e1.tj.txt')

            print('input:')
            line = '红楼梦的作者是谁?'  # input()
            _best_p, _best_s = ner_rel_analyisis(dh, discriminator, line,
                                                 sess)  # 2 NER LSTM 识别
            hh_dh = dh
            hh_discriminator = discriminator
            hh_sess = sess
            print(_best_s)
            print(_best_p)
            return hh_dh, hh_discriminator, hh_sess
コード例 #4
0
def main():
    time.sleep(0.5)  # 休息0.5 秒让之前的进程退出
    now = "\n\n\n" + str(datetime.datetime.now().isoformat())
    # test 是完整的; small 是少量 ; debug 只是一次
    model = FLAGS.mode
    ct.print("tf:%s should be 1.2.1 model:%s " %
             (str(tf.__version__), model))  # 1.2.1
    ct.just_log2("info", now)
    ct.just_log2("valid", now)
    ct.just_log2("test", now)
    ct.just_log2("info", get_config_msg())
    ct.log3(now)

    embedding_weight = None
    error_test_dict = dict()
    valid_test_dict = dict()
    # 1 读取所有的数据,返回一批数据标记好的数据{data.x,data.label}
    dh = data_helper.DataClass(model)
    if FLAGS.word_model == "word2vec_train":
        embedding_weight = dh.embeddings

    # 3 构造模型LSTM类
    ct.print("max_document_length=%s,vocab_size=%s " %
             (str(dh.max_document_length), str(dh.converter.vocab_size)))
    lstm = mynn.CustomNetwork(
        max_document_length=dh.max_document_length,  # timesteps
        word_dimension=FLAGS.word_dimension,  # 一个单词的维度
        vocab_size=dh.converter.vocab_size,  # embedding时候的W的大小embedding_size
        rnn_size=FLAGS.rnn_size,  # 隐藏层大小
        model=model,
        need_cal_attention=FLAGS.need_cal_attention,
        need_max_pooling=FLAGS.need_max_pooling,
        word_model=FLAGS.word_model,
        embedding_weight=embedding_weight,
        need_gan=False)

    # 4 ----------------------------------- 设定loss-----------------------------------
    global_step = tf.Variable(0, name="globle_step", trainable=False)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(lstm.loss, tvars),
                                      FLAGS.max_grad_norm)
    optimizer = tf.train.GradientDescentOptimizer(1e-1)
    optimizer.apply_gradients(zip(grads, tvars))
    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    # 初始化
    init = tf.global_variables_initializer()
    merged = tf.summary.merge_all()

    with tf.Session().as_default() as sess:
        writer = tf.summary.FileWriter("log/", sess.graph)
        sess.run(init)

        embeddings = []
        use_error = False
        error_test_q_list = []
        error_test_pos_r_list = []
        error_test_neg_r_list = []

        # 测试输出所以的训练问题和测试问题
        # dh.build_train_test_q()
        #
        train_step = 0
        max_acc = 0
        for step in range(FLAGS.epoches):

            toogle_line = ">>>>>>>>>>>>>>>>>>>>>>>>>step=%d,total_train_step=%d " % (
                step, len(dh.q_neg_r_tuple))
            ct.log3(toogle_line)
            ct.just_log2("info", toogle_line)

            # 数据准备
            my_generator = ''
            if FLAGS.fix_model and len(error_test_q_list) != 0:
                my_generator = dh.batch_iter_wq_debug_fix_model(
                    error_test_q_list, error_test_pos_r_list,
                    error_test_neg_r_list, FLAGS.batch_size)
                use_error = True
                toogle_line = "\n\n\n\n\n------------------use_error to train"
                ct.log3(toogle_line)
                ct.just_log2("info", toogle_line)
                ct.just_log2("valid", 'use_error to train')
                ct.just_log2("test", 'use_error to train')
            elif ct.is_debug_few():
                toogle_line = "\n------------------is_debug_few to train"
                ct.log3(toogle_line)
                ct.just_log2("info", toogle_line)
                train_part = config.cc_par('train_part')
                model = 'train'
                # 属性就生成问题就读取
                shuffle_indices = get_shuffle_indices_train(
                    len(dh.q_neg_r_tuple_train), step, train_part, model,
                    train_step)

                if train_part == 'relation':
                    my_generator = dh.batch_iter_wq_debug(
                        dh.train_question_list_index,
                        dh.train_relation_list_index, shuffle_indices,
                        FLAGS.batch_size, train_part)
                else:
                    my_generator = dh.batch_iter_wq_debug(
                        dh.train_question_list_index,
                        dh.train_answer_list_index, shuffle_indices,
                        FLAGS.batch_size, train_part)
            else:
                # 不用
                train_q, train_cand, train_neg = \
                    dh.batch_iter_wq(dh.train_question_list_index, dh.train_relation_list_index,
                                     FLAGS.batch_size)

            toogle_line = "\n==============================train_step=%d\n" % train_step
            ct.just_log2("info", toogle_line)
            ct.log3(toogle_line)

            # 训练数据
            for gen in my_generator:
                toogle_line = "\n==============================train_step=%d\n" % train_step
                ct.just_log2("info", toogle_line)
                ct.log3(toogle_line)

                if not use_error:
                    train_step += 1

                train_q = gen[0]
                train_cand = gen[1]
                train_neg = gen[2]
                run_step2(sess, lstm, step, train_step, train_op, train_q,
                          train_cand, train_neg, merged, writer, dh, use_error)

                if use_error:
                    continue
                    # -------------------------test
                    # 1 源数据,训练数据OR验证数据OR测试数据

            # 验证
            valid_test_dict, error_test_dict, max_acc, all_right,\
                error_test_q_list, error_test_pos_r_list, error_test_neg_r_list \
                = valid_test_checkpoint(train_step, dh, step, sess, lstm, merged, writer,
                                        train_op,
                                        valid_test_dict, error_test_dict, max_acc)

            if config.cc_par('keep_run') and all_right and step > 2:
                del lstm  # 清理资源
                del sess
                return True

            if use_error:
                error_test_q_list.clear()
                error_test_pos_r_list.clear()
                error_test_neg_r_list.clear()
                use_error = False
            toogle_line = "<<<<<<<<<<<<<<<<<<<<<<<<<<<<step=%d\n" % step
            # ct.just_log2("test", toogle_line)
            ct.just_log2("info", toogle_line)

            ct.log3(toogle_line)
コード例 #5
0
 def build_all_q_r_tuple():
     dh = data_helper.DataClass("sq")
     dh.build_all_q_r_tuple(99999, 9999999, is_record=True)
コード例 #6
0
def main(_):
    # prepare_data()
    # FLAGS.start_string = FLAGS.start_string.decode('utf-8')
    # converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path = \
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model_path = os.path.join('model', FLAGS.name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    model = 'ner'
    dh = data_helper.DataClass(model)
    train_batch_size = 1
    # g = dh.batch_iter_char_rnn(train_batch_size)  # (FLAGS.num_seqs, FLAGS.num_steps)
    embedding_weight = dh.embeddings

    model = CharRNN(dh.converter.vocab_size,  # 词汇表大小 从其中生成所有候选
                    num_seqs=train_batch_size,  # FLAGS.num_seqs,  # ? 一个batch 的 句子 数目
                    num_steps=dh.max_document_length,  # FLAGS.num_steps,  # 一个句子的长度
                    lstm_size=FLAGS.lstm_size,
                    num_layers=FLAGS.num_layers,
                    learning_rate=FLAGS.learning_rate,
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size,
                    embedding_weight=embedding_weight,
                    sampling=True,
                    dh=dh
                    )

    model.load(FLAGS.checkpoint_path)
    # cs = []
    # cs.append('♠是什么类型的产品')
    # cs.append('♠是谁')
    # cs.append('♠是哪个公司的长度')
    f1 = '../data/nlpcc2016/6-answer/q.rdf.ms.re.v1.txt'
    f3 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.v1.txt'
    f4 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.sort_by_ner_lstm.v1.txt'
    f1s = ct.file_read_all_lines_strip(f1)
    f3s = ct.file_read_all_lines_strip(f3)
    f1s_new = []
    f3s_new = []
    for i in range(len(f1s)):
        # if str(f1s[i]).__contains__('NULL'):
        #     continue
        f1s_new.append(f1s[i])
        f3s_new.append(f3s[i])

    # 过滤NULL
    # 获取候选实体逐个去替代和判断

    # cs.append('立建候时么什是♠')
    # 读取出所有候选实体并打分取出前3 看准确率

    f4s = []
    _index = -1
    for l1 in f1s_new:  # 遍历每个问题
        _index += 1
        replace_qs = []
        for l3 in f3s_new[_index].split('\t'):
            q_1 = str(l1).split('\t')[0].replace(l3, '♠')
            replace_qs.append((q_1, l3))
        entitys = []
        for content, l3 in replace_qs:
            # content = input("input:")
            start = dh.convert_str_to_indexlist_2(content, False)

            # arr = model.sample(FLAGS.max_length, start, dh.converter.vocab_size,dh.get_padding_num())
            # #converter.vocab_size
            r1, score_list = model.judge(start, dh.converter.vocab_size)
            entitys.append((l3, r1))
            # print(content)
            # print(r1)
            # print(score_list)
            ct.print("%s\t%s\t%s" % (content, l3, r1), 'debug_process')
        entitys.sort(key=lambda x: x[1])
        entitys_new = [x[0] for x in entitys]
        ct.print('\t'.join(entitys_new))
        f4s.append('\t'.join(entitys_new))
    ct.file_wirte_list(f4, f4s)