Exemple #1
0
def train():
    # -----------------------------------数据准备-------------------------------------
    train_manager = BatchManager(batch_size=20, name='train')
    test_manager = BatchManager(batch_size=100, name='test')

    # -----------------------------------读取字典-------------------------------------
    mapping_dict = get_dict(dict_file)

    # -----------------------------------搭建模型-------------------------------------
    model = Model(mapping_dict)

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for i in range(5):
            j = 1
            for batch in train_manager.iter_batch(shuffle=True):
                start = time.time()
                loss = model.run_step(sess, batch)
                end = time.time()
                if j % 5 == 0:
                    print('epoch:{},step:{}/{},loss:{},elapse:{},estimate:{}'.
                          format(i + 1, j, train_manager.len_data, loss,
                                 end - start,
                                 (end - start) * (train_manager.len_data - j)))
                j += 1
            for batch in test_manager.iter_batch(shuffle=True):
                test_result = model.predict(sess,
                                            batch,
                                            istrain=False,
                                            istest=True)
                print('precision rate:{} %', test_result[1])
Exemple #2
0
def predict_line(param):
    # 初始化日志对象
    logger = get_logger(param.test_log_file)
    tf_config = tf.ConfigProto()
    # 读取字典
    mapping_dict = get_dict(param.dict_file)
    # 根据保存的模型读取模型
    model = Model(param, mapping_dict)
    # 开始测试
    with tf.Session(config=tf_config) as sess:
        # 首先检查模型是否存在
        ckpt_path = param.ckpt_path
        ckpt = tf.train.get_checkpoint_state(ckpt_path)
        # 看是否存在训练好的模型
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            logger.info("Reading model parameters from {}".format(
                ckpt.model_checkpoint_path))
            # 如果存在就进行重新加载
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("Cannot find the ckpt files!")
        while True:
            # 反复输入句子进行预测
            line = input("请输入测试句子:")
            raw_inputs, model_inputs = input_from_line_with_feature(line)
            tag = model.evaluate_line(sess, model_inputs)
            result = result_to_json(raw_inputs, tag)
            result = js.dumps(result,
                              ensure_ascii=False,
                              indent=4,
                              separators=(',', ': '))
            with open('./result/result.json', 'w', encoding='utf-8') as f:
                f.write(result)
            print("预测结果为:{}".format(result))
Exemple #3
0
def test(param):
    # 检查参数
    assert param.clip < 5.1, "gradient clip should't be too much"
    assert 0 <= param.dropout < 1, "dropout rate between 0 and 1"
    assert param.lr > 0, "learning rate must larger than zero"
    # 获取batch_manager
    test_manager = BatchManager(param.test_batch_size, name='test')
    number_dataset = test_manager.len_data
    print("total of number test data is {}".format(number_dataset))
    # 配置日志
    logger = get_logger(param.test_log_file)
    # 读取字典
    mapping_dict = get_dict(param.dict_file)
    # 搭建模型
    model = Model(param, mapping_dict)
    # 配置GPU参数
    gpu_config = tf.ConfigProto()
    with tf.Session(config=gpu_config) as sess:
        logger.info("start testing...")
        start = time.time()
        # 首先检查模型是否存在
        ckpt_path = param.ckpt_path
        ckpt = tf.train.get_checkpoint_state(ckpt_path)
        # 看是否存在训练好的模型
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            logger.info("Reading model parameters from {}".format(
                ckpt.model_checkpoint_path))
            # 如果存在就进行重新加载
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("Cannot find the ckpt files!")
        # 开始评估
        evaluate(sess, param, model, "test", test_manager, logger)
        logger.info("The best_f1 on test_dataset is {:.2f}".format(
            model.best_test_f1.eval()))
        logger.info('Time test for {:.2f} batch is {:.2f} sec\n'.format(
            param.test_batch_size,
            time.time() - start))
Exemple #4
0
def annotation():
    #train()
    #读入整体的csv文件
    try:
        whole_data = pd.read_csv(raw_data, sep=',', encoding='UTF-8')
    except UnicodeEncodeError:
        whole_data = pd.read_csv(raw_data, sep=',', encoding='GBK', errors='ignore')
    except Exception as e:
        print(e)

    row_num = whole_data.shape[0]
    print(row_num)

    # -----------------------------------读取字典-------------------------------------
    mapping_dict = get_dict(dict_file)

    # -----------------------------------搭建模型-------------------------------------
    model = Model(mapping_dict)

    list = ['Per', 'Com', 'Time', 'Job', 'Nat', 'Bir', 'Age', 'Gdr', 'Uni', 'Edu', 'Sch', 'Col', 'Maj', 'Zhi', 'Hon']
    feature_dataframe = pd.DataFrame(columns=list)

    #创建test文件夹,并遍历所有的数据进行预测
    for i in range(whole_data):
        # 单纯创建只能创建两层,用shutil可以创建多层
        if os.path.exists('data/Test'):
            shutil.rmtree('data/Test')
        if not os.path.exists('data/Test'):
            os.makedirs('data/Test')

        cur_data = whole_data['ManagerResume'][i]
        print(cur_data)

        filename = 'data/Test/need_annotation.txt'
        with open(filename, 'w') as f:  # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据!
            f.write(cur_data)

        task_process(split_text)
        get_data('task')

        # -----------------------------------数据准备-------------------------------------
        task_manager = BatchManager(batch_size=1, name='task')

        # -----------------------------------搭建模型-------------------------------------

        item_T = {}
        item_T = pd.DataFrame(item_T)

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            for i in range(1):
                for batch in task_manager.iter_batch(shuffle=True):
                    task_result,item = model.predict(sess, batch, istrain=False, istest=False)
                    #item_Entity = pd.DataFrame(item['entities'])
                    #item_T = item_T.append(item_Entity)
                    item_T = pd.DataFrame(item['entities'])

                    #print('predict result:{} %', task_result)
                    print(item_T)
                    #num_samples = len(item)  # 获取有多少句话  等于是有多少个样本
                    #print(num_samples)

        # -------------------------------存储标注完的数据----------------------------------
        f_Key = {}

        for feature in list:
            l_type =[]
            for j in range(item_T.shape[0]):
                if(item_T['type'].iloc[j] == feature):
                    return_word = [item_T['word'].iloc[j]]
                    l_type = l_type+return_word
            f_Key.update({feature:l_type})

        feature_dataframe = feature_dataframe.append(f_Key,ignore_index=True)

    FinalResult = pd.concat([whole_data,feature_dataframe], axis=1)
    fpath = 'FinalResult.csv'
    pd.DataFrame(FinalResult).to_csv(fpath)
Exemple #5
0
def train(param):
    # 检查参数
    assert param.clip < 5.1, "gradient clip should't be too much"
    assert 0 <= param.dropout < 1, "dropout rate between 0 and 1"
    assert param.lr > 0, "learning rate must larger than zero"
    # 数据准备
    train_manager = BatchManager(param.batch_size, name='train')
    number_dataset = train_manager.len_data
    print("total of number train data is {}".format(number_dataset))
    # 创建相应的文件夹
    make_path(param)
    # 配置日志
    logger = get_logger(param.train_log_file)
    # 读取字典
    mapping_dict = get_dict(param.dict_file)
    # 读取senc_tag为后续加载词向量做准备
    senc_tag = get_sent_tag(param.sent_tag_file)
    # 加载预训练向量
    dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        mapping_dict['word'][2].copy(), param.emb_file,
        list(
            itertools.chain.from_iterable([[w[0] for w in s]
                                           for s in senc_tag])))
    # 获取总的训练集数据数量
    steps_per_epoch = train_manager.len_data
    # 配置GPU参数
    gpu_config = tf.ConfigProto()
    with tf.Session(config=gpu_config) as sess:
        # 初始化模型
        model = creat_model(sess,
                            Model,
                            param.ckpt_path,
                            load_word2vec,
                            param,
                            id_to_char,
                            logger,
                            map_all=mapping_dict)
        for i in range(param.max_epoch):
            loss = []
            total_loss = 0
            # 初始化时间
            start = time.time()
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, batch)
                # 这里计算平均loss
                loss.append(batch_loss)
                # 这里计算总的loss后面计算全部平均
                total_loss += batch_loss
                if step % 5 == 0:
                    logger.info(
                        "epoch:{}, step:{}/{}, avg_loss:{:>9.4f}".format(
                            i + 1, step % steps_per_epoch, steps_per_epoch,
                            np.mean(loss)))
            # 保存模型
            model.save_model(sess, logger, i)
            logger.info('Epoch {}, total Loss {:.4f}'.format(
                i + 1, total_loss / train_manager.len_data))
            logger.info(
                'Time taken for one epoch {:.4f} min, take {:.2f} h for rest of epoch\n'
                .format((time.time() - start) / 60,
                        ((param.max_epoch - i + 1) *
                         (time.time() - start)) / 3600))