コード例 #1
0
 def sta_test_x(self):
     if self.__sta_test_x is None:
         # 从文件加载
         try:
             df = pd.read_csv(PATH_CONFIG.get('file_sta_test'))
             self.__sta_test_x = df[self.__sta_ltdfCol].values
         except:
             print('重新计算统计数据 - test')
             #texts = src2.d_org_data.org_test_data()
             texts = self.test_org_texts()
             self.__sta_test_x = self.create_sta_data(
                 texts, PATH_CONFIG.get('file_sta_test'))
     return self.__sta_test_x
コード例 #2
0
def cut_test_text():
    try:
        lt_cut_text = read_cut_file(PATH_CONFIG.get('file_cut_test'))
    except:
        print('测试数据首次加载...')
        lt_org_data = src2.d_org_data.org_test_data()
        lt_cut_text = [
            ' '.join(jieba.cut(text, cut_all=False)) for text in lt_org_data
        ]
        lt_cut_text = [
            a.replace('\n', ' ').replace('\r', ' ') for a in lt_cut_text
        ]
        # 顺手保存一下
        save_cut_file(PATH_CONFIG.get('file_cut_test'), lt_cut_text)

    return lt_cut_text
コード例 #3
0
    def __init__(self, f_mgr):
        self.f_mgr = f_mgr
        self.w2v_matrix = self.f_mgr.w2v_matrix()
        self.map_id_to_label = self.f_mgr.map_id_to_label()

        self.config = src2.ProConfig.ProConfig()
        self.config.w2v_vocab_size = len(self.w2v_matrix)
        self.config.cix_vocab_size = self.f_mgr.cix_vocab_size
        self.config.f2_dimension = len((self.f_mgr.f2_train_x())[0])
        self.model = src2.ModelCnn.ModleCnn_3(self.config)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=PATH_CONFIG.get('file_cnn_best'))  # 读取保存的模型
コード例 #4
0
def process():
    # 创建数据管理类
    f_mgr = src2.features.Features(src2.ProConfig.ProConfig())

    # 创建工作模型类
    run = CnnPredict(f_mgr)

    print('加载测试数据...')
    test_voc_x = f_mgr.voc_test_x()
    test_w2v_x = f_mgr.w2v_test_x()
    test_f2_x = f_mgr.f2_test_x()
    test_cix_x = f_mgr.cix_test_x()

    print('加载测试数据id号 (从原始数据)...')
    test_id = src2.d_org_data.org_test_index()

    # 生成过程
    print('开始生成结果...')
    data_len = len(test_w2v_x)
    batch_size = 20
    num_batch = int((data_len - 1) / batch_size) + 1
    lt_ids = []
    for i in range(num_batch):
        if int((i * 1.0 / num_batch) * 100) % 10 == 0:
            print("%d / %d" % (i, num_batch))
        bng_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)

        d1 = test_voc_x[bng_id:end_id]
        d2 = test_w2v_x[bng_id:end_id]
        d3 = test_f2_x[bng_id:end_id]
        d4 = test_cix_x[bng_id:end_id]

        lt_ids += run.predict_data(d1, d2, d3, d4)

    print('结果存入文件...')
    lines = '\n'.join([str(tid) + ',' + run.map_id2label(id) for tid, id in zip(test_id, lt_ids)])

    # save_path = PATH_CONFIG.get('file_sub') + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.csv'
    save_path = PATH_CONFIG.get('file_sub')
    with codecs.open(save_path, "w", "utf-8")as f:
        f.write(lines)
        f.write('\n')
    print('完成: ' + save_path)
コード例 #5
0
def org_test_index():
    df = pd.read_json(PATH_CONFIG.get('file_org_test'),
                      lines=True).set_index('id')
    return list(df.index.values)
コード例 #6
0
def org_train_data():
    df = pd.read_json(PATH_CONFIG.get('file_org_train'),
                      lines=True).set_index('id')
    return list(df['内容'].values), list(df['标签'].values)
コード例 #7
0
 def save_path(self):
     return '%s/w2v_%d.mod' % (PATH_CONFIG.get('dir_w2v'), self.__vec_size)
コード例 #8
0
def process():

    print('载入数据...')
    start_time = time.time()

    # 建立全局配置
    config = src2.ProConfig.ProConfig()

    # 数据管理类
    f_mgr = src2.features.Features(config)

    # 获得特征向量
    train_voc_x = f_mgr.voc_train_2_x()

    train_cix_x = f_mgr.cix_train_2_x()
    config.cix_vocab_size = f_mgr.cix_vocab_size

    train_w2v_x = f_mgr.w2v_train_2_x()

    w2v_matrix = f_mgr.w2v_matrix()
    config.w2v_vocab_size = len(w2v_matrix)

    train_f2_x = f_mgr.f2_train_2_x()
    config.f2_dimension = len(train_f2_x[0])

    # 获得训练数据指标
    train_y = f_mgr.train_2_y()

    # 分割出部分验证集
    train_voc_x, train_w2v_x, train_f2_x, train_cix_x, train_y, valid_voc_x, valid_w2v_x, valid_f2_x, valid_cix_x, valid_y = \
        get_train_validation(train_voc_x, train_w2v_x, train_f2_x, train_cix_x, train_y, 120)

    time_dif = get_time_dif(start_time)
    print("数据载入完成:", time_dif)

    # 开始训练
    # model = src2.ModelCnn.ModleCnn_1(config)
    model = src2.ModelCnn.ModleCnn_3(config)

    # tensorboard配置
    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(PATH_CONFIG.get('dir_cnn_tb'))

    # 模型存储Saver
    saver = tf.train.Saver()

    # 创建session
    # session = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('开始执行训练...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    best_acc_tra = 0.0
    best_los_tra = 0.0
    best_los_val = 0.0
    learning_rate = config.learning_rate
    last_improved = 0  # 记录上一次提升批次
    last_learn_rate_dec = 0
    require_improvement = 8000  # 如果超过1000轮未提升,提前结束训练
    learnrate_dec_rounds = 2000  # 如果超过1000轮未提升,提前结束训练

    flag = False
    for epoch in range(config.epochs_num):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(train_voc_x, train_w2v_x, train_f2_x, train_cix_x, train_y, config.batch_size)
        for x1_batch, x2_batch, x3_batch, x4_batch, y_batch in batch_train:
            feed_dict = feed_data(model, x1_batch, x2_batch, x3_batch, x4_batch, y_batch, config.dropout_keep_prob, w2v_matrix,
                                  learning_rate)
            if total_batch % config.save_per_batch == 0:
                # 每多少轮次将训练结果写入tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, model, valid_voc_x, valid_w2v_x, valid_f2_x, valid_cix_x, valid_y, w2v_matrix,
                                             learning_rate)  # todo

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    best_acc_tra = acc_train
                    best_los_tra = loss_train
                    best_los_val = loss_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=PATH_CONFIG.get('file_cnn_best'))
                    improved_str = '*'
                elif acc_val == best_acc_val:
                    if (acc_train > best_acc_tra and loss_train <= best_los_tra and loss_val <= best_los_val) or \
                            (acc_train >= best_acc_tra and loss_train < best_los_tra and loss_val <= best_los_val) or \
                            (acc_train >= best_acc_tra and loss_train <= best_los_tra and loss_val < best_los_val):
                        # 保存最好结果
                        best_acc_val = acc_val
                        best_acc_tra = acc_train
                        best_los_tra = loss_train
                        best_los_val = loss_val
                        last_improved = total_batch
                        saver.save(sess=session, save_path=PATH_CONFIG.get('file_cnn_best'))
                        improved_str = '*'
                    else:
                        improved_str = ''
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                print("验证集长时间没有提升,强制训练结束...")
                flag = True
                break  # 跳出循环

            if total_batch - last_improved > learnrate_dec_rounds and total_batch - last_learn_rate_dec > learnrate_dec_rounds:
                learning_rate = max(learning_rate * config.learning_rate_dec_rate, config.learning_rate_min)
                last_learn_rate_dec = total_batch
                print("%d轮无结果,降低学习率%f" % (total_batch - last_improved, learning_rate))

        if flag:  # 同上
            break

    session.close()
コード例 #9
0
 def save_path(self):
     return '%s/tfidf_%d.mod' % (PATH_CONFIG.get('dir_tfidt'),
                                 self.__vec_size)