Exemple #1
0
 def test_step(x_batch, y_batch, writer=None):
     """
     Evaluates model on a dev set
     """
     feed_dict = {
         cnn.input_x: x_batch,
         cnn.input_y: y_batch,
         cnn.dropout_keep_prob: 1.0
     }
     loss, accuracy, predict = sess.run(
         [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict)
     auc = calAUC(predict, y_batch)
     time_str = datetime.datetime.now().isoformat()
     print("{}: loss {:g}, acc {:g}, auc {:g}".format(
         time_str, loss, accuracy, auc))
Exemple #2
0
def modelStack(models):
    S = []
    L = []
    T = []
    for model in models:
        with open('data/test_predict_' + model + '.txt', 'r') as f:
            s = f.read().strip().split('\n')[1:]
            s = [ss.split('\t') for ss in s]
            scores = [float(ss[0]) for ss in s]
            labels = [int(ss[1]) for ss in s]
            texts = [ss[2] for ss in s]
        S.append(scores)
        L = labels
        T = texts
    S = np.array(S)
    S = np.mean(S, axis=0)
    A = ['预测值\t实际值\t文本']
    for i in range(len(S)):
        A.append('\t'.join(['%0.4f' % S[i], str(L[i]), texts[i]]))
    with open('data/test_predict_' + 'all' + '.txt', 'w') as f:
        f.write('\n'.join(A))
    y_p = S
    yTst = L
    auc = calAUC(y_p, yTst)
    thr0 = [0.1 * i for i in range(10)]
    R = ['\t'.join(['阈值', '准确率', '精度', '召回率'])]
    for thr in thr0:
        yp = [int(t > thr) for t in y_p]
        TP = sum([yp[i] == yTst[i] for i in range(len(yp)) if yp[i] == 1])
        TN = sum([yp[i] == yTst[i] for i in range(len(yp)) if yp[i] == 0])
        FP = sum([yp[i] != yTst[i] for i in range(len(yp)) if yp[i] == 1])
        FN = sum([yp[i] != yTst[i] for i in range(len(yp)) if yp[i] == 0])
        acc = float(TP + TN) / len(yp)
        pre = float(TP) / (TP + FP)
        rec = float(TP) / (TP + FN)
        R.append('\t'.join(
            ['%0.2f' % thr,
             '%0.4f' % acc,
             '%0.4f' % pre,
             '%0.4f' % rec]))
        print([thr, acc, pre, rec])
    R.append('%0.4f' % auc)
    with open('data/test_result-' + 'all' + '.txt', 'w') as f:
        f.write('\n'.join(R))
Exemple #3
0
 def dev_step(x_batch, y_batch, writer=None):
     """
     Evaluates model on a dev set
     """
     feed_dict = {
         cnn.input_x: x_batch,
         cnn.input_y: y_batch,
         cnn.dropout_keep_prob: 1.0
     }
     step, summaries, loss, accuracy, predict = sess.run([
         global_step, dev_summary_op, cnn.loss, cnn.accuracy,
         cnn.predictions
     ], feed_dict)
     auc = calAUC(predict, y_batch)
     time_str = datetime.datetime.now().isoformat()
     print("{}: step {}, loss {:g}, acc {:g}, auc {:g}".format(
         time_str, step, loss, accuracy, auc))
     if writer:
         writer.add_summary(summaries, step)
Exemple #4
0
def training(path_train,
             path_test,
             config_feature,
             path_ckpt,
             config_train,
             mode='lr'):
    XTrn, XTst, yTrn, yTst = dataSplit(path_train, path_test, config_feature)
    feature_dim = len(XTrn[0])
    config_train.feature_dim = feature_dim
    if mode == 'lr' or mode == 'word':
        X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr(
            feature_dim)
    if mode == 'lr-dense' or mode == 'lr-w2v-word-dense':
        X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr_dense(
            config_train)
    if mode == 'lr-w2v' or mode == 'lr-w2v-word' or mode == 'lr-word':
        W_lr = np.load('lr-ckpt/W.npy')
        b = np.load('lr-ckpt/b.npy')
        W_w2v = np.zeros((feature_dim - W_lr.shape[0], 1))
        W = np.concatenate((W_lr, W_w2v), axis=0)
        X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr(
            feature_dim, W=W, b=b)
    global_step = tf.train.get_or_create_global_step()
    train_op = tf.group(train_op, [tf.assign_add(global_step, 1)])
    saver = tf.train.Saver(max_to_keep=10)
    session = tf.Session()
    if not os.path.exists(path_ckpt):
        os.mkdir(path_ckpt)
    ckpt_file = tf.train.latest_checkpoint(path_ckpt)
    if ckpt_file:
        saver.restore(session, ckpt_file)
        print('restore model from %s' % ckpt_file)
    else:
        init = tf.global_variables_initializer()
        session.run(init)
    learning_rate_ = config_train.learning_rate
    iter = iterData(XTrn,
                    yTrn,
                    batch_size=config_train.train_batch_size,
                    epoch=config_train.epochs)
    iter_test = iterData_test(XTst,
                              yTst,
                              batch_size=config_train.test_batch_size)
    data = next(iter)
    step = 0
    epoch = 0
    print('training begin')
    auc0 = 0
    while data != '__RETURN__':
        if data == '__STOP__':
            data = next(iter)
            epoch += 1
            continue
        x0, y0 = data
        y0 = np.array(y0)
        y0 = np.reshape(y0, (len(y0), 1))
        if step % config_train.step_saveckpt == 0:
            saver.save(session,
                       os.path.join(path_ckpt, 'model.ckpt'),
                       global_step=global_step)
            loss_ = session.run(loss,
                                feed_dict={
                                    X_holder: x0,
                                    y_holder: y0,
                                    learning_rate: learning_rate_
                                })
            datatest = next(iter_test)
            x0_test, y0_test = datatest
            y0_test = np.array(y0_test)
            y0_test = np.reshape(y0_test, (len(y0_test), 1))
            y_p0 = session.run(predict_y,
                               feed_dict={
                                   X_holder: x0_test,
                                   y_holder: y0_test,
                                   learning_rate: learning_rate_
                               })
            y_p = [tmp[0] for tmp in y_p0]
            auc = calAUC(y_p, y0_test)
            if auc > auc0:
                auc0 = auc
                path_backup = path_ckpt + '-backup/'
                tmpfile = tf.train.latest_checkpoint(path_ckpt)
                cfile = tmpfile + ".*"
                cmdstr = "cp " + cfile + " " + path_backup
                os.system(cmdstr)
                print('****ckpt updated****')
            print('epoch:{}-step:{}-auc_test:{}-loss_trn:{}'.format(
                epoch, step, '%0.3f' % auc, '%0.4f' % loss_))
        session.run(train_op,
                    feed_dict={
                        X_holder: x0,
                        y_holder: y0,
                        learning_rate: learning_rate_
                    })
        step += 1
        data = next(iter)
    print('training over!')
Exemple #5
0
def testing(path_test,
            config_feature,
            path_ckpt,
            config_train,
            ckpt_file='',
            mode='lr',
            name=''):
    with open(path_test, 'r') as f:
        S = f.read().strip().split('\n')
    S = [s.split('\t') for s in S]
    XTst = [getFeature(s[0], config_feature) for s in S]
    yTst = [int(s[1]) for s in S]
    print('number of positive/negative samples of testSet is {}/{}'.format(
        sum(yTst),
        len(yTst) - sum(yTst)))
    feature_dim = len(XTst[0])
    print('feature dim is %d' % feature_dim)
    config_train.feature_dim = feature_dim
    if 'dense' in mode:
        X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr_dense(
            config_train)
    else:
        X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr(
            feature_dim)
    global_step = tf.train.get_or_create_global_step()
    train_op = tf.group(train_op, [tf.assign_add(global_step, 1)])
    saver = tf.train.Saver(max_to_keep=10)
    session = tf.Session()
    if len(ckpt_file) == 0:
        ckpt_file = tf.train.latest_checkpoint(path_ckpt)
    saver.restore(session, ckpt_file)
    print('restore model from %s' % ckpt_file)
    learning_rate_ = config_train.learning_rate
    x0_test, y0_test = XTst, yTst
    y0_test = np.array(y0_test)
    y0_test = np.reshape(y0_test, (len(y0_test), 1))
    y_p0 = session.run(predict_y,
                       feed_dict={
                           X_holder: x0_test,
                           y_holder: y0_test,
                           learning_rate: learning_rate_
                       })
    y_p = [tmp[0] for tmp in y_p0]
    auc = calAUC(y_p, y0_test)
    X = ['\t'.join(['预测值', '实际值', '文本'])]
    for i in range(len(S)):
        X.append('%0.4f' % y_p[i] + '\t' + S[i][1] + '\t' + S[i][0])
    with open('data/test_predict_' + mode + name + '.txt', 'w') as f:
        f.write('\n'.join(X))
    thr0 = [0.1 * i for i in range(10)]
    thr0 += [0.91 + 0.01 * i for i in range(9)]
    R = ['\t'.join(['阈值', '准确率', '精度', '召回率'])]
    for thr in thr0:
        yp = [int(t > thr) for t in y_p]
        TP = sum([yp[i] == yTst[i] for i in range(len(yp)) if yp[i] == 1])
        TN = sum([yp[i] == yTst[i] for i in range(len(yp)) if yp[i] == 0])
        FP = sum([yp[i] != yTst[i] for i in range(len(yp)) if yp[i] == 1])
        FN = sum([yp[i] != yTst[i] for i in range(len(yp)) if yp[i] == 0])
        acc = float(TP + TN) / len(yp)
        pre = float(TP) / (TP + FP)
        rec = float(TP) / (TP + FN)
        R.append('\t'.join(
            ['%0.2f' % thr,
             '%0.4f' % acc,
             '%0.4f' % pre,
             '%0.4f' % rec]))
        print('\n'.join(R))
    print('auc=%0.4f' % auc)
    R.append('auc=%0.4f' % auc)
    with open('data/test_result-' + mode + name + '.txt', 'w') as f:
        f.write('\n'.join(R))