def test_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } loss, accuracy, predict = sess.run( [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict) auc = calAUC(predict, y_batch) time_str = datetime.datetime.now().isoformat() print("{}: loss {:g}, acc {:g}, auc {:g}".format( time_str, loss, accuracy, auc))
def modelStack(models): S = [] L = [] T = [] for model in models: with open('data/test_predict_' + model + '.txt', 'r') as f: s = f.read().strip().split('\n')[1:] s = [ss.split('\t') for ss in s] scores = [float(ss[0]) for ss in s] labels = [int(ss[1]) for ss in s] texts = [ss[2] for ss in s] S.append(scores) L = labels T = texts S = np.array(S) S = np.mean(S, axis=0) A = ['预测值\t实际值\t文本'] for i in range(len(S)): A.append('\t'.join(['%0.4f' % S[i], str(L[i]), texts[i]])) with open('data/test_predict_' + 'all' + '.txt', 'w') as f: f.write('\n'.join(A)) y_p = S yTst = L auc = calAUC(y_p, yTst) thr0 = [0.1 * i for i in range(10)] R = ['\t'.join(['阈值', '准确率', '精度', '召回率'])] for thr in thr0: yp = [int(t > thr) for t in y_p] TP = sum([yp[i] == yTst[i] for i in range(len(yp)) if yp[i] == 1]) TN = sum([yp[i] == yTst[i] for i in range(len(yp)) if yp[i] == 0]) FP = sum([yp[i] != yTst[i] for i in range(len(yp)) if yp[i] == 1]) FN = sum([yp[i] != yTst[i] for i in range(len(yp)) if yp[i] == 0]) acc = float(TP + TN) / len(yp) pre = float(TP) / (TP + FP) rec = float(TP) / (TP + FN) R.append('\t'.join( ['%0.2f' % thr, '%0.4f' % acc, '%0.4f' % pre, '%0.4f' % rec])) print([thr, acc, pre, rec]) R.append('%0.4f' % auc) with open('data/test_result-' + 'all' + '.txt', 'w') as f: f.write('\n'.join(R))
def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy, predict = sess.run([ global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions ], feed_dict) auc = calAUC(predict, y_batch) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}, auc {:g}".format( time_str, step, loss, accuracy, auc)) if writer: writer.add_summary(summaries, step)
def training(path_train, path_test, config_feature, path_ckpt, config_train, mode='lr'): XTrn, XTst, yTrn, yTst = dataSplit(path_train, path_test, config_feature) feature_dim = len(XTrn[0]) config_train.feature_dim = feature_dim if mode == 'lr' or mode == 'word': X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr( feature_dim) if mode == 'lr-dense' or mode == 'lr-w2v-word-dense': X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr_dense( config_train) if mode == 'lr-w2v' or mode == 'lr-w2v-word' or mode == 'lr-word': W_lr = np.load('lr-ckpt/W.npy') b = np.load('lr-ckpt/b.npy') W_w2v = np.zeros((feature_dim - W_lr.shape[0], 1)) W = np.concatenate((W_lr, W_w2v), axis=0) X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr( feature_dim, W=W, b=b) global_step = tf.train.get_or_create_global_step() train_op = tf.group(train_op, [tf.assign_add(global_step, 1)]) saver = tf.train.Saver(max_to_keep=10) session = tf.Session() if not os.path.exists(path_ckpt): os.mkdir(path_ckpt) ckpt_file = tf.train.latest_checkpoint(path_ckpt) if ckpt_file: saver.restore(session, ckpt_file) print('restore model from %s' % ckpt_file) else: init = tf.global_variables_initializer() session.run(init) learning_rate_ = config_train.learning_rate iter = iterData(XTrn, yTrn, batch_size=config_train.train_batch_size, epoch=config_train.epochs) iter_test = iterData_test(XTst, yTst, batch_size=config_train.test_batch_size) data = next(iter) step = 0 epoch = 0 print('training begin') auc0 = 0 while data != '__RETURN__': if data == '__STOP__': data = next(iter) epoch += 1 continue x0, y0 = data y0 = np.array(y0) y0 = np.reshape(y0, (len(y0), 1)) if step % config_train.step_saveckpt == 0: saver.save(session, os.path.join(path_ckpt, 'model.ckpt'), global_step=global_step) loss_ = session.run(loss, feed_dict={ X_holder: x0, y_holder: y0, learning_rate: learning_rate_ }) datatest = next(iter_test) x0_test, y0_test = datatest y0_test = np.array(y0_test) y0_test = np.reshape(y0_test, (len(y0_test), 1)) y_p0 = session.run(predict_y, feed_dict={ X_holder: x0_test, y_holder: y0_test, learning_rate: learning_rate_ }) y_p = [tmp[0] for tmp in y_p0] auc = calAUC(y_p, y0_test) if auc > auc0: auc0 = auc path_backup = path_ckpt + '-backup/' tmpfile = tf.train.latest_checkpoint(path_ckpt) cfile = tmpfile + ".*" cmdstr = "cp " + cfile + " " + path_backup os.system(cmdstr) print('****ckpt updated****') print('epoch:{}-step:{}-auc_test:{}-loss_trn:{}'.format( epoch, step, '%0.3f' % auc, '%0.4f' % loss_)) session.run(train_op, feed_dict={ X_holder: x0, y_holder: y0, learning_rate: learning_rate_ }) step += 1 data = next(iter) print('training over!')
def testing(path_test, config_feature, path_ckpt, config_train, ckpt_file='', mode='lr', name=''): with open(path_test, 'r') as f: S = f.read().strip().split('\n') S = [s.split('\t') for s in S] XTst = [getFeature(s[0], config_feature) for s in S] yTst = [int(s[1]) for s in S] print('number of positive/negative samples of testSet is {}/{}'.format( sum(yTst), len(yTst) - sum(yTst))) feature_dim = len(XTst[0]) print('feature dim is %d' % feature_dim) config_train.feature_dim = feature_dim if 'dense' in mode: X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr_dense( config_train) else: X_holder, y_holder, learning_rate, predict_y, loss, optimizer, train_op, grads, accuracy = simple_lr( feature_dim) global_step = tf.train.get_or_create_global_step() train_op = tf.group(train_op, [tf.assign_add(global_step, 1)]) saver = tf.train.Saver(max_to_keep=10) session = tf.Session() if len(ckpt_file) == 0: ckpt_file = tf.train.latest_checkpoint(path_ckpt) saver.restore(session, ckpt_file) print('restore model from %s' % ckpt_file) learning_rate_ = config_train.learning_rate x0_test, y0_test = XTst, yTst y0_test = np.array(y0_test) y0_test = np.reshape(y0_test, (len(y0_test), 1)) y_p0 = session.run(predict_y, feed_dict={ X_holder: x0_test, y_holder: y0_test, learning_rate: learning_rate_ }) y_p = [tmp[0] for tmp in y_p0] auc = calAUC(y_p, y0_test) X = ['\t'.join(['预测值', '实际值', '文本'])] for i in range(len(S)): X.append('%0.4f' % y_p[i] + '\t' + S[i][1] + '\t' + S[i][0]) with open('data/test_predict_' + mode + name + '.txt', 'w') as f: f.write('\n'.join(X)) thr0 = [0.1 * i for i in range(10)] thr0 += [0.91 + 0.01 * i for i in range(9)] R = ['\t'.join(['阈值', '准确率', '精度', '召回率'])] for thr in thr0: yp = [int(t > thr) for t in y_p] TP = sum([yp[i] == yTst[i] for i in range(len(yp)) if yp[i] == 1]) TN = sum([yp[i] == yTst[i] for i in range(len(yp)) if yp[i] == 0]) FP = sum([yp[i] != yTst[i] for i in range(len(yp)) if yp[i] == 1]) FN = sum([yp[i] != yTst[i] for i in range(len(yp)) if yp[i] == 0]) acc = float(TP + TN) / len(yp) pre = float(TP) / (TP + FP) rec = float(TP) / (TP + FN) R.append('\t'.join( ['%0.2f' % thr, '%0.4f' % acc, '%0.4f' % pre, '%0.4f' % rec])) print('\n'.join(R)) print('auc=%0.4f' % auc) R.append('auc=%0.4f' % auc) with open('data/test_result-' + mode + name + '.txt', 'w') as f: f.write('\n'.join(R))