def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): global last_f1 global lr time0 = time.time() batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data for batch in tqdm(xrange(n_tr_batches)): global_step = sess.run(model.global_step) if 0 == (global_step + 1) % FLAGS.valid_step: valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( global_step, valid_cost, precision, recall, f1, time.time() - time0)) time0 = time.time() if f1 > last_f1: last_f1 = f1 saving_path = model.saver.save(sess, model_path, global_step+1) print('saved new model to %s ' % saving_path) # training batch_id = batch_indexs[batch] [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id) y_batch = to_categorical(y_batch) _batch_size = len(y_batch) feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob} summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch # valid per 500 steps if 0 == (global_step + 1) % 500: train_writer.add_summary(summary, global_step) batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id) y_batch = to_categorical(y_batch) _batch_size = len(y_batch) feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} summary, _cost = sess.run(valid_fetches, feed_dict) test_writer.add_summary(summary, global_step)
def valid_epoch(data_path, sess, model): """Test on the valid data.""" va_batches = os.listdir(data_path) n_va_batches = len(va_batches) _costs = 0.0 predict_labels_list = list() # 所有的预测结果 marked_labels_list = list() for i in xrange(n_va_batches): [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) marked_labels_list.extend(y_batch) y_batch = to_categorical(y_batch) _batch_size = len(y_batch) fetches = [model.loss, model.y_pred] feed_dict = { model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0 } _cost, predict_labels = sess.run(fetches, feed_dict) _costs += _cost predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标 predict_labels_list.extend(predict_labels) predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) precision, recall, f1 = score_eval(predict_label_and_marked_label_list) mean_cost = _costs / n_va_batches return mean_cost, precision, recall, f1
def valid_epoch(data_path,sess,model): #??以下这两行应该是多余的,前面已经写过了。 va_batches=os.listdir(data_path) n_va_batches=len(va_batches) _costs=0.0 predict_labels_list=list() marked_labels_list=list() for i in range(n_va_batches): [X1_batch,X2_batch,y_batch]=get_batch(data_path,i) marked_labels_list.extend(y_batch) #将label转化为one_hot形式 y_batch = to_categorical(y_batch) _batch_size=len(y_batch) fetches=[model.loss,model.y_pred] feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} _cost,predict_labels=sess.run(fetches,feed_dict) #累加1个epoch里的误差 _costs+=_cost #注意predict_lavels的shape为[batch_size,n_class] predict_labels=list(map(lambda label:label.argsort()[-1:-6:-1],predict_labels)) predict_labels_list.extend(predict_labels) predict_label_and_marked_label_list=zip(predict_labels_list,marked_labels_list) precision,recall,f1=score_eval(predict_label_and_marked_label_list) mean_cost=_costs/n_va_batches return mean_cost,precision,recall,f1
def valid_epoch(): """Testing or valid.""" data_valid._index_in_epoch = 0 # 先指向第一个值 _batch_size = te_batch_size fetches = [cost, y_pred] batch_num = int(valid_data_size / _batch_size) start_time = time.time() _costs = 0.0 predict_labels_list = list() # 所有的预测结果 for i in range(batch_num): X_batch, y_batch = data_valid.next_batch(_batch_size) X1_batch = X_batch[:, :n_step1] X2_batch = X_batch[:, n_step1:] y_batch = to_categorical(y_batch) feed_dict = { X1_inputs: X1_batch, X2_inputs: X2_batch, y_inputs: y_batch, lr: 1e-5, batch_size: _batch_size, keep_prob: 1.0, fc_keep_prob: 1.0, tst: True, n_updates: global_step } _cost, predict_labels = sess.run(fetches, feed_dict) _costs += _cost predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标 predict_labels_list.extend(predict_labels) predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) precision, recall, f1 = score_eval(predict_label_and_marked_label_list) mean_cost = _costs / batch_num return mean_cost, precision, recall, f1
def valid_epoch(data_path=data_valid_path): """Test on the valid data.""" _costs = 0.0 predict_labels_list = list() # 所有的预测结果 marked_labels_list = list() # 真实标签 _global_step = sess.run(global_step) for i in range(n_va_batches): [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) marked_labels_list.extend(y_batch) y_batch = to_categorical(y_batch) _batch_size = len(y_batch) fetches = [merged, cost, y_pred] feed_dict = { X1_inputs: X1_batch, X2_inputs: X2_batch, y_inputs: y_batch, batch_size: _batch_size, keep_prob: 1.0, tst: True, n_updates: _global_step } summary, _cost, predict_labels = sess.run(fetches, feed_dict) _costs += _cost predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标 predict_labels_list.extend(predict_labels) predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) precision, recall, f1 = score_eval(predict_label_and_marked_label_list) mean_cost = _costs / n_va_batches return mean_cost, precision, recall, f1
def predict_test(sess, model): """Test on the valid data.""" time0 = time.time() predict_labels_list = list() # 所有的预测结果 predict_score20_list = list() # 预测排名前20的分数 predict_labels_list2 = list() #前五名的结果 marked_labels_list = list() topic_num = list() predict_scores = list() for i in tqdm(xrange(n_te_batches)): [X1_batch, X2_batch, y_batch] = get_batch_t(i) marked_labels_list.extend(y_batch)#真实标签结果 没-1 y_batch = to_categorical(y_batch) _batch_size = len(X1_batch) fetches = [model.y_pred]#每个类别的分数 feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} predict_labels = sess.run(fetches, feed_dict)[0] predict_labels = softmax(predict_labels)#128 predict_scores.append(predict_labels)#每个类别的分数 predict_top5score = map(lambda label: np.sort(label,axis=-1)[-1:-6:-1], predict_labels) # 取最大的5个分数 128 index = map(findindex,predict_top5score)#list 128 #print (index,'index.type:',type(index),'len.index',len(index)) predict_toplabels = list() for i in range(len(index)): if index[i] == None: toplabel = predict_labels[i].argsort()[-1:-6:-1] elif index[i] == 0: toplabel = predict_labels[i].argsort()[-1:-2:-1] else: toplabel = predict_labels[i].argsort()[-1:-1*index[i]-1:-1] predict_toplabels.append(toplabel) predict_labels_list.extend(predict_toplabels) #print('predict_toplabels:',predict_toplabels,type(predict_toplabels),len(predict_toplabels)) predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)#都-1了 不知道为啥 print (predict_label_and_marked_label_list[0:2]) #(array([ 15, 327, 307, 478, 10]), [8, 15, 307, 0]),真实是[9, 16, 308, 1] precision, recall, f1 = score_eval(predict_label_and_marked_label_list)#计算分数 print('Local test p=%g, r=%g, f1=%g' % (precision, recall, f1)) predict_scores = np.vstack(np.asarray(predict_scores)) print('predict_scores:',predict_scores.shape) local_scores_name = local_scores_path + model_name + '_test.npy' np.save(local_scores_name, predict_scores)#保存每个类别的分数 print('local_scores.shape=', predict_scores.shape) print('Writed the test scores into %s, time %g s' % (local_scores_name, time.time() - time0))
def train_epoch(data_path,sess,model,train_fetches, valid_fetches,train_writer,test_writer): global last_f1 global lr time0=time.time() batch_indexs=np.random.permutation(n_tr_batches) #引入数据,弄上tqdm显示进度 for batch in tqdm(range(n_tr_batches)): #看下global_step,每隔一万步valid一下 global_step=sess.run(model.global_step) if (global_step+1)%FLAGS.valid_step==0: valid_cost, precision, recall, f1=valid_epoch(data_valid_path,sess,model) print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( global_step, valid_cost, precision, recall, f1, time.time() - time0)) time0=time.time() if f1>last_f1: last_f1=f1 saving_path=model.saver.save(sess,model_path,global_step+1) print('saved new model to %s ' % saving_path) #training batch_id=batch_indexs[batch] [X1_batch,X2_batch,y_batch]=get_batch(data_train_path,batch_id) #将label转化为one_hot形式 y_batch = to_categorical(y_batch) _batch_size=len(y_batch) feed_dict={model.X1_inputs:X1_batch, model.X2_inputs:X2_batch, model.y_inputs:y_batch, model.batch_size:_batch_size,model.tst:True,model.keep_prob:0.5} summary, _cost, _, _=sess.run(train_fetches,feed_dict) #valid per 500 steps if (global_step+1) % 500 ==0: train_writer.add_summary(summary, global_step) batch_id=np.random.randint(0,n_va_batches) [X1_batch,X2_batch,y_batch]=get_batch(data_valid_path,batch_id) y_batch = to_categorical(y_batch) _batch_size = len(y_batch) feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} summary, _cost = sess.run(valid_fetches, feed_dict) test_writer.add_summary(summary, global_step)
def valid_epoch(data_path, sess, model): """Test on the valid data.""" va_batches = os.listdir(data_path) n_va_batches = len(va_batches) _costs = 0.0 predict_labels_list = list() # 所有的预测结果 marked_labels_list = list() for i in xrange(n_va_batches): [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) marked_labels_list.extend(y_batch) y_batch = to_categorical(y_batch) _batch_size = len(y_batch) fetches = [model.loss, model.y_pred] feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} _cost, predict_labels = sess.run(fetches, feed_dict) _costs += _cost predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标 predict_labels_list.extend(predict_labels) predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) precision, recall, f1 = score_eval(predict_label_and_marked_label_list) mean_cost = _costs / n_va_batches return mean_cost, precision, recall, f1
def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): global last_score global lr time0 = time.time() batch_indexs = np.random.permutation( n_tr_batches) # shuffle the training data for batch in range(n_tr_batches): global_step = sess.run(model.global_step) if 0 == (global_step + 1) % FLAGS.valid_step: valid_cost, score = valid_epoch(data_valid_path, sess, model) print('\n') print('Global_step=%d: valid cost=%g; score=%g, time=%g s' % (global_step, valid_cost, score, time.time() - time0)) logging.info('Global_step=%d: valid cost=%g; score=%g, time=%g s' % (global_step, valid_cost, score, time.time() - time0)) time0 = time.time() if score > last_score: last_score = score saving_path = model.saver.save(sess, model_path, global_step + 1) print('\n') print('saved new model to %s ' % saving_path) logging.info('saved new model to %s ' % saving_path) # training batch_id = batch_indexs[batch] [X_batch, y_batch] = get_batch(data_train_path, batch_id) y_batch = to_categorical(y_batch) # 标签转换 _batch_size = len(y_batch) feed_dict = { model.X_inputs: X_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob } summary, _cost, _accuracy, _, _ = sess.run( train_fetches, feed_dict) # the cost is the mean cost of one batch time_str = datetime.datetime.now().isoformat() # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, global_step, _cost, _accuracy)) logging.info("{}: step {}, loss {:g}, acc {:g}".format( time_str, global_step, _cost, _accuracy)) # valid per 500 steps if 0 == (global_step + 1) % 500: train_writer.add_summary(summary, global_step) batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch [X_batch, y_batch] = get_batch(data_valid_path, batch_id) y_batch = to_categorical(y_batch) # 法条标签转换 batch_size = len(y_batch) feed_dict = { model.X_inputs: X_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0 } summary, _cost, _accuracy = sess.run(valid_fetches, feed_dict) time_str = datetime.datetime.now().isoformat() # print("valid: {}: step {}, loss {:g}, acc {:g}".format(time_str, global_step, _cost, _accuracy)) logging.info("valid: {}: step {}, loss {:g}, acc {:g}".format( time_str, global_step, _cost, _accuracy)) test_writer.add_summary(summary, global_step)
def valid_epoch(data_path, sess, model): """Test on the valid data.""" va_batches = os.listdir(data_path) n_va_batches = len(va_batches) _costs = 0.0 predict_labels_list = list() # 所有的预测结果 marked_labels_list = list() # n_va_batches = 10 for i in range(n_va_batches): [X_batch, y_batch1] = get_batch(data_path, i) marked_labels_list.extend(y_batch1) y_batch = to_categorical(y_batch1) # 法条或罪名标签转换 _batch_size = len(y_batch) fetches = [model.loss, model.y_pred, model.accuracy] feed_dict = { model.X_inputs: X_batch, model.y_inputs: y_batch, model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0 } _cost, predict_labels, _accuracy = sess.run(fetches, feed_dict) train_batch_predict(predict_labels, y_batch1, batch_path + config.MISSION + "/", i, batch_size=config.BATCH_SIZE) # train_batch_predict(predict_labels,y_batch1, batch_path+'accu/', i, batch_size=config.BATCH_SIZE) predict_labelsnew = [] if config.LAST_LAYER == "sigmoid": for label in predict_labels: xitem = np.argwhere(label > config.SIGMOID_THRESHOLD).flatten() if (len(xitem) > 0): predict_labelsnew.append(xitem) else: predict_labelsnew.append(label.argsort()[-1:-2:-1]) elif config.LAST_LAYER == "softmax": for label in predict_labels: prob = np.argsort(label) pred = [prob[-1]] total_prob = config.SOFTMAX_THRESHHOLD - prob[-1] for i in range(len(prob))[2:]: total_prob -= prob[-i] if total_prob > 0: pred.append(prob[i - 1]) else: break xitem = np.array(pred) predict_labelsnew.append(xitem) _costs += _cost if (i == 0): logging.info(predict_labelsnew) # predict_labels = map(lambda label: label.argsort()[-1:-2:-1], predict_labels) # 取最大的1个下标 predict_labels_list.extend(predict_labelsnew) predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) score = get_task_score(predict_label_and_marked_label_list) # 法条预测分数 mean_cost = _costs / n_va_batches return mean_cost, score
valid_cost, precision, recall, f1 = valid_epoch() print( 'Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (_global_step, valid_cost, precision, recall, f1, time.time() - time0)) time0 = time.time() if (f1 > last_f1): last_f1 = f1 model_num += 1 save_path = saver.save(sess, model_path, global_step=model_num) print('the save path is ', save_path) batch_id = batch_indexs[batch] [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id, n_step1) y_batch = to_categorical(y_batch) _batch_size = len(y_batch) fetches = [merged, cost, train_op, update_op] feed_dict = { X1_inputs: X1_batch, X2_inputs: X2_batch, y_inputs: y_batch, batch_size: _batch_size, keep_prob: 0.5, tst: False, n_updates: _global_step } summary, _cost, _, _ = sess.run( fetches, feed_dict) # the cost is the mean cost of one batch if _global_step % 100: train_writer.add_summary(summary, _global_step)
def predict_dev(sess, model): """Test on the valid data.""" time0 = time.time() predict_labels_list = list() # 所有的预测结果 predict_score20_list = list() # 预测排名前20的分数 predict_labels_list2 = list() #前五名的结果 marked_labels_list = list() topic_num = list() predict_scores = list() for i in tqdm(xrange(n_va_batches)):#验证集 [X1_batch, X2_batch, y_batch] = get_batch(i) X1_length, X2_length = get_sequence_length(X1_batch, X2_batch) marked_labels_list.extend(y_batch)#真实标签结果 没-1 y_batch = to_categorical(y_batch) _batch_size = len(X1_batch) fetches = [model.y_pred]#每个类别的分数 feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.batch_size: _batch_size, model.X1_length: X1_length, model.X2_length: X2_length, model.tst: True, model.keep_prob: 1.0} predict_labels = sess.run(fetches, feed_dict)[0] predict_labels = softmax(predict_labels)#128 predict_scores.append(predict_labels)#每个类别的分数 predict_top5score = map(lambda label: np.sort(label,axis=-1)[-1:-6:-1], predict_labels) # 取最大的5个分数 128 #predict_top20score = map(lambda label: np.sort(label,axis=-1)[-1:-21:-1], predict_labels) # 取最大的20个分数 128 #print (type(predict_score20_list)) #print (type(predict_top20score)) #predict_score20_list.extend(predict_top20score) #所有 #list,predict_score_list1[0]=[ 0.63514245 0.09193601 0.0417341 0.02742104 0.02721145] index = map(findindex,predict_top5score)#list 128 #print (index,'index.type:',type(index),'len.index',len(index)) predict_toplabels = list() for i in range(len(index)): if index[i] == None: toplabel = predict_labels[i].argsort()[-1:-6:-1] elif index[i] == 0: toplabel = predict_labels[i].argsort()[-1:-2:-1] else: toplabel = predict_labels[i].argsort()[-1:-1*index[i]-1:-1] predict_toplabels.append(toplabel) predict_labels_list.extend(predict_toplabels) #print('predict_toplabels:',predict_toplabels,type(predict_toplabels),len(predict_toplabels)) #predict_top5labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标 #predict_labels_list2.extend(predict_top5labels) #predict_labels_list2.to_csv('predict_labels_list2.csv') #predict_score20_list = DataFrame(predict_score20_list) #predict_labels_list2 = DataFrame(predict_labels_list2) #predict_score20_list.to_csv('score20list.csv') #predict_labels_list2.to_csv('predict_labels_list2.csv') #topic_num = map(tolen,marked_labels_list) #topic_num = DataFrame(topic_num) #topic_num.to_csv('topic_num.csv') predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)#都-1了 不知道为啥 print (predict_label_and_marked_label_list[0:2]) #(array([ 15, 327, 307, 478, 10]), [8, 15, 307, 0]),真实是[9, 16, 308, 1] precision, recall, f1 = score_eval(predict_label_and_marked_label_list)#计算分数 print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1)) predict_scores = np.vstack(np.asarray(predict_scores)) print('predict_scores:',predict_scores.shape) local_scores_name = local_scores_path + model_name + '_dev.npy' np.save(local_scores_name, predict_scores)#保存每个类别的分数 print('local_scores.shape=', predict_scores.shape) print('Writed the dev scores into %s, time %g s' % (local_scores_name, time.time() - time0))
max_features = np.max(list(indice_token.keys())) + 1 # Augmenting x_train and x_test with n-grams features x_train = add_ngram(list(x_train), token_indice, ngram_range) x_dev = add_ngram(list(x_dev), token_indice, ngram_range) x_test = add_ngram(list(x_test), token_indice, ngram_range) print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int))) print('Average dev sequence length: {}'.format(np.mean(list(map(len, x_dev)), dtype=int))) print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int))) print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_dev = sequence.pad_sequences(x_dev, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) testy = y_test y_train = np.array([to_categorical(s) for s in y_train]) y_dev = np.array([to_categorical(s) for s in y_dev]) y_test = np.array([to_categorical(s) for s in y_test]) print('x_train shape:', x_train.shape) print('x_test shape:', x_dev.shape) print('x_test shape:', x_test.shape) print(x_train[1]) print('Build model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
# ================================================== # Load data print("Loading data...") (X_train, y_train), (X_test, y_test), WordEm = data_helpers.loadData( path='../corpus/wordseq/mr_new.p') # Randomly shuffle data np.random.seed(1933) max_features = (WordEm.shape[0]) embedding_size = WordEm.shape[1] sequence_length = X_train.shape[1] print("Vocabulary Size: {:d}".format(max_features)) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_test))) print("Sequnence Length: {:d}".format(sequence_length)) train_label = data_helpers.to_categorical(y_train, 2) test_label = data_helpers.to_categorical(y_test, 2) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(sequence_length=sequence_length, num_classes=2, vocab_size=max_features, embedding_size=FLAGS.embedding_dim,