def overlap_visualize(): train, test, dev = load("nlpcc", filter=True) test = test.reindex(np.random.permutation(test.index)) df = test df['qlen'] = df['question'].str.len() df['alen'] = df['answer'].str.len() df['q_n_words'] = df['question'].apply(lambda row: len(row.split(' '))) df['a_n_words'] = df['answer'].apply(lambda row: len(row.split(' '))) def normalized_word_share(row): w1 = set( map(lambda word: word.lower().strip(), row['question'].split(" "))) w2 = set( map(lambda word: word.lower().strip(), row['answer'].split(" "))) return 1.0 * len(w1 & w2) / (len(w1) + len(w2)) df['word_share'] = df.apply(normalized_word_share, axis=1) plt.figure(figsize=(12, 8)) plt.subplot(1, 2, 1) sns.violinplot(x='flag', y='word_share', data=df[0:50000]) plt.subplot(1, 2, 2) sns.distplot(df[df['flag'] == 1.0]['word_share'][0:10000], color='green') sns.distplot(df[df['flag'] == 0.0]['word_share'][0:10000], color='red') print evaluation.evaluationBypandas(test, df['word_share']) plt.show('hold')
def random_result(): train, test, dev = load("wiki", filter=True) # test = test.reindex(np.random.permutation(test.index)) # test['pred'] = test.apply(idf_word_overlap,axis = 1) pred = np.random.randn(len(test)) print evaluation.evaluationBypandas(test, pred)
def theano_verion(): train, test = load("wiki", filter=True) alphabet, embeddings = prepare([train, test]) test_input = getQAIndiceofTest(test, alphabet) from model import Model1 model = Model1(50, 50, 50, len(alphabet.keys()), embeddings) #pdb.set_trace() #print((model.predict([q_train, a_train]))) # start training for epoch in range(20): for x_trainq, x_traina, y_train1 in batch_gen_with_point_wise( train, alphabet, FLAGS.batch_size): loss, acc = model.train_on_batch([x_trainq, x_traina], y_train1) perf = str(loss) + " " + str(acc) print("loss is %f with acc %f" % (loss, acc)) #y_train1 = y_train1.reshape(y_train1.shape[0],1) #x = model.predict([x_trainq, x_traina]) predicted = model.predict_on_batch(test_input) print(evaluation.evaluationBypandas(test, predicted)) evaluation.briany_test_file(test, predicted) print("\n\n\n\n\n")
def predict(): logger = logging.getLogger('QA') logger.info('load vocab') data_path = FLAGS.data_path train_file = os.path.join(data_path, 'train.txt') test_file = os.path.join(data_path, 'test.txt') if FLAGS.dt_dir == "": FLAGS.dt_dir = (date.today() + timedelta(-1)).strftime('%Y%m%d') FLAGS.model_dir = FLAGS.model_dir + FLAGS.dt_dir with open(os.path.join(FLAGS.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) model_params["vocab_size"] = len(vocab['embeddings']) model_params["embeddings"] = vocab["embeddings"] data_set = QA_dataset(None, None, test_file, FLAGS) config = tf.estimator.RunConfig().replace( session_config=tf.ConfigProto(device_count={ 'GPU': 0, 'CPU': FLAGS.num_threads }), log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps) QA_CNN = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir=FLAGS.model_dir, params=model_params, config=config) preds = QA_CNN.predict(input_fn=lambda: data_set.input_fn( FLAGS.test_tf_records, num_epochs=1, batch_size=FLAGS.batch_size), predict_keys=["prob", 'score']) # list_pred = list(map(lambda x:x['prob'],preds)) a = list(map(lambda x: (x['prob'], x['score']), preds)) list_pred, score = zip(*a) random_pred = np.random.rand(len(data_set.test_set)) print('random:{}\n'.format( evaluation.evaluationBypandas(data_set.test_set, random_pred))) print(evaluation.evaluationBypandas(data_set.test_set, score)) # data_set.test_set['pred'] = list_pred print(data_set.test_set.head()) data_set.test_set.to_csv('pred.txt', sep='\t', index=None, header=None)
def test_point_wise(): # creat_train_test("2") train, test = load() # train,test,dev = load(FLAGS.data,filter = FLAGS.clean) # print () # q_max_sent_length = 4 q_max_sent_length = FLAGS.max_len_query # d_max_sent_length = 21 d_max_sent_length = FLAGS.max_len_document # alphabet,embeddings = prepare([train,test,dev],dim = FLAGS.embedding_dim,is_embedding_needed = True,fresh = True) # alphabet,embeddings = get_wordDic_Embedding(300) alphabet, embeddings = get_wordDic_Embedding(50) print("alphabet", len(alphabet)) # exit() with tf.Graph().as_default(): with tf.device("/gpu:0"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) # with sess.as_default(),open(precision,"w") as log: # log.write(str(FLAGS.__flags)+'\n') cnn = IR_quantum(max_input_query=q_max_sent_length, max_input_docu=d_max_sent_length, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, embeddings=embeddings, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda, trainable=FLAGS.trainable, overlap_needed=FLAGS.overlap_needed, pooling=FLAGS.pooling, extend_feature_dim=FLAGS.extend_feature_dim) cnn.build_graph() ckpt_dir = "runs/20181022" saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: print(ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # restore all variables else: raise FileNotFoundError("no fund saver!!") predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, d_max_sent_length) test["score"] = predicted_test[:, -1] # test_score["query_ID","document","flag","score"] = test["query_ID","document","flag","score"] # f = "result.txt" test.to_csv("result.txt", sep='\t', header=False, index=False) map_NDCG0_NDCG1_ERR_p_test = evaluation_test.evaluationBypandas( test, predicted_test[:, -1]) print("test epoch:map,NDCG0,NDCG1,ERR,p {}".format( map_NDCG0_NDCG1_ERR_p_test))
def model_mixed(): test_file = 'dbqa.txt' test = pd.read_csv(test_file, header=None, sep="\t", names=["flag", "question", "answer"], quoting=3) predicted = pd.read_csv('Bot_submit', names=['score']) map_mrr_test = evaluation.evaluationBypandas(test, predicted) print map_mrr_test
def model_mixed(): data_dir = "data/" + 'nlpcc' test_file = os.path.join(data_dir, "test.txt") test = pd.read_csv(test_file, header=None, sep="\t", names=["question", "answer", "flag"], quoting=3) predicted = pd.read_csv('../QA/train.QApair.TJU_IR_QA.score', names=['score']) map_mrr_test = evaluation.evaluationBypandas(test, predicted) print map_mrr_test
def englishTest(): train, test, dev = load("wiki", filter=True) q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) # test = test.reindex(np.random.permutation(test.index)) train = train.reset_index() # test = test.reset_index() print 'load Data finished' columns1 = get_features(train) columns2 = get_features(test) common = [item for item in columns2 if item in columns1] print common # common = ['align', 'align_reverse', 'features_similarity', 'features_similarity_reverse'] print 'save the small idf_dict' # pickle.dump(small_idf_dict,open('data/small_idf_dict','w')) x = train[common].fillna(0) y = train["flag"] test_x = test[common].fillna(0) # clf = linear_model.LinearRegression() clf = linear_model.LogisticRegression() # clf = svm.SVR() print x.head() # clf = GradientBoostingRegressor() # clf = tree.DecisionTreeRegressor() # clf = svm.SVR() clf.fit(x, y) print clf.coef_ # predicted = clf.predict(test_x) predicted = clf.predict_proba(test_x) predicted = predicted[:, 1] print len(predicted) print len(test) print(evaluation.evaluationBypandas(test, predicted))
def test_point_wise(): # creat_train_test("2") train, test = load() # train,test,dev = load(FLAGS.data,filter = FLAGS.clean) # print () # q_max_sent_length = 4 q_max_sent_length = FLAGS.max_len_query # d_max_sent_length = 21 d_max_sent_length = FLAGS.max_len_document # alphabet,embeddings = prepare([train,test,dev],dim = FLAGS.embedding_dim,is_embedding_needed = True,fresh = True) # alphabet,embeddings = get_wordDic_Embedding(300) alphabet, embeddings = get_wordDic_Embedding(300) print("alphabet", len(alphabet)) # exit() with tf.Graph().as_default(): with tf.device("/gpu:1"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') cnn = IR_quantum(max_input_query=q_max_sent_length, max_input_docu=d_max_sent_length, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, embeddings=embeddings, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda, trainable=FLAGS.trainable, overlap_needed=FLAGS.overlap_needed, pooling=FLAGS.pooling, extend_feature_dim=FLAGS.extend_feature_dim) cnn.build_graph() global_step = tf.Variable(0, name='global_step', trainable=False) starter_learning_rate = FLAGS.learning_rate # learning_rate = tf.train.exponential_decay(starter_learning_rate,global_step,100,0.96) # optimizer = tf.train.GradientDescentOptimizer(starter_learning_rate,global_step = global_step) optimizer = tf.train.GradientDescentOptimizer( starter_learning_rate) # optimizer = tf.train.AdamOptimizer(starter_learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=4) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter( tensorboard_log_dir + "/train", sess.graph) sess.run(tf.global_variables_initializer()) map_max = 0.020 # loss_max = 0.3 for i in range(FLAGS.num_epochs): print("\nepoch " + str(i) + "\n") d = get_overlap_dict(train, alphabet, q_len=q_max_sent_length, d_len=d_max_sent_length) # datas = batch_gen_with_point_wise(train,alphabet,FLAGS.batch_size,overlap_dict = d, # q_len = q_max_sent_length,d_len = d_max_sent_length) datas = batch_gen_with_list_wise(train, alphabet, FLAGS.batch_size, q_len=q_max_sent_length, d_len=d_max_sent_length, overlap_dict=d) # if i <2: # continue j = 1 for data in datas: feed_dict = { cnn.query: data[0], cnn.document: data[1], cnn.input_label: data[2], cnn.q_overlap: data[3], cnn.d_overlap: data[4] } _, step, logits, loss, scores, input_label = sess.run([ train_op, global_step, cnn.logits, cnn.loss, cnn.scores, cnn.input_label ], feed_dict) # train_writer.add_summary(rs,i) # print ("density_trace") # print (density_trace) # print ("input_label") # print (input_label) # print ("label") # print (data[2]) # print ("logits") # print (logits) # print ("p_label") # print (p_label) # print ("scores:") # print (scores) print("{} loss: {}".format(j, loss)) j += 1 # # print ("para") # # print (para) # print ("score"+str(scores)) time_str = datetime.datetime.now().isoformat() predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, d_max_sent_length) # print ("train predict") # print (predicted[:,-1]) map_NDCG0_NDCG1_ERR_p_train = evaluation_test.evaluationBypandas( train, predicted[:, -1]) # precision_train = evaluation.precision(train,predicted[:,-1]) # predicted = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,d_max_sent_length) # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted[:,-1]) # precision_dev = evaluation.precision(dev,predicted[:,-1]) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, d_max_sent_length) # print ("test predict") # print (predicted_test[:,-1]) map_NDCG0_NDCG1_ERR_p_test = evaluation_test.evaluationBypandas( test, predicted_test[:, -1]) # precision_test = evaluation.precision(test,predicted_test[:,-1]) if map_NDCG0_NDCG1_ERR_p_test[0] > map_max: map_max = map_NDCG0_NDCG1_ERR_p_test[0] timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) folder = 'runs/' + timeDay out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str( map_NDCG0_NDCG1_ERR_p_test[0]) if not os.path.exists(folder): os.makedirs(folder) save_path = saver.save(sess, out_dir) print("Model saved in file: ", save_path) print("{}:train epoch:map,NDCG0,NDCG1,ERR,p {}".format( i, map_NDCG0_NDCG1_ERR_p_train)) # print('precision_train',precision_train) # print ("{}:dev epoch:map mrr {}".format(i,map_mrr_dev)) # print('precision_dev',precision_dev) # f = open() print("{}:test epoch:map,NDCG0,NDCG1,ERR,p {}".format( i, map_NDCG0_NDCG1_ERR_p_test)) # file = "result/listwise_"+timeDay+"_learnrate_"+str(FLAGS.learning_rate)+".txt" # f = open(file,"a") # f.write("{}:train epoch:map,NDCG0,NDCG1,ERR,p {}".format(i,map_NDCG0_NDCG1_ERR_p_train)) # f.write("{}:test epoch:map,NDCG0,NDCG1,ERR,p {}".format(i,map_NDCG0_NDCG1_ERR_p_test)) # f.write("\n") # f.close() # print('precision_test',precision_test) # line = " {}:epoch: map_test{},precision_test: {}".format(i,map_mrr_test,precision_test) line1 = " {}:epoch: map_train{}".format( i, map_NDCG0_NDCG1_ERR_p_train) log.write(line1 + "\n") line = " {}:epoch: map_test{}".format( i, map_NDCG0_NDCG1_ERR_p_test) log.write(line + "\n") log.write("\n") log.flush() log.close()
def test_point_wise(): train, dev, test = load(FLAGS.data, filter=FLAGS.clean) #wiki # train, test, dev = load(FLAGS.data, filter=FLAGS.clean) #trec q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) print(q_max_sent_length) print(a_max_sent_length) print(len(train)) print('train question unique:{}'.format(len(train['question'].unique()))) print('train length', len(train)) print('test length', len(test)) print('dev length', len(dev)) alphabet, embeddings, embeddings_complex = prepare( [train, test, dev], max_sent_length=a_max_sent_length, dim=FLAGS.embedding_dim, is_embedding_needed=True, fresh=True) print(embeddings_complex) print('alphabet:', len(alphabet)) with tf.Graph().as_default(): with tf.device("/gpu:0"): # session_conf = tf.ConfigProto( # allow_soft_placement=FLAGS.allow_soft_placement, # log_device_placement=FLAGS.log_device_placement) session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) cnn = QA_quantum(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, embeddings=embeddings, embeddings_complex=embeddings_complex, dropout_keep_prob=FLAGS.dropout_keep_prob, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda, is_Embedding_Needed=True, trainable=FLAGS.trainable, overlap_needed=FLAGS.overlap_needed, position_needed=FLAGS.position_needed, pooling=FLAGS.pooling, hidden_num=FLAGS.hidden_num, extend_feature_dim=FLAGS.extend_feature_dim) cnn.build_graph() # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) starter_learning_rate = FLAGS.learning_rate learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100, 0.96) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) merged = tf.summary.merge_all() writer = tf.summary.FileWriter( "logs_NNQLM2_embedding_xiangwei_uniform/", sess.graph) sess.run(tf.global_variables_initializer()) map_max = 0.65 now = int(time.time()) timeArray = time.localtime(now) timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray) timeDay = time.strftime("%Y%m%d", timeArray) print(timeStamp) for i in range(FLAGS.num_epochs): d = get_overlap_dict(train, alphabet, q_len=q_max_sent_length, a_len=a_max_sent_length) datas = batch_gen_with_point_wise(train, alphabet, FLAGS.batch_size, overlap_dict=d, q_len=q_max_sent_length, a_len=a_max_sent_length) for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.input_y: data[2], cnn.q_position: data[3], cnn.a_position: data[4], cnn.overlap: data[5], cnn.q_overlap: data[6], cnn.a_overlap: data[7] } _, step, loss, accuracy, pred, scores, input_y, position = sess.run( [ train_op, global_step, cnn.loss, cnn.accuracy, cnn.predictions, cnn.scores, cnn.input_y, cnn.embedding_W_complex ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g} ".format( time_str, step, loss, accuracy)) now = int(time.time()) timeArray = time.localtime(now) timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray) timeDay = time.strftime("%Y%m%d", timeArray) print(timeStamp) position_embedding = position.reshape([236, 50]) print(position_embedding) np.savetxt("position_embedding_xiangwei_uniform.txt", position_embedding) predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) predicted_label = np.argmax(predicted, 1) map_mrr_train = evaluation.evaluationBypandas( train, predicted[:, -1]) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) print(predicted_test[-2]) print(predicted_test[-1]) predicted_label = np.argmax(predicted_test, 1) map_mrr_test = evaluation.evaluationBypandas( test, predicted_test[:, -1]) if map_mrr_test[0] > map_max: map_max = map_mrr_test[0] timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) folder = 'runs/' + timeDay out_dir = folder + '/' + timeStamp + \ '__' + FLAGS.data + str(map_mrr_test[0]) if not os.path.exists(folder): os.makedirs(folder) #save_path = saver.save(sess, out_dir) print("{}:train epoch:map mrr {}".format(i, map_mrr_train)) print("{}:test epoch:map mrr {}".format(i, map_mrr_test)) line1 = " {}:epoch: map_train{}".format(i, map_mrr_train) line2 = " {}:epoch: map_test{}".format(i, map_mrr_test) log.write(line1 + '\n' + line2 + '\n') log.flush() log.close()
alphabet = data_helper.get_alphabet([train, test, dev]) logger.info('the number of words :%d ' % len(alphabet)) embedding = data_helper.get_embedding(alphabet, opts['embedding_file'], embedding_size=opts['embedding_size']) opts["embeddings"] = embedding opts["vocab_size"] = len(alphabet) opts["max_input_right"] = a_max_sent_length opts["max_input_left"] = q_max_sent_length opts["filter_sizes"] = list(map(int, args.filter_sizes.split(","))) with tf.Graph().as_default(): model = models.setup(opts) model._model_stats() for i in range(args.num_epoches): data_gen = data_helper.get_mini_batch(train, alphabet, args.batch_size) model.train(data_gen, i) test_datas = data_helper.get_mini_batch_test(test, alphabet, args.batch_size) predicted_test = model.predict(test_datas) map_, mrr_, p_1 = evaluation.evaluationBypandas(test, predicted_test) logger.info('map:{}--mrr:{}--p@1--{}'.format(map_, mrr_, p_1)) print('map:{}--mrr:{}--p@1--{}'.format(map_, mrr_, p_1))
def test(): train, test, dev = load("trec", filter=False) q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) alphabet, embeddings = prepare([train, test, dev], is_embedding_needed=True) print 'alphabet:', len(alphabet) with tf.Graph().as_default(): # with tf.device("/cpu:0"): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(), open("precision.txt", "w") as log: # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) cnn = QA_overlap( max_len_left=q_max_sent_length, max_len_right=a_max_sent_length, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, embeddings=embeddings, # filter_sizes = list(map(int, FLAGS.filter_sizes.split(","))), filter_sizes=[5], num_filters=FLAGS.num_filters, num_hidden=10, l2_reg_lambda=FLAGS.l2_reg_lambda, is_Embedding_Needed=True) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.all_variables(), max_to_keep=20) # Initialize all variables sess.run(tf.global_variables_initializer()) #summary # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.histogram_summary( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.merge_summary(grad_summaries) # Output directory for models and summaries out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.scalar_summary("loss", cnn.loss) acc_summary = tf.scalar_summary("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.merge_summary( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.train.SummaryWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.train.SummaryWriter( dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # seq_process(train, alphabet) # seq_process(test, alphabet) for i in range(25): for x_left_batch, x_right_batch, y_batch, overlap in batch_gen_with_point_wise( train, alphabet, FLAGS.batch_size, overlap=True, q_len=q_max_sent_length, a_len=a_max_sent_length): feed_dict = { cnn.input_left: x_left_batch, cnn.input_right: x_right_batch, cnn.input_y: y_batch, cnn.overlap: overlap, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy, pred, scores = sess.run( [ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy, cnn.predictions, cnn.scores ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g} ".format( time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) # print loss predicted = prediction(sess, cnn, test, alphabet, q_max_sent_length, a_max_sent_length) predicted_dev = prediction(sess, cnn, dev, alphabet, q_max_sent_length, a_max_sent_length) # predicted_train = prediction(sess,cnn,train,alphabet) print np.array(predicted).shape print len(predicted) print len(test) map_mrr_dev = evaluation.evaluationBypandas( dev, predicted_dev[:, -1]) map_mrr_test = evaluation.evaluationBypandas( test, predicted[:, -1]) # print evaluation.evaluationBypandas(train,predicted_train[:,-1]) print map_mrr_dev print map_mrr_test line = " {}: epoch: precision {}".format(i, map_mrr_test) log.write(line + '\n')
def test_pair_wise(dns=FLAGS.dns): train, test, dev = load(FLAGS.data, filter=True) q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) print 'q_question_length:{} a_question_length:{}'.format( q_max_sent_length, a_max_sent_length) print 'train question unique:{}'.format(len(train['question'].unique())) print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) alphabet, embeddings = prepare([train, test, dev], dim=FLAGS.embedding_dim, is_embedding_needed=True, fresh=True) # alphabet,embeddings = prepare_300([train,test,dev]) print 'alphabet:', len(alphabet) with tf.Graph().as_default(), tf.device("/gpu:0"): # with tf.device("/cpu:0"): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) cnn = QA_attentive(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, batch_size=FLAGS.batch_size, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, dropout_keep_prob=FLAGS.dropout_keep_prob, embeddings=embeddings, l2_reg_lambda=FLAGS.l2_reg_lambda, is_Embedding_Needed=True, trainable=FLAGS.trainable) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) # Initialize all variables sess.run(tf.global_variables_initializer()) if dns == True: loadfile = "tmp/20170502223124__0.678083232207.ckpt" saver.restore(sess, loadfile) predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_train = evaluation.evaluationBypandas(train, predicted) predicted = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas(test, predicted) print map_mrr_train print map_mrr_test # seq_process(train, alphabet) # seq_process(test, alphabet) map_max = 0.65 for i in range(1000): if dns == True: samples = dns_sample(train, alphabet, q_max_sent_length, a_max_sent_length, sess, cnn, FLAGS.batch_size, neg_sample_num=10) datas = batch_gen_with_pair_dns(samples, FLAGS.batch_size) else: datas = batch_gen_with_pair(train, alphabet, FLAGS.batch_size, q_len=q_max_sent_length, a_len=a_max_sent_length) for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[2] } _, step, loss, accuracy, score12, score13 = sess.run([ train_op, global_step, cnn.loss, cnn.accuracy, cnn.score12, cnn.score13 ], feed_dict) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}" .format(time_str, step, loss, accuracy, np.mean(score12), np.mean(score13))) # print loss predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_train = evaluation.evaluationBypandas(train, predicted) predicted = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas(test, predicted) # # predicted_train = prediction(sess,cnn,train,alphabet,q_max_sent_length,a_max_sent_length) # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted_dev[:,-1]) # map_mrr_test = evaluation.evaluationBypandas(test,predicted[:,-1]) # map_mrr_train = evaluation.evaluationBypandas(train,predicted_train[:,-1]) # # print evaluation.evaluationBypandas(train,predicted_train[:,-1]) print "{}:epoch:train map mrr {}".format(i, map_mrr_train) print "{}:epoch:test map mrr {}".format(i, map_mrr_test) # print "{}:epoch:map mrr {}".format(i,map_mrr_dev) line = " {}:epoch: map_train{}----map_test{}".format( i, map_mrr_train[0], map_mrr_test[0]) log.write(line + '\n') log.flush() if map_mrr_test[0] > map_max: timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) folder = 'runs/' + timeDay out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str( map_mrr_test[0]) if not os.path.exists(folder): os.makedirs(folder) save_path = saver.save(sess, out_dir) print "Model saved in file: ", save_path map_max = map_mrr_test[0]
def test_point_wise(): train, test = load(FLAGS.data, FLAGS.file_name) q_max_sent_length = FLAGS.max_len_query d_max_sent_length = FLAGS.max_len_document alphabet, embeddings = get_wordDic_Embedding(FLAGS.data, FLAGS.file_name, 50) # print ("alphabet",len(alphabet)) with tf.Graph().as_default(): with tf.device("/gpu:0"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') cnn = IR_quantum(max_input_query=q_max_sent_length, max_input_docu=d_max_sent_length, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, embeddings=embeddings, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda, trainable=FLAGS.trainable, overlap_needed=FLAGS.overlap_needed, pooling=FLAGS.pooling, extend_feature_dim=FLAGS.extend_feature_dim) cnn.build_graph() global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = FLAGS.learning_rate # optimizer = tf.train.AdamOptimizer(learning_rate,epsilon=1e-08) # grads_and_vars = optimizer.compute_gradients(cnn.loss) # train_op = optimizer.apply_gradients(grads_and_vars,global_step = global_step) optimizer = tf.train.AdamOptimizer(learning_rate, epsilon=1e-08) grads, v = zip(*optimizer.compute_gradients(cnn.loss)) grads, _ = tf.clip_by_global_norm(grads, 5.0) train_op = optimizer.apply_gradients(zip(grads, v), global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=4) sess.run(tf.global_variables_initializer()) map_max = 0.020 # loss_max = 0.3 for i in range(FLAGS.num_epochs): print("\nepoch " + str(i) + "\n") d = get_overlap_dict(train, alphabet, q_len=q_max_sent_length, d_len=d_max_sent_length) datas = batch_gen_with_list_wise(train, alphabet, FLAGS.batch_size, q_len=q_max_sent_length, d_len=d_max_sent_length, overlap_dict=d) j = 1 for data in datas: feed_dict = { cnn.query: data[0], cnn.document: data[1], cnn.input_label: data[2], cnn.q_overlap: data[3], cnn.d_overlap: data[4], cnn.tfidf_value: data[5], cnn.dropout_keep_prob: 0.5 } _, step, l2_loss, loss = sess.run( [train_op, global_step, cnn.l2_loss, cnn.loss], feed_dict) print("{} loss: {},l2_loss : {}".format(j, loss, l2_loss)) # exit() j += 1 time_str = datetime.datetime.now().isoformat() predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, d_max_sent_length) map_NDCG0_NDCG1_ERR_p_train = evaluation_test.evaluationBypandas( train, predicted[:, -1]) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, d_max_sent_length) map_NDCG0_NDCG1_ERR_p_test = evaluation_test.evaluationBypandas( test, predicted_test[:, -1]) if map_NDCG0_NDCG1_ERR_p_test[0] > map_max: map_max = map_NDCG0_NDCG1_ERR_p_test[0] timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) folder = 'runs/' + FLAGS.data + "/" + FLAGS.file_name + "/" + timeDay out_dir = folder + '/' + timeStamp + '_' + str( map_NDCG0_NDCG1_ERR_p_test[0]) if not os.path.exists(folder): os.makedirs(folder) save_path = saver.save(sess, out_dir) print("Model saved in file: ", save_path) print("{}:train epoch:map,NDCG0,NDCG1,ERR,p {}".format( i, map_NDCG0_NDCG1_ERR_p_train)) print("{}:test epoch:map,NDCG0,NDCG1,ERR,p {}".format( i, map_NDCG0_NDCG1_ERR_p_test)) line1 = " {}:epoch: map_train{}".format( i, map_NDCG0_NDCG1_ERR_p_train) log.write(line1 + "\n") line = " {}:epoch: map_test{}".format( i, map_NDCG0_NDCG1_ERR_p_test) log.write(line + "\n") log.write("\n") log.flush() log.close()
def main(args): args._parse_flags() print("\nParameters:") for attr, value in sorted(args.__flags.items()): print(("{}={}".format(attr.upper(), value))) log_dir = 'log/' + timeDay if not os.path.exists(log_dir): os.makedirs(log_dir) data_file = log_dir + '/test_' + args.data + timeStamp precision = data_file + 'precise' print('load data ...........') train, test, dev = data_helper.load(args.data, filter=args.clean) q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) alphabet = data_helper.get_alphabet([train, test, dev]) print('the number of words', len(alphabet)) print('get embedding') if args.data == "quora": embedding = data_helper.get_embedding(alphabet, language="cn") else: embedding = data_helper.get_embedding(alphabet) with tf.Graph().as_default(), tf.device("/gpu:" + str(args.gpu)): # with tf.device("/cpu:0"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = args.allow_soft_placement session_conf.log_device_placement = args.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) model = QA_CNN_extend(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, batch_size=args.batch_size, vocab_size=len(alphabet), embedding_size=args.embedding_dim, filter_sizes=list( map(int, args.filter_sizes.split(","))), num_filters=args.num_filters, hidden_size=args.hidden_size, dropout_keep_prob=args.dropout_keep_prob, embeddings=embedding, l2_reg_lambda=args.l2_reg_lambda, trainable=args.trainable, pooling=args.pooling, conv=args.conv) model.build_graph() sess.run(tf.global_variables_initializer()) def train_step(model, sess, batch): for data in batch: feed_dict = { model.question: data[0], model.answer: data[1], model.answer_negative: data[2], model.q_mask: data[3], model.a_mask: data[4], model.a_neg_mask: data[5] } _, summary, step, loss, accuracy, score12, score13, see = sess.run( [ model.train_op, model.merged, model.global_step, model.loss, model.accuracy, model.score12, model.score13, model.see ], feed_dict) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}" .format(time_str, step, loss, accuracy, np.mean(score12), np.mean(score13))) def predict(model, sess, batch, test): scores = [] for data in batch: feed_dict = { model.question: data[0], model.answer: data[1], model.q_mask: data[2], model.a_mask: data[3] } score = sess.run(model.score12, feed_dict) scores.extend(score) return np.array(scores[:len(test)]) for i in range(args.num_epoches): datas = data_helper.get_mini_batch(train, alphabet, args.batch_size) train_step(model, sess, datas) test_datas = data_helper.get_mini_batch_test( test, alphabet, args.batch_size) predicted_test = predict(model, sess, test_datas, test) print(len(predicted_test)) print(len(test)) map_mrr_test = evaluation.evaluationBypandas(test, predicted_test) print('map_mrr test', map_mrr_test)
def test_pair_wise(dns=FLAGS.dns): train, test, dev = load(FLAGS.data, filter=FLAGS.clean) test = test.reindex(np.random.permutation(test.index)) q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) print('q_question_length:{} a_question_length:{}'.format( q_max_sent_length, a_max_sent_length)) print('train question unique:{}'.format(len(train['question'].unique()))) print('train length', len(train)) print('test length', len(test)) print('dev length', len(dev)) alphabet, embeddings = prepare([train, test, dev], dim=FLAGS.embedding_dim, is_embedding_needed=True, fresh=FLAGS.fresh) # alphabet,embeddings = prepare_300([train,test,dev]) print('alphabet:', len(alphabet)) with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)): # with tf.device("/cpu:0"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') folder = 'runs/' + timeDay + '/' + timeStamp + '/' out_dir = folder + FLAGS.data if not os.path.exists(folder): os.makedirs(folder) # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) print("start build model") cnn = QA_CNN_quantum_extend( max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, batch_size=FLAGS.batch_size, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, dropout_keep_prob=FLAGS.dropout_keep_prob, embeddings=embeddings, l2_reg_lambda=FLAGS.l2_reg_lambda, overlap_needed=FLAGS.overlap_needed, learning_rate=FLAGS.learning_rate, trainable=FLAGS.trainable, extend_feature_dim=FLAGS.extend_feature_dim, pooling=FLAGS.pooling, position_needed=FLAGS.position_needed, conv=FLAGS.conv, margin=FLAGS.margin) cnn.build_graph() saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(log_dir + '/test') # Initialize all variables print("build over") sess.run(tf.global_variables_initializer()) print("variables_initializer") # saver.restore(sess, 'runs/20170910/20170910154937/wiki') map_max = 0.65 for i in range(FLAGS.num_epochs): datas = batch_gen_with_pair(train, alphabet, FLAGS.batch_size, q_len=q_max_sent_length, a_len=a_max_sent_length, fresh=FLAGS.fresh, overlap_dict=None) print("load data") for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[2], cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, summary, step, loss, accuracy, score12, score13, see = sess.run( [ cnn.train_op, cnn.merged, cnn.global_step, cnn.loss, cnn.accuracy, cnn.score12, cnn.score13, cnn.see ], feed_dict) train_writer.add_summary(summary, i) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}" .format(time_str, step, loss, accuracy, np.mean(score12), np.mean(score13))) line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format( time_str, step, loss, accuracy, np.mean(score12), np.mean(score13)) # print loss if i % 1 == 0: predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_dev = evaluation.evaluationBypandas( dev, predicted_dev) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas( test, predicted_test) precise_test = evaluation.precision(test, predicted_test) print("test precise : {}".format(precise_test)) print("{}:epoch:dev map mrr {}".format(i, map_mrr_dev)) print("{}:epoch:test map mrr {}".format(i, map_mrr_test)) line = " {}:epoch: precise: {}--- map_dev{}-------map_mrr_test{}".format( i, precise_test, map_mrr_dev[0], map_mrr_test) if map_mrr_dev[0] > map_max: map_max = map_mrr_dev[0] save_path = saver.save(sess, out_dir) print("Model saved in file: ", save_path) log.write(line + '\n') log.flush() print('train over') saver.restore(sess, out_dir) predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) train['predicted'] = predicted map_mrr_train = evaluation.evaluationBypandas(train, predicted) predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) dev['predicted'] = predicted_dev map_mrr_dev = evaluation.evaluationBypandas(dev, predicted_dev) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) test['predicted'] = predicted_test map_mrr_test = evaluation.evaluationBypandas(test, predicted_test) ap = evaluation.get_ap(test, predicted_test) ap.to_csv('ap_score_qlm_wiki', header=None, sep='\t') print('map_mrr train', map_mrr_train) print('map_mrr dev', map_mrr_dev) print('map_mrr test', map_mrr_test) log.write(str(map_mrr_train) + '\n') log.write(str(map_mrr_test) + '\n') log.write(str(map_mrr_dev) + '\n')
def test_pair_wise(dns=FLAGS.dns): train, test, dev = load(FLAGS.data, filter=False) train = train.fillna('') test = test.fillna('') dev = dev.fillna('') # train = train[:1000] # test = test[:1000] # dev = dev[:1000] # submit = submit[:1000] q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) print 'q_question_length:{} a_question_length:{}'.format( q_max_sent_length, a_max_sent_length) print 'train question unique:{}'.format(len(train['question'].unique())) print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) alphabet, embeddings = prepare([train, test, dev], dim=FLAGS.embedding_dim, is_embedding_needed=True, fresh=FLAGS.fresh) # alphabet,embeddings = prepare_300([train,test,dev]) print 'alphabet:', len(alphabet) with tf.Graph().as_default(), tf.device("/gpu:0"): # with tf.device("/cpu:0"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) print "start build model" cnn = QA_CNN_extend(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, batch_size=FLAGS.batch_size, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, dropout_keep_prob=FLAGS.dropout_keep_prob, embeddings=embeddings, l2_reg_lambda=FLAGS.l2_reg_lambda, overlap_needed=FLAGS.overlap_needed, learning_rate=FLAGS.learning_rate, trainable=FLAGS.trainable, extend_feature_dim=FLAGS.extend_feature_dim, model_type=FLAGS.CNN_type) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(cnn.learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) # Initialize all variables print "build over" sess.run(tf.global_variables_initializer()) print "variables_initializer" if dns == True: loadfile = "runs/20170604/20170604183633__nlpcc0.833940715393" saver.restore(sess, loadfile) predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_train = evaluation.evaluationBypandas(train, predicted) predicted = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas(test, predicted) print map_mrr_train print map_mrr_test # seq_process(train, alphabet) # seq_process(test, alphabet) ''' print 'get my submit result' loadfile="runs/20170604/20170604183633__nlpcc0.833940715393" saver.restore(sess, loadfile) predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) train['predicted'] = predicted train['predicted'].to_csv('train.QApair.TJU_IR_QA2017_train.score',index = False,sep = '\t') map_mrr_train = evaluation.evaluationBypandas(train,predicted) predicted_test = predict(sess,cnn,test,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) test['predicted'] = predicted_test test['predicted'].to_csv('train.QApair.TJU_IR_QA2017.score',index = False,sep = '\t') map_mrr_test = evaluation.evaluationBypandas(test,predicted_test) print 'map_mrr train',map_mrr_train print 'map_prr dev',map_mrr_test predict_submit = predict(sess,cnn,submit,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) submit['predicted'] = predict_submit submit['predicted'].to_csv('train.QApair.TJU_IR_QA2017_submit.score',index = False,sep = '\t') print 'predict over' ''' map_max = 0.65 for i in range(1000): if dns == True: samples = dns_sample(train, alphabet, q_max_sent_length, a_max_sent_length, sess, cnn, FLAGS.batch_size, neg_sample_num=10) datas = batch_gen_with_pair_dns(samples, FLAGS.batch_size) else: d = get_overlap_dict(train, alphabet, q_len=q_max_sent_length, a_len=a_max_sent_length) datas = batch_gen_with_pair_overlap( train, alphabet, FLAGS.batch_size, q_len=q_max_sent_length, a_len=a_max_sent_length, fresh=FLAGS.fresh, overlap_dict=d) print "load data" for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[2], cnn.q_pos_overlap: data[3], cnn.q_neg_overlap: data[4], cnn.a_pos_overlap: data[5], cnn.a_neg_overlap: data[6] } _, step, loss, accuracy, score12, score13 = sess.run([ train_op, global_step, cnn.loss, cnn.accuracy, cnn.score12, cnn.score13 ], feed_dict) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}" .format(time_str, step, loss, accuracy, np.mean(score12), np.mean(score13))) line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format( time_str, step, loss, accuracy, np.mean(score12), np.mean(score13)) # print loss if i % 1 == 0: predicted = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas( test, predicted) print "{}:epoch:test map mrr {}".format(i, map_mrr_test) line = " {}:epoch: map_test{}".format(i, map_mrr_test[0]) if map_mrr_test[0] > map_max: map_max = map_mrr_test[0] timeStamp = time.strftime( "%Y%m%d%H%M%S", time.localtime(int(time.time()))) folder = 'runs/' + timeDay out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str( map_mrr_test[0]) if not os.path.exists(folder): os.makedirs(folder) save_path = saver.save(sess, out_dir) print "Model saved in file: ", save_path ''' predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) map_mrr_train = evaluation.evaluationBypandas(train,predicted) predicted = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) map_mrr_dev = evaluation.evaluationBypandas(dev,predicted) print "{}:epoch:train map mrr {}".format(i,map_mrr_train) # print "{}:epoch:test map mrr {}".format(i,map_mrr_test) print "{}:epoch:dev map mrr {}".format(i,map_mrr_dev) if map_mrr_dev[0] > map_max: map_max = map_mrr_dev[0] predicted = predict(sess,cnn,test,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas(test,predicted) print "{}:epoch:test map mrr {}".format(i,map_mrr_test) line = " {}:epoch: map_test{}".format(i,map_mrr_test[0]) if map_mrr_test[0] > map_max: timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) folder = 'runs/' + timeDay out_dir = folder +'/'+timeStamp+'__'+FLAGS.data+str(map_mrr_test[0]) if not os.path.exists(folder): os.makedirs(folder) save_path = saver.save(sess, out_dir) print "Model saved in file: ", save_path ''' # # predicted_train = prediction(sess,cnn,train,alphabet,q_max_sent_length,a_max_sent_length) # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted_dev[:,-1]) # map_mrr_test = evaluation.evaluationBypandas(test,predicted[:,-1]) # map_mrr_train = evaluation.evaluationBypandas(train,predicted_train[:,-1]) # # print evaluation.evaluationBypandas(train,predicted_train[:,-1]) # line = " {}:epoch: map_train{}----map_test{}----map_dev{}".format(i,map_mrr_train[0],map_mrr_test[0],map_mrr_dev[0]) # line = " {}:epoch: map_train{}----map_dev{}".format(i,map_mrr_train[0],map_mrr_dev[0]) log.write(line + '\n') log.flush()
with tf.Graph().as_default(): model = Attentive_CNN(opts) model._model_stats() for i in range(args.num_epoches): data_gen = helper.batch_iter(train, args.batch_size, alphabet, shuffle=True, q_len=q_max_sent_length, a_len=a_max_sent_length) model.train(data_gen, i) test_datas = helper.batch_iter(test, args.batch_size, alphabet, q_len=q_max_sent_length, a_len=a_max_sent_length) test['score'] = model.predict(test_datas) map_, mrr_ = evaluation.evaluationBypandas(test, test['score'].to_list()) df_group = test.groupby('question').filter( evaluation.mrr_metric_filter) df_group[['question', 'answer', 'flag', 'score']].to_csv('badcase', sep='\t', index=None) logger.info('map:{}--mrr:{}'.format(map_, mrr_)) print('map:{}--mrr:{}'.format(map_, mrr_))
def get_feature(): train, test, dev = load("trec", filter=False) test = test.reindex(np.random.permutation(test.index)) test['pred'] = test.apply(overlap_jiabing, axis=1) print evaluation.evaluationBypandas(test, test['pred'])
# test_datas = data_helper.get_mini_batch_test(dev,alphabet,args.batch_size) # predicted_test = predict(model,sess,test_datas,dev) # map_mrr_test = evaluation.evaluationBypandas(dev,predicted_test) # # logger.info('map_mrr dev' +str(map_mrr_test)) # print('map_mrr dev' +str(map_mrr_test)) # map,mrr,p1 = map_mrr_test # if p1>best_p1: # best_p1=p1 # filename= "checkpoint/"+args.data+"_"+str(p1)+".model" # save_path = saver.save(sess, filename) # # load_path = saver.restore(sess, model_path) # # import shutil # shutil.rmtree("model") # builder = tf.saved_model.builder.SavedModelBuilder("./model") # builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING]) # builder.save(True) # # #======= test_datas = data_helper.get_mini_batch_test(test, alphabet, args.batch_size) predicted_test = predict(model, sess, test_datas, test) map_mrr_test = evaluation.evaluationBypandas(test, predicted_test) logger.info('map_mrr test' + str(map_mrr_test)) print('epoch ' + str(i) + 'map_mrr test' + str(map_mrr_test))
def test_point_wise(): # creat_train_test("2") length = 5 # train,test = load() test_length = load_test_apply(length) # train,test,dev = load(FLAGS.data,filter = FLAGS.clean) # print () # q_max_sent_length = 4 q_max_sent_length = FLAGS.max_len_query # d_max_sent_length = 21 d_max_sent_length = FLAGS.max_len_document # alphabet,embeddings = prepare([train,test,dev],dim = FLAGS.embedding_dim,is_embedding_needed = True,fresh = True) # alphabet,embeddings = get_wordDic_Embedding(300) alphabet, embeddings = get_wordDic_Embedding(50) print("alphabet", len(alphabet)) # exit() with tf.Graph().as_default(): with tf.device("/gpu:0"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') cnn = IR_quantum(max_input_query=q_max_sent_length, max_input_docu=d_max_sent_length, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, embeddings=embeddings, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda, trainable=FLAGS.trainable, overlap_needed=FLAGS.overlap_needed, pooling=FLAGS.pooling, extend_feature_dim=FLAGS.extend_feature_dim) cnn.build_graph() ckpt_dir = "runs/20181022" saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: print(ckpt.model_checkpoint_path) saver.restore( sess, ckpt.model_checkpoint_path) # restore all variables else: raise FileNotFoundError("no fund saver!!") sess.run(tf.global_variables_initializer()) global_step = tf.Variable(0, name='global_step', trainable=False) starter_learning_rate = FLAGS.learning_rate # learning_rate = tf.train.exponential_decay(starter_learning_rate,global_step,100,0.96) # optimizer = tf.train.GradientDescentOptimizer(starter_learning_rate,global_step = global_step) # optimizer = tf.train.GradientDescentOptimizer(starter_learning_rate) optimizer = tf.train.AdamOptimizer(starter_learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=4) # merged = tf.summary.merge_all() # train_writer = tf.summary.FileWriter(tensorboard_log_dir+"/train",sess.graph) print("start predict test!!") predicted_test = predict(sess, cnn, test_length, alphabet, FLAGS.batch_size, q_max_sent_length, d_max_sent_length) print("start evaluation test!!") map_NDCG0_NDCG1_ERR_p_test = evaluation_test.evaluationBypandas( test_length, predicted_test[:, -1]) # sess.run(tf.global_variables_initializer()) print("query length {}:test epoch:map,NDCG0,NDCG1,ERR,p {}".format( length, map_NDCG0_NDCG1_ERR_p_test))
def test_point_wise(): train, test, dev = load(FLAGS.data, filter=FLAGS.clean) train = train.fillna('') test = test.fillna('') dev = dev.fillna('') # submit = submit.fillna('') q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) # train = train[:1000] # test = test[:1000] # dev = dev[:1000] # submit = dev[:100] print 'train question unique:{}'.format(len(train['question'].unique())) print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) alphabet, embeddings = prepare([train, test, dev], dim=FLAGS.embedding_dim, is_embedding_needed=True, fresh=True) print 'alphabet:', len(alphabet) with tf.Graph().as_default(): with tf.device("/gpu:0"): # session_conf = tf.ConfigProto( # allow_soft_placement=FLAGS.allow_soft_placement, # log_device_placement=FLAGS.log_device_placement) session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) cnn = QA(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, embeddings=embeddings, dropout_keep_prob=FLAGS.dropout_keep_prob, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda, is_Embedding_Needed=True, trainable=FLAGS.trainable, overlap_needed=FLAGS.overlap_needed, position_needed=FLAGS.position_needed, pooling=FLAGS.pooling, extend_feature_dim=FLAGS.extend_feature_dim) cnn.build_graph() # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) starter_learning_rate = 0.001 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100, 0.96) optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) # Initialize all variables sess.run(tf.global_variables_initializer()) # seq_process(train, alphabet) # seq_process(test, alphabet) map_max = 0.65 for i in range(30): d = get_overlap_dict(train, alphabet, q_len=q_max_sent_length, a_len=a_max_sent_length) datas = batch_gen_with_point_wise(train, alphabet, FLAGS.batch_size, overlap_dict=d, q_len=q_max_sent_length, a_len=a_max_sent_length) for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.input_y: data[2], cnn.q_overlap: data[3], cnn.a_overlap: data[4], cnn.q_position: data[5], cnn.a_position: data[6] } _, step, loss, accuracy, pred, scores, see = sess.run([ train_op, global_step, cnn.loss, cnn.accuracy, cnn.predictions, cnn.scores, cnn.see ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g} ".format( time_str, step, loss, accuracy)) # print loss # predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) # map_mrr_train = evaluation.evaluationBypandas(train,predicted[:,-1]) predicted = predict(sess, cnn, dev, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_dev = evaluation.evaluationBypandas( dev, predicted[:, -1]) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas( test, predicted_test[:, -1]) if map_mrr_dev[0] > map_max: map_max = map_mrr_dev[0] timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) folder = 'runs/' + timeDay out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str( map_mrr_dev[0]) if not os.path.exists(folder): os.makedirs(folder) save_path = saver.save(sess, out_dir) print "Model saved in file: ", save_path # predicted = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted[:,-1]) # map_mrr_train = evaluation.evaluationBypandas(train,predicted_train[:,-1]) # print evaluation.evaluationBypandas(train,predicted_train[:,-1]) # print "{}:train epoch:map mrr {}".format(i,map_mrr_train) print "{}:dev epoch:map mrr {}".format(i, map_mrr_dev) print "{}:test epoch:map mrr {}".format(i, map_mrr_test) # line = " {}:epoch: map_train{}----map_test{}----map_dev{}".format(i,map_mrr_train[0],map_mrr_test[0],map_mrr_dev[0]) line = " {}:epoch: map_dev{}----map_test{}".format( i, map_mrr_dev[0], map_mrr_test[0]) log.write(line + '\n') log.flush() log.close()
def test_pair_wise(dns=FLAGS.dns): train, test, dev = load(FLAGS.data, filter=FLAGS.clean) # train = train[:10000] # test = test[:10000] # dev = dev[:10000] # submit = submit[:1000] q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) print 'q_question_length:{} a_question_length:{}'.format( q_max_sent_length, a_max_sent_length) print 'train question unique:{}'.format(len(train['question'].unique())) print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) alphabet, embeddings = prepare([train, test, dev], dim=FLAGS.embedding_dim, is_embedding_needed=True, fresh=FLAGS.fresh) # alphabet,embeddings = prepare_300([train,test,dev]) print 'alphabet:', len(alphabet) with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)): # with tf.device("/cpu:0"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') folder = 'runs/' + timeDay + '/' + timeStamp + '/' out_dir = folder + FLAGS.data if not os.path.exists(folder): os.makedirs(folder) # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) print "start build model" cnn = QA_RNN_extend(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, batch_size=FLAGS.batch_size, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, dropout_keep_prob=FLAGS.dropout_keep_prob, embeddings=embeddings, l2_reg_lambda=FLAGS.l2_reg_lambda, overlap_needed=FLAGS.overlap_needed, learning_rate=FLAGS.learning_rate, trainable=FLAGS.trainable, extend_feature_dim=FLAGS.extend_feature_dim, pooling=FLAGS.pooling, position_needed=FLAGS.position_needed, conv=FLAGS.conv) cnn.build_graph() saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(log_dir + '/test') # Initialize all variables print "build over" sess.run(tf.global_variables_initializer()) print "variables_initializer" map_max = 0.65 for i in range(FLAGS.num_epochs): if FLAGS.dns == True: samples = dns_sample(train, alphabet, q_max_sent_length, a_max_sent_length, sess, cnn, FLAGS.batch_size, neg_sample_num=10) datas = batch_gen_with_pair_dns(samples, FLAGS.batch_size) print 'load dns datas' for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[2] } _, step, loss, accuracy, score12, score13 = sess.run([ cnn.train_op, cnn.global_step, cnn.loss, cnn.accuracy, cnn.score12, cnn.score13 ], feed_dict) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}" .format(time_str, step, loss, accuracy, np.mean(score12), np.mean(score13))) line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format( time_str, step, loss, accuracy, np.mean(score12), np.mean(score13)) else: d = get_overlap_dict(train, alphabet, q_len=q_max_sent_length, a_len=a_max_sent_length) datas = batch_gen_with_pair_overlap( train, alphabet, FLAGS.batch_size, q_len=q_max_sent_length, a_len=a_max_sent_length, fresh=FLAGS.fresh, overlap_dict=d) print "load data" for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[2], cnn.q_pos_overlap: data[3], cnn.q_neg_overlap: data[4], cnn.a_pos_overlap: data[5], cnn.a_neg_overlap: data[6], cnn.q_position: data[7], cnn.a_pos_position: data[8], cnn.a_neg_position: data[9] } _, summary, step, loss, accuracy, score12, score13 = sess.run( [ cnn.train_op, cnn.merged, cnn.global_step, cnn.loss, cnn.accuracy, cnn.score12, cnn.score13 ], feed_dict) train_writer.add_summary(summary, i) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}" .format(time_str, step, loss, accuracy, np.mean(score12), np.mean(score13))) line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format( time_str, step, loss, accuracy, np.mean(score12), np.mean(score13)) # print loss if i % 1 == 0: predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_dev = evaluation.evaluationBypandas( dev, predicted_dev) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas( test, predicted_test) print "{}:epoch:dev map mrr {}".format(i, map_mrr_dev) print "{}:epoch:test map mrr {}".format(i, map_mrr_test) line = " {}:epoch: map_dev{}-------map_mrr_test{}".format( i, map_mrr_dev[0], map_mrr_test) if map_mrr_dev[0] > map_max: map_max = map_mrr_dev[0] # timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) save_path = saver.save(sess, out_dir) print "Model saved in file: ", save_path log.write(line + '\n') log.flush() print 'train over' saver.restore(sess, out_dir) predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) train['predicted'] = predicted train['predicted'].to_csv('train.QApair.TJU_IR_QA2017_train.score', index=False, sep='\t') map_mrr_train = evaluation.evaluationBypandas(train, predicted) predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) dev['predicted'] = predicted_dev dev['predicted'].to_csv('train.QApair.TJU_IR_QA2017_dev.score', index=False, sep='\t') map_mrr_dev = evaluation.evaluationBypandas(dev, predicted_dev) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) test['predicted'] = predicted_test test['predicted'].to_csv('train.QApair.TJU_IR_QA2017.score', index=False, sep='\t') map_mrr_test = evaluation.evaluationBypandas(test, predicted_test) print 'map_mrr train', map_mrr_train print 'map_mrr dev', map_mrr_dev print 'map_mrr test', map_mrr_test log.write(str(map_mrr_train) + '\n') log.write(str(map_mrr_test) + '\n') log.write(str(map_mrr_dev) + '\n') predict(sess, cnn, train[:100], alphabet, 20, q_max_sent_length, a_max_sent_length)
def main(): train, test, dev = load("wiki", filter=True) q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) q_max_sent_length = 40 a_max_sent_length = 40 print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) alphabet, embeddings = prepare([train, test, dev], is_embedding_needed=True) print 'alphabet:', len(alphabet) with tf.Graph().as_default(): # with tf.device("/cpu:0"): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(), open("precision.txt", "w") as log: cnn = QA_CNN_Attentive(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, batch_size=FLAGS.batch_size, vocab_size=len(alphabet), embeddings=embeddings, embedding_size=FLAGS.embedding_dim, num_filters=FLAGS.num_filters, dropout_keep_prob=1.0, l2_reg_lambda=FLAGS.l2_reg_lambda, is_Embedding_Needed=True, trainable=True) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.all_variables(), max_to_keep=20) # Initialize all variables sess.run(tf.global_variables_initializer()) for i in range(1000): for x_batch_1, x_batch_2, x_batch_3 in batch_gen_with_pair_whole( train, alphabet, FLAGS.batch_size): feed_dict = { cnn.question: x_batch_1, cnn.answer: x_batch_2, cnn.answer_negative: x_batch_3, } _, step, loss, accuracy, scores1, scores2, a1, a2, U = sess.run( [ train_op, global_step, cnn.loss, cnn.accuracy, cnn.score12, cnn.score13, cnn.attention_q, cnn.attention_a, cnn.U ], feed_dict) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} positive {:g} negative {:g} mean_pooling {:g}" .format(time_str, step, loss, accuracy, np.mean(scores1), np.mean(scores2), np.mean(a1))) # print a1 predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) print(evaluation.evaluationBypandas(train, predicted)) predicted = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) print(evaluation.evaluationBypandas(test, predicted))