def predict(sess, cnn, test, alphabet, batch_size, q_len, a_len): scores = [] d = get_overlap_dict(test, alphabet, q_len, a_len) for data in batch_gen_with_single(test, alphabet, batch_size, q_len, a_len, overlap_dict=d): feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[1], cnn.q_pos_overlap: data[2], cnn.q_neg_overlap: data[2], cnn.a_pos_overlap: data[3], cnn.a_neg_overlap: data[3], cnn.q_position: data[4], cnn.a_pos_position: data[5], cnn.a_neg_position: data[5] } score = sess.run(cnn.score12, feed_dict) # print len(score) # if batch_size == 20: # attention.extend((q,a)) scores.extend(score) pickle.dump(attention, open('attention.file', 'w')) return np.array(scores[:len(test)])
def predict(sess, cnn, test, alphabet, batch_size, q_len, a_len, step, type): scores = [] d = get_overlap_dict(test, alphabet, q_len, a_len) for data in batch_gen_with_single(test, alphabet, batch_size, q_len, a_len, overlap_dict=d): feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[1], cnn.q_pos_overlap: data[2], cnn.q_neg_overlap: data[2], cnn.a_pos_overlap: data[3], cnn.a_neg_overlap: data[3], cnn.q_position: data[4], cnn.a_pos_position: data[5], cnn.a_neg_position: data[5] } score = sess.run(cnn.score12, feed_dict) scores.extend(score) with open(data_file + '_' + type + "_score_%d.txt" % step, 'w') as ff: string_tmp = '\n'.join([str(i) for i in scores]) ff.write(string_tmp) return np.array(scores[:len(test)])
def predict(sess, cnn, test, alphabet, batch_size, q_len, a_len): scores = [] d = get_overlap_dict(test, alphabet, q_len, a_len) for data in batch_gen_with_single(test, alphabet, batch_size, q_len, a_len, overlap_dict=d): feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.q_overlap: data[2], cnn.a_overlap: data[3], cnn.q_position: data[4], cnn.a_position: data[5] } score = sess.run(cnn.scores, feed_dict) scores.extend(score) return np.array(scores[:len(test)])
def test_pair_wise(dns=FLAGS.dns): train, test, dev = load(FLAGS.data, filter=FLAGS.clean) # train = train[:10000] # test = test[:10000] # dev = dev[:10000] # submit = submit[:1000] q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) print 'q_question_length:{} a_question_length:{}'.format( q_max_sent_length, a_max_sent_length) print 'train question unique:{}'.format(len(train['question'].unique())) print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) alphabet, embeddings = prepare([train, test, dev], dim=FLAGS.embedding_dim, is_embedding_needed=True, fresh=FLAGS.fresh) # alphabet,embeddings = prepare_300([train,test,dev]) print 'alphabet:', len(alphabet) with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)): # with tf.device("/cpu:0"): session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') folder = 'runs/' + timeDay + '/' + timeStamp + '/' out_dir = folder + FLAGS.data if not os.path.exists(folder): os.makedirs(folder) # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) print "start build model" cnn = QA_RNN_extend(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, batch_size=FLAGS.batch_size, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, dropout_keep_prob=FLAGS.dropout_keep_prob, embeddings=embeddings, l2_reg_lambda=FLAGS.l2_reg_lambda, overlap_needed=FLAGS.overlap_needed, learning_rate=FLAGS.learning_rate, trainable=FLAGS.trainable, extend_feature_dim=FLAGS.extend_feature_dim, pooling=FLAGS.pooling, position_needed=FLAGS.position_needed, conv=FLAGS.conv) cnn.build_graph() saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(log_dir + '/test') # Initialize all variables print "build over" sess.run(tf.global_variables_initializer()) print "variables_initializer" map_max = 0.65 for i in range(FLAGS.num_epochs): if FLAGS.dns == True: samples = dns_sample(train, alphabet, q_max_sent_length, a_max_sent_length, sess, cnn, FLAGS.batch_size, neg_sample_num=10) datas = batch_gen_with_pair_dns(samples, FLAGS.batch_size) print 'load dns datas' for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[2] } _, step, loss, accuracy, score12, score13 = sess.run([ cnn.train_op, cnn.global_step, cnn.loss, cnn.accuracy, cnn.score12, cnn.score13 ], feed_dict) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}" .format(time_str, step, loss, accuracy, np.mean(score12), np.mean(score13))) line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format( time_str, step, loss, accuracy, np.mean(score12), np.mean(score13)) else: d = get_overlap_dict(train, alphabet, q_len=q_max_sent_length, a_len=a_max_sent_length) datas = batch_gen_with_pair_overlap( train, alphabet, FLAGS.batch_size, q_len=q_max_sent_length, a_len=a_max_sent_length, fresh=FLAGS.fresh, overlap_dict=d) print "load data" for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.answer_negative: data[2], cnn.q_pos_overlap: data[3], cnn.q_neg_overlap: data[4], cnn.a_pos_overlap: data[5], cnn.a_neg_overlap: data[6], cnn.q_position: data[7], cnn.a_pos_position: data[8], cnn.a_neg_position: data[9] } _, summary, step, loss, accuracy, score12, score13 = sess.run( [ cnn.train_op, cnn.merged, cnn.global_step, cnn.loss, cnn.accuracy, cnn.score12, cnn.score13 ], feed_dict) train_writer.add_summary(summary, i) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}" .format(time_str, step, loss, accuracy, np.mean(score12), np.mean(score13))) line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format( time_str, step, loss, accuracy, np.mean(score12), np.mean(score13)) # print loss if i % 1 == 0: predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_dev = evaluation.evaluationBypandas( dev, predicted_dev) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas( test, predicted_test) print "{}:epoch:dev map mrr {}".format(i, map_mrr_dev) print "{}:epoch:test map mrr {}".format(i, map_mrr_test) line = " {}:epoch: map_dev{}-------map_mrr_test{}".format( i, map_mrr_dev[0], map_mrr_test) if map_mrr_dev[0] > map_max: map_max = map_mrr_dev[0] # timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) save_path = saver.save(sess, out_dir) print "Model saved in file: ", save_path log.write(line + '\n') log.flush() print 'train over' saver.restore(sess, out_dir) predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) train['predicted'] = predicted train['predicted'].to_csv('train.QApair.TJU_IR_QA2017_train.score', index=False, sep='\t') map_mrr_train = evaluation.evaluationBypandas(train, predicted) predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) dev['predicted'] = predicted_dev dev['predicted'].to_csv('train.QApair.TJU_IR_QA2017_dev.score', index=False, sep='\t') map_mrr_dev = evaluation.evaluationBypandas(dev, predicted_dev) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) test['predicted'] = predicted_test test['predicted'].to_csv('train.QApair.TJU_IR_QA2017.score', index=False, sep='\t') map_mrr_test = evaluation.evaluationBypandas(test, predicted_test) print 'map_mrr train', map_mrr_train print 'map_mrr dev', map_mrr_dev print 'map_mrr test', map_mrr_test log.write(str(map_mrr_train) + '\n') log.write(str(map_mrr_test) + '\n') log.write(str(map_mrr_dev) + '\n') predict(sess, cnn, train[:100], alphabet, 20, q_max_sent_length, a_max_sent_length)
def test_point_wise(): train, test, dev = load(FLAGS.data, filter=FLAGS.clean) train = train.fillna('') test = test.fillna('') dev = dev.fillna('') # submit = submit.fillna('') q_max_sent_length = max( map(lambda x: len(x), train['question'].str.split())) a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split())) # train = train[:1000] # test = test[:1000] # dev = dev[:1000] # submit = dev[:100] print 'train question unique:{}'.format(len(train['question'].unique())) print 'train length', len(train) print 'test length', len(test) print 'dev length', len(dev) alphabet, embeddings = prepare([train, test, dev], dim=FLAGS.embedding_dim, is_embedding_needed=True, fresh=True) print 'alphabet:', len(alphabet) with tf.Graph().as_default(): with tf.device("/gpu:0"): # session_conf = tf.ConfigProto( # allow_soft_placement=FLAGS.allow_soft_placement, # log_device_placement=FLAGS.log_device_placement) session_conf = tf.ConfigProto() session_conf.allow_soft_placement = FLAGS.allow_soft_placement session_conf.log_device_placement = FLAGS.log_device_placement session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(), open(precision, "w") as log: log.write(str(FLAGS.__flags) + '\n') # train,test,dev = load("trec",filter=True) # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True) cnn = QA(max_input_left=q_max_sent_length, max_input_right=a_max_sent_length, vocab_size=len(alphabet), embedding_size=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, embeddings=embeddings, dropout_keep_prob=FLAGS.dropout_keep_prob, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda, is_Embedding_Needed=True, trainable=FLAGS.trainable, overlap_needed=FLAGS.overlap_needed, position_needed=FLAGS.position_needed, pooling=FLAGS.pooling, extend_feature_dim=FLAGS.extend_feature_dim) cnn.build_graph() # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) starter_learning_rate = 0.001 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100, 0.96) optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.global_variables(), max_to_keep=20) # Initialize all variables sess.run(tf.global_variables_initializer()) # seq_process(train, alphabet) # seq_process(test, alphabet) map_max = 0.65 for i in range(30): d = get_overlap_dict(train, alphabet, q_len=q_max_sent_length, a_len=a_max_sent_length) datas = batch_gen_with_point_wise(train, alphabet, FLAGS.batch_size, overlap_dict=d, q_len=q_max_sent_length, a_len=a_max_sent_length) for data in datas: feed_dict = { cnn.question: data[0], cnn.answer: data[1], cnn.input_y: data[2], cnn.q_overlap: data[3], cnn.a_overlap: data[4], cnn.q_position: data[5], cnn.a_position: data[6] } _, step, loss, accuracy, pred, scores, see = sess.run([ train_op, global_step, cnn.loss, cnn.accuracy, cnn.predictions, cnn.scores, cnn.see ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g} ".format( time_str, step, loss, accuracy)) # print loss # predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) # map_mrr_train = evaluation.evaluationBypandas(train,predicted[:,-1]) predicted = predict(sess, cnn, dev, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_dev = evaluation.evaluationBypandas( dev, predicted[:, -1]) predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size, q_max_sent_length, a_max_sent_length) map_mrr_test = evaluation.evaluationBypandas( test, predicted_test[:, -1]) if map_mrr_dev[0] > map_max: map_max = map_mrr_dev[0] timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time()))) folder = 'runs/' + timeDay out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str( map_mrr_dev[0]) if not os.path.exists(folder): os.makedirs(folder) save_path = saver.save(sess, out_dir) print "Model saved in file: ", save_path # predicted = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length) # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted[:,-1]) # map_mrr_train = evaluation.evaluationBypandas(train,predicted_train[:,-1]) # print evaluation.evaluationBypandas(train,predicted_train[:,-1]) # print "{}:train epoch:map mrr {}".format(i,map_mrr_train) print "{}:dev epoch:map mrr {}".format(i, map_mrr_dev) print "{}:test epoch:map mrr {}".format(i, map_mrr_test) # line = " {}:epoch: map_train{}----map_test{}----map_dev{}".format(i,map_mrr_train[0],map_mrr_test[0],map_mrr_dev[0]) line = " {}:epoch: map_dev{}----map_test{}".format( i, map_mrr_dev[0], map_mrr_test[0]) log.write(line + '\n') log.flush() log.close()