Exemple #1
0
def overlap_visualize():
    train, test, dev = load("nlpcc", filter=True)
    test = test.reindex(np.random.permutation(test.index))
    df = test
    df['qlen'] = df['question'].str.len()
    df['alen'] = df['answer'].str.len()

    df['q_n_words'] = df['question'].apply(lambda row: len(row.split(' ')))
    df['a_n_words'] = df['answer'].apply(lambda row: len(row.split(' ')))

    def normalized_word_share(row):
        w1 = set(
            map(lambda word: word.lower().strip(), row['question'].split(" ")))
        w2 = set(
            map(lambda word: word.lower().strip(), row['answer'].split(" ")))
        return 1.0 * len(w1 & w2) / (len(w1) + len(w2))

    df['word_share'] = df.apply(normalized_word_share, axis=1)

    plt.figure(figsize=(12, 8))
    plt.subplot(1, 2, 1)
    sns.violinplot(x='flag', y='word_share', data=df[0:50000])
    plt.subplot(1, 2, 2)
    sns.distplot(df[df['flag'] == 1.0]['word_share'][0:10000], color='green')
    sns.distplot(df[df['flag'] == 0.0]['word_share'][0:10000], color='red')

    print evaluation.evaluationBypandas(test, df['word_share'])
    plt.show('hold')
Exemple #2
0
def random_result():
    train, test, dev = load("wiki", filter=True)
    # test = test.reindex(np.random.permutation(test.index))

    # test['pred'] = test.apply(idf_word_overlap,axis = 1)
    pred = np.random.randn(len(test))

    print evaluation.evaluationBypandas(test, pred)
def theano_verion():

    train, test = load("wiki", filter=True)
    alphabet, embeddings = prepare([train, test])
    test_input = getQAIndiceofTest(test, alphabet)
    from model import Model1
    model = Model1(50, 50, 50, len(alphabet.keys()), embeddings)
    #pdb.set_trace()
    #print((model.predict([q_train, a_train])))
    # start training
    for epoch in range(20):

        for x_trainq, x_traina, y_train1 in batch_gen_with_point_wise(
                train, alphabet, FLAGS.batch_size):
            loss, acc = model.train_on_batch([x_trainq, x_traina], y_train1)
            perf = str(loss) + " " + str(acc)
            print("loss is %f with acc %f" % (loss, acc))

        #y_train1 = y_train1.reshape(y_train1.shape[0],1)
        #x = model.predict([x_trainq, x_traina])
        predicted = model.predict_on_batch(test_input)
        print(evaluation.evaluationBypandas(test, predicted))

        evaluation.briany_test_file(test, predicted)
        print("\n\n\n\n\n")
Exemple #4
0
def predict():

    logger = logging.getLogger('QA')
    logger.info('load vocab')

    data_path = FLAGS.data_path
    train_file = os.path.join(data_path, 'train.txt')
    test_file = os.path.join(data_path, 'test.txt')

    if FLAGS.dt_dir == "":
        FLAGS.dt_dir = (date.today() + timedelta(-1)).strftime('%Y%m%d')
        FLAGS.model_dir = FLAGS.model_dir + FLAGS.dt_dir
    with open(os.path.join(FLAGS.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    model_params["vocab_size"] = len(vocab['embeddings'])
    model_params["embeddings"] = vocab["embeddings"]
    data_set = QA_dataset(None, None, test_file, FLAGS)

    config = tf.estimator.RunConfig().replace(
        session_config=tf.ConfigProto(device_count={
            'GPU': 0,
            'CPU': FLAGS.num_threads
        }),
        log_step_count_steps=FLAGS.log_steps,
        save_summary_steps=FLAGS.log_steps)
    QA_CNN = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                    model_dir=FLAGS.model_dir,
                                    params=model_params,
                                    config=config)

    preds = QA_CNN.predict(input_fn=lambda: data_set.input_fn(
        FLAGS.test_tf_records, num_epochs=1, batch_size=FLAGS.batch_size),
                           predict_keys=["prob", 'score'])
    # list_pred = list(map(lambda x:x['prob'],preds))
    a = list(map(lambda x: (x['prob'], x['score']), preds))
    list_pred, score = zip(*a)

    random_pred = np.random.rand(len(data_set.test_set))
    print('random:{}\n'.format(
        evaluation.evaluationBypandas(data_set.test_set, random_pred)))

    print(evaluation.evaluationBypandas(data_set.test_set, score))

    # data_set.test_set['pred'] = list_pred
    print(data_set.test_set.head())
    data_set.test_set.to_csv('pred.txt', sep='\t', index=None, header=None)
Exemple #5
0
def test_point_wise():
    # creat_train_test("2")
    train, test = load()
    # train,test,dev = load(FLAGS.data,filter = FLAGS.clean)
    # print ()
    # q_max_sent_length = 4
    q_max_sent_length = FLAGS.max_len_query
    # d_max_sent_length = 21
    d_max_sent_length = FLAGS.max_len_document

    # alphabet,embeddings = prepare([train,test,dev],dim = FLAGS.embedding_dim,is_embedding_needed = True,fresh = True)
    # alphabet,embeddings = get_wordDic_Embedding(300)
    alphabet, embeddings = get_wordDic_Embedding(50)
    print("alphabet", len(alphabet))
    # exit()
    with tf.Graph().as_default():
        with tf.device("/gpu:0"):
            session_conf = tf.ConfigProto()
            session_conf.allow_soft_placement = FLAGS.allow_soft_placement
            session_conf.log_device_placement = FLAGS.log_device_placement
            session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        # with sess.as_default(),open(precision,"w") as log:
        # log.write(str(FLAGS.__flags)+'\n')
        cnn = IR_quantum(max_input_query=q_max_sent_length,
                         max_input_docu=d_max_sent_length,
                         vocab_size=len(alphabet),
                         embedding_size=FLAGS.embedding_dim,
                         batch_size=FLAGS.batch_size,
                         embeddings=embeddings,
                         filter_sizes=list(
                             map(int, FLAGS.filter_sizes.split(","))),
                         num_filters=FLAGS.num_filters,
                         l2_reg_lambda=FLAGS.l2_reg_lambda,
                         trainable=FLAGS.trainable,
                         overlap_needed=FLAGS.overlap_needed,
                         pooling=FLAGS.pooling,
                         extend_feature_dim=FLAGS.extend_feature_dim)
        cnn.build_graph()
        ckpt_dir = "runs/20181022"
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)
        if ckpt and ckpt.model_checkpoint_path:
            print(ckpt.model_checkpoint_path)
            saver.restore(sess,
                          ckpt.model_checkpoint_path)  # restore all variables
        else:
            raise FileNotFoundError("no fund saver!!")

        predicted_test = predict(sess, cnn, test, alphabet, FLAGS.batch_size,
                                 q_max_sent_length, d_max_sent_length)
        test["score"] = predicted_test[:, -1]
        # test_score["query_ID","document","flag","score"] = test["query_ID","document","flag","score"]
        # f = "result.txt"
        test.to_csv("result.txt", sep='\t', header=False, index=False)
        map_NDCG0_NDCG1_ERR_p_test = evaluation_test.evaluationBypandas(
            test, predicted_test[:, -1])
        print("test epoch:map,NDCG0,NDCG1,ERR,p {}".format(
            map_NDCG0_NDCG1_ERR_p_test))
Exemple #6
0
def model_mixed():
    test_file = 'dbqa.txt'
    test = pd.read_csv(test_file,
                       header=None,
                       sep="\t",
                       names=["flag", "question", "answer"],
                       quoting=3)
    predicted = pd.read_csv('Bot_submit', names=['score'])
    map_mrr_test = evaluation.evaluationBypandas(test, predicted)
    print map_mrr_test
Exemple #7
0
def model_mixed():
    data_dir = "data/" + 'nlpcc'
    test_file = os.path.join(data_dir, "test.txt")
    test = pd.read_csv(test_file,
                       header=None,
                       sep="\t",
                       names=["question", "answer", "flag"],
                       quoting=3)
    predicted = pd.read_csv('../QA/train.QApair.TJU_IR_QA.score',
                            names=['score'])
    map_mrr_test = evaluation.evaluationBypandas(test, predicted)
    print map_mrr_test
def englishTest():
    train, test, dev = load("wiki", filter=True)
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)
    # test = test.reindex(np.random.permutation(test.index))
    train = train.reset_index()
    # test = test.reset_index()
    print 'load Data finished'
    columns1 = get_features(train)
    columns2 = get_features(test)
    common = [item for item in columns2 if item in columns1]
    print common
    # common = ['align', 'align_reverse', 'features_similarity', 'features_similarity_reverse']
    print 'save the small idf_dict'
    # pickle.dump(small_idf_dict,open('data/small_idf_dict','w'))
    x = train[common].fillna(0)
    y = train["flag"]
    test_x = test[common].fillna(0)
    # clf = linear_model.LinearRegression()
    clf = linear_model.LogisticRegression()
    # clf = svm.SVR()
    print x.head()
    # clf = GradientBoostingRegressor()
    # clf = tree.DecisionTreeRegressor()
    # clf = svm.SVR()
    clf.fit(x, y)
    print clf.coef_
    # predicted = clf.predict(test_x)
    predicted = clf.predict_proba(test_x)
    predicted = predicted[:, 1]
    print len(predicted)
    print len(test)
    print(evaluation.evaluationBypandas(test, predicted))
Exemple #9
0
def test_point_wise():
    # creat_train_test("2")
    train, test = load()
    # train,test,dev = load(FLAGS.data,filter = FLAGS.clean)
    # print ()
    # q_max_sent_length = 4
    q_max_sent_length = FLAGS.max_len_query
    # d_max_sent_length = 21
    d_max_sent_length = FLAGS.max_len_document

    # alphabet,embeddings = prepare([train,test,dev],dim = FLAGS.embedding_dim,is_embedding_needed = True,fresh = True)
    # alphabet,embeddings = get_wordDic_Embedding(300)
    alphabet, embeddings = get_wordDic_Embedding(300)
    print("alphabet", len(alphabet))
    # exit()
    with tf.Graph().as_default():
        with tf.device("/gpu:1"):
            session_conf = tf.ConfigProto()
            session_conf.allow_soft_placement = FLAGS.allow_soft_placement
            session_conf.log_device_placement = FLAGS.log_device_placement
            session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            cnn = IR_quantum(max_input_query=q_max_sent_length,
                             max_input_docu=d_max_sent_length,
                             vocab_size=len(alphabet),
                             embedding_size=FLAGS.embedding_dim,
                             batch_size=FLAGS.batch_size,
                             embeddings=embeddings,
                             filter_sizes=list(
                                 map(int, FLAGS.filter_sizes.split(","))),
                             num_filters=FLAGS.num_filters,
                             l2_reg_lambda=FLAGS.l2_reg_lambda,
                             trainable=FLAGS.trainable,
                             overlap_needed=FLAGS.overlap_needed,
                             pooling=FLAGS.pooling,
                             extend_feature_dim=FLAGS.extend_feature_dim)
            cnn.build_graph()

            global_step = tf.Variable(0, name='global_step', trainable=False)
            starter_learning_rate = FLAGS.learning_rate
            # learning_rate = tf.train.exponential_decay(starter_learning_rate,global_step,100,0.96)
            # optimizer = tf.train.GradientDescentOptimizer(starter_learning_rate,global_step = global_step)
            optimizer = tf.train.GradientDescentOptimizer(
                starter_learning_rate)
            # optimizer = tf.train.AdamOptimizer(starter_learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=4)

            merged = tf.summary.merge_all()
            train_writer = tf.summary.FileWriter(
                tensorboard_log_dir + "/train", sess.graph)

            sess.run(tf.global_variables_initializer())

            map_max = 0.020
            # loss_max = 0.3
            for i in range(FLAGS.num_epochs):
                print("\nepoch " + str(i) + "\n")
                d = get_overlap_dict(train,
                                     alphabet,
                                     q_len=q_max_sent_length,
                                     d_len=d_max_sent_length)

                # datas = batch_gen_with_point_wise(train,alphabet,FLAGS.batch_size,overlap_dict = d,
                # 	q_len = q_max_sent_length,d_len = d_max_sent_length)
                datas = batch_gen_with_list_wise(train,
                                                 alphabet,
                                                 FLAGS.batch_size,
                                                 q_len=q_max_sent_length,
                                                 d_len=d_max_sent_length,
                                                 overlap_dict=d)
                # if i <2:
                # 	continue
                j = 1
                for data in datas:
                    feed_dict = {
                        cnn.query: data[0],
                        cnn.document: data[1],
                        cnn.input_label: data[2],
                        cnn.q_overlap: data[3],
                        cnn.d_overlap: data[4]
                    }
                    _, step, logits, loss, scores, input_label = sess.run([
                        train_op, global_step, cnn.logits, cnn.loss,
                        cnn.scores, cnn.input_label
                    ], feed_dict)
                    # train_writer.add_summary(rs,i)
                    # print ("density_trace")
                    # print (density_trace)
                    # print ("input_label")
                    # print (input_label)
                    # print ("label")
                    # print (data[2])
                    # print ("logits")
                    # print (logits)
                    # print ("p_label")
                    # print (p_label)
                    # print ("scores:")
                    # print (scores)
                    print("{} loss: {}".format(j, loss))
                    j += 1
                    # # print ("para")
                    # # print (para)
                    # print ("score"+str(scores))
                    time_str = datetime.datetime.now().isoformat()

                predicted = predict(sess, cnn, train, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    d_max_sent_length)
                # print ("train predict")
                # print (predicted[:,-1])
                map_NDCG0_NDCG1_ERR_p_train = evaluation_test.evaluationBypandas(
                    train, predicted[:, -1])
                # precision_train = evaluation.precision(train,predicted[:,-1])
                # predicted = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,d_max_sent_length)
                # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted[:,-1])
                # precision_dev = evaluation.precision(dev,predicted[:,-1])
                predicted_test = predict(sess, cnn, test, alphabet,
                                         FLAGS.batch_size, q_max_sent_length,
                                         d_max_sent_length)

                # print ("test predict")
                # print (predicted_test[:,-1])

                map_NDCG0_NDCG1_ERR_p_test = evaluation_test.evaluationBypandas(
                    test, predicted_test[:, -1])
                # precision_test = evaluation.precision(test,predicted_test[:,-1])

                if map_NDCG0_NDCG1_ERR_p_test[0] > map_max:
                    map_max = map_NDCG0_NDCG1_ERR_p_test[0]
                    timeStamp = time.strftime("%Y%m%d%H%M%S",
                                              time.localtime(int(time.time())))
                    folder = 'runs/' + timeDay
                    out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str(
                        map_NDCG0_NDCG1_ERR_p_test[0])
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    save_path = saver.save(sess, out_dir)
                    print("Model saved in file: ", save_path)

                print("{}:train epoch:map,NDCG0,NDCG1,ERR,p {}".format(
                    i, map_NDCG0_NDCG1_ERR_p_train))
                # print('precision_train',precision_train)
                # print ("{}:dev epoch:map mrr {}".format(i,map_mrr_dev))
                # print('precision_dev',precision_dev)
                # f = open()
                print("{}:test epoch:map,NDCG0,NDCG1,ERR,p {}".format(
                    i, map_NDCG0_NDCG1_ERR_p_test))
                # file = "result/listwise_"+timeDay+"_learnrate_"+str(FLAGS.learning_rate)+".txt"
                # f = open(file,"a")
                # f.write("{}:train epoch:map,NDCG0,NDCG1,ERR,p {}".format(i,map_NDCG0_NDCG1_ERR_p_train))
                # f.write("{}:test epoch:map,NDCG0,NDCG1,ERR,p {}".format(i,map_NDCG0_NDCG1_ERR_p_test))
                # f.write("\n")
                # f.close()
                # print('precision_test',precision_test)
                # line = " {}:epoch: map_test{},precision_test: {}".format(i,map_mrr_test,precision_test)

                line1 = " {}:epoch: map_train{}".format(
                    i, map_NDCG0_NDCG1_ERR_p_train)
                log.write(line1 + "\n")
                line = " {}:epoch: map_test{}".format(
                    i, map_NDCG0_NDCG1_ERR_p_test)
                log.write(line + "\n")
                log.write("\n")
                log.flush()
            log.close()
Exemple #10
0
def test_point_wise():
    train, dev, test = load(FLAGS.data, filter=FLAGS.clean)  #wiki
    # train, test, dev = load(FLAGS.data, filter=FLAGS.clean) #trec
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    print(q_max_sent_length)
    print(a_max_sent_length)
    print(len(train))
    print('train question unique:{}'.format(len(train['question'].unique())))
    print('train length', len(train))
    print('test length', len(test))
    print('dev length', len(dev))

    alphabet, embeddings, embeddings_complex = prepare(
        [train, test, dev],
        max_sent_length=a_max_sent_length,
        dim=FLAGS.embedding_dim,
        is_embedding_needed=True,
        fresh=True)
    print(embeddings_complex)
    print('alphabet:', len(alphabet))
    with tf.Graph().as_default():
        with tf.device("/gpu:0"):
            # session_conf = tf.ConfigProto(
            #     allow_soft_placement=FLAGS.allow_soft_placement,
            #     log_device_placement=FLAGS.log_device_placement)

            session_conf = tf.ConfigProto()
            session_conf.allow_soft_placement = FLAGS.allow_soft_placement
            session_conf.log_device_placement = FLAGS.log_device_placement
            session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            cnn = QA_quantum(max_input_left=q_max_sent_length,
                             max_input_right=a_max_sent_length,
                             vocab_size=len(alphabet),
                             embedding_size=FLAGS.embedding_dim,
                             batch_size=FLAGS.batch_size,
                             embeddings=embeddings,
                             embeddings_complex=embeddings_complex,
                             dropout_keep_prob=FLAGS.dropout_keep_prob,
                             filter_sizes=list(
                                 map(int, FLAGS.filter_sizes.split(","))),
                             num_filters=FLAGS.num_filters,
                             l2_reg_lambda=FLAGS.l2_reg_lambda,
                             is_Embedding_Needed=True,
                             trainable=FLAGS.trainable,
                             overlap_needed=FLAGS.overlap_needed,
                             position_needed=FLAGS.position_needed,
                             pooling=FLAGS.pooling,
                             hidden_num=FLAGS.hidden_num,
                             extend_feature_dim=FLAGS.extend_feature_dim)
            cnn.build_graph()
            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            starter_learning_rate = FLAGS.learning_rate
            learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                       global_step, 100, 0.96)
            optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
            # optimizer =  tf.train.GradientDescentOptimizer(learning_rate)

            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            merged = tf.summary.merge_all()
            writer = tf.summary.FileWriter(
                "logs_NNQLM2_embedding_xiangwei_uniform/", sess.graph)
            sess.run(tf.global_variables_initializer())
            map_max = 0.65
            now = int(time.time())
            timeArray = time.localtime(now)
            timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray)
            timeDay = time.strftime("%Y%m%d", timeArray)
            print(timeStamp)
            for i in range(FLAGS.num_epochs):
                d = get_overlap_dict(train,
                                     alphabet,
                                     q_len=q_max_sent_length,
                                     a_len=a_max_sent_length)
                datas = batch_gen_with_point_wise(train,
                                                  alphabet,
                                                  FLAGS.batch_size,
                                                  overlap_dict=d,
                                                  q_len=q_max_sent_length,
                                                  a_len=a_max_sent_length)
                for data in datas:
                    feed_dict = {
                        cnn.question: data[0],
                        cnn.answer: data[1],
                        cnn.input_y: data[2],
                        cnn.q_position: data[3],
                        cnn.a_position: data[4],
                        cnn.overlap: data[5],
                        cnn.q_overlap: data[6],
                        cnn.a_overlap: data[7]
                    }
                    _, step, loss, accuracy, pred, scores, input_y, position = sess.run(
                        [
                            train_op, global_step, cnn.loss, cnn.accuracy,
                            cnn.predictions, cnn.scores, cnn.input_y,
                            cnn.embedding_W_complex
                        ], feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}  ".format(
                        time_str, step, loss, accuracy))
                now = int(time.time())
                timeArray = time.localtime(now)
                timeStamp = time.strftime("%Y%m%d%H%M%S", timeArray)
                timeDay = time.strftime("%Y%m%d", timeArray)
                print(timeStamp)
                position_embedding = position.reshape([236, 50])
                print(position_embedding)
                np.savetxt("position_embedding_xiangwei_uniform.txt",
                           position_embedding)
                predicted = predict(sess, cnn, train, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                predicted_label = np.argmax(predicted, 1)
                map_mrr_train = evaluation.evaluationBypandas(
                    train, predicted[:, -1])
                predicted_test = predict(sess, cnn, test, alphabet,
                                         FLAGS.batch_size, q_max_sent_length,
                                         a_max_sent_length)
                print(predicted_test[-2])
                print(predicted_test[-1])
                predicted_label = np.argmax(predicted_test, 1)
                map_mrr_test = evaluation.evaluationBypandas(
                    test, predicted_test[:, -1])
                if map_mrr_test[0] > map_max:
                    map_max = map_mrr_test[0]
                    timeStamp = time.strftime("%Y%m%d%H%M%S",
                                              time.localtime(int(time.time())))
                    folder = 'runs/' + timeDay
                    out_dir = folder + '/' + timeStamp + \
                        '__' + FLAGS.data + str(map_mrr_test[0])
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    #save_path = saver.save(sess, out_dir)
                print("{}:train epoch:map mrr {}".format(i, map_mrr_train))
                print("{}:test epoch:map mrr {}".format(i, map_mrr_test))
                line1 = " {}:epoch: map_train{}".format(i, map_mrr_train)
                line2 = " {}:epoch: map_test{}".format(i, map_mrr_test)
                log.write(line1 + '\n' + line2 + '\n')
                log.flush()
            log.close()
Exemple #11
0
alphabet = data_helper.get_alphabet([train, test, dev])
logger.info('the number of words :%d ' % len(alphabet))

embedding = data_helper.get_embedding(alphabet,
                                      opts['embedding_file'],
                                      embedding_size=opts['embedding_size'])

opts["embeddings"] = embedding
opts["vocab_size"] = len(alphabet)
opts["max_input_right"] = a_max_sent_length
opts["max_input_left"] = q_max_sent_length
opts["filter_sizes"] = list(map(int, args.filter_sizes.split(",")))

with tf.Graph().as_default():

    model = models.setup(opts)
    model._model_stats()
    for i in range(args.num_epoches):
        data_gen = data_helper.get_mini_batch(train, alphabet, args.batch_size)
        model.train(data_gen, i)

        test_datas = data_helper.get_mini_batch_test(test, alphabet,
                                                     args.batch_size)

        predicted_test = model.predict(test_datas)
        map_, mrr_, p_1 = evaluation.evaluationBypandas(test, predicted_test)

        logger.info('map:{}--mrr:{}--p@1--{}'.format(map_, mrr_, p_1))
        print('map:{}--mrr:{}--p@1--{}'.format(map_, mrr_, p_1))
def test():
    train, test, dev = load("trec", filter=False)
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))

    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)
    alphabet, embeddings = prepare([train, test, dev],
                                   is_embedding_needed=True)
    print 'alphabet:', len(alphabet)
    with tf.Graph().as_default():
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open("precision.txt", "w") as log:

            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            cnn = QA_overlap(
                max_len_left=q_max_sent_length,
                max_len_right=a_max_sent_length,
                vocab_size=len(alphabet),
                embedding_size=FLAGS.embedding_dim,
                embeddings=embeddings,
                # filter_sizes = list(map(int, FLAGS.filter_sizes.split(","))),
                filter_sizes=[5],
                num_filters=FLAGS.num_filters,
                num_hidden=10,
                l2_reg_lambda=FLAGS.l2_reg_lambda,
                is_Embedding_Needed=True)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.all_variables(), max_to_keep=20)
            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            #summary
            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.histogram_summary(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.scalar_summary(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.merge_summary(grad_summaries)
            # Output directory for models and summaries

            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.scalar_summary("loss", cnn.loss)
            acc_summary = tf.scalar_summary("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.merge_summary(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.train.SummaryWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.train.SummaryWriter(
                dev_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            # saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

            # seq_process(train, alphabet)
            # seq_process(test, alphabet)
            for i in range(25):
                for x_left_batch, x_right_batch, y_batch, overlap in batch_gen_with_point_wise(
                        train,
                        alphabet,
                        FLAGS.batch_size,
                        overlap=True,
                        q_len=q_max_sent_length,
                        a_len=a_max_sent_length):
                    feed_dict = {
                        cnn.input_left: x_left_batch,
                        cnn.input_right: x_right_batch,
                        cnn.input_y: y_batch,
                        cnn.overlap: overlap,
                        cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }

                    _, step, summaries, loss, accuracy, pred, scores = sess.run(
                        [
                            train_op, global_step, train_summary_op, cnn.loss,
                            cnn.accuracy, cnn.predictions, cnn.scores
                        ], feed_dict)
                    time_str = datetime.datetime.now().isoformat()

                    print("{}: step {}, loss {:g}, acc {:g}  ".format(
                        time_str, step, loss, accuracy))
                    train_summary_writer.add_summary(summaries, step)
                    # print loss

                predicted = prediction(sess, cnn, test, alphabet,
                                       q_max_sent_length, a_max_sent_length)
                predicted_dev = prediction(sess, cnn, dev, alphabet,
                                           q_max_sent_length,
                                           a_max_sent_length)
                # predicted_train = prediction(sess,cnn,train,alphabet)
                print np.array(predicted).shape
                print len(predicted)
                print len(test)
                map_mrr_dev = evaluation.evaluationBypandas(
                    dev, predicted_dev[:, -1])
                map_mrr_test = evaluation.evaluationBypandas(
                    test, predicted[:, -1])
                # print evaluation.evaluationBypandas(train,predicted_train[:,-1])
                print map_mrr_dev
                print map_mrr_test
                line = " {}: epoch: precision {}".format(i, map_mrr_test)
                log.write(line + '\n')
def test_pair_wise(dns=FLAGS.dns):
    train, test, dev = load(FLAGS.data, filter=True)
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    print 'q_question_length:{} a_question_length:{}'.format(
        q_max_sent_length, a_max_sent_length)
    print 'train question unique:{}'.format(len(train['question'].unique()))
    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)
    alphabet, embeddings = prepare([train, test, dev],
                                   dim=FLAGS.embedding_dim,
                                   is_embedding_needed=True,
                                   fresh=True)
    # alphabet,embeddings = prepare_300([train,test,dev])
    print 'alphabet:', len(alphabet)

    with tf.Graph().as_default(), tf.device("/gpu:0"):
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            cnn = QA_attentive(max_input_left=q_max_sent_length,
                               max_input_right=a_max_sent_length,
                               batch_size=FLAGS.batch_size,
                               vocab_size=len(alphabet),
                               embedding_size=FLAGS.embedding_dim,
                               filter_sizes=list(
                                   map(int, FLAGS.filter_sizes.split(","))),
                               num_filters=FLAGS.num_filters,
                               dropout_keep_prob=FLAGS.dropout_keep_prob,
                               embeddings=embeddings,
                               l2_reg_lambda=FLAGS.l2_reg_lambda,
                               is_Embedding_Needed=True,
                               trainable=FLAGS.trainable)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            if dns == True:
                loadfile = "tmp/20170502223124__0.678083232207.ckpt"
                saver.restore(sess, loadfile)
                predicted = predict(sess, cnn, train, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                map_mrr_train = evaluation.evaluationBypandas(train, predicted)
                predicted = predict(sess, cnn, test, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                map_mrr_test = evaluation.evaluationBypandas(test, predicted)
                print map_mrr_train
                print map_mrr_test
            # seq_process(train, alphabet)
            # seq_process(test, alphabet)
            map_max = 0.65
            for i in range(1000):
                if dns == True:
                    samples = dns_sample(train,
                                         alphabet,
                                         q_max_sent_length,
                                         a_max_sent_length,
                                         sess,
                                         cnn,
                                         FLAGS.batch_size,
                                         neg_sample_num=10)
                    datas = batch_gen_with_pair_dns(samples, FLAGS.batch_size)
                else:
                    datas = batch_gen_with_pair(train,
                                                alphabet,
                                                FLAGS.batch_size,
                                                q_len=q_max_sent_length,
                                                a_len=a_max_sent_length)

                for data in datas:
                    feed_dict = {
                        cnn.question: data[0],
                        cnn.answer: data[1],
                        cnn.answer_negative: data[2]
                    }
                    _, step, loss, accuracy, score12, score13 = sess.run([
                        train_op, global_step, cnn.loss, cnn.accuracy,
                        cnn.score12, cnn.score13
                    ], feed_dict)
                    time_str = datetime.datetime.now().isoformat()

                    print(
                        "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                        .format(time_str, step, loss, accuracy,
                                np.mean(score12), np.mean(score13)))
                    # print loss
                predicted = predict(sess, cnn, train, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                map_mrr_train = evaluation.evaluationBypandas(train, predicted)
                predicted = predict(sess, cnn, test, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                map_mrr_test = evaluation.evaluationBypandas(test, predicted)

                # # predicted_train = prediction(sess,cnn,train,alphabet,q_max_sent_length,a_max_sent_length)
                # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted_dev[:,-1])
                # map_mrr_test = evaluation.evaluationBypandas(test,predicted[:,-1])
                # map_mrr_train = evaluation.evaluationBypandas(train,predicted_train[:,-1])
                # # print evaluation.evaluationBypandas(train,predicted_train[:,-1])
                print "{}:epoch:train map mrr {}".format(i, map_mrr_train)
                print "{}:epoch:test map mrr {}".format(i, map_mrr_test)
                # print "{}:epoch:map mrr {}".format(i,map_mrr_dev)
                line = " {}:epoch: map_train{}----map_test{}".format(
                    i, map_mrr_train[0], map_mrr_test[0])
                log.write(line + '\n')
                log.flush()
                if map_mrr_test[0] > map_max:
                    timeStamp = time.strftime("%Y%m%d%H%M%S",
                                              time.localtime(int(time.time())))
                    folder = 'runs/' + timeDay
                    out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str(
                        map_mrr_test[0])
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    save_path = saver.save(sess, out_dir)
                    print "Model saved in file: ", save_path
                    map_max = map_mrr_test[0]
Exemple #14
0
def test_point_wise():
    train, test = load(FLAGS.data, FLAGS.file_name)
    q_max_sent_length = FLAGS.max_len_query
    d_max_sent_length = FLAGS.max_len_document

    alphabet, embeddings = get_wordDic_Embedding(FLAGS.data, FLAGS.file_name,
                                                 50)
    # print ("alphabet",len(alphabet))
    with tf.Graph().as_default():
        with tf.device("/gpu:0"):
            session_conf = tf.ConfigProto()
            session_conf.allow_soft_placement = FLAGS.allow_soft_placement
            session_conf.log_device_placement = FLAGS.log_device_placement
            session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)

        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            cnn = IR_quantum(max_input_query=q_max_sent_length,
                             max_input_docu=d_max_sent_length,
                             vocab_size=len(alphabet),
                             embedding_size=FLAGS.embedding_dim,
                             batch_size=FLAGS.batch_size,
                             embeddings=embeddings,
                             filter_sizes=list(
                                 map(int, FLAGS.filter_sizes.split(","))),
                             num_filters=FLAGS.num_filters,
                             l2_reg_lambda=FLAGS.l2_reg_lambda,
                             trainable=FLAGS.trainable,
                             overlap_needed=FLAGS.overlap_needed,
                             pooling=FLAGS.pooling,
                             extend_feature_dim=FLAGS.extend_feature_dim)

            cnn.build_graph()

            global_step = tf.Variable(0, name='global_step', trainable=False)
            learning_rate = FLAGS.learning_rate

            # optimizer = tf.train.AdamOptimizer(learning_rate,epsilon=1e-08)

            # grads_and_vars = optimizer.compute_gradients(cnn.loss)
            # train_op = optimizer.apply_gradients(grads_and_vars,global_step = global_step)

            optimizer = tf.train.AdamOptimizer(learning_rate, epsilon=1e-08)
            grads, v = zip(*optimizer.compute_gradients(cnn.loss))
            grads, _ = tf.clip_by_global_norm(grads, 5.0)
            train_op = optimizer.apply_gradients(zip(grads, v),
                                                 global_step=global_step)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=4)

            sess.run(tf.global_variables_initializer())

            map_max = 0.020
            # loss_max = 0.3
            for i in range(FLAGS.num_epochs):
                print("\nepoch " + str(i) + "\n")
                d = get_overlap_dict(train,
                                     alphabet,
                                     q_len=q_max_sent_length,
                                     d_len=d_max_sent_length)
                datas = batch_gen_with_list_wise(train,
                                                 alphabet,
                                                 FLAGS.batch_size,
                                                 q_len=q_max_sent_length,
                                                 d_len=d_max_sent_length,
                                                 overlap_dict=d)
                j = 1
                for data in datas:
                    feed_dict = {
                        cnn.query: data[0],
                        cnn.document: data[1],
                        cnn.input_label: data[2],
                        cnn.q_overlap: data[3],
                        cnn.d_overlap: data[4],
                        cnn.tfidf_value: data[5],
                        cnn.dropout_keep_prob: 0.5
                    }
                    _, step, l2_loss, loss = sess.run(
                        [train_op, global_step, cnn.l2_loss, cnn.loss],
                        feed_dict)
                    print("{} loss: {},l2_loss : {}".format(j, loss, l2_loss))
                    # exit()
                    j += 1
                    time_str = datetime.datetime.now().isoformat()

                predicted = predict(sess, cnn, train, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    d_max_sent_length)

                map_NDCG0_NDCG1_ERR_p_train = evaluation_test.evaluationBypandas(
                    train, predicted[:, -1])
                predicted_test = predict(sess, cnn, test, alphabet,
                                         FLAGS.batch_size, q_max_sent_length,
                                         d_max_sent_length)

                map_NDCG0_NDCG1_ERR_p_test = evaluation_test.evaluationBypandas(
                    test, predicted_test[:, -1])

                if map_NDCG0_NDCG1_ERR_p_test[0] > map_max:
                    map_max = map_NDCG0_NDCG1_ERR_p_test[0]
                    timeStamp = time.strftime("%Y%m%d%H%M%S",
                                              time.localtime(int(time.time())))
                    folder = 'runs/' + FLAGS.data + "/" + FLAGS.file_name + "/" + timeDay
                    out_dir = folder + '/' + timeStamp + '_' + str(
                        map_NDCG0_NDCG1_ERR_p_test[0])
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    save_path = saver.save(sess, out_dir)
                    print("Model saved in file: ", save_path)

                print("{}:train epoch:map,NDCG0,NDCG1,ERR,p {}".format(
                    i, map_NDCG0_NDCG1_ERR_p_train))
                print("{}:test epoch:map,NDCG0,NDCG1,ERR,p {}".format(
                    i, map_NDCG0_NDCG1_ERR_p_test))

                line1 = " {}:epoch: map_train{}".format(
                    i, map_NDCG0_NDCG1_ERR_p_train)
                log.write(line1 + "\n")
                line = " {}:epoch: map_test{}".format(
                    i, map_NDCG0_NDCG1_ERR_p_test)
                log.write(line + "\n")
                log.write("\n")
                log.flush()
            log.close()
Exemple #15
0
def main(args):
    args._parse_flags()
    print("\nParameters:")
    for attr, value in sorted(args.__flags.items()):
        print(("{}={}".format(attr.upper(), value)))
    log_dir = 'log/' + timeDay
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    data_file = log_dir + '/test_' + args.data + timeStamp
    precision = data_file + 'precise'
    print('load data ...........')
    train, test, dev = data_helper.load(args.data, filter=args.clean)

    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))

    alphabet = data_helper.get_alphabet([train, test, dev])
    print('the number of words', len(alphabet))

    print('get embedding')
    if args.data == "quora":
        embedding = data_helper.get_embedding(alphabet, language="cn")
    else:
        embedding = data_helper.get_embedding(alphabet)

    with tf.Graph().as_default(), tf.device("/gpu:" + str(args.gpu)):
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto()
        session_conf.allow_soft_placement = args.allow_soft_placement
        session_conf.log_device_placement = args.log_device_placement
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)

        model = QA_CNN_extend(max_input_left=q_max_sent_length,
                              max_input_right=a_max_sent_length,
                              batch_size=args.batch_size,
                              vocab_size=len(alphabet),
                              embedding_size=args.embedding_dim,
                              filter_sizes=list(
                                  map(int, args.filter_sizes.split(","))),
                              num_filters=args.num_filters,
                              hidden_size=args.hidden_size,
                              dropout_keep_prob=args.dropout_keep_prob,
                              embeddings=embedding,
                              l2_reg_lambda=args.l2_reg_lambda,
                              trainable=args.trainable,
                              pooling=args.pooling,
                              conv=args.conv)

        model.build_graph()

        sess.run(tf.global_variables_initializer())

        def train_step(model, sess, batch):
            for data in batch:
                feed_dict = {
                    model.question: data[0],
                    model.answer: data[1],
                    model.answer_negative: data[2],
                    model.q_mask: data[3],
                    model.a_mask: data[4],
                    model.a_neg_mask: data[5]
                }
                _, summary, step, loss, accuracy, score12, score13, see = sess.run(
                    [
                        model.train_op, model.merged, model.global_step,
                        model.loss, model.accuracy, model.score12,
                        model.score13, model.see
                    ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print(
                    "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                    .format(time_str, step, loss, accuracy, np.mean(score12),
                            np.mean(score13)))

        def predict(model, sess, batch, test):
            scores = []
            for data in batch:
                feed_dict = {
                    model.question: data[0],
                    model.answer: data[1],
                    model.q_mask: data[2],
                    model.a_mask: data[3]
                }
                score = sess.run(model.score12, feed_dict)
                scores.extend(score)

            return np.array(scores[:len(test)])

        for i in range(args.num_epoches):
            datas = data_helper.get_mini_batch(train, alphabet,
                                               args.batch_size)
            train_step(model, sess, datas)
            test_datas = data_helper.get_mini_batch_test(
                test, alphabet, args.batch_size)

            predicted_test = predict(model, sess, test_datas, test)
            print(len(predicted_test))
            print(len(test))
            map_mrr_test = evaluation.evaluationBypandas(test, predicted_test)

            print('map_mrr test', map_mrr_test)
Exemple #16
0
def test_pair_wise(dns=FLAGS.dns):
    train, test, dev = load(FLAGS.data, filter=FLAGS.clean)
    test = test.reindex(np.random.permutation(test.index))

    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    print('q_question_length:{} a_question_length:{}'.format(
        q_max_sent_length, a_max_sent_length))
    print('train question unique:{}'.format(len(train['question'].unique())))
    print('train length', len(train))
    print('test length', len(test))
    print('dev length', len(dev))
    alphabet, embeddings = prepare([train, test, dev],
                                   dim=FLAGS.embedding_dim,
                                   is_embedding_needed=True,
                                   fresh=FLAGS.fresh)
    # alphabet,embeddings = prepare_300([train,test,dev])
    print('alphabet:', len(alphabet))
    with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)):
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto()
        session_conf.allow_soft_placement = FLAGS.allow_soft_placement
        session_conf.log_device_placement = FLAGS.log_device_placement
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            folder = 'runs/' + timeDay + '/' + timeStamp + '/'
            out_dir = folder + FLAGS.data
            if not os.path.exists(folder):
                os.makedirs(folder)
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            print("start build model")
            cnn = QA_CNN_quantum_extend(
                max_input_left=q_max_sent_length,
                max_input_right=a_max_sent_length,
                batch_size=FLAGS.batch_size,
                vocab_size=len(alphabet),
                embedding_size=FLAGS.embedding_dim,
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                num_filters=FLAGS.num_filters,
                dropout_keep_prob=FLAGS.dropout_keep_prob,
                embeddings=embeddings,
                l2_reg_lambda=FLAGS.l2_reg_lambda,
                overlap_needed=FLAGS.overlap_needed,
                learning_rate=FLAGS.learning_rate,
                trainable=FLAGS.trainable,
                extend_feature_dim=FLAGS.extend_feature_dim,
                pooling=FLAGS.pooling,
                position_needed=FLAGS.position_needed,
                conv=FLAGS.conv,
                margin=FLAGS.margin)
            cnn.build_graph()

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            train_writer = tf.summary.FileWriter(log_dir + '/train',
                                                 sess.graph)
            test_writer = tf.summary.FileWriter(log_dir + '/test')
            # Initialize all variables
            print("build over")
            sess.run(tf.global_variables_initializer())
            print("variables_initializer")

            # saver.restore(sess, 'runs/20170910/20170910154937/wiki')
            map_max = 0.65
            for i in range(FLAGS.num_epochs):

                datas = batch_gen_with_pair(train,
                                            alphabet,
                                            FLAGS.batch_size,
                                            q_len=q_max_sent_length,
                                            a_len=a_max_sent_length,
                                            fresh=FLAGS.fresh,
                                            overlap_dict=None)
                print("load data")
                for data in datas:
                    feed_dict = {
                        cnn.question: data[0],
                        cnn.answer: data[1],
                        cnn.answer_negative: data[2],
                        cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                    }
                    _, summary, step, loss, accuracy, score12, score13, see = sess.run(
                        [
                            cnn.train_op, cnn.merged, cnn.global_step,
                            cnn.loss, cnn.accuracy, cnn.score12, cnn.score13,
                            cnn.see
                        ], feed_dict)

                    train_writer.add_summary(summary, i)
                    time_str = datetime.datetime.now().isoformat()
                    print(
                        "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                        .format(time_str, step, loss, accuracy,
                                np.mean(score12), np.mean(score13)))
                    line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(
                        time_str, step, loss, accuracy, np.mean(score12),
                        np.mean(score13))
                    # print loss
                if i % 1 == 0:
                    predicted_dev = predict(sess, cnn, dev, alphabet,
                                            FLAGS.batch_size,
                                            q_max_sent_length,
                                            a_max_sent_length)
                    map_mrr_dev = evaluation.evaluationBypandas(
                        dev, predicted_dev)
                    predicted_test = predict(sess, cnn, test, alphabet,
                                             FLAGS.batch_size,
                                             q_max_sent_length,
                                             a_max_sent_length)
                    map_mrr_test = evaluation.evaluationBypandas(
                        test, predicted_test)

                    precise_test = evaluation.precision(test, predicted_test)

                    print("test precise : {}".format(precise_test))
                    print("{}:epoch:dev map mrr {}".format(i, map_mrr_dev))
                    print("{}:epoch:test map mrr {}".format(i, map_mrr_test))
                    line = " {}:epoch: precise: {}--- map_dev{}-------map_mrr_test{}".format(
                        i, precise_test, map_mrr_dev[0], map_mrr_test)
                    if map_mrr_dev[0] > map_max:
                        map_max = map_mrr_dev[0]

                        save_path = saver.save(sess, out_dir)
                        print("Model saved in file: ", save_path)

                log.write(line + '\n')
                log.flush()
            print('train over')
            saver.restore(sess, out_dir)
            predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size,
                                q_max_sent_length, a_max_sent_length)
            train['predicted'] = predicted
            map_mrr_train = evaluation.evaluationBypandas(train, predicted)
            predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size,
                                    q_max_sent_length, a_max_sent_length)
            dev['predicted'] = predicted_dev
            map_mrr_dev = evaluation.evaluationBypandas(dev, predicted_dev)
            predicted_test = predict(sess, cnn, test, alphabet,
                                     FLAGS.batch_size, q_max_sent_length,
                                     a_max_sent_length)
            test['predicted'] = predicted_test
            map_mrr_test = evaluation.evaluationBypandas(test, predicted_test)

            ap = evaluation.get_ap(test, predicted_test)
            ap.to_csv('ap_score_qlm_wiki', header=None, sep='\t')
            print('map_mrr train', map_mrr_train)
            print('map_mrr dev', map_mrr_dev)
            print('map_mrr test', map_mrr_test)
            log.write(str(map_mrr_train) + '\n')
            log.write(str(map_mrr_test) + '\n')
            log.write(str(map_mrr_dev) + '\n')
Exemple #17
0
def test_pair_wise(dns=FLAGS.dns):
    train, test, dev = load(FLAGS.data, filter=False)
    train = train.fillna('')
    test = test.fillna('')
    dev = dev.fillna('')
    # train = train[:1000]
    # test = test[:1000]
    # dev = dev[:1000]
    # submit = submit[:1000]
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    print 'q_question_length:{} a_question_length:{}'.format(
        q_max_sent_length, a_max_sent_length)
    print 'train question unique:{}'.format(len(train['question'].unique()))
    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)
    alphabet, embeddings = prepare([train, test, dev],
                                   dim=FLAGS.embedding_dim,
                                   is_embedding_needed=True,
                                   fresh=FLAGS.fresh)
    # alphabet,embeddings = prepare_300([train,test,dev])
    print 'alphabet:', len(alphabet)
    with tf.Graph().as_default(), tf.device("/gpu:0"):
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto()
        session_conf.allow_soft_placement = FLAGS.allow_soft_placement
        session_conf.log_device_placement = FLAGS.log_device_placement
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            print "start build model"
            cnn = QA_CNN_extend(max_input_left=q_max_sent_length,
                                max_input_right=a_max_sent_length,
                                batch_size=FLAGS.batch_size,
                                vocab_size=len(alphabet),
                                embedding_size=FLAGS.embedding_dim,
                                filter_sizes=list(
                                    map(int, FLAGS.filter_sizes.split(","))),
                                num_filters=FLAGS.num_filters,
                                dropout_keep_prob=FLAGS.dropout_keep_prob,
                                embeddings=embeddings,
                                l2_reg_lambda=FLAGS.l2_reg_lambda,
                                overlap_needed=FLAGS.overlap_needed,
                                learning_rate=FLAGS.learning_rate,
                                trainable=FLAGS.trainable,
                                extend_feature_dim=FLAGS.extend_feature_dim,
                                model_type=FLAGS.CNN_type)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(cnn.learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            # Initialize all variables
            print "build over"
            sess.run(tf.global_variables_initializer())
            print "variables_initializer"
            if dns == True:
                loadfile = "runs/20170604/20170604183633__nlpcc0.833940715393"
                saver.restore(sess, loadfile)
                predicted = predict(sess, cnn, train, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                map_mrr_train = evaluation.evaluationBypandas(train, predicted)
                predicted = predict(sess, cnn, test, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                map_mrr_test = evaluation.evaluationBypandas(test, predicted)
                print map_mrr_train
                print map_mrr_test
            # seq_process(train, alphabet)
            # seq_process(test, alphabet)
            '''
            print 'get my submit result'
            loadfile="runs/20170604/20170604183633__nlpcc0.833940715393"
            saver.restore(sess, loadfile)
            predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
            train['predicted'] = predicted
            train['predicted'].to_csv('train.QApair.TJU_IR_QA2017_train.score',index = False,sep = '\t')
            map_mrr_train = evaluation.evaluationBypandas(train,predicted)
            predicted_test = predict(sess,cnn,test,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
            test['predicted'] = predicted_test
            test['predicted'].to_csv('train.QApair.TJU_IR_QA2017.score',index = False,sep = '\t')
            map_mrr_test = evaluation.evaluationBypandas(test,predicted_test)
            print 'map_mrr train',map_mrr_train
            print 'map_prr dev',map_mrr_test

            predict_submit = predict(sess,cnn,submit,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
            submit['predicted'] = predict_submit
            submit['predicted'].to_csv('train.QApair.TJU_IR_QA2017_submit.score',index = False,sep = '\t')
            print 'predict over'

            '''
            map_max = 0.65
            for i in range(1000):
                if dns == True:
                    samples = dns_sample(train,
                                         alphabet,
                                         q_max_sent_length,
                                         a_max_sent_length,
                                         sess,
                                         cnn,
                                         FLAGS.batch_size,
                                         neg_sample_num=10)
                    datas = batch_gen_with_pair_dns(samples, FLAGS.batch_size)
                else:
                    d = get_overlap_dict(train,
                                         alphabet,
                                         q_len=q_max_sent_length,
                                         a_len=a_max_sent_length)
                    datas = batch_gen_with_pair_overlap(
                        train,
                        alphabet,
                        FLAGS.batch_size,
                        q_len=q_max_sent_length,
                        a_len=a_max_sent_length,
                        fresh=FLAGS.fresh,
                        overlap_dict=d)
                print "load data"
                for data in datas:
                    feed_dict = {
                        cnn.question: data[0],
                        cnn.answer: data[1],
                        cnn.answer_negative: data[2],
                        cnn.q_pos_overlap: data[3],
                        cnn.q_neg_overlap: data[4],
                        cnn.a_pos_overlap: data[5],
                        cnn.a_neg_overlap: data[6]
                    }
                    _, step, loss, accuracy, score12, score13 = sess.run([
                        train_op, global_step, cnn.loss, cnn.accuracy,
                        cnn.score12, cnn.score13
                    ], feed_dict)
                    time_str = datetime.datetime.now().isoformat()

                    print(
                        "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                        .format(time_str, step, loss, accuracy,
                                np.mean(score12), np.mean(score13)))
                    line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(
                        time_str, step, loss, accuracy, np.mean(score12),
                        np.mean(score13))
                    # print loss
                if i % 1 == 0:
                    predicted = predict(sess, cnn, test, alphabet,
                                        FLAGS.batch_size, q_max_sent_length,
                                        a_max_sent_length)
                    map_mrr_test = evaluation.evaluationBypandas(
                        test, predicted)
                    print "{}:epoch:test map mrr {}".format(i, map_mrr_test)
                    line = " {}:epoch: map_test{}".format(i, map_mrr_test[0])
                    if map_mrr_test[0] > map_max:
                        map_max = map_mrr_test[0]
                        timeStamp = time.strftime(
                            "%Y%m%d%H%M%S", time.localtime(int(time.time())))
                        folder = 'runs/' + timeDay
                        out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str(
                            map_mrr_test[0])
                        if not os.path.exists(folder):
                            os.makedirs(folder)
                        save_path = saver.save(sess, out_dir)
                        print "Model saved in file: ", save_path
                '''
                predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
                map_mrr_train = evaluation.evaluationBypandas(train,predicted)
                predicted = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
                map_mrr_dev = evaluation.evaluationBypandas(dev,predicted)
                print "{}:epoch:train map mrr {}".format(i,map_mrr_train)
                # print "{}:epoch:test map mrr {}".format(i,map_mrr_test)
                print "{}:epoch:dev map mrr {}".format(i,map_mrr_dev)
                if map_mrr_dev[0] > map_max:
                    map_max = map_mrr_dev[0]
                    predicted = predict(sess,cnn,test,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
                    map_mrr_test = evaluation.evaluationBypandas(test,predicted)
                    print "{}:epoch:test map mrr {}".format(i,map_mrr_test)
                    line = " {}:epoch: map_test{}".format(i,map_mrr_test[0])
                    if map_mrr_test[0] > map_max:
                        timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))
                        folder = 'runs/' + timeDay
                        out_dir = folder +'/'+timeStamp+'__'+FLAGS.data+str(map_mrr_test[0])
                        if not os.path.exists(folder):
                            os.makedirs(folder)
                        save_path = saver.save(sess, out_dir)
                        print "Model saved in file: ", save_path
                        
                '''
                # # predicted_train = prediction(sess,cnn,train,alphabet,q_max_sent_length,a_max_sent_length)
                # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted_dev[:,-1])
                # map_mrr_test = evaluation.evaluationBypandas(test,predicted[:,-1])
                # map_mrr_train = evaluation.evaluationBypandas(train,predicted_train[:,-1])
                # # print evaluation.evaluationBypandas(train,predicted_train[:,-1])

                # line = " {}:epoch: map_train{}----map_test{}----map_dev{}".format(i,map_mrr_train[0],map_mrr_test[0],map_mrr_dev[0])
                # line = " {}:epoch: map_train{}----map_dev{}".format(i,map_mrr_train[0],map_mrr_dev[0])
                log.write(line + '\n')
                log.flush()
Exemple #18
0
with tf.Graph().as_default():

    model = Attentive_CNN(opts)
    model._model_stats()
    for i in range(args.num_epoches):
        data_gen = helper.batch_iter(train,
                                     args.batch_size,
                                     alphabet,
                                     shuffle=True,
                                     q_len=q_max_sent_length,
                                     a_len=a_max_sent_length)
        model.train(data_gen, i)

        test_datas = helper.batch_iter(test,
                                       args.batch_size,
                                       alphabet,
                                       q_len=q_max_sent_length,
                                       a_len=a_max_sent_length)

        test['score'] = model.predict(test_datas)
        map_, mrr_ = evaluation.evaluationBypandas(test,
                                                   test['score'].to_list())
        df_group = test.groupby('question').filter(
            evaluation.mrr_metric_filter)
        df_group[['question', 'answer', 'flag', 'score']].to_csv('badcase',
                                                                 sep='\t',
                                                                 index=None)
        logger.info('map:{}--mrr:{}'.format(map_, mrr_))
        print('map:{}--mrr:{}'.format(map_, mrr_))
Exemple #19
0
def get_feature():
    train, test, dev = load("trec", filter=False)
    test = test.reindex(np.random.permutation(test.index))

    test['pred'] = test.apply(overlap_jiabing, axis=1)
    print evaluation.evaluationBypandas(test, test['pred'])
Exemple #20
0
#            test_datas = data_helper.get_mini_batch_test(dev,alphabet,args.batch_size)
#            predicted_test = predict(model,sess,test_datas,dev)
#            map_mrr_test = evaluation.evaluationBypandas(dev,predicted_test)
#
#            logger.info('map_mrr dev' +str(map_mrr_test))
#            print('map_mrr dev' +str(map_mrr_test))
#            map,mrr,p1 = map_mrr_test
#            if p1>best_p1:
#                best_p1=p1
#                filename= "checkpoint/"+args.data+"_"+str(p1)+".model"
#                save_path = saver.save(sess, filename)
#        #            load_path = saver.restore(sess, model_path)
#
#                import shutil
#                shutil.rmtree("model")
#                builder = tf.saved_model.builder.SavedModelBuilder("./model")
#                builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING])
#                builder.save(True)
#
#
#=======

        test_datas = data_helper.get_mini_batch_test(test, alphabet,
                                                     args.batch_size)

        predicted_test = predict(model, sess, test_datas, test)
        map_mrr_test = evaluation.evaluationBypandas(test, predicted_test)

        logger.info('map_mrr test' + str(map_mrr_test))
        print('epoch ' + str(i) + 'map_mrr test' + str(map_mrr_test))
Exemple #21
0
def test_point_wise():
    # creat_train_test("2")
    length = 5
    # train,test = load()
    test_length = load_test_apply(length)
    # train,test,dev = load(FLAGS.data,filter = FLAGS.clean)
    # print ()
    # q_max_sent_length = 4
    q_max_sent_length = FLAGS.max_len_query
    # d_max_sent_length = 21
    d_max_sent_length = FLAGS.max_len_document

    # alphabet,embeddings = prepare([train,test,dev],dim = FLAGS.embedding_dim,is_embedding_needed = True,fresh = True)
    # alphabet,embeddings = get_wordDic_Embedding(300)
    alphabet, embeddings = get_wordDic_Embedding(50)
    print("alphabet", len(alphabet))
    # exit()
    with tf.Graph().as_default():
        with tf.device("/gpu:0"):
            session_conf = tf.ConfigProto()
            session_conf.allow_soft_placement = FLAGS.allow_soft_placement
            session_conf.log_device_placement = FLAGS.log_device_placement
            session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            cnn = IR_quantum(max_input_query=q_max_sent_length,
                             max_input_docu=d_max_sent_length,
                             vocab_size=len(alphabet),
                             embedding_size=FLAGS.embedding_dim,
                             batch_size=FLAGS.batch_size,
                             embeddings=embeddings,
                             filter_sizes=list(
                                 map(int, FLAGS.filter_sizes.split(","))),
                             num_filters=FLAGS.num_filters,
                             l2_reg_lambda=FLAGS.l2_reg_lambda,
                             trainable=FLAGS.trainable,
                             overlap_needed=FLAGS.overlap_needed,
                             pooling=FLAGS.pooling,
                             extend_feature_dim=FLAGS.extend_feature_dim)
            cnn.build_graph()
            ckpt_dir = "runs/20181022"
            saver = tf.train.Saver()
            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
            if ckpt and ckpt.model_checkpoint_path:
                print(ckpt.model_checkpoint_path)
                saver.restore(
                    sess, ckpt.model_checkpoint_path)  # restore all variables
            else:
                raise FileNotFoundError("no fund saver!!")
            sess.run(tf.global_variables_initializer())

            global_step = tf.Variable(0, name='global_step', trainable=False)
            starter_learning_rate = FLAGS.learning_rate
            # learning_rate = tf.train.exponential_decay(starter_learning_rate,global_step,100,0.96)
            # optimizer = tf.train.GradientDescentOptimizer(starter_learning_rate,global_step = global_step)
            # optimizer = tf.train.GradientDescentOptimizer(starter_learning_rate)
            optimizer = tf.train.AdamOptimizer(starter_learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=4)

            # merged = tf.summary.merge_all()
            # train_writer = tf.summary.FileWriter(tensorboard_log_dir+"/train",sess.graph)
            print("start predict test!!")
            predicted_test = predict(sess, cnn, test_length, alphabet,
                                     FLAGS.batch_size, q_max_sent_length,
                                     d_max_sent_length)
            print("start evaluation test!!")
            map_NDCG0_NDCG1_ERR_p_test = evaluation_test.evaluationBypandas(
                test_length, predicted_test[:, -1])
            # sess.run(tf.global_variables_initializer())
            print("query length {}:test epoch:map,NDCG0,NDCG1,ERR,p {}".format(
                length, map_NDCG0_NDCG1_ERR_p_test))
Exemple #22
0
def test_point_wise():
    train, test, dev = load(FLAGS.data, filter=FLAGS.clean)
    train = train.fillna('')
    test = test.fillna('')
    dev = dev.fillna('')
    # submit = submit.fillna('')
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    # train = train[:1000]
    # test = test[:1000]
    # dev = dev[:1000]
    # submit = dev[:100]
    print 'train question unique:{}'.format(len(train['question'].unique()))
    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)

    alphabet, embeddings = prepare([train, test, dev],
                                   dim=FLAGS.embedding_dim,
                                   is_embedding_needed=True,
                                   fresh=True)
    print 'alphabet:', len(alphabet)
    with tf.Graph().as_default():
        with tf.device("/gpu:0"):
            # session_conf = tf.ConfigProto(
            #     allow_soft_placement=FLAGS.allow_soft_placement,
            #     log_device_placement=FLAGS.log_device_placement)

            session_conf = tf.ConfigProto()
            session_conf.allow_soft_placement = FLAGS.allow_soft_placement
            session_conf.log_device_placement = FLAGS.log_device_placement
            session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            cnn = QA(max_input_left=q_max_sent_length,
                     max_input_right=a_max_sent_length,
                     vocab_size=len(alphabet),
                     embedding_size=FLAGS.embedding_dim,
                     batch_size=FLAGS.batch_size,
                     embeddings=embeddings,
                     dropout_keep_prob=FLAGS.dropout_keep_prob,
                     filter_sizes=list(map(int,
                                           FLAGS.filter_sizes.split(","))),
                     num_filters=FLAGS.num_filters,
                     l2_reg_lambda=FLAGS.l2_reg_lambda,
                     is_Embedding_Needed=True,
                     trainable=FLAGS.trainable,
                     overlap_needed=FLAGS.overlap_needed,
                     position_needed=FLAGS.position_needed,
                     pooling=FLAGS.pooling,
                     extend_feature_dim=FLAGS.extend_feature_dim)
            cnn.build_graph()
            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            starter_learning_rate = 0.001
            learning_rate = tf.train.exponential_decay(starter_learning_rate,
                                                       global_step, 100, 0.96)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # seq_process(train, alphabet)
            # seq_process(test, alphabet)
            map_max = 0.65
            for i in range(30):
                d = get_overlap_dict(train,
                                     alphabet,
                                     q_len=q_max_sent_length,
                                     a_len=a_max_sent_length)
                datas = batch_gen_with_point_wise(train,
                                                  alphabet,
                                                  FLAGS.batch_size,
                                                  overlap_dict=d,
                                                  q_len=q_max_sent_length,
                                                  a_len=a_max_sent_length)
                for data in datas:
                    feed_dict = {
                        cnn.question: data[0],
                        cnn.answer: data[1],
                        cnn.input_y: data[2],
                        cnn.q_overlap: data[3],
                        cnn.a_overlap: data[4],
                        cnn.q_position: data[5],
                        cnn.a_position: data[6]
                    }
                    _, step, loss, accuracy, pred, scores, see = sess.run([
                        train_op, global_step, cnn.loss, cnn.accuracy,
                        cnn.predictions, cnn.scores, cnn.see
                    ], feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}  ".format(
                        time_str, step, loss, accuracy))

                    # print loss

                # predicted = predict(sess,cnn,train,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
                # map_mrr_train = evaluation.evaluationBypandas(train,predicted[:,-1])
                predicted = predict(sess, cnn, dev, alphabet, FLAGS.batch_size,
                                    q_max_sent_length, a_max_sent_length)
                map_mrr_dev = evaluation.evaluationBypandas(
                    dev, predicted[:, -1])
                predicted_test = predict(sess, cnn, test, alphabet,
                                         FLAGS.batch_size, q_max_sent_length,
                                         a_max_sent_length)
                map_mrr_test = evaluation.evaluationBypandas(
                    test, predicted_test[:, -1])
                if map_mrr_dev[0] > map_max:
                    map_max = map_mrr_dev[0]
                    timeStamp = time.strftime("%Y%m%d%H%M%S",
                                              time.localtime(int(time.time())))
                    folder = 'runs/' + timeDay
                    out_dir = folder + '/' + timeStamp + '__' + FLAGS.data + str(
                        map_mrr_dev[0])
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    save_path = saver.save(sess, out_dir)
                    print "Model saved in file: ", save_path
                # predicted = predict(sess,cnn,dev,alphabet,FLAGS.batch_size,q_max_sent_length,a_max_sent_length)
                # map_mrr_dev = evaluation.evaluationBypandas(dev,predicted[:,-1])
                # map_mrr_train = evaluation.evaluationBypandas(train,predicted_train[:,-1])
                # print evaluation.evaluationBypandas(train,predicted_train[:,-1])
                # print "{}:train epoch:map mrr {}".format(i,map_mrr_train)
                print "{}:dev epoch:map mrr {}".format(i, map_mrr_dev)
                print "{}:test epoch:map mrr {}".format(i, map_mrr_test)
                # line = " {}:epoch: map_train{}----map_test{}----map_dev{}".format(i,map_mrr_train[0],map_mrr_test[0],map_mrr_dev[0])
                line = " {}:epoch: map_dev{}----map_test{}".format(
                    i, map_mrr_dev[0], map_mrr_test[0])
                log.write(line + '\n')
                log.flush()
            log.close()
Exemple #23
0
def test_pair_wise(dns=FLAGS.dns):
    train, test, dev = load(FLAGS.data, filter=FLAGS.clean)
    # train = train[:10000]
    # test = test[:10000]
    # dev = dev[:10000]
    # submit = submit[:1000]
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    print 'q_question_length:{} a_question_length:{}'.format(
        q_max_sent_length, a_max_sent_length)
    print 'train question unique:{}'.format(len(train['question'].unique()))
    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)
    alphabet, embeddings = prepare([train, test, dev],
                                   dim=FLAGS.embedding_dim,
                                   is_embedding_needed=True,
                                   fresh=FLAGS.fresh)
    # alphabet,embeddings = prepare_300([train,test,dev])
    print 'alphabet:', len(alphabet)
    with tf.Graph().as_default(), tf.device("/gpu:" + str(FLAGS.gpu)):
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto()
        session_conf.allow_soft_placement = FLAGS.allow_soft_placement
        session_conf.log_device_placement = FLAGS.log_device_placement
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open(precision, "w") as log:
            log.write(str(FLAGS.__flags) + '\n')
            folder = 'runs/' + timeDay + '/' + timeStamp + '/'
            out_dir = folder + FLAGS.data
            if not os.path.exists(folder):
                os.makedirs(folder)
            # train,test,dev = load("trec",filter=True)
            # alphabet,embeddings = prepare([train,test,dev],is_embedding_needed = True)
            print "start build model"
            cnn = QA_RNN_extend(max_input_left=q_max_sent_length,
                                max_input_right=a_max_sent_length,
                                batch_size=FLAGS.batch_size,
                                vocab_size=len(alphabet),
                                embedding_size=FLAGS.embedding_dim,
                                filter_sizes=list(
                                    map(int, FLAGS.filter_sizes.split(","))),
                                num_filters=FLAGS.num_filters,
                                dropout_keep_prob=FLAGS.dropout_keep_prob,
                                embeddings=embeddings,
                                l2_reg_lambda=FLAGS.l2_reg_lambda,
                                overlap_needed=FLAGS.overlap_needed,
                                learning_rate=FLAGS.learning_rate,
                                trainable=FLAGS.trainable,
                                extend_feature_dim=FLAGS.extend_feature_dim,
                                pooling=FLAGS.pooling,
                                position_needed=FLAGS.position_needed,
                                conv=FLAGS.conv)
            cnn.build_graph()

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
            train_writer = tf.summary.FileWriter(log_dir + '/train',
                                                 sess.graph)
            test_writer = tf.summary.FileWriter(log_dir + '/test')
            # Initialize all variables
            print "build over"
            sess.run(tf.global_variables_initializer())
            print "variables_initializer"

            map_max = 0.65
            for i in range(FLAGS.num_epochs):
                if FLAGS.dns == True:
                    samples = dns_sample(train,
                                         alphabet,
                                         q_max_sent_length,
                                         a_max_sent_length,
                                         sess,
                                         cnn,
                                         FLAGS.batch_size,
                                         neg_sample_num=10)
                    datas = batch_gen_with_pair_dns(samples, FLAGS.batch_size)
                    print 'load dns datas'
                    for data in datas:
                        feed_dict = {
                            cnn.question: data[0],
                            cnn.answer: data[1],
                            cnn.answer_negative: data[2]
                        }
                        _, step, loss, accuracy, score12, score13 = sess.run([
                            cnn.train_op, cnn.global_step, cnn.loss,
                            cnn.accuracy, cnn.score12, cnn.score13
                        ], feed_dict)
                        time_str = datetime.datetime.now().isoformat()
                        print(
                            "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                            .format(time_str, step, loss, accuracy,
                                    np.mean(score12), np.mean(score13)))
                        line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(
                            time_str, step, loss, accuracy, np.mean(score12),
                            np.mean(score13))
                else:
                    d = get_overlap_dict(train,
                                         alphabet,
                                         q_len=q_max_sent_length,
                                         a_len=a_max_sent_length)
                    datas = batch_gen_with_pair_overlap(
                        train,
                        alphabet,
                        FLAGS.batch_size,
                        q_len=q_max_sent_length,
                        a_len=a_max_sent_length,
                        fresh=FLAGS.fresh,
                        overlap_dict=d)
                    print "load data"
                    for data in datas:
                        feed_dict = {
                            cnn.question: data[0],
                            cnn.answer: data[1],
                            cnn.answer_negative: data[2],
                            cnn.q_pos_overlap: data[3],
                            cnn.q_neg_overlap: data[4],
                            cnn.a_pos_overlap: data[5],
                            cnn.a_neg_overlap: data[6],
                            cnn.q_position: data[7],
                            cnn.a_pos_position: data[8],
                            cnn.a_neg_position: data[9]
                        }
                        _, summary, step, loss, accuracy, score12, score13 = sess.run(
                            [
                                cnn.train_op, cnn.merged, cnn.global_step,
                                cnn.loss, cnn.accuracy, cnn.score12,
                                cnn.score13
                            ], feed_dict)
                        train_writer.add_summary(summary, i)
                        time_str = datetime.datetime.now().isoformat()
                        print(
                            "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}"
                            .format(time_str, step, loss, accuracy,
                                    np.mean(score12), np.mean(score13)))
                        line = "{}: step {}, loss {:g}, acc {:g} ,positive {:g},negative {:g}".format(
                            time_str, step, loss, accuracy, np.mean(score12),
                            np.mean(score13))
                        # print loss
                if i % 1 == 0:
                    predicted_dev = predict(sess, cnn, dev, alphabet,
                                            FLAGS.batch_size,
                                            q_max_sent_length,
                                            a_max_sent_length)
                    map_mrr_dev = evaluation.evaluationBypandas(
                        dev, predicted_dev)
                    predicted_test = predict(sess, cnn, test, alphabet,
                                             FLAGS.batch_size,
                                             q_max_sent_length,
                                             a_max_sent_length)
                    map_mrr_test = evaluation.evaluationBypandas(
                        test, predicted_test)

                    print "{}:epoch:dev map mrr {}".format(i, map_mrr_dev)
                    print "{}:epoch:test map mrr {}".format(i, map_mrr_test)
                    line = " {}:epoch: map_dev{}-------map_mrr_test{}".format(
                        i, map_mrr_dev[0], map_mrr_test)
                    if map_mrr_dev[0] > map_max:
                        map_max = map_mrr_dev[0]
                        # timeStamp = time.strftime("%Y%m%d%H%M%S", time.localtime(int(time.time())))

                        save_path = saver.save(sess, out_dir)
                        print "Model saved in file: ", save_path

                log.write(line + '\n')
                log.flush()
            print 'train over'
            saver.restore(sess, out_dir)
            predicted = predict(sess, cnn, train, alphabet, FLAGS.batch_size,
                                q_max_sent_length, a_max_sent_length)
            train['predicted'] = predicted
            train['predicted'].to_csv('train.QApair.TJU_IR_QA2017_train.score',
                                      index=False,
                                      sep='\t')
            map_mrr_train = evaluation.evaluationBypandas(train, predicted)

            predicted_dev = predict(sess, cnn, dev, alphabet, FLAGS.batch_size,
                                    q_max_sent_length, a_max_sent_length)
            dev['predicted'] = predicted_dev
            dev['predicted'].to_csv('train.QApair.TJU_IR_QA2017_dev.score',
                                    index=False,
                                    sep='\t')
            map_mrr_dev = evaluation.evaluationBypandas(dev, predicted_dev)

            predicted_test = predict(sess, cnn, test, alphabet,
                                     FLAGS.batch_size, q_max_sent_length,
                                     a_max_sent_length)

            test['predicted'] = predicted_test
            test['predicted'].to_csv('train.QApair.TJU_IR_QA2017.score',
                                     index=False,
                                     sep='\t')
            map_mrr_test = evaluation.evaluationBypandas(test, predicted_test)

            print 'map_mrr train', map_mrr_train
            print 'map_mrr dev', map_mrr_dev
            print 'map_mrr test', map_mrr_test
            log.write(str(map_mrr_train) + '\n')
            log.write(str(map_mrr_test) + '\n')
            log.write(str(map_mrr_dev) + '\n')
            predict(sess, cnn, train[:100], alphabet, 20, q_max_sent_length,
                    a_max_sent_length)
def main():
    train, test, dev = load("wiki", filter=True)
    q_max_sent_length = max(
        map(lambda x: len(x), train['question'].str.split()))
    a_max_sent_length = max(map(lambda x: len(x), train['answer'].str.split()))
    q_max_sent_length = 40
    a_max_sent_length = 40
    print 'train length', len(train)
    print 'test length', len(test)
    print 'dev length', len(dev)
    alphabet, embeddings = prepare([train, test, dev],
                                   is_embedding_needed=True)
    print 'alphabet:', len(alphabet)
    with tf.Graph().as_default():
        # with tf.device("/cpu:0"):
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default(), open("precision.txt", "w") as log:
            cnn = QA_CNN_Attentive(max_input_left=q_max_sent_length,
                                   max_input_right=a_max_sent_length,
                                   batch_size=FLAGS.batch_size,
                                   vocab_size=len(alphabet),
                                   embeddings=embeddings,
                                   embedding_size=FLAGS.embedding_dim,
                                   num_filters=FLAGS.num_filters,
                                   dropout_keep_prob=1.0,
                                   l2_reg_lambda=FLAGS.l2_reg_lambda,
                                   is_Embedding_Needed=True,
                                   trainable=True)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            saver = tf.train.Saver(tf.all_variables(), max_to_keep=20)
            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            for i in range(1000):
                for x_batch_1, x_batch_2, x_batch_3 in batch_gen_with_pair_whole(
                        train, alphabet, FLAGS.batch_size):

                    feed_dict = {
                        cnn.question: x_batch_1,
                        cnn.answer: x_batch_2,
                        cnn.answer_negative: x_batch_3,
                    }

                    _, step, loss, accuracy, scores1, scores2, a1, a2, U = sess.run(
                        [
                            train_op, global_step, cnn.loss, cnn.accuracy,
                            cnn.score12, cnn.score13, cnn.attention_q,
                            cnn.attention_a, cnn.U
                        ], feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print(
                        "{}: step {}, loss {:g}, acc {:g} positive {:g} negative {:g} mean_pooling {:g}"
                        .format(time_str, step, loss,
                                accuracy, np.mean(scores1), np.mean(scores2),
                                np.mean(a1)))
                    # print a1
                predicted = predict(sess, cnn, train, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                print(evaluation.evaluationBypandas(train, predicted))
                predicted = predict(sess, cnn, test, alphabet,
                                    FLAGS.batch_size, q_max_sent_length,
                                    a_max_sent_length)
                print(evaluation.evaluationBypandas(test, predicted))