Exemple #1
0
def out(input_path, out_path):
    max_len = 20
    _index, t1, t2 = read_out_file(input_path)
    vocab_process = learn.preprocessing.VocabularyProcessor.restore("./vocab")
    # vocab_process.restore("./vocab")
    t_1 = list(vocab_process.transform(t1))
    t_2 = list(vocab_process.transform(t2))
    vocab = len(vocab_process.vocabulary_)
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            model = m.model(emb_dim=100,
                            length=max_len,
                            vocab_size=vocab,
                            filter_size=2,
                            conv_out=64,
                            lstm_cell=32)
            model.build()
            # sess.run(tf.global_variables_initializer())
            # ckpt = tf.train.load_checkpoint("./log/model/model-4000")#_checkpoint("./log/model/model-4000")
            saver = tf.train.Saver()
            saver.restore(sess, "./log/model/model-4000")

            def final(out_t1, out_t2, index):
                feed_dict = {
                    model.x_1: out_t1,
                    model.x_2: out_t2,
                    model.keep_prob: 1.0
                }
                pre = sess.run([model.pre], feed_dict=feed_dict)
                with codecs.open(out_path, mode="a+", encoding="utf-8") as out:
                    for label in enumerate(pre[0]):

                        if label == 0:
                            w_str = str(index) + "\t" + str(1) + "\n"
                        else:
                            w_str = str(index) + "\t" + str(0) + "\n"
                        out.writelines(w_str)
                        index += 1
                        pass
                return index

            len_t = len(t_1)

            i = 3000
            index = 1
            for x in range(0, len_t, i):

                _t1, _t2 = t_1[x:i + x], t_2[x:i + x]
                index = final(_t1, _t2, index)
Exemple #2
0
def train():
    train_file = "./atec_nlp_sim_train.csv"
    train_file2 = "./atec_nlp_sim_train_add.csv"
    L2 = 0.02
    _index, t_1, t_2, max_len, _labels_1 = read_data(train_file)

    out_index, out_t1, out_t2, max_len, _labels_2 = read_data(train_file2)
    # labels = np.concatenate((labels,labels2),axis=0)
    _labels = _labels_1 + _labels_2
    max_len = 65

    # t_1 = [len(l)for l in train_chars+test_chars]

    vocab_process = learn.preprocessing.VocabularyProcessor(max_len)
    vocab_process.fit(t_1 + t_2 + out_t1 + out_t2)
    t_1 = list(vocab_process.transform(t_1 + out_t1))
    t_2 = list(vocab_process.transform(t_2 + out_t2))
    # out_t1 = np.array(list(vocab_process.transform(out_t1)))
    # out_t2 = np.array(list(vocab_process.transform(out_t2)))
    text = np.concatenate((t_1, t_2), axis=1)
    t = [len(l) for l in text]

    def count(t):
        x_1 = Counter(t)
        pass

    count(t)
    t_1, t_2, labels = balance_data(t_1, t_2, _labels, _use=True)

    global vocab
    vocab = len(vocab_process.vocabulary_)
    vocab_process.save("vocab")

    dev_sample_index = -1 * int(0.1 * float(len(labels)))

    train_1, dev_1 = t_1[:dev_sample_index], t_1[dev_sample_index:]
    train_2, dev_2 = t_2[:dev_sample_index], t_2[dev_sample_index:]
    train_labels, dev_labels = np.array(labels)[:dev_sample_index], np.array(
        labels)[dev_sample_index:]
    train_hot_labels, dev_hot_labels = _labels[:dev_sample_index], np.array(
        labels)[dev_sample_index:]
    epoches = 300

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            model = m.model(emb_dim=100,
                            length=max_len,
                            vocab_size=vocab,
                            filter_size=2,
                            conv_out=64,
                            lstm_cell=32)
            model.build()
            sess.run(tf.global_variables_initializer())
            # Saver = tf.
            timenow = str(int(time.time()))
            if not os.path.exists("./log/" + timenow):
                os.mkdir("./log/" + timenow)
            if not os.path.exists("./log/" + timenow + "/train"):
                os.mkdir("./log/" + timenow + "/train")
            if not os.path.exists("./log/" + timenow + "/dev"):
                os.mkdir("./log/" + timenow + "/dev")
            trainWriter = tf.summary.FileWriter("./log/" + timenow + "/train",
                                                sess.graph)
            devWriter = tf.summary.FileWriter("./log/" + timenow + "/dev",
                                              sess.graph)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=0)

            def train(train_1, train_2, _labels, step):

                for b_1, b_2, labels in creat_batch(train_1,
                                                    train_2,
                                                    _labels,
                                                    batch_size=256):
                    feed_dict = {
                        model.x_1: b_1,
                        model.x_2: b_2,
                        model.labels: labels,
                        model.keep_prob: 0.5
                    }

                    _, f1score, summary = sess.run(
                        [model.Opt, model.f1_score, model.summary],
                        feed_dict=feed_dict)
                    trainWriter.add_summary(summary, global_step=step)
                    print("step:{}, acc:{}".format(step, f1score))
                    if step > 10000 and step % 1000 == 0:
                        dev_pre = []
                        for dev1, dev2, devlabels in creat_batch(
                                dev_1, dev_2, dev_labels, batch_size=256):
                            d_pre = dev(dev1, dev2, devlabels, step)
                            dev_pre += d_pre[0].tolist()

                        f1 = f1_score(dev_hot_labels, dev_pre)
                        f1 = tf.constant(f1, dtype=tf.int32, name="f1_score")
                        f1 = tf.summary.scalar("f1score", f1)
                        Summary = tf.summary.merge(f1)
                        devWriter.add_summary(Summary, global_step=step)
                        print("step:{}, acc:{}".format(step, f1score))
                        if not os.path.exists("./log/" + timenow + "/model"):
                            os.mkdir("./log/" + timenow + "/model")
                        saver.save(sess,
                                   "./log/" + timenow + "/model/model",
                                   global_step=step)
                    step += 1
                return step

            def dev(test_1, test_2, labels, step):
                feed_dict = {
                    model.x_1: test_1,
                    model.x_2: test_2,
                    model.labels: labels,
                    model.keep_prob: 1.0
                }
                pre = sess.run([model.pre], feed_dict=feed_dict)
                return pre

            step = 1
            for epoch in range(epoches):
                step = train(train_1, train_2, train_labels, step)
                print("\n\tepoch: {}".format(epoch + 1))
Exemple #3
0
def train():
    train_file = "./atec_nlp_sim_train.csv"
    train_file2 = "./atec_nlp_sim_train_add.csv"
    L2 = 0.02
    _index,t_1,t_2,labels,max_len = read_data(train_file)

    out_index,out_t1,out_t2,labels2,max_len = read_data(train_file2)
    labels = np.concatenate((labels,labels2),axis=0)
    max_len = 20
    vocab_process = learn.preprocessing.VocabularyProcessor(max_len)
    vocab_process.fit(t_1+t_2+out_t1+out_t2)
    t_1 = list(vocab_process.transform(t_1+out_t1))
    t_2 = list(vocab_process.transform(t_2+out_t2))
    # out_t1 = np.array(list(vocab_process.transform(out_t1)))
    # out_t2 = np.array(list(vocab_process.transform(out_t2)))
    global vocab
    vocab = len(vocab_process.vocabulary_)
    vocab_process.save("vocab")

    dev_sample_index = -1 * int(0.1 * float(len(labels)))

    train_1,dev_1 = t_1[:dev_sample_index],t_1[dev_sample_index:]
    train_2,dev_2 = t_2[:dev_sample_index],t_2[dev_sample_index:]
    train_labels,dev_labels = np.array(labels)[:dev_sample_index],np.array(labels)[dev_sample_index:]
    epoches = 20


    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            model = m.model(emb_dim=100, length=max_len, vocab_size=vocab, filter_size=2, conv_out=64, lstm_cell=32)
            model.build()
            sess.run(tf.global_variables_initializer())
            # Saver = tf.
            timenow = str(int(time.time()))
            if not os.path.exists("./log/"+timenow):
                os.mkdir("./log/"+timenow)
            if not os.path.exists("./log/"+timenow+"/train"):
                os.mkdir("./log/"+timenow+"/train")
            if not os.path.exists("./log/"+timenow+"/dev"):
                os.mkdir("./log/"+timenow+"/dev")
            trainWriter=tf.summary.FileWriter("./log/"+timenow+"/train",sess.graph)
            devWriter=tf.summary.FileWriter("./log/"+timenow+"/dev",sess.graph)
            saver = tf.train.Saver(tf.global_variables(),max_to_keep=0)
            def train(train_1,train_2,_labels,step):

                for b_1,b_2,labels in creat_batch(train_1,train_2,_labels,batch_size=64):
                    feed_dict = {
                        model.x_1:b_1,
                        model.x_2:b_2,
                        model.labels:labels,
                        model.keep_prob:0.5
                    }

                    _,acc,summary = sess.run([model.Opt,model.acc,model.summary],feed_dict=feed_dict)
                    trainWriter.add_summary(summary,global_step=step)
                    print("step:{}, acc:{}".format(step,acc))
                    if step %100==0:
                        dev(dev_1,dev_2,dev_labels,step)
                    step+=1
                return step

            def dev(test_1,test_2,labels,step):
                feed_dict = {
                        model.x_1:test_1,
                        model.x_2:test_2,
                        model.labels:labels,
                        model.keep_prob:1.0
                }
                acc,summary = sess.run([model.acc,model.summary],feed_dict=feed_dict)
                devWriter.add_summary(summary,global_step=step)
                if not os.path.exists("./log/"+timenow+"/model"):
                    os.mkdir("./log/"+timenow+"/model")
                saver.save(sess,"./log/"+timenow+"/model/model",global_step=step)
                print("step:{}, acc:{}".format(step,acc))

            step =1
            for epoch in range(epoches):
                step = train(train_1,train_2,train_labels,step)
                print("\n\tepoch: {}".format(epoch+1))