def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary()
    vocab_size = len(vocabulary_word2index)
    print("vocab_size:",vocab_size)
    #iii=0
    #iii/0
    vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)

    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
    print("end padding...")

    # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        batch_size=1
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)):
            logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index=index+1
        predict_target_file_f.close()
def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary()
    vocab_size = len(vocabulary_word2index)
    print("vocab_size:",vocab_size)
    #iii=0
    #iii/0
    vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)

    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
    print("end padding...")

    # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        batch_size=1
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)):
            logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index=index+1
        predict_target_file_f.close()
def main(_):
    # 1.load data(X:list of lint,y:int).
    # if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    # else:
    if 1 == 1:
        vocab_processor_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/vocab'
        # print("end padding & transform to one hot...")
        x_train, y = data_helpers.load_data_and_labels(FLAGS.data_file)

        # vocab_processor = learn.preprocessing.VocabularyProcessor(2000,min_frequency=1)
        # x = np.array(list(vocab_processor.fit_transform(x_train)))
        # vocab_processor.save(vocab_processor_path)

        vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
            vocab_processor_path)
        x = np.array(list(vocab_processor.transform(x_train)))

        trainX = x[:80000]
        testX = x[80000:]
        trainY = y[:80000]
        testY = y[80000:]
        vocab_size = len(vocab_processor.vocabulary_)
        print('vocab_size', vocab_size)

    # 2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Instantiate Model
        fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate,
                             FLAGS.batch_size, FLAGS.decay_steps,
                             FLAGS.decay_rate, FLAGS.num_sampled,
                             FLAGS.sentence_len, vocab_size, FLAGS.embed_size,
                             FLAGS.is_training)
        # Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  # load pre-trained word embedding
                # word_embedding = tf.constant(trainX, dtype=tf.float32)  # convert to tensor
                # t_assign_embedding = tf.assign(fast_text.Embedding,
                #                            word_embedding)  # assign this value to our embedding variables of our model
                # sess.run(t_assign_embedding)
                assign_pretrained_word_embedding(sess, trainX, vocab_size,
                                                 fast_text)

        curr_epoch = sess.run(fast_text.epoch_step)
        # 3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch,
                           FLAGS.num_epochs):  # range(start,stop,step_size)
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                    print("trainY[start:end]:", trainY[start:end])
                curr_loss, curr_acc, _ = sess.run(
                    [
                        fast_text.loss_val, fast_text.accuracy,
                        fast_text.train_op
                    ],
                    feed_dict={
                        fast_text.sentence: trainX[start:end],
                        fast_text.labels: trainY[start:end]
                    })
                loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1
                if counter % 10 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter)))

            # epoch increment
            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, fast_text, testX, testY,
                                              batch_size)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))

                # save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(
                    sess, save_path,
                    global_step=fast_text.epoch_step)  # fast_text.epoch_step

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, fast_text, testX, testY,
                                      batch_size)
    pass
Beispiel #4
0
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1 == 1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary()
        vocab_size = len(vocabulary_word2index)
        vocabulary_word2index_label, _ = create_voabulary_label()
        train, test, _ = load_data(vocabulary_word2index,
                                   vocabulary_word2index_label,
                                   data_type='train')
        trainX, trainY = train
        testX, testY = test
        print("testX.shape:", np.array(testX).shape)  # 2500个list.每个list代表一句话
        print("testY.shape:", np.array(testY).shape)  # 2500个label
        print("testX[0]:",
              testX[0])  # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
        print("testX[1]:", testX[1])
        print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0

        # 2.Data preprocessing
        # Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len,
                               value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                              value=0.)  # padding to max length
        ###############################################################################################
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        ###############################################################################################
    print("testX[0]:", testX[0])
    print("testX[1]:", testX[1])
    #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
    # Converting labels to binary vectors
    print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0
    print("end padding & transform to one hot...")
    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate,
                             FLAGS.batch_size, FLAGS.decay_steps,
                             FLAGS.decay_rate, FLAGS.num_sampled,
                             FLAGS.sentence_len, vocab_size, FLAGS.embed_size,
                             FLAGS.is_training)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, fast_text)

        curr_epoch = sess.run(fast_text.epoch_step)
        #3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch,
                           FLAGS.num_epochs):  #range(start,stop,step_size)
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                    print("trainY[start:end]:", trainY[start:end])
                curr_loss, curr_acc, _ = sess.run(
                    [
                        fast_text.loss_val, fast_text.accuracy,
                        fast_text.train_op
                    ],
                    feed_dict={
                        fast_text.sentence: trainX[start:end],
                        fast_text.labels: trainY[start:end]
                    })
                loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1
                if counter % 500 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter)))

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, fast_text, testX, testY,
                                              batch_size)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))

                #save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(
                    sess, save_path,
                    global_step=fast_text.epoch_step)  #fast_text.epoch_step

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, fast_text, testX, testY,
                                      batch_size)
    pass
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1==1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary()
        vocab_size = len(vocabulary_word2index)
        vocabulary_word2index_label,_ = create_voabulary_label()
        train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label,data_type='train')
        trainX, trainY = train
        testX, testY = test
        print("testX.shape:", np.array(testX).shape)  # 2500个list.每个list代表一句话
        print("testY.shape:", np.array(testY).shape)  # 2500个label
        print("testX[0]:", testX[0])  # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
        print("testX[1]:", testX[1]);
        print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0

        # 2.Data preprocessing
        # Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
        ###############################################################################################
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        ###############################################################################################
    print("testX[0]:", testX[0]) ;print("testX[1]:", testX[1]); #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
    # Converting labels to binary vectors
    print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0
    print("end padding & transform to one hot...")
    #2.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
        #Initialize Save
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding: #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text)

        curr_epoch=sess.run(fast_text.epoch_step)
        #3.feed data & training
        number_of_training_data=len(trainX)
        batch_size=FLAGS.batch_size
        for epoch in range(curr_epoch,FLAGS.num_epochs):#range(start,stop,step_size)
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)):
                if epoch==0 and counter==0:
                    print("trainX[start:end]:",trainX[start:end])
                    print("trainY[start:end]:",trainY[start:end])
                curr_loss,curr_acc,_=sess.run([fast_text.loss_val,fast_text.accuracy,fast_text.train_op],feed_dict={fast_text.sentence:trainX[start:end],fast_text.labels:trainY[start:end]})
                loss,acc,counter=loss+curr_loss,acc+curr_acc,counter+1
                if counter %500==0:
                    print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter)))

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            # 4.validation
            print(epoch,FLAGS.validate_every,(epoch % FLAGS.validate_every==0))
            if epoch % FLAGS.validate_every==0:
                eval_loss, eval_acc=do_eval(sess,fast_text,testX,testY,batch_size)
                print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_acc))

                #save model to checkpoint
                save_path=FLAGS.ckpt_dir+"model.ckpt"
                saver.save(sess,save_path,global_step=fast_text.epoch_step) #fast_text.epoch_step

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, fast_text, testX, testY, batch_size)
    pass