Python load_data_predict Examples, data_util_zhihu.load_data_predict Python Examples

Example #1

0

Show file

File: a2_predict_classification.py Project: AmjadHisham/text_classification

def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer_classification")  # simple='simple'
    vocab_size = len(vocabulary_word2index)
    print("transformer_classification.vocab_size:", vocab_size)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="transformer_classification")
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
    print("list of total questions:",len(questionid_question_lists))
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
    print("list of total questions2:",len(test))
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
    print("list of total questions3:", len(testX2))
    print("end padding...")
   # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                 vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,l2_lambda=FLAGS.l2_lambda)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
            logits=sess.run(model.logits,feed_dict={model.input_x:testX2[start:end],model.dropout_keep_prob:1}) #logits:[batch_size,self.num_classes]

            question_id_sublist=question_id_list[start:end]
            get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f)

            # 6. get lable using logtis
            #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            #print(index," ;predicted_labels:",predicted_labels)
            # 7. write question id and labels to file system.
            #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index=index+1
        predict_target_file_f.close()

Example #2

0

Show file

File: a8_predict.py Project: brucexia6116/text_classification

def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network")
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network")
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
    print("end padding...")
   # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                                     FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass,
                                     use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint of EntityNet.")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
            logits=sess.run(model.logits,feed_dict={model.query:testX2[start:end],model.story: np.expand_dims(testX2[start:end],axis=1),
                                                    model.dropout_keep_prob:1.0}) #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            question_id_sublist=question_id_list[start:end]
            get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f)

            index=index+1
        predict_target_file_f.close()

Example #3

0

Show file

File: a1_seq2seq_attention_predict.py Project: AmjadHisham/text_classification

def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="seq2seq_attention")  # simple='simple'
    vocab_size = len(vocabulary_word2index)
    print("seq2seq_attention.vocab_size:", vocab_size)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="seq2seq_attention",use_seq2seq=True)
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
    print("end padding...")
   # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model=seq2seq_attention_model(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                                      vocab_size, FLAGS.embed_size,FLAGS.hidden_size, FLAGS.is_training,decoder_sent_length=FLAGS.decoder_sent_length,l2_lambda=FLAGS.l2_lambda)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        decoder_input=np.reshape(np.array([vocabulary_word2index_label[_GO]]+[vocabulary_word2index_label[_PAD]]*(FLAGS.decoder_sent_length-1)),[-1,FLAGS.decoder_sent_length])
        for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
            predictions,logits=sess.run([model.predictions,model.logits],feed_dict={model.input_x:testX2[start:end],model.decoder_input:decoder_input,model.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels=get_label_using_logits(logits[0],predictions,vocabulary_index2word_label,vocabulary_word2index_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index=index+1
        predict_target_file_f.close()

Example #4

0

Show file

File: p5_fastTextB_predict.py Project: AmjadHisham/text_classification

def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary()
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)

    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
    print("end padding...")

    # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        batch_size=1
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)):
            logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index=index+1
        predict_target_file_f.close()

Example #5

0

Show file

File: a8_predict.py Project: wwdxfa/text_classification-2

def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        word2vec_model_path=FLAGS.word2vec_model_path,
        name_scope="dynamic_memory_network")
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
        name_scope="dynamic_memory_network")
    questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
    test = load_data_predict(vocabulary_word2index,
                             vocabulary_word2index_label,
                             questionid_question_lists)
    testX = []
    question_id_list = []
    for tuple in test:
        question_id, question_string_list = tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length,
                           value=0.)  # padding to max length
    print("end padding...")
    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model = DynamicMemoryNetwork(
            FLAGS.num_classes,
            FLAGS.learning_rate,
            FLAGS.batch_size,
            FLAGS.decay_steps,
            FLAGS.decay_rate,
            FLAGS.sequence_length,
            FLAGS.story_length,
            vocab_size,
            FLAGS.embed_size,
            FLAGS.hidden_size,
            FLAGS.is_training,
            num_pass=FLAGS.num_pass,
            use_gated_gru=FLAGS.use_gated_gru,
            decode_with_sequences=FLAGS.decode_with_sequences,
            multi_label_flag=FLAGS.multi_label_flag,
            l2_lambda=FLAGS.l2_lambda)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint of EntityNet.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data = len(testX2)
        print("number_of_training_data:", number_of_training_data)
        index = 0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a',
                                            'utf8')
        for start, end in zip(
                range(0, number_of_training_data, FLAGS.batch_size),
                range(FLAGS.batch_size, number_of_training_data + 1,
                      FLAGS.batch_size)):
            logits = sess.run(model.logits,
                              feed_dict={
                                  model.query:
                                  testX2[start:end],
                                  model.story:
                                  np.expand_dims(testX2[start:end], axis=1),
                                  model.dropout_keep_prob:
                                  1.0
                              })  #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            question_id_sublist = question_id_list[start:end]
            get_label_using_logits_batch(question_id_sublist, logits,
                                         vocabulary_index2word_label,
                                         predict_target_file_f)

            index = index + 1
        predict_target_file_f.close()

Example #6

0

Show file

def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        word2vec_model_path=FLAGS.word2vec_model_path,
        name_scope="seq2seq_attention")  # simple='simple'
    vocab_size = len(vocabulary_word2index)
    print("seq2seq_attention.vocab_size:", vocab_size)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
        name_scope="seq2seq_attention", use_seq2seq=True)
    questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
    test = load_data_predict(vocabulary_word2index,
                             vocabulary_word2index_label,
                             questionid_question_lists)
    testX = []
    question_id_list = []
    for tuple in test:
        question_id, question_string_list = tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length,
                           value=0.)  # padding to max length
    print("end padding...")
    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model = seq2seq_attention_model(
            FLAGS.num_classes,
            FLAGS.learning_rate,
            FLAGS.batch_size,
            FLAGS.decay_steps,
            FLAGS.decay_rate,
            FLAGS.sequence_length,
            vocab_size,
            FLAGS.embed_size,
            FLAGS.hidden_size,
            FLAGS.is_training,
            decoder_sent_length=FLAGS.decoder_sent_length,
            l2_lambda=FLAGS.l2_lambda)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data = len(testX2)
        print("number_of_training_data:", number_of_training_data)
        index = 0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a',
                                            'utf8')
        decoder_input = np.reshape(
            np.array([vocabulary_word2index_label[_GO]] +
                     [vocabulary_word2index_label[_PAD]] *
                     (FLAGS.decoder_sent_length - 1)),
            [-1, FLAGS.decoder_sent_length])
        for start, end in zip(
                range(0, number_of_training_data, FLAGS.batch_size),
                range(FLAGS.batch_size, number_of_training_data + 1,
                      FLAGS.batch_size)):
            predictions, logits = sess.run(
                [model.predictions, model.logits],
                feed_dict={
                    model.input_x: testX2[start:end],
                    model.decoder_input: decoder_input,
                    model.dropout_keep_prob: 1
                })  #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels = get_label_using_logits(
                logits[0], predictions, vocabulary_index2word_label,
                vocabulary_word2index_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],
                                          predicted_labels,
                                          predict_target_file_f)
            index = index + 1
        predict_target_file_f.close()

Example #7

0

Show file

File: a2_predict_classification.py Project: ahashisyuu/TextClassification

def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        word2vec_model_path=FLAGS.word2vec_model_path,
        name_scope="transformer_classification")  # simple='simple'
    vocab_size = len(vocabulary_word2index)
    print("transformer_classification.vocab_size:", vocab_size)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
        name_scope="transformer_classification")
    questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
    print("list of total questions:", len(questionid_question_lists))
    test = load_data_predict(vocabulary_word2index,
                             vocabulary_word2index_label,
                             questionid_question_lists)
    print("list of total questions2:", len(test))
    testX = []
    question_id_list = []
    for tuple in test:
        question_id, question_string_list = tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length,
                           value=0.)  # padding to max length
    print("list of total questions3:", len(testX2))
    print("end padding...")
    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model = Transformer(FLAGS.num_classes,
                            FLAGS.learning_rate,
                            FLAGS.batch_size,
                            FLAGS.decay_steps,
                            FLAGS.decay_rate,
                            FLAGS.sequence_length,
                            vocab_size,
                            FLAGS.embed_size,
                            FLAGS.d_model,
                            FLAGS.d_k,
                            FLAGS.d_v,
                            FLAGS.h,
                            FLAGS.num_layer,
                            FLAGS.is_training,
                            l2_lambda=FLAGS.l2_lambda)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data = len(testX2)
        print("number_of_training_data:", number_of_training_data)
        index = 0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a',
                                            'utf8')
        for start, end in zip(
                range(0, number_of_training_data, FLAGS.batch_size),
                range(FLAGS.batch_size, number_of_training_data + 1,
                      FLAGS.batch_size)):
            logits = sess.run(model.logits,
                              feed_dict={
                                  model.input_x: testX2[start:end],
                                  model.dropout_keep_prob: 1
                              })  #logits:[batch_size,self.num_classes]

            question_id_sublist = question_id_list[start:end]
            get_label_using_logits_batch(question_id_sublist, logits,
                                         vocabulary_index2word_label,
                                         predict_target_file_f)

            # 6. get lable using logtis
            #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            #print(index," ;predicted_labels:",predicted_labels)
            # 7. write question id and labels to file system.
            #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index = index + 1
        predict_target_file_f.close()

Example #8

0

Show file

File: p7_TextCNN_predict.py Project: hanghang2333/scoreclassify

tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128
tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model")

#tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")

##############################################################################################################################################
filter_sizes=[1,2,3,4,5,6,7]#[1,2,3,4,5,6,7]
#1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction)
# 1.load data with vocabulary of words and labels
vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',
                                                                word2vec_model_path=FLAGS.word2vec_model_path,
                                                                name_scope="cnn2")
vocab_size = len(vocabulary_word2index)
vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists)
testX = []
question_id_list = []
for tuple in test:
    question_id, question_string_list = tuple
    question_id_list.append(question_id)
    testX.append(question_string_list)
# 2.Data preprocessing: Sequence padding
print("start padding....")
testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
print("end padding...")
# 3.create session.
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
graph=tf.Graph().as_default()
global sess

Example #9

0

Show file

File: p5_fastTextB_predict.py Project: kongliang2015/text_classification_zhihu

def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary()
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
    )
    questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
    test = load_data_predict(vocabulary_word2index,
                             vocabulary_word2index_label,
                             questionid_question_lists)
    testX = []
    question_id_list = []
    for tuple in test:
        question_id, question_string_list = tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)

    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                           value=0.)  # padding to max length
    print("end padding...")

    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate,
                             FLAGS.batch_size, FLAGS.decay_steps,
                             FLAGS.decay_rate, FLAGS.num_sampled,
                             FLAGS.sentence_len, vocab_size, FLAGS.embed_size,
                             FLAGS.is_training)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data = len(testX2)
        print("number_of_training_data:", number_of_training_data)
        batch_size = 1
        index = 0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a',
                                            'utf8')
        for start, end in zip(
                range(0, number_of_training_data, batch_size),
                range(batch_size, number_of_training_data + 1, batch_size)):
            logits = sess.run(
                fast_text.logits,
                feed_dict={fast_text.sentence:
                           testX2[start:end]})  #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels = get_label_using_logits(
                logits[0], vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],
                                          predicted_labels,
                                          predict_target_file_f)
            index = index + 1
        predict_target_file_f.close()

Example #10

0

Show file

def main(_):
    # 1. load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        simple='simple',
        word2vec_model_path=word2vec_model_path,
        name_scope='rnn')
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
        name_scope='rnn')
    questionid_question_lists = load_final_test_data(predict_source_file)
    test = load_data_predict(vocabulary_word2index,
                             vocabulary_word2index_label,
                             questionid_question_lists)
    testX = []
    question_id_list = []
    for tuple in test:
        question_id, question_string_list = tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)

    # 2.data preprocessing :sequence padding
    print('string padding...')
    testX2 = pad_sequences(testX, maxlen=sequence_length, value=0.)
    print('end padding...')

    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 4.instantiate model
        textRNN = TextRNN(num_classes, learning_rate, batch_size, decay_steps,
                          decay_rate, sequence_length, vocab_size, embed_size,
                          is_training)
    saver = tf.train.Saver()
    if os.path.exists(ckpt_dir + 'checkpoint'):
        print('Restore Variables from Checkpoint for TextRNN')
        saver.restore(sess, tf.train.latest_checkpoint(ckpt_dir))
    else:
        print("Can't find the checkpoint.going to stop")
        return

    # 5.feed data
    number_of_training_data = len(testX2)
    print('number_of_training_data:', number_of_training_data)
    index = 0
    predict_target_file_f = codecs.open(predict_target_file, 'a', 'utf8')
    for start, end in zip(
            range(0, number_of_training_data, batch_size),
            range(batch_size, number_of_training_data + 1, batch_size)):
        logits = sess.run(textRNN.logits,
                          feed_dict={
                              textRNN.input_x: testX2[start:end],
                              textRNN.dropout_keep_prob: 1
                          })  # shape of logits: (1,1999)

        print('start: {} ;end: {}'.format(start, end))

        question_id_sublist = question_id_list[start:end]
        get_label_using_logits_batch(question_id_sublist.logits,
                                     vocabulary_index2word_label,
                                     predict_target_file_f)

        index = index + 1
    predict_target_file_f.close()