def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer_classification") # simple='simple' vocab_size = len(vocabulary_word2index) print("transformer_classification.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="transformer_classification") questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) print("list of total questions:",len(questionid_question_lists)) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) print("list of total questions2:",len(test)) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length print("list of total questions3:", len(testX2)) print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,l2_lambda=FLAGS.l2_lambda) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): logits=sess.run(model.logits,feed_dict={model.input_x:testX2[start:end],model.dropout_keep_prob:1}) #logits:[batch_size,self.num_classes] question_id_sublist=question_id_list[start:end] get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f) # 6. get lable using logtis #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) #print(index," ;predicted_labels:",predicted_labels) # 7. write question id and labels to file system. #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network") vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network") questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model model = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass, use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint of EntityNet.") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): logits=sess.run(model.logits,feed_dict={model.query:testX2[start:end],model.story: np.expand_dims(testX2[start:end],axis=1), model.dropout_keep_prob:1.0}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) # 7. write question id and labels to file system. #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) question_id_sublist=question_id_list[start:end] get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) print("vocab_size:",vocab_size) #iii=0 #iii/0 vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label() questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) batch_size=1 index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)): logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label() questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) batch_size=1 index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)): logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="seq2seq_attention") # simple='simple' vocab_size = len(vocabulary_word2index) print("seq2seq_attention.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="seq2seq_attention",use_seq2seq=True) questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model model=seq2seq_attention_model(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size,FLAGS.hidden_size, FLAGS.is_training,decoder_sent_length=FLAGS.decoder_sent_length,l2_lambda=FLAGS.l2_lambda) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') decoder_input=np.reshape(np.array([vocabulary_word2index_label[_GO]]+[vocabulary_word2index_label[_PAD]]*(FLAGS.decoder_sent_length-1)),[-1,FLAGS.decoder_sent_length]) for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): predictions,logits=sess.run([model.predictions,model.logits],feed_dict={model.input_x:testX2[start:end],model.decoder_input:decoder_input,model.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels=get_label_using_logits(logits[0],predictions,vocabulary_index2word_label,vocabulary_word2index_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1 == 1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary( word2vec_model_path=FLAGS.word2vec_model_path, name_scope="cnn2") #simple='simple' vocab_size = len(vocabulary_word2index) print("cnn_model.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( name_scope="cnn2") if FLAGS.multi_label_flag: FLAGS.traning_data_path = 'training-data/train-zhihu6-title-desc.txt' #test-zhihu5-only-title-multilabel.txt train, test, _ = load_data_multilabel_new( vocabulary_word2index, vocabulary_word2index_label, multi_label_flag=FLAGS.multi_label_flag, traning_data_path=FLAGS.traning_data_path ) #,traning_data_path=FLAGS.traning_data_path trainX, trainY = train testX, testY = test # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training, multi_label_flag=FLAGS.multi_label_flag) #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, textCNN, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(textCNN.epoch_step) #3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end] ) #;print("trainY[start:end]:",trainY[start:end]) feed_dict = { textCNN.input_x: trainX[start:end], textCNN.dropout_keep_prob: 0.5 } if not FLAGS.multi_label_flag: feed_dict[textCNN.input_y] = trainY[start:end] else: feed_dict[textCNN.input_y_multilabel] = trainY[start:end] curr_loss, curr_acc, _ = sess.run( [textCNN.loss_val, textCNN.accuracy, textCNN.train_op], feed_dict) #curr_acc--->TextCNN.accuracy loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 50 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)) ) #tTrain Accuracy:%.3f---》acc/float(counter) #epoch increment print("going to increment epoch counter....") sess.run(textCNN.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, textCNN, testX, testY, batch_size, vocabulary_index2word_label) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) #save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, textCNN, testX, testY, batch_size, vocabulary_index2word_label) pass
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer") # simple='simple' vocab_size = len(vocabulary_word2index) print("transformer.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="transformer",use_seq2seq=True) questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,decoder_sent_length=FLAGS.decoder_sent_length,l2_lambda=FLAGS.l2_lambda) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') #decoder_input=np.reshape(np.array([vocabulary_word2index_label[_GO]]+[vocabulary_word2index_label[_PAD]]*(FLAGS.decoder_sent_length-1)),[-1,FLAGS.decoder_sent_length]) decoder_input=np.full((FLAGS.batch_size,FLAGS.decoder_sent_length),vocabulary_word2index_label[_PAD]) decoder_input[:,0:1]=vocabulary_word2index_label[_GO] #set all values in first column to _GO for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): predictions,logits=sess.run([model.predictions,model.logits], feed_dict={model.input_x:testX2[start:end], model.decoder_input:decoder_input, model.dropout_keep_prob:1 }) #################################################################################### #for j in range(FLAGS.decoder_sent_length): # predict = sess.run(model.predictions, #model.loss_val,--->loss, model.train_op # feed_dict={model.input_x:testX2[start:end], # model.decoder_input:decoder_input, # #model.input_y_label: input_y_label, # model.dropout_keep_prob: 1.0, # }) # decoder_input[:,j] = predict[:,j] #################################################################################### # 6. get lable using logtis for _,logit in enumerate(logits): predicted_labels=get_label_using_logits(logit,predictions,vocabulary_index2word_label,vocabulary_word2index_label) print(index," ;predicted_labels:",predicted_labels) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128 tf.app.flags.DEFINE_string("ckpt_dir2", "text_cnn_title_desc_checkpoint_exp/", "checkpoint location for the model") #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") ############################################################################################################################################## filter_sizes = [1, 2, 3, 4, 5, 6, 7] #[1,2,3,4,5,6,7] #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary( simple='simple', word2vec_model_path=FLAGS.word2vec_model_path, name_scope="cnn2") vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( name_scope="cnn2") questionid_question_lists = load_final_test_data(FLAGS.predict_source_file) test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists) testX = [] question_id_list = [] for tuple in test: question_id, question_string_list = tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session.
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # load training data from cache file. # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1==1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network") #simple='simple' vocab_size = len(vocabulary_word2index) print("dynamic_memory_network.vocab_size:",vocab_size) vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network") if FLAGS.multi_label_flag: FLAGS.traning_data_path='../training-data/train-zhihu6-title-desc.txt' #change this line if want to train in a small dataset. e.g. dataset from 'test-zhihu6-title-desc.txt' train,test,_=load_data_multilabel_new(vocabulary_word2index,vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag, traning_data_path=FLAGS.traning_data_path) trainX, trainY = train testX, testY = test print("trainY:",trainY[0:10]) # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model model = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass, use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, model,word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch=sess.run(model.epoch_step) #3.feed data & training number_of_training_data=len(trainX) print("number_of_training_data:",number_of_training_data) previous_eval_loss=10000 best_eval_loss=10000 batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end])#;print("trainY[start:end]:",trainY[start:end]) feed_dict = {model.query: trainX[start:end],model.story: np.expand_dims(trainX[start:end],axis=1),model.dropout_keep_prob: 1.0} if not FLAGS.multi_label_flag: feed_dict[model.answer_single] = trainY[start:end] else: feed_dict[model.answer_multilabel]=trainY[start:end] curr_loss,curr_acc,_=sess.run([model.loss_val,model.accuracy,model.train_op],feed_dict) #curr_acc--->TextCNN.accuracy loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc if counter %50==0: print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,math.exp(loss/float(counter)) if (loss/float(counter))<20 else 10000.000,acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter) ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### if FLAGS.batch_size!=0 and (start%(FLAGS.validate_step*FLAGS.batch_size)==0): #(epoch % FLAGS.validate_every) or if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label) print("dynamic_memory_network[use_gated_gru=False,num_pass=2].validation.part. previous_eval_loss:", math.exp(previous_eval_loss) if previous_eval_loss<20 else 10000.000,";current_eval_loss:", math.exp(eval_loss) if eval_loss<20 else 10000.000) if eval_loss > previous_eval_loss: #if loss is not decreasing # reduce the learning rate by a factor of 0.5 print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>validation.part.going to reduce the learning rate.") learning_rate1 = sess.run(model.learning_rate) lrr=sess.run([model.learning_rate_decay_half_op]) learning_rate2 = sess.run(model.learning_rate) print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:",learning_rate2) else:# loss is decreasing if eval_loss<best_eval_loss: print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>going to save the model.eval_loss:",math.exp(eval_loss) if eval_loss<20 else 10000.000,";best_eval_loss:",math.exp(best_eval_loss) if best_eval_loss<20 else 10000.000) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) best_eval_loss=eval_loss previous_eval_loss = eval_loss ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### #epoch increment print("going to increment epoch counter....") sess.run(model.epoch_increment) # 5.test on test set test_loss, test_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label) pass
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1==1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="rcnn") #simple='simple' vocab_size = len(vocabulary_word2index) print("cnn_model.vocab_size:",vocab_size) vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="rcnn") if FLAGS.multi_label_flag: FLAGS.traning_data_path='training-data/train-zhihu6-title-desc.txt' #test-zhihu5-only-title-multilabel.txt train, test, _ = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag,traning_data_path=FLAGS.traning_data_path) #,traning_data_path=FLAGS.traning_data_path trainX, trainY = train testX, testY = test # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model textRCNN=TextRCNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sequence_length, vocab_size,FLAGS.embed_size,FLAGS.is_training,FLAGS.batch_size,multi_label_flag=FLAGS.multi_label_flag) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textRCNN,word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch=sess.run(textRCNN.epoch_step) #3.feed data & training number_of_training_data=len(trainX) batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end])#;print("trainY[start:end]:",trainY[start:end]) feed_dict = {textRCNN.input_x: trainX[start:end],textRCNN.dropout_keep_prob: 0.5} if not FLAGS.multi_label_flag: feed_dict[textRCNN.input_y] = trainY[start:end] else: feed_dict[textRCNN.input_y_multilabel]=trainY[start:end] curr_loss,curr_acc,_=sess.run([textRCNN.loss_val,textRCNN.accuracy,textRCNN.train_op],feed_dict) #curr_acc--->TextCNN.accuracy loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc if counter %50==0: print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter) #epoch increment print("going to increment epoch counter....") sess.run(textRCNN.epoch_increment) # 4.validation print(epoch,FLAGS.validate_every,(epoch % FLAGS.validate_every==0)) if epoch % FLAGS.validate_every==0: eval_loss, eval_acc=do_eval(sess,textRCNN,testX,testY,batch_size,vocabulary_index2word_label) print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_acc)) #save model to checkpoint save_path=FLAGS.ckpt_dir+"model.ckpt" saver.save(sess,save_path,global_step=epoch) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, textRCNN, testX, testY, batch_size,vocabulary_index2word_label) pass
def main(_): # 1.load data(X:list of lint,y:int). # if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) # else: if 1 == 1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary( word2vec_model_path=FLAGS.word2vec_model_path, name_scope="transformer_classification") # simple='simple' vocab_size = len(vocabulary_word2index) print("transformer.vocab_size:", vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( name_scope="transformer_classification") if FLAGS.multi_label_flag: FLAGS.traning_data_path = 'training-data/test-zhihu6-title-desc.txt' # one record like this:'w35620 w1097 w111 c278 c150 c150 c285 c278 c43 __label__7756633728210171144 3195914392210930723' train, test, _ = load_data_multilabel_new( vocabulary_word2index, vocabulary_word2index_label, multi_label_flag=FLAGS.multi_label_flag, traning_data_path=FLAGS.traning_data_path) trainX, trainY, = train testX, testY = test print("trainY:", trainY[0:10]) # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length # with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) # ;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") # 2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Instantiate Model model = Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size, FLAGS.d_model, FLAGS.d_k, FLAGS.d_v, FLAGS.h, FLAGS.num_layer, FLAGS.is_training, l2_lambda=FLAGS.l2_lambda) # Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: # load pre-trained word embedding assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, model, word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch = sess.run(model.epoch_step) # 3.feed data & training number_of_training_data = len(trainX) print("number_of_training_data:", number_of_training_data) previous_eval_loss = 10000 best_eval_loss = 10000 batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) feed_dict = { model.input_x: trainX[start:end], model.dropout_keep_prob: 0.5 } feed_dict[model.input_y_label] = trainY[start:end] curr_loss, curr_acc, _ = sess.run( [model.loss_val, model.accuracy, model.train_op], feed_dict) # curr_acc--->TextCNN.accuracy loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 50 == 0: print( "transformer.classification==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)) ) # tTrain Accuracy:%.3f---》acc/float(counter) ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### if FLAGS.batch_size != 0 and ( start % (FLAGS.validate_step * FLAGS.batch_size) == 0): eval_loss, eval_acc = do_eval(sess, model, testX, testY, batch_size, vocabulary_index2word_label) print( "transformer.classification.validation.part. previous_eval_loss:", previous_eval_loss, ";current_eval_loss:", eval_loss) if eval_loss > previous_eval_loss: # if loss is not decreasing # reduce the learning rate by a factor of 0.5 print( "transformer.classification.==>validation.part.going to reduce the learning rate." ) learning_rate1 = sess.run(model.learning_rate) lrr = sess.run([model.learning_rate_decay_half_op]) learning_rate2 = sess.run(model.learning_rate) print( "transformer.classification==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:", learning_rate2) # print("HierAtten==>Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) else: # loss is decreasing if eval_loss < best_eval_loss: print( "transformer.classification==>going to save the model.eval_loss:", eval_loss, ";best_eval_loss:", best_eval_loss) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) best_eval_loss = eval_loss previous_eval_loss = eval_loss ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### # epoch increment print("going to increment epoch counter....") sess.run(model.epoch_increment) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, model, testX, testY, batch_size, vocabulary_index2word_label) pass
tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128 tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model") #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") ############################################################################################################################################## filter_sizes=[1,2,3,4,5,6,7]#[1,2,3,4,5,6,7] #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple', word2vec_model_path=FLAGS.word2vec_model_path, name_scope="cnn2") vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") questionid_question_lists = load_final_test_data(FLAGS.predict_source_file) test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists) testX = [] question_id_list = [] for tuple in test: question_id, question_string_list = tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1==1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer_classification") #simple='simple' vocab_size = len(vocabulary_word2index) print("transformer.vocab_size:",vocab_size) vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="transformer_classification") if FLAGS.multi_label_flag: FLAGS.traning_data_path='training-data/test-zhihu6-title-desc.txt' #one record like this:'w35620 w1097 w111 c278 c150 c150 c285 c278 c43 __label__7756633728210171144 3195914392210930723' train,test,_=load_data_multilabel_new(vocabulary_word2index,vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag, traning_data_path=FLAGS.traning_data_path) trainX, trainY, = train testX, testY = test print("trainY:",trainY[0:10]) # 2.Data preprocessing.Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.) # padding to max length #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0]) # Converting labels to binary vectors print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,l2_lambda=FLAGS.l2_lambda) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, model,word2vec_model_path=FLAGS.word2vec_model_path) curr_epoch=sess.run(model.epoch_step) #3.feed data & training number_of_training_data=len(trainX) print("number_of_training_data:",number_of_training_data) previous_eval_loss=10000 best_eval_loss=10000 batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end]) feed_dict = {model.input_x: trainX[start:end],model.dropout_keep_prob: 0.5} feed_dict[model.input_y_label]=trainY[start:end] curr_loss,curr_acc,_=sess.run([model.loss_val,model.accuracy,model.train_op],feed_dict) #curr_acc--->TextCNN.accuracy loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc if counter %50==0: print("transformer.classification==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter) ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### if FLAGS.batch_size!=0 and (start%(FLAGS.validate_step*FLAGS.batch_size)==0): eval_loss, eval_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label) print("transformer.classification.validation.part. previous_eval_loss:", previous_eval_loss,";current_eval_loss:",eval_loss) if eval_loss > previous_eval_loss: #if loss is not decreasing # reduce the learning rate by a factor of 0.5 print("transformer.classification.==>validation.part.going to reduce the learning rate.") learning_rate1 = sess.run(model.learning_rate) lrr=sess.run([model.learning_rate_decay_half_op]) learning_rate2 = sess.run(model.learning_rate) print("transformer.classification==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:",learning_rate2) #print("HierAtten==>Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) else:# loss is decreasing if eval_loss<best_eval_loss: print("transformer.classification==>going to save the model.eval_loss:",eval_loss,";best_eval_loss:",best_eval_loss) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save(sess, save_path, global_step=epoch) best_eval_loss=eval_loss previous_eval_loss = eval_loss ##VALIDATION VALIDATION VALIDATION PART###################################################################################################### #epoch increment print("going to increment epoch counter....") sess.run(model.epoch_increment) # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label) pass
def main(_): # 1. load data if 1 == 1: # 1. get vocabulary of label. # trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary( simple='simple', word2vec_model_path=word2vec_model_path, name_scope='rnn') vocab_size = len(vocabulary_word2index) print('rnn_model.vocab_size:', vocab_size) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( name_scope='rnn', voabulary_label=traing_data_path) train, test, _ = load_data_multilabel_new( vocabulary_word2index, vocabulary_word2index_label, multi_label_flag=False, traning_data_path=traing_data_path) trainX, trainY = train testX, testY = test # 2. data preprocessing.Sequence padding print('start padding & transform to one hot ...') trainX = pad_sequences(trainX, maxlen=sequence_length, value=0.0) # padding to max length testX = pad_sequences(testX, maxlen=sequence_length, value=0.0) print( 'trainX[0]:', trainX[0], ) # convert labels to binary vector print('end padding & transform to one hot ...') # 2. create session config = tf.ConfigProto() config.gpu_options.allow_growth = True # 动态的分配gpu空间,需要多少占用多少 with tf.Session(config=config) as sess: # instantiate mdoel textRNN = TextRNN(num_classes, learning_rate, batch_size, decay_steps, decay_rate, sequence_length, vocab_size, embed_size, is_training) saver = tf.train.Saver() if os.path.exists(ckpt_dir + 'checkpoint'): print('Restoring Variables from Checkpoint for rnn model.') saver.restore( sess, tf.train.latest_checkpoint(ckpt_dir)) # todo 怎么找到最近保存的文件的? else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if use_embedding: # load pre_trained word embedding assign_pretrained_word_embedding( sess, vocabulary_index2word, vocab_size, textRNN, word2vec_model_path=word2vec_model_path) curr_epoch = sess.run(textRNN.epoch_step) # 3.feed data & training number_of_training_data = len(trainX) for epoch in range(curr_epoch, num_epochs): loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print('trainX[start:end]:', trainX[start:end]) curr_loss, curr_acc, _ = sess.run( [textRNN.loss_val, textRNN.accuracy, textRNN.train_op], feed_dict={ textRNN.input_x: trainX[start:end], textRNN.input_y: trainY[start:end], textRNN.dropout_keep_prob: 1.0 }) loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc if counter % 500 == 0: print( 'Epoch {} \tBatch {}\tTrain Loss:{:.3}\tTrain Accuracy:{:.3}' .format(epoch, counter, loss / float(counter), acc / float(counter))) # epoch increament print('going to increament epoch counter ...') sess.run(textRNN.epoch_increament) # 4.validation print(epoch, validation_every, (epoch % validation_every == 0)) if epoch % validation_every == 0: eval_loss, eval_acc = do_eval(sess, textRNN, testX, testY, batch_size, vocabulary_index2word_label) print( 'Epoch {} Validation Loss: {:.3} \tValidation Accuracy:{:.3}' .format(epoch, eval_loss, eval_acc)) # save model to checkpoint save_path = ckpt_dir + 'model.ckpt' # saver.save(sess,save_path,global_step=epoch) saver.save(sess, save_path, global_step=textRNN.global_step) # 5. test in testData and report accuracy test_loss, test_acc = do_eval(sess, textRNN, testX, testY, batch_size, vocabulary_index2word_label) pass
def main(_): # 1. load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary( simple='simple', word2vec_model_path=word2vec_model_path, name_scope='rnn') vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( name_scope='rnn') questionid_question_lists = load_final_test_data(predict_source_file) test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists) testX = [] question_id_list = [] for tuple in test: question_id, question_string_list = tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.data preprocessing :sequence padding print('string padding...') testX2 = pad_sequences(testX, maxlen=sequence_length, value=0.) print('end padding...') # 3.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # 4.instantiate model textRNN = TextRNN(num_classes, learning_rate, batch_size, decay_steps, decay_rate, sequence_length, vocab_size, embed_size, is_training) saver = tf.train.Saver() if os.path.exists(ckpt_dir + 'checkpoint'): print('Restore Variables from Checkpoint for TextRNN') saver.restore(sess, tf.train.latest_checkpoint(ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data number_of_training_data = len(testX2) print('number_of_training_data:', number_of_training_data) index = 0 predict_target_file_f = codecs.open(predict_target_file, 'a', 'utf8') for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data + 1, batch_size)): logits = sess.run(textRNN.logits, feed_dict={ textRNN.input_x: testX2[start:end], textRNN.dropout_keep_prob: 1 }) # shape of logits: (1,1999) print('start: {} ;end: {}'.format(start, end)) question_id_sublist = question_id_list[start:end] get_label_using_logits_batch(question_id_sublist.logits, vocabulary_index2word_label, predict_target_file_f) index = index + 1 predict_target_file_f.close()