def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) print("vocab_size:",vocab_size) #iii=0 #iii/0 vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label() questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) batch_size=1 index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)): logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) print("vocab_size:",vocab_size) #iii=0 #iii/0 vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label() questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO testX=[] question_id_list=[] for tuple in test: question_id,question_string_list=tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: # 4.Instantiate Model fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) batch_size=1 index=0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)): logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) index=index+1 predict_target_file_f.close()
def main(_): # 1.load data(X:list of lint,y:int). # if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) # else: if 1 == 1: vocab_processor_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/vocab' # print("end padding & transform to one hot...") x_train, y = data_helpers.load_data_and_labels(FLAGS.data_file) # vocab_processor = learn.preprocessing.VocabularyProcessor(2000,min_frequency=1) # x = np.array(list(vocab_processor.fit_transform(x_train))) # vocab_processor.save(vocab_processor_path) vocab_processor = learn.preprocessing.VocabularyProcessor.restore( vocab_processor_path) x = np.array(list(vocab_processor.transform(x_train))) trainX = x[:80000] testX = x[80000:] trainY = y[:80000] testY = y[80000:] vocab_size = len(vocab_processor.vocabulary_) print('vocab_size', vocab_size) # 2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Instantiate Model fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.num_sampled, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) # Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: # load pre-trained word embedding # word_embedding = tf.constant(trainX, dtype=tf.float32) # convert to tensor # t_assign_embedding = tf.assign(fast_text.Embedding, # word_embedding) # assign this value to our embedding variables of our model # sess.run(t_assign_embedding) assign_pretrained_word_embedding(sess, trainX, vocab_size, fast_text) curr_epoch = sess.run(fast_text.epoch_step) # 3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): # range(start,stop,step_size) loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) print("trainY[start:end]:", trainY[start:end]) curr_loss, curr_acc, _ = sess.run( [ fast_text.loss_val, fast_text.accuracy, fast_text.train_op ], feed_dict={ fast_text.sentence: trainX[start:end], fast_text.labels: trainY[start:end] }) loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1 if counter % 10 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter))) # epoch increment print("going to increment epoch counter....") sess.run(fast_text.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, fast_text, testX, testY, batch_size) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) # save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save( sess, save_path, global_step=fast_text.epoch_step) # fast_text.epoch_step # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, fast_text, testX, testY, batch_size) pass
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1 == 1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, _ = create_voabulary_label() train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label, data_type='train') trainX, trainY = train testX, testY = test print("testX.shape:", np.array(testX).shape) # 2500个list.每个list代表一句话 print("testY.shape:", np.array(testY).shape) # 2500个label print("testX[0]:", testX[0]) # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] print("testX[1]:", testX[1]) print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 # 2.Data preprocessing # Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length ############################################################################################### #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) ############################################################################################### print("testX[0]:", testX[0]) print("testX[1]:", testX[1]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] # Converting labels to binary vectors print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 print("end padding & transform to one hot...") #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.num_sampled, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text) curr_epoch = sess.run(fast_text.epoch_step) #3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): #range(start,stop,step_size) loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) print("trainY[start:end]:", trainY[start:end]) curr_loss, curr_acc, _ = sess.run( [ fast_text.loss_val, fast_text.accuracy, fast_text.train_op ], feed_dict={ fast_text.sentence: trainX[start:end], fast_text.labels: trainY[start:end] }) loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1 if counter % 500 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter))) #epoch increment print("going to increment epoch counter....") sess.run(fast_text.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, fast_text, testX, testY, batch_size) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) #save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save( sess, save_path, global_step=fast_text.epoch_step) #fast_text.epoch_step # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, fast_text, testX, testY, batch_size) pass
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1==1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) vocabulary_word2index_label,_ = create_voabulary_label() train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label,data_type='train') trainX, trainY = train testX, testY = test print("testX.shape:", np.array(testX).shape) # 2500个list.每个list代表一句话 print("testY.shape:", np.array(testY).shape) # 2500个label print("testX[0]:", testX[0]) # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] print("testX[1]:", testX[1]); print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 # 2.Data preprocessing # Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length ############################################################################################### #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) ############################################################################################### print("testX[0]:", testX[0]) ;print("testX[1]:", testX[1]); #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] # Converting labels to binary vectors print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 print("end padding & transform to one hot...") #2.create session. config=tf.ConfigProto() config.gpu_options.allow_growth=True with tf.Session(config=config) as sess: #Instantiate Model fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) #Initialize Save saver=tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text) curr_epoch=sess.run(fast_text.epoch_step) #3.feed data & training number_of_training_data=len(trainX) batch_size=FLAGS.batch_size for epoch in range(curr_epoch,FLAGS.num_epochs):#range(start,stop,step_size) loss, acc, counter = 0.0, 0.0, 0 for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)): if epoch==0 and counter==0: print("trainX[start:end]:",trainX[start:end]) print("trainY[start:end]:",trainY[start:end]) curr_loss,curr_acc,_=sess.run([fast_text.loss_val,fast_text.accuracy,fast_text.train_op],feed_dict={fast_text.sentence:trainX[start:end],fast_text.labels:trainY[start:end]}) loss,acc,counter=loss+curr_loss,acc+curr_acc,counter+1 if counter %500==0: print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter))) #epoch increment print("going to increment epoch counter....") sess.run(fast_text.epoch_increment) # 4.validation print(epoch,FLAGS.validate_every,(epoch % FLAGS.validate_every==0)) if epoch % FLAGS.validate_every==0: eval_loss, eval_acc=do_eval(sess,fast_text,testX,testY,batch_size) print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_acc)) #save model to checkpoint save_path=FLAGS.ckpt_dir+"model.ckpt" saver.save(sess,save_path,global_step=fast_text.epoch_step) #fast_text.epoch_step # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, fast_text, testX, testY, batch_size) pass