def char_index(p_sentences, h_sentences): word2idx, idx2word = load_char_vocab() p_list, h_list = [], [] for p_sentence, h_sentence in zip(p_sentences, h_sentences): p = [word2idx[word.lower()] for word in p_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()] h = [word2idx[word.lower()] for word in h_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()] p_list.append(p) h_list.append(h) p_list = pad_sequences(p_list, maxlen=ESIMConfig().maxlen) h_list = pad_sequences(h_list, maxlen=ESIMConfig().maxlen) return p_list, h_list
def word_index(p_sentences, h_sentences): word2idx, idx2word = load_word_vocab() p_list, h_list = [], [] for p_sentence, h_sentence in zip(p_sentences, h_sentences): p = [word2idx[word.lower()] for word in p_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()] h = [word2idx[word.lower()] for word in h_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()] p_list.append(p) h_list.append(h) p_list = pad_sequences(p_list, maxlen=args.seq_length) h_list = pad_sequences(h_list, maxlen=args.seq_length) return p_list, h_list
def char_index_single(sentences): word2idx, idx2word = load_char_vocab() p_list = [] for sentence in sentences: p = [word2idx[word.lower()] for word in sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()] p_list.append(p) p_list = pad_sequences(p_list, maxlen=ESIMConfig().maxlen) return p_list
def main(config, eval_folder): # local the vocab file text_words_vocab = load_vocab(config.text_words_path) text_chars_vocab = load_vocab(config.text_chars_path) inv_text_vocab = {v: k for k, v in text_words_vocab.items()} # get the processing function processing_word = get_processing_word(text_words_vocab, text_chars_vocab, lowercase=True, chars=True) #load features: word_features = get_trimmed_features(config.word_embeddings_trimmed_path) examples = read_examples(eval_folder, processing_word) # build WImpModel model = WImpModel(config, word_features, None, text_words_vocab["$UNK$"], inv_text_vocab, None) model.build_graph() words, word_feats, speech_interval_feats = [], [], [] for sent_key in examples.keys(): words_, word_feats_, speech_feats_ = zip(*examples[sent_key]) word_feats_ = list(zip(*word_feats_)) word_feats.append(word_feats_) speech_interval_feats.append(speech_feats_) words.append(words_) speech_interval_feats_pad_, speech_lengths = pad_sequences( speech_interval_feats, pad_tok=[0] * config.speech_features_dim, nlevels=2) speech_feats = speech_interval_feats_pad_[:, :, :, config. speech_lexical_features_dim:] speech_lexical_feats = speech_interval_feats_pad_[:, :, 0, :config. speech_lexical_features_dim] feed, sequence_lengths = model.get_feed_dict(words=word_feats, dropout=1.0) feed[model.speech_features] = speech_feats feed[model.speech_lexical_features] = speech_lexical_feats feed[model.speech_lengths] = speech_lengths predictions = model.test(feed) print("\n") print("WORD IMPORTANCE PREDICTION OUTPUT") print("=================================") for sent_id in range(len(words)): scores = predictions[0][:sequence_lengths[sent_id]] tokens = words[sent_id] result = ["%s (%f)" % (w, s) for w, s in zip(tokens, scores)] print("--> " + " ".join(result) + "\n")
def get_feed_dict(self, words, speech=None, labels=None, lr=None, dropout=None): feed = {} if speech is not None: speech_interval_feats = get_features(speech, self.config.speech_features) speech_interval_feats_pad_, speech_lengths = pad_sequences( speech_interval_feats, pad_tok=[0] * self.config.speech_features_dim, nlevels=2) speech_feats = speech_interval_feats_pad_[:, :, :, self.config. speech_lexical_features_dim:] speech_lexical_feats = speech_interval_feats_pad_[:, :, 0, :self.config. speech_lexical_features_dim] feed[self.speech_features] = speech_feats feed[self.speech_lexical_features] = speech_lexical_feats feed[self.speech_lengths] = speech_lengths char_ids, word_ids = list(zip(*words)) word_ids, sequence_lengths = pad_sequences(word_ids, 0) char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) feed[self.word_ids] = word_ids feed[self.char_ids] = char_ids feed[self.sequence_lengths] = sequence_lengths feed[self.word_lengths] = word_lengths if labels is not None: labels, _ = pad_sequences(labels, 0) feed[self.labels] = labels if lr is not None: feed[self.lr] = lr if dropout is not None: feed[self.dropout] = dropout return feed, sequence_lengths
def seq_index(p_sentences): word2idx, idx2word = load_char_vocab() p_list, h_list = [], [] for p_sentence in p_sentences: p = [ word2idx[word.lower()] for word in p_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys() ] p_list.append(p) p_list = pad_sequences(p_list, maxlen=15) return p_list
def main(_): #1.load data(X:list of lint,y:int). #if os.path.exists(FLAGS.cache_path): # 如果文件系统中存在,那么加载故事(词汇表索引化的) # with open(FLAGS.cache_path, 'r') as data_f: # trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f) # vocab_size=len(vocabulary_index2word) #else: if 1 == 1: trainX, trainY, testX, testY = None, None, None, None vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, _ = create_voabulary_label() train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label, data_type='train') trainX, trainY = train testX, testY = test print("testX.shape:", np.array(testX).shape) # 2500个list.每个list代表一句话 print("testY.shape:", np.array(testY).shape) # 2500个label print("testX[0]:", testX[0]) # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] print("testX[1]:", testX[1]) print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 # 2.Data preprocessing # Sequence padding print("start padding & transform to one hot...") trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length ############################################################################################### #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly. # pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f) ############################################################################################### print("testX[0]:", testX[0]) print("testX[1]:", testX[1]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4] # Converting labels to binary vectors print("testY[0]:", testY[0]) # 0 ;print("testY[1]:",testY[1]) #0 print("end padding & transform to one hot...") #2.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #Instantiate Model fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.num_sampled, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) #Initialize Save saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if FLAGS.use_embedding: #load pre-trained word embedding assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text) curr_epoch = sess.run(fast_text.epoch_step) #3.feed data & training number_of_training_data = len(trainX) batch_size = FLAGS.batch_size for epoch in range(curr_epoch, FLAGS.num_epochs): #range(start,stop,step_size) loss, acc, counter = 0.0, 0.0, 0 for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)): if epoch == 0 and counter == 0: print("trainX[start:end]:", trainX[start:end]) print("trainY[start:end]:", trainY[start:end]) curr_loss, curr_acc, _ = sess.run( [ fast_text.loss_val, fast_text.accuracy, fast_text.train_op ], feed_dict={ fast_text.sentence: trainX[start:end], fast_text.labels: trainY[start:end] }) loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1 if counter % 500 == 0: print( "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter))) #epoch increment print("going to increment epoch counter....") sess.run(fast_text.epoch_increment) # 4.validation print(epoch, FLAGS.validate_every, (epoch % FLAGS.validate_every == 0)) if epoch % FLAGS.validate_every == 0: eval_loss, eval_acc = do_eval(sess, fast_text, testX, testY, batch_size) print( "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc)) #save model to checkpoint save_path = FLAGS.ckpt_dir + "model.ckpt" saver.save( sess, save_path, global_step=fast_text.epoch_step) #fast_text.epoch_step # 5.最后在测试集上做测试,并报告测试准确率 Test test_loss, test_acc = do_eval(sess, fast_text, testX, testY, batch_size) pass
def main(_): # 1.load data with vocabulary of words and labels vocabulary_word2index, vocabulary_index2word = create_voabulary() vocab_size = len(vocabulary_word2index) vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label( ) questionid_question_lists = load_final_test_data(FLAGS.predict_source_file) test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists) testX = [] question_id_list = [] for tuple in test: question_id, question_string_list = tuple question_id_list.append(question_id) testX.append(question_string_list) # 2.Data preprocessing: Sequence padding print("start padding....") testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length print("end padding...") # 3.create session. config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # 4.Instantiate Model fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.num_sampled, FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) else: print("Can't find the checkpoint.going to stop") return # 5.feed data, to get logits number_of_training_data = len(testX2) print("number_of_training_data:", number_of_training_data) batch_size = 1 index = 0 predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') for start, end in zip( range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data + 1, batch_size)): logits = sess.run( fast_text.logits, feed_dict={fast_text.sentence: testX2[start:end]}) #'shape of logits:', ( 1, 1999) # 6. get lable using logtis predicted_labels = get_label_using_logits( logits[0], vocabulary_index2word_label) # 7. write question id and labels to file system. write_question_id_with_labels(question_id_list[index], predicted_labels, predict_target_file_f) index = index + 1 predict_target_file_f.close()