def scores(self, data_dir, fquery, freply, fqvocab, frvocab, init=False): if not init: self.init_model() queries = data_helpers.load_file(data_dir, fquery) replies = data_helpers.load_file(data_dir, freply) data_size = len(queries) qvocab = data_helpers.load_vocab(data_dir, fqvocab) rvocab = data_helpers.load_vocab(data_dir, frvocab) scores=[] with self.session.as_default(): for query, reply in zip(queries, replies): ql, qids = data_helpers.transform_to_id(qvocab, query, self.qmax_length) rl, rids = data_helpers.transform_to_id(rvocab, reply, self.rmax_length) feed_dict = self.make_input_feed([qids], [ql], [rids], [rl], training=False) score = self.session.run(self.pos_score, feed_dict) scores.append(score[0]) """ Debug for i, s in enumerate(scores): print(i,s) """ return scores
def get_scores(self, query_file, reply_file, query_vocab_file, reply_vocab_file, init=False): if not init: self.init_model() queries = data_helpers.load_file(query_file) replies = data_helpers.load_file(reply_file) query_vocab = data_helpers.load_vocab(query_vocab_file) reply_vocab = data_helpers.load_vocab(reply_vocab_file) scores = [] logger.info('looping over query-reply pairs') with self.session.as_default(): for query, reply in zip(queries, replies): q_len, q_ids = data_helpers.transform_to_id(query_vocab, query, self.query_max_length) r_len, r_ids = data_helpers.transform_to_id(reply_vocab, reply, self.reply_max_length) feed_dict = self.make_input_feed([q_ids], [q_len], [r_ids], [r_len], training=False) # When training=False there is no neg_score, so as pos_score. score = self.session.run(self.score, feed_dict) score = float(score[0]) scores.append(score) return scores
def main(): trained_model = "checkpoints/model.ckpt" embedding_size = 100 # Word embedding dimension epochs = 10 batch_size = 64 # Batch data size rnn_size = 50 # Number of hidden layer neurons sequence_length = 300 # Sentence length learning_rate = 0.01 # Learning rate lrdownRate = 0.9 margin = 0.1 attention_matrix_size = 100 gpu_mem_usage = 0.75 gpu_device = "/gpu:0" cpu_device = "/cpu:0" embeddings, word2idx = data_helpers.load_embedding('vectors.nobin') voc = data_helpers.load_vocab('D:\\DataMining\\Datasets\\insuranceQA\\V1\\vocabulary') all_answers = data_helpers.load_answers('D:\\DataMining\\Datasets\\insuranceQA\\V1\\answers.label.token_idx', voc) questions, pos_answers, neg_answers = data_helpers.load_train_data('D:\\DataMining\\Datasets\\insuranceQA\\V1\\question.train.token_idx.label', all_answers, voc, word2idx, sequence_length) data_size = len(questions) permutation = np.random.permutation(data_size) questions = questions[permutation, :] pos_answers = pos_answers[permutation, :] neg_answers = neg_answers[permutation, :] with tf.Graph().as_default(), tf.device(gpu_device): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_mem_usage) session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) model = QALSTM(batch_size, sequence_length, embeddings, embedding_size, rnn_size, margin, attention_matrix_size) with tf.Session(config=session_conf).as_default() as sess: # config=session_conf saver = tf.train.Saver() print("Start training") sess.run(tf.global_variables_initializer()) # Initialize all variables for epoch in range(epochs): print("The training of the %s iteration is underway" % (epoch + 1)) batch_number = 1 for question, pos_answer, neg_answer in data_helpers.batch_iter(questions, pos_answers, neg_answers, batch_size): start_time = time.time() feed_dict = { model.q: question, model.ap: pos_answer, model.an: neg_answer, model.lr: learning_rate } _, loss, acc = sess.run([model.train_op, model.loss, model.acc], feed_dict) duration = time.time() - start_time print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f\tAcc %2.3f' % (epoch + 1, batch_number * batch_size, data_size, duration, loss, acc)) batch_number += 1 learning_rate *= lrdownRate saver.save(sess, trained_model) print("End of the training")
def main(): trained_model = "checkpoints/model.ckpt" embedding_size = 100 # Word embedding dimension batch_size = 128 # Batch data size sequence_length = 300 # Sentence length rnn_size = 50 # Number of hidden layer neurons attention_matrix_size = 100 margin = 0.1 gpu_mem_usage = 0.75 gpu_device = "/gpu:0" embeddings, word2idx = data_helpers.load_embedding('vectors.nobin') voc = data_helpers.load_vocab( 'D:\\DataMining\\Datasets\\insuranceQA\\V1\\vocabulary') all_answers = data_helpers.load_answers( 'D:\\DataMining\\Datasets\\insuranceQA\\V1\\answers.label.token_idx', voc) questions, answers, labels, qids, aids = data_helpers.load_test_data( 'D:\\DataMining\\Datasets\\insuranceQA\\V1\\question.test1.label.token_idx.pool', all_answers, voc, word2idx, 300) with tf.Graph().as_default(), tf.device(gpu_device): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_mem_usage) session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) model = QALSTM(batch_size, sequence_length, embeddings, embedding_size, rnn_size, margin, attention_matrix_size) with tf.Session(config=session_conf).as_default( ) as sess: # config=session_conf saver = tf.train.Saver() print("Start loading the model") saver.restore(sess, trained_model) print("The model is loaded") scores = [] for question, answer in data_helpers.test_batch_iter( questions, answers, batch_size): feed_dict = {model.qtest: question, model.atest: answer} score = sess.run([model.scores], feed_dict) scores.extend(score[0].tolist()) MAP, MRR = eval_map_mrr(qids, aids, scores, labels) print('MAP %2.3f\tMRR %2.3f' % (MAP, MRR))
mymodel.batchsize: batch['tokens'].shape[0] } [scores] = sess.run([mymodel.scores], feed_dict) if type(scores) == numpy.float32: writer.write(str(scores) + '\n') else: for score in scores: writer.write(str(score) + '\n') except tf.errors.OutOfRangeError: break print("Done. Write output into {}".format(outfile)) writer.close() if __name__ == '__main__': vocab_table, _, vocab_size = load_vocab(FLAGS.vocab_file) mode = tf.estimator.ModeKeys.PREDICT mymodel = model(vocab_size, l2_reg_lambda=FLAGS.l2_reg_lambda, mode=mode) #FLAGS.batch_size = 1 # for testing batch size must be init_ops = [ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ] config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: sess.run([init_ops]) for i in range(1000, 1198): if i == 1163: continue test_file = glob('slide_generator_data/data/' + str(i) +
def get_word2id(self, filename): vocab, self.word2id = load_vocab(filename) print("%d train vocabulary word2id" % len(vocab))
def get_vocab(self, word_vocab_path): word_vocab, self.word2id = load_vocab(word_vocab_path) print("%d word vocabulary word2id" % len(word_vocab))
"checkpoint directory from training run") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") word2id = data_helpers.load_vocab(FLAGS.vocab_file) print('vocabulary size: {}'.format(len(word2id))) response_data = [] with open(FLAGS.response_file, 'rt') as f: for line in f: response_data.append(line.strip()) ''' user: 货要 真的 system:正品 有 保障 的 哦 亲亲 放心 呢 user:好 的 system:谢谢您 对 我 和 我们 店铺 的 信赖 我们 时刻 等待 着 您 的 再次 光临 哦 祝您 生活 愉快 ''' test_dialogue_data = json.load( open(os.path.join(DATA_DIR, "all_test_dialogue.json"), "r",
def train(): with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # load the vocab and embedding files vocab_table, vocab, vocab_size = load_vocab(FLAGS.vocab_file) embeddings = load_embedding(FLAGS.embed_file, vocab) train_iterator, train_next_batch = get_iterator( FLAGS.train_data_file, vocab_table, FLAGS.batch_size, FLAGS.max_seq_len, padding=True) dev_iterator, dev_next_batch = get_iterator(FLAGS.dev_data_file, vocab_table, 10000000, FLAGS.max_seq_len, padding=True) mode = tf.estimator.ModeKeys.TRAIN mymodel = model(vocab_size, l2_reg_lambda=FLAGS.l2_reg_lambda, mode=mode) global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = 0.001 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars = optimizer.compute_gradients(mymodel.loss) # clip the gradient norms: cliped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads_and_vars] train_op = optimizer.apply_gradients(cliped_gvs, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries # timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, tf.flags.FLAGS.model + "_runs")) print("Writing to {}\n".format(out_dir)) # Summaries for loss loss_summary = tf.summary.scalar("loss", mymodel.loss) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) def train_step(): """ A single training step """ [batch] = sess.run([train_next_batch]) feed_dict = { mymodel.tokens: batch['tokens'], mymodel.surf_features: batch['features'], mymodel.input_y: batch['scores'], mymodel.batchsize: batch['tokens'].shape[0] } _, step, summaries, loss = sess.run( [train_op, global_step, train_summary_op, mymodel.loss], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}".format(time_str, step, loss)) train_summary_writer.add_summary(summaries, step) def dev_step(step, writer=None): """ Evaluates model on a dev set """ sess.run(dev_iterator.initializer) while True: try: [batch] = sess.run([dev_next_batch]) feed_dict = { mymodel.tokens: batch['tokens'], mymodel.surf_features: batch['features'], mymodel.input_y: batch['scores'], mymodel.batchsize: batch['tokens'].shape[0] } summaries, loss = sess.run( [dev_summary_op, mymodel.loss], feed_dict) print('--- dev loss: ', loss) if writer: writer.add_summary(summaries, step) except tf.errors.OutOfRangeError: print("End of dataset") break time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}".format(time_str, step, loss)) if writer: writer.add_summary(summaries, step) # Initialize all variables init_ops = [ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ] sess.run(init_ops) for epoch in range(FLAGS.num_epochs): # initialize going through dataset sess.run(train_iterator.initializer) while True: try: train_step() current_step = tf.train.global_step(sess, global_step) # evaluate on dev set if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_step(current_step, writer=dev_summary_writer) print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print( "Saved model checkpoint to {}\n".format(path)) except tf.errors.OutOfRangeError: print("End of dataset") break print('-' * 100)
import numpy as np import datetime import os import json import time import config if __name__ == '__main__': # Load the training data. qid, que, pos_rel, pos_rel_word, neg_rel, neg_rel_word = load_data( config.TRAIN_PATH) qid_dev, que_dev, pos_rel_dev, pos_rel_word_dev, neg_rel_dev, neg_rel_word_dev = load_data( config.TEST_PATH) # Create the word2id dictionaries of questions and relations. que_word2id, rel_word2id = load_vocab(config.DICT_DIR) print('Size of question vocab : {}'.format(len(que_word2id))) print('Size of relation vocab : {}'.format(len(rel_word2id))) # Change to pytorch Variable. que = prepare_sequence(que, config.MAX_QUESTION_LENGTH, que_word2id) pos_rel = prepare_sequence(pos_rel, config.MAX_RELATION_LEVEL_LENGTH, rel_word2id) neg_rel = prepare_sequence(neg_rel, config.MAX_RELATION_LEVEL_LENGTH, rel_word2id) pos_rel_word = prepare_sequence(pos_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_word2id) neg_rel_word = prepare_sequence(neg_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_word2id) print('\nTrain set')
import datetime import os import json import time import config if __name__ == '__main__': # Load the training data. qid, que_word, que_char, pos_rel_name, pos_rel_word, pos_rel_char, \ neg_rel_name, neg_rel_word, neg_rel_char = load_data(config.TRAIN_PATH) qid_dev, que_word_dev, que_char_dev, pos_rel_name_dev, pos_rel_word_dev, pos_rel_char_dev, \ neg_rel_name_dev, neg_rel_word_dev, neg_rel_char_dev = load_data(config.TEST_PATH) # Create the word2id dictionaries of questions and relations. que_vocab, rel_vocab = load_vocab(config.DICT_DIR) print('Size of question vocab : {}'.format(len(que_vocab))) print('Size of relation vocab : {}'.format(len(rel_vocab))) # Change to pytorch Variable. que_word = prepare_sequence(que_word, config.MAX_QUESTION_LENGTH, que_vocab) que_char = prepare_sequence(que_char, config.MAX_QUESTION_CHAR_LEVEL_LENGTH, que_vocab) pos_rel_name = prepare_sequence(pos_rel_name, config.MAX_RELATION_LEVEL_LENGTH, rel_vocab) neg_rel_name = prepare_sequence(neg_rel_name, config.MAX_RELATION_LEVEL_LENGTH, rel_vocab) pos_rel_word = prepare_sequence(pos_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_vocab) neg_rel_word = prepare_sequence(neg_rel_word, config.MAX_WORD_LEVEL_LENGTH, rel_vocab) pos_rel_char = prepare_sequence(pos_rel_char, config.MAX_CHAR_LEVEL_LENGTH, rel_vocab) neg_rel_char = prepare_sequence(neg_rel_char, config.MAX_CHAR_LEVEL_LENGTH, rel_vocab) print('\nTrain set') print('question word-level tensor shape: {}'.format(que_word.shape))