def __init__(self): pbtxt = op.join(filenames.google_lm_dir, self.pbtxt) ckpt = op.join(filenames.google_lm_dir, self.ckpt) vocab_file = op.join(filenames.google_lm_dir, self.vocab_file) self.load_model(pbtxt, ckpt) self.vocab = data_utils.CharsVocabulary(vocab_file, MAX_WORD_LEN) self.graph = list(self.t.values())[0].graph
def __init__(self, metadata): self.BATCH_SIZE = 1 self.NUM_TIMESTEPS = 1 self.MAX_WORD_LEN = 50 self.metadata = metadata self._LoadModel(self.metadata['modelParameters']['pbtxt_path'], self.metadata['modelParameters']['ckpt_path']) self.vocab = data_utils.CharsVocabulary( self.metadata['modelParameters']['vocab_path'], self.MAX_WORD_LEN)
def main(unused_argv): vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN) if FLAGS.mode == 'eval': dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab) _EvalModel(dataset) elif FLAGS.mode == 'sample': _SampleModel(FLAGS.prefix, vocab) elif FLAGS.mode == 'dump_emb': _DumpEmb(vocab) elif FLAGS.mode == 'dump_lstm_emb': _DumpSentenceEmbedding(FLAGS.sentence, vocab) else: raise Exception('Mode not supported.')
def main(unused_argv): vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN) if FLAGS.mode == 'eval': dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab) _EvalModel(dataset) elif FLAGS.mode == 'sample': _SampleModel(FLAGS.prefix, vocab) elif FLAGS.mode == 'dump_emb': _DumpEmb(vocab) elif FLAGS.mode == 'dump_lstm_emb': _DumpSentenceEmbedding(FLAGS.sentence, vocab) elif FLAGS.mode == 'predict_perp': sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt) current_step = t['global_step'].eval(session=sess) sentences = [] with open(FLAGS.input_data, 'r') as f: sentences = [] for line in f: ls = line[:-1].split("\t") sentences.append(ls) best_sentences = [] for i in range(len(sentences)): print("Test sentence: " + str(i)) perplexities = [] for j in range(len(sentences[i])): if j % 5 == 0: print("Output sentence: " + str(j)) with open("temp_sent.txt", 'w') as f: f.write(sentences[i][j]) print("Loading data...") dataset = data_utils.LM1BDataset("temp_sent.txt", vocab) print("Calculating perplexity...") perplexities.append(_EvalModel(dataset, current_step)) ind = perplexities.index(min(perplexities)) best_sentences.append(sentences[i][ind]) ## Return best sentences with open(FLAGS.output_data, 'w') as f: for sent in best_sentences: f.write(sent + "\n") else: raise Exception('Mode not supported.')
def main(unused_argv): vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN) sess, model = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt) with open(FLAGS.input_file) as inf: sentences = [line.strip().split(" ") for line in inf] if FLAGS.mode == "surprisal": outf = sys.stdout if FLAGS.output_file == "-" else open(output_file, "w") # Print TSV header outf.write("sentence_id\ttoken_id\ttoken\tsurprisal\n") surprisals = get_surprisals(sentences, model, sess, vocab) for i, (sentence, sentence_surps) in enumerate(zip(sentences, surprisals)): for j, (word, word_surp) in enumerate(sentence_surps): outf.write("%i\t%i\t%s\t%f\n" % (i + 1, j + 1, word, word_surp)) outf.close() elif FLAGS.mode == "predictions": outf = h5py.File(FLAGS.output_file, "w") predictions = get_predictions(sentences, model, sess, vocab) for i, (sentence, sentence_preds) in enumerate(zip(sentences, predictions)): token_ids = [vocab.word_to_id(word) for word in sentence] # Skip the first word, which has null predictions sentence_preds = sentence_preds[1:] first_word_pred = np.ones_like(sentence_preds[0]) first_word_pred /= first_word_pred.sum() first_word_pred = np.log(first_word_pred) sentence_preds = np.vstack([first_word_pred] + sentence_preds) group = outf.create_group("/sentence/%i" % i) group.create_dataset("predictions", data=sentence_preds) group.create_dataset("tokens", data=token_ids) vocab_encoded = np.array(vocab._id_to_word) vocab_encoded = np.char.encode(vocab_encoded, "utf-8") outf.create_dataset("/vocabulary", data=vocab_encoded) outf.close() else: raise ValueError("Unknown --mode %s" % FLAGS.mode)
def _LoadModel(gd_file, ckpt_file): """Load the model from GraphDef and Checkpoint. Args: gd_file: GraphDef proto text file. ckpt_file: TensorFlow Checkpoint file. Returns: TensorFlow session and tensors dict. """ with tf.Graph().as_default(): sys.stderr.write('Recovering graph.\n') with tf.gfile.FastGFile(gd_file, 'r') as f: s = f.read().decode() gd = tf.GraphDef() text_format.Merge(s, gd) tf.logging.info('Recovering Graph %s', gd_file) t = {} [ t['states_init'], t['lstm/lstm_0/control_dependency'], t['lstm/lstm_1/control_dependency'], t['softmax_out'], t['class_ids_out'], t['class_weights_out'], t['log_perplexity_out'], t['inputs_in'], t['targets_in'], t['target_weights_in'], t['char_inputs_in'], t['all_embs'], t['softmax_weights'], t['global_step'] ] = tf.import_graph_def(gd, {}, [ 'states_init', 'lstm/lstm_0/control_dependency:0', 'lstm/lstm_1/control_dependency:0', 'softmax_out:0', 'class_ids_out:0', 'class_weights_out:0', 'log_perplexity_out:0', 'inputs_in:0', 'targets_in:0', 'target_weights_in:0', 'char_inputs_in:0', 'all_embs_out:0', 'Reshape_3:0', 'global_step:0' ], name='') sys.stderr.write('Recovering checkpoint %s\n' % ckpt_file) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run('save/restore_all', {'save/Const:0': ckpt_file}) sess.run(t['states_init']) vocab = data_utils.CharsVocabulary('data/vocab-2016-09-10.txt', MAX_WORD_LEN) return sess, t, vocab
def main(unused_argv): vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN) _EvalTestSents(FLAGS.input_file, vocab, FLAGS.output_file)