Exemple #1
0
    def __init__(self):
        pbtxt = op.join(filenames.google_lm_dir, self.pbtxt)
        ckpt = op.join(filenames.google_lm_dir, self.ckpt)
        vocab_file = op.join(filenames.google_lm_dir, self.vocab_file)

        self.load_model(pbtxt, ckpt)
        self.vocab = data_utils.CharsVocabulary(vocab_file, MAX_WORD_LEN)
        self.graph = list(self.t.values())[0].graph
Exemple #2
0
 def __init__(self, metadata):
     self.BATCH_SIZE = 1
     self.NUM_TIMESTEPS = 1
     self.MAX_WORD_LEN = 50
     self.metadata = metadata
     self._LoadModel(self.metadata['modelParameters']['pbtxt_path'],
                     self.metadata['modelParameters']['ckpt_path'])
     self.vocab = data_utils.CharsVocabulary(
         self.metadata['modelParameters']['vocab_path'], self.MAX_WORD_LEN)
Exemple #3
0
def main(unused_argv):
    vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN)

    if FLAGS.mode == 'eval':
        dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab)
        _EvalModel(dataset)
    elif FLAGS.mode == 'sample':
        _SampleModel(FLAGS.prefix, vocab)
    elif FLAGS.mode == 'dump_emb':
        _DumpEmb(vocab)
    elif FLAGS.mode == 'dump_lstm_emb':
        _DumpSentenceEmbedding(FLAGS.sentence, vocab)
    else:
        raise Exception('Mode not supported.')
def main(unused_argv):
    vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN)

    if FLAGS.mode == 'eval':
        dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab)
        _EvalModel(dataset)
    elif FLAGS.mode == 'sample':
        _SampleModel(FLAGS.prefix, vocab)
    elif FLAGS.mode == 'dump_emb':
        _DumpEmb(vocab)
    elif FLAGS.mode == 'dump_lstm_emb':
        _DumpSentenceEmbedding(FLAGS.sentence, vocab)
    elif FLAGS.mode == 'predict_perp':
        sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)
        current_step = t['global_step'].eval(session=sess)

        sentences = []
        with open(FLAGS.input_data, 'r') as f:
            sentences = []
            for line in f:
                ls = line[:-1].split("\t")
                sentences.append(ls)

            best_sentences = []
            for i in range(len(sentences)):
                print("Test sentence: " + str(i))

                perplexities = []
                for j in range(len(sentences[i])):
                    if j % 5 == 0:
                        print("Output sentence: " + str(j))

                    with open("temp_sent.txt", 'w') as f:
                        f.write(sentences[i][j])

                    print("Loading data...")
                    dataset = data_utils.LM1BDataset("temp_sent.txt", vocab)
                    print("Calculating perplexity...")
                    perplexities.append(_EvalModel(dataset, current_step))

                ind = perplexities.index(min(perplexities))
                best_sentences.append(sentences[i][ind])

            ## Return best sentences
            with open(FLAGS.output_data, 'w') as f:
                for sent in best_sentences:
                    f.write(sent + "\n")
    else:
        raise Exception('Mode not supported.')
Exemple #5
0
def main(unused_argv):
  vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN)
  sess, model = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt)

  with open(FLAGS.input_file) as inf:
    sentences = [line.strip().split(" ") for line in inf]

  if FLAGS.mode == "surprisal":
    outf = sys.stdout if FLAGS.output_file == "-" else open(output_file, "w")
    # Print TSV header
    outf.write("sentence_id\ttoken_id\ttoken\tsurprisal\n")

    surprisals = get_surprisals(sentences, model, sess, vocab)
    for i, (sentence, sentence_surps) in enumerate(zip(sentences, surprisals)):
      for j, (word, word_surp) in enumerate(sentence_surps):
        outf.write("%i\t%i\t%s\t%f\n" % (i + 1, j + 1, word, word_surp))

    outf.close()
  elif FLAGS.mode == "predictions":
    outf = h5py.File(FLAGS.output_file, "w")

    predictions = get_predictions(sentences, model, sess, vocab)
    for i, (sentence, sentence_preds) in enumerate(zip(sentences, predictions)):
        token_ids = [vocab.word_to_id(word) for word in sentence]

        # Skip the first word, which has null predictions
        sentence_preds = sentence_preds[1:]
        first_word_pred = np.ones_like(sentence_preds[0])
        first_word_pred /= first_word_pred.sum()
        first_word_pred = np.log(first_word_pred)
        sentence_preds = np.vstack([first_word_pred] + sentence_preds)

        group = outf.create_group("/sentence/%i" % i)
        group.create_dataset("predictions", data=sentence_preds)
        group.create_dataset("tokens", data=token_ids)

    vocab_encoded = np.array(vocab._id_to_word)
    vocab_encoded = np.char.encode(vocab_encoded, "utf-8")
    outf.create_dataset("/vocabulary", data=vocab_encoded)

    outf.close()
  else:
    raise ValueError("Unknown --mode %s" % FLAGS.mode)
Exemple #6
0
def _LoadModel(gd_file, ckpt_file):
    """Load the model from GraphDef and Checkpoint.
  Args:
    gd_file: GraphDef proto text file.
    ckpt_file: TensorFlow Checkpoint file.
  Returns:
    TensorFlow session and tensors dict.
  """
    with tf.Graph().as_default():
        sys.stderr.write('Recovering graph.\n')
        with tf.gfile.FastGFile(gd_file, 'r') as f:
            s = f.read().decode()
            gd = tf.GraphDef()
            text_format.Merge(s, gd)

        tf.logging.info('Recovering Graph %s', gd_file)
        t = {}
        [
            t['states_init'], t['lstm/lstm_0/control_dependency'],
            t['lstm/lstm_1/control_dependency'], t['softmax_out'],
            t['class_ids_out'], t['class_weights_out'],
            t['log_perplexity_out'], t['inputs_in'], t['targets_in'],
            t['target_weights_in'], t['char_inputs_in'], t['all_embs'],
            t['softmax_weights'], t['global_step']
        ] = tf.import_graph_def(gd, {}, [
            'states_init', 'lstm/lstm_0/control_dependency:0',
            'lstm/lstm_1/control_dependency:0', 'softmax_out:0',
            'class_ids_out:0', 'class_weights_out:0', 'log_perplexity_out:0',
            'inputs_in:0', 'targets_in:0', 'target_weights_in:0',
            'char_inputs_in:0', 'all_embs_out:0', 'Reshape_3:0',
            'global_step:0'
        ],
                                name='')

        sys.stderr.write('Recovering checkpoint %s\n' % ckpt_file)
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run('save/restore_all', {'save/Const:0': ckpt_file})
        sess.run(t['states_init'])

        vocab = data_utils.CharsVocabulary('data/vocab-2016-09-10.txt',
                                           MAX_WORD_LEN)

    return sess, t, vocab
def main(unused_argv):
    vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN)
    _EvalTestSents(FLAGS.input_file, vocab, FLAGS.output_file)