def main(unused_argv): vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN) if FLAGS.mode == 'eval': dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab) _EvalModel(dataset) elif FLAGS.mode == 'sample': _SampleModel(FLAGS.prefix, vocab) elif FLAGS.mode == 'dump_emb': _DumpEmb(vocab) elif FLAGS.mode == 'dump_lstm_emb': _DumpSentenceEmbedding(FLAGS.sentence, vocab) elif FLAGS.mode == 'predict_perp': sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt) current_step = t['global_step'].eval(session=sess) sentences = [] with open(FLAGS.input_data, 'r') as f: sentences = [] for line in f: ls = line[:-1].split("\t") sentences.append(ls) best_sentences = [] for i in range(len(sentences)): print("Test sentence: " + str(i)) perplexities = [] for j in range(len(sentences[i])): if j % 5 == 0: print("Output sentence: " + str(j)) with open("temp_sent.txt", 'w') as f: f.write(sentences[i][j]) print("Loading data...") dataset = data_utils.LM1BDataset("temp_sent.txt", vocab) print("Calculating perplexity...") perplexities.append(_EvalModel(dataset, current_step)) ind = perplexities.index(min(perplexities)) best_sentences.append(sentences[i][ind]) ## Return best sentences with open(FLAGS.output_data, 'w') as f: for sent in best_sentences: f.write(sent + "\n") else: raise Exception('Mode not supported.')
def _SentencePerplexity(dataset_file, vocab): dataset = data_utils.LM1BDataset(dataset_file, vocab) sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt) current_step = t['global_step'].eval(session=sess) sys.stderr.write('Loaded step %d.\n' % current_step) data_gen = dataset.get_batch(BATCH_SIZE, NUM_TIMESTEPS, forever=False) sys.stderr.write('Loaded data gen\n') total_sum_num = 0.0 sum_num = 0.0 total_sum_den = 0.0 sum_den = 0.0 sentence_id = None t0 = time.time() perps = [] for i, (inputs, char_inputs, sentence_ids, targets, weights) in enumerate(data_gen): next_sentence_id = sentence_ids[0][0] if sentence_id is None: sentence_id = next_sentence_id # We hit a new sentence. Record this one and reset counters if next_sentence_id != sentence_id: perplexity = np.exp(sum_num / sum_den) print('{}\t{}\t{}'.format(perplexity, sentence_id, '_'.join(str(p) for p in perps))) sum_num = sum_den = 0.0 perps = [] sentence_id = next_sentence_id if (sentence_id % 5) == 0: t1 = time.time() sys.stderr.write('Starting sentence {} (t={:.1f}s)\n'.format( sentence_id, t1 - t0)) ppx = np.exp(total_sum_num / total_sum_den) sys.stderr.write( 'Running avg. perplexity: {:.3f}\n'.format(ppx)) t0 = t1 input_dict = { t['inputs_in']: inputs, t['targets_in']: targets, t['target_weights_in']: weights } if 'char_inputs_in' in t: input_dict[t['char_inputs_in']] = char_inputs log_perp = sess.run(t['log_perplexity_out'], feed_dict=input_dict) if np.isnan(log_perp): sys.stderr.error('log_perplexity is Nan.\n') else: num = log_perp * weights.mean() sum_num += num total_sum_num += num den = weights.mean() sum_den += den total_sum_den += den perps.append(log_perp) if sentence_id > FLAGS.max_eval_steps: break ppx = np.exp(total_sum_num / total_sum_den)
def _RunN400Experiment(input_data, vocab): """Calculates the Cross-Entropy Loss of a given word and its predicted probability based on context. Args: input_file: A file containing target sentences. Each sentence must be separated by a newline character and have: - Asterisks before and after target word (e.g. *target*). - Punctuation separated from words by a space. vocab: Vocabulary. Contains max word chard id length and converts between words and ids. """ _CreateFiles(input_data) fname = FLAGS.output_file with tf.gfile.Open(fname, mode='w') as f: f.write('Sentence;TargetWord;Probability;Surprisal\n') with tf.gfile.Open('data/sentences.txt', mode='r') as sentences_file: all_sentences = sentences_file.readlines() all_sentences = [ re.sub(r'\n', '', sentence) for sentence in all_sentences ] with tf.gfile.Open('data/targets.txt', mode='r') as target_file: all_targets = target_file.readlines() all_targets = [[int(index) for index in targets.rsplit()] for targets in all_targets] for i in range(len(all_sentences)): current_sentence = all_sentences[i] with tf.gfile.Open('data/current.sentence.txt', mode='w') as current_sentence_file: current_sentence_file.write(current_sentence) target_indices = all_targets[i] dataset = data_utils.LM1BDataset('data/current.sentence.txt', vocab) _WordLossAndProbability(dataset, vocab, current_sentence, target_indices)
def main(unused_argv): vocab = data_utils.CharsVocabulary(FLAGS.vocab_file, MAX_WORD_LEN) if FLAGS.mode == 'eval': dataset = data_utils.LM1BDataset(FLAGS.input_data, vocab) _EvalModel(dataset) elif FLAGS.mode == 'sample': _SampleModel(FLAGS.prefix, vocab) elif FLAGS.mode == 'dump_emb': _DumpEmb(vocab) elif FLAGS.mode == 'dump_lstm_emb': _DumpSentenceEmbedding(FLAGS.sentence, vocab) else: raise Exception('Mode not supported.')
def _EvalSentences(self, sentences): """Evaluate the log probability of the input sentences in the directory Args: vocab: vocabulary object. sentences: list of strings """ print('Evaluating sentences') start_time = time.time() current_step = self.t['global_step'].eval(session=self.sess) sys.stderr.write('Loaded step %d.\n' % current_step) # instantiate a dataset generator dataset = data_utils.LM1BDataset(vocab=self.vocab) result_dfs = [] for sentence in sentences: # set the sentence first dataset.sentence = sentence # then the call to batch with method "list" converts the sentence object data_gen = dataset.get_batch(self.BATCH_SIZE, self.NUM_TIMESTEPS, method='list', forever=False) word_probabilities = [] words = [] for i, (inputs, char_inputs, _, targets, weights) in enumerate(data_gen): input_dict = { self.t['inputs_in']: inputs, self.t['targets_in']: targets, self.t['target_weights_in']: weights } if 'char_inputs_in' in self.t: input_dict[self.t['char_inputs_in']] = char_inputs log_perp = self.sess.run(self.t['log_perplexity_out'], feed_dict=input_dict) softmax = self.sess.run(self.t['softmax_out'], feed_dict=input_dict) log10_probability = -1 * np.log10( softmax[0, targets[0][0]] / np.sum(softmax)) sys.stderr.write( self.vocab.id_to_word(targets[0][0]) + ' ' + str(log10_probability) + '\n') words.append(self.vocab.id_to_word(targets[0][0])) word_probabilities.append(log10_probability) sys.stderr.write('Sentence perplexity: %s\n' % str(np.sum(word_probabilities))) sys.stderr.write('Elapsed: %s\n' % str(time.time() - start_time)) rdf = pd.DataFrame({'prob': word_probabilities, 'word': words}) result_dfs.append(rdf) #!!! different serialization here return (pd.concat(result_dfs))
def _EvalSentencesDicts(self, input_dict_list, base): """Evaluate the log probability of the input sentences in the directory Args: vocab: vocabulary object. input_dict_list: list of dictionaries """ print('Evaluating sentences') start_time = time.time() current_step = self.t['global_step'].eval(session=self.sess) sys.stderr.write('Loaded step %d.\n' % current_step) # instantiate a dataset generator dataset = data_utils.LM1BDataset(vocab=self.vocab) utterance_dfs = [] for input_row in input_dict_list: # set the sentence first # !!! drop the eos and bos eval_utterance = [ x for x in input_row['utterance_list'] if not (x in ('<s>', '</s>')) ] eval_utterance_string = ' '.join(eval_utterance) print('Evaluating:') print(eval_utterance_string) dataset.sentence = eval_utterance_string # then the call to batch with method "list" converts the sentence object data_gen = dataset.get_batch(self.BATCH_SIZE, self.NUM_TIMESTEPS, method='list', forever=False) word_by_word = [] word_by_word.append( #append a dummy start of sentence { 'token_id': 0, 'word': '<S>', 'log_prob': np.nan }) for i, (inputs, char_inputs, _, targets, weights) in enumerate(data_gen): input_dict = { self.t['inputs_in']: inputs, self.t['targets_in']: targets, self.t['target_weights_in']: weights } if 'char_inputs_in' in self.t: input_dict[self.t['char_inputs_in']] = char_inputs log_perp = self.sess.run(self.t['log_perplexity_out'], feed_dict=input_dict) softmax = self.sess.run(self.t['softmax_out'], feed_dict=input_dict) log10_probability = -1 * np.log10( softmax[0, targets[0][0]] / np.sum(softmax)) sys.stderr.write( self.vocab.id_to_word(targets[0][0]) + ' ' + str(log10_probability) + '\n') word_by_word.append({ 'token_id': i + 1, 'word': self.vocab.id_to_word(targets[0][0]), 'log_prob': math.log(10.**log10_probability, base) }) utterance_df = pd.DataFrame(word_by_word) utterance_df['utterance_id'] = input_row['utterance_id'] sys.stderr.write('Sentence perplexity: %s\n' % str(np.sum(utterance_df.log_prob))) utterance_dfs.append(utterance_df) sys.stderr.write('Elapsed: %s\n' % str(time.time() - start_time)) return (pd.concat(utterance_dfs))
def _EvalSentences(vocab): """Evaluate the log probability of the input sentences in the directory Args: dataset: LM1BDataset object. """ start_time = time.time() sess, t = _LoadModel(FLAGS.pbtxt, FLAGS.ckpt) current_step = t['global_step'].eval(session=sess) sys.stderr.write('Loaded step %d.\n' % current_step) eval_files = glob.glob(os.path.join(FLAGS.eval_dir, '*.txt')) for eval_file in eval_files: print('Evaluating ' + eval_file + '...') start_time = time.time() dataset = data_utils.LM1BDataset(eval_file, vocab) data_gen = dataset.get_batch(BATCH_SIZE, NUM_TIMESTEPS, forever=False) word_probabilities = [] words = [] for i, (inputs, char_inputs, _, targets, weights) in enumerate(data_gen): input_dict = { t['inputs_in']: inputs, t['targets_in']: targets, t['target_weights_in']: weights } if 'char_inputs_in' in t: input_dict[t['char_inputs_in']] = char_inputs log_perp = sess.run(t['log_perplexity_out'], feed_dict=input_dict) softmax = sess.run(t['softmax_out'], feed_dict=input_dict) # infer the quan #sys.stderr.write('char_inputs: %s\n' % # (' '.join([chr(x) for x in char_inputs[0][0]])+'\n')) #sys.stderr.write('Input index: %s\n' % str(inputs[0][0])) #sys.stderr.write('Indexed item: %s\n' % vocab.id_to_word(inputs[0][0])) #sys.stderr.write('Target index: %s\n' % str(targets[0][0])) #sys.stderr.write('Target item: %s\n' % vocab.id_to_word(targets[0][0])) log10_probability = -1 * np.log10( softmax[0, targets[0][0]] / np.sum(softmax)) sys.stderr.write( vocab.id_to_word(targets[0][0]) + ' ' + str(log10_probability) + '\n') words.append(vocab.id_to_word(targets[0][0])) word_probabilities.append(log10_probability) sys.stderr.write('Sentence perplexity: %s\n' % str(np.sum(word_probabilities))) sys.stderr.write('Elapsed: %s\n' % str(time.time() - start_time)) rdf = pd.DataFrame({'prob': word_probabilities, 'word': words}) rdf.to_csv(eval_file.replace('.txt', '.out'))