def create_merged_ngram_dictionaries(indices, n): """Generate a single dictionary for the full batch. Args: indices: List of lists of indices. n: Degree of n-grams. Returns: Dictionary of hashed(n-gram tuples) to counts in the batch of indices. """ ngram_dicts = [] for ind in indices: ngrams = n_gram.find_all_ngrams(ind, n=n) ngram_counts = n_gram.construct_ngrams_dict(ngrams) ngram_dicts.append(ngram_counts) merged_gen_dict = Counter() for ngram_dict in ngram_dicts: merged_gen_dict += Counter(ngram_dict) return merged_gen_dict
def main(_): hparams = create_hparams() train_dir = FLAGS.base_directory + '/train' # Load data set. if FLAGS.data_set == 'ptb': raw_data = ptb_loader.ptb_raw_data(FLAGS.data_dir) train_data, valid_data, test_data, _ = raw_data valid_data_flat = valid_data elif FLAGS.data_set == 'imdb': raw_data = imdb_loader.imdb_raw_data(FLAGS.data_dir) # TODO(liamfedus): Get an IMDB test partition. train_data, valid_data = raw_data valid_data_flat = [word for review in valid_data for word in review] else: raise NotImplementedError if FLAGS.mode == MODE_TRAIN or FLAGS.mode == MODE_TRAIN_EVAL: data_set = train_data elif FLAGS.mode == MODE_VALIDATION: data_set = valid_data elif FLAGS.mode == MODE_TEST: data_set = test_data else: raise NotImplementedError # Dictionary and reverse dictionry. if FLAGS.data_set == 'ptb': word_to_id = ptb_loader.build_vocab( os.path.join(FLAGS.data_dir, 'ptb.train.txt')) elif FLAGS.data_set == 'imdb': word_to_id = imdb_loader.build_vocab( os.path.join(FLAGS.data_dir, 'vocab.txt')) id_to_word = {v: k for k, v in word_to_id.items()} # Dictionary of Training Set n-gram counts. bigram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=2) trigram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=3) fourgram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=4) bigram_counts = n_gram.construct_ngrams_dict(bigram_tuples) trigram_counts = n_gram.construct_ngrams_dict(trigram_tuples) fourgram_counts = n_gram.construct_ngrams_dict(fourgram_tuples) print('Unique %d-grams: %d' % (2, len(bigram_counts))) print('Unique %d-grams: %d' % (3, len(trigram_counts))) print('Unique %d-grams: %d' % (4, len(fourgram_counts))) data_ngram_counts = { '2': bigram_counts, '3': trigram_counts, '4': fourgram_counts } # TODO(liamfedus): This was necessary because there was a problem with our # originally trained IMDB models. The EOS_INDEX was off by one, which means, # two words were mapping to index 86933. The presence of '</s>' is going # to throw and out of vocabulary error. FLAGS.vocab_size = len(id_to_word) print('Vocab size: %d' % FLAGS.vocab_size) tf.gfile.MakeDirs(FLAGS.base_directory) if FLAGS.mode == MODE_TRAIN: log = tf.gfile.GFile( os.path.join(FLAGS.base_directory, 'train-log.txt'), mode='w') elif FLAGS.mode == MODE_VALIDATION: log = tf.gfile.GFile( os.path.join(FLAGS.base_directory, 'validation-log.txt'), mode='w') elif FLAGS.mode == MODE_TRAIN_EVAL: log = tf.gfile.GFile( os.path.join(FLAGS.base_directory, 'train_eval-log.txt'), mode='w') else: log = tf.gfile.GFile( os.path.join(FLAGS.base_directory, 'test-log.txt'), mode='w') if FLAGS.mode == MODE_TRAIN: train_model(hparams, data_set, train_dir, log, id_to_word, data_ngram_counts) elif FLAGS.mode == MODE_VALIDATION: evaluate_model(hparams, data_set, train_dir, log, id_to_word, data_ngram_counts) elif FLAGS.mode == MODE_TRAIN_EVAL: evaluate_model(hparams, data_set, train_dir, log, id_to_word, data_ngram_counts) elif FLAGS.mode == MODE_TEST: evaluate_model(hparams, data_set, train_dir, log, id_to_word, data_ngram_counts) else: raise NotImplementedError
def main(_): hparams = create_hparams() train_dir = FLAGS.base_directory + '/train' # Load data set. if FLAGS.data_set == 'ptb': raw_data = ptb_loader.ptb_raw_data(FLAGS.data_dir) train_data, valid_data, test_data, _ = raw_data valid_data_flat = valid_data elif FLAGS.data_set == 'imdb': raw_data = imdb_loader.imdb_raw_data(FLAGS.data_dir) # TODO(liamfedus): Get an IMDB test partition. train_data, valid_data = raw_data valid_data_flat = [word for review in valid_data for word in review] else: raise NotImplementedError if FLAGS.mode == MODE_TRAIN or FLAGS.mode == MODE_TRAIN_EVAL: data_set = train_data elif FLAGS.mode == MODE_VALIDATION: data_set = valid_data elif FLAGS.mode == MODE_TEST: data_set = test_data else: raise NotImplementedError # Dictionary and reverse dictionry. if FLAGS.data_set == 'ptb': word_to_id = ptb_loader.build_vocab( os.path.join(FLAGS.data_dir, 'ptb.train.txt')) elif FLAGS.data_set == 'imdb': word_to_id = imdb_loader.build_vocab( os.path.join(FLAGS.data_dir, 'vocab.txt')) id_to_word = {v: k for k, v in word_to_id.iteritems()} # Dictionary of Training Set n-gram counts. bigram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=2) trigram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=3) fourgram_tuples = n_gram.find_all_ngrams(valid_data_flat, n=4) bigram_counts = n_gram.construct_ngrams_dict(bigram_tuples) trigram_counts = n_gram.construct_ngrams_dict(trigram_tuples) fourgram_counts = n_gram.construct_ngrams_dict(fourgram_tuples) print('Unique %d-grams: %d' % (2, len(bigram_counts))) print('Unique %d-grams: %d' % (3, len(trigram_counts))) print('Unique %d-grams: %d' % (4, len(fourgram_counts))) data_ngram_counts = { '2': bigram_counts, '3': trigram_counts, '4': fourgram_counts } # TODO(liamfedus): This was necessary because there was a problem with our # originally trained IMDB models. The EOS_INDEX was off by one, which means, # two words were mapping to index 86933. The presence of '</s>' is going # to throw and out of vocabulary error. FLAGS.vocab_size = len(id_to_word) print('Vocab size: %d' % FLAGS.vocab_size) tf.gfile.MakeDirs(FLAGS.base_directory) if FLAGS.mode == MODE_TRAIN: log = tf.gfile.GFile( os.path.join(FLAGS.base_directory, 'train-log.txt'), mode='w') elif FLAGS.mode == MODE_VALIDATION: log = tf.gfile.GFile( os.path.join(FLAGS.base_directory, 'validation-log.txt'), mode='w') elif FLAGS.mode == MODE_TRAIN_EVAL: log = tf.gfile.GFile( os.path.join(FLAGS.base_directory, 'train_eval-log.txt'), mode='w') else: log = tf.gfile.GFile( os.path.join(FLAGS.base_directory, 'test-log.txt'), mode='w') if FLAGS.mode == MODE_TRAIN: train_model(hparams, data_set, train_dir, log, id_to_word, data_ngram_counts) elif FLAGS.mode == MODE_VALIDATION: evaluate_model(hparams, data_set, train_dir, log, id_to_word, data_ngram_counts) elif FLAGS.mode == MODE_TRAIN_EVAL: evaluate_model(hparams, data_set, train_dir, log, id_to_word, data_ngram_counts) elif FLAGS.mode == MODE_TEST: evaluate_model(hparams, data_set, train_dir, log, id_to_word, data_ngram_counts) else: raise NotImplementedError