def input_fn(): vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=None) input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) corpus = input_pipeline.load_data(input_dataset).repeat() corpus = input_pipeline.padded_batch(corpus, 3) return corpus
def model_fn(features, labels, mode, params): vocab_copy = MockVocab() input_pipeline_copy = LmInputDataPipeline(vocab_copy) return get_autoregressor_model_fn( vocab_size, input_pipeline_copy.get_id_to_embedding_mapping())(features, labels, mode, params)
def test_load_no_batching(): def input_generator(): yield ["a", "b", "c"] yield ["c", "b"] expected_output = [ ( { "inputs": np.array([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.5, 2.5, 3.5], [0.0, 0.0, 0.0, 4.5, 5.5, 6.5], [0.0, 0.0, 0.0, 7.5, 8.5, 9.5]], dtype=np.float32), "length": np.array(4, dtype=np.int32), }, { "targets": np.array([4, 5, 6, 2], dtype=np.int32) }, ), ( { "inputs": np.array([ [0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 7.5, 8.5, 9.5], [0.0, 0.0, 0.0, 4.5, 5.5, 6.5], ], dtype=np.float32), "length": np.array(3, dtype=np.int32), }, { "targets": np.array([6, 5, 2], dtype=np.int32) }, ), ] input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=None) input_data = input_pipeline.load_data(input_dataset) it = input_data.make_initializable_iterator() example = it.get_next() with tf.Session() as sess: sess.run(tf.tables_initializer()) sess.run(it.initializer) #sess.run(tf.global_variables_initializer()) for _, expected in enumerate(expected_output): actual = sess.run(example) assert actual[0]["inputs"] == approx(expected[0]["inputs"]) assert actual[0]["length"] == approx(expected[0]["length"]) assert actual[1]["targets"] == approx(expected[1]["targets"])
def create_input(): input_pipe = LmInputDataPipeline(glove, 5) embedding_size = LmInputDataPipeline( glove, None)._vocab_generalized.vector_size() train_data = read_dataset_from_dir(data_dir, DatasetType.TRAIN, embedding_size) train_data = train_data.repeat().shuffle(1000, seed=0) train_data = input_pipe.padded_batch(train_data, BATCH_SIZE) return train_data
def create_input(): input_pipe = LmInputDataPipeline(glove, 5) embedding_size = LmInputDataPipeline( glove, None)._vocab_generalized.vector_size() train_data = read_dataset_from_dir(data_dir, subset, embedding_size) if take_first_n is not None: train_data = train_data.take(take_first_n) train_data = input_pipe.padded_batch(train_data, BATCH_SIZE) return train_data
def disambiguation_preprocessing(inputs): glove = Glove300() input_pipe = LmInputDataPipeline(glove) t_words = tf.placeholder(dtype=tf.string) t_vocab_ids = glove.word_to_id_op()(t_words) t_genralized_ids = input_pipe._vocab_generalized.vocab_id_to_generalized_id( )(t_vocab_ids) meanings_all = set() for sentence in inputs: for word in sentence: for meaning in word.split("^"): meanings_all.add(meaning) meanings_all = list(meanings_all) with tf.Session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) glove.after_session_created_hook_fn(sess) ids_all = sess.run(t_genralized_ids, feed_dict={t_words: meanings_all}) mapping = {meaning: id for meaning, id in zip(meanings_all, ids_all)} sentences_as_ids = [] for sentence in inputs: sentence_as_ids = [] for word in sentence: allowables = [] for meaning in word.split("^"): allowables.append(mapping[meaning]) sentence_as_ids.append(allowables) sentences_as_ids.append(sentence_as_ids) return sentences_as_ids
def model_function(features, labels, mode, params): input_pipe = LmInputDataPipeline(glove) vocab_size = glove.vocab_size() embedding_size = input_pipe._vocab_generalized.vector_size() id_to_embeding_fn = input_pipe.get_id_to_embedding_mapping( ) if mode == tf.estimator.ModeKeys.PREDICT else lambda x: tf.zeros( (tf.shape(x), embedding_size), tf.float32) #with tf.device(device_assignment_function) if hparams.size_based_device_assignment else without: with tf.device("/device:CPU:0"): concrete_model_fn = get_autoregressor_model_fn( vocab_size, id_to_embeding_fn, time_major_optimization=True, predict_as_pure_lm=False, mask_allowables=input_sentence, hparams=hparams) estimator_spec = concrete_model_fn(features, labels, mode, params) training_hooks = [] to_restore = tf.contrib.framework.get_variables_to_restore() predictions = estimator_spec.predictions if mode == tf.estimator.ModeKeys.PREDICT: training_hooks.append(InitializeVocabularyHook(glove)) predicted_ids = tf.cast(predictions["paths"], dtype=tf.int64) words_shape = tf.shape(predicted_ids) to_vocab_id = input_pipe._vocab_generalized.generalized_id_to_vocab_id( ) to_word = glove.id_to_word_op() predicted_ids = tf.reshape(predicted_ids, shape=[-1]) predicted_words = to_word(to_vocab_id(predicted_ids)) predicted_words = tf.reshape(predicted_words, shape=words_shape) predictions["predicted_words"] = predicted_words if hparams.profiler: training_hooks.append( tf.train.ProfilerHook(output_dir=model_dir, save_secs=30, show_memory=True)) training_hooks.append(FullLogHook()) estimator_spec_with_hooks = tf.estimator.EstimatorSpec( mode=estimator_spec.mode, loss=estimator_spec.loss, train_op=estimator_spec.train_op, eval_metric_ops=estimator_spec.eval_metric_ops, predictions=estimator_spec.predictions, training_hooks=training_hooks) return estimator_spec_with_hooks
def model_function(features, labels, mode, params): input_pipe = LmInputDataPipeline(glove) vocab_size = glove.vocab_size() id_to_embeding_fn = input_pipe.get_id_to_embedding_mapping() with tf.device(device_assignment_function): concrete_model_fn = get_autoregressor_model_fn( vocab_size, id_to_embeding_fn) estimator_spec = concrete_model_fn(features, labels, mode, params) training_hooks = [InitializeVocabularyHook(glove)] estimator_spec_with_hooks = tf.estimator.EstimatorSpec( mode=estimator_spec.mode, loss=estimator_spec.loss, train_op=estimator_spec.train_op, eval_metric_ops=estimator_spec.eval_metric_ops, predictions=estimator_spec.predictions, training_hooks=training_hooks) return estimator_spec_with_hooks
def model_function(features, labels, mode, params): input_pipe = LmInputDataPipeline(glove) vocab_size = glove.vocab_size() embedding_size = input_pipe._vocab_generalized.vector_size() id_to_embeding_fn = input_pipe.get_id_to_embedding_mapping( ) if mode == tf.estimator.ModeKeys.PREDICT else lambda x: tf.zeros( (tf.shape(x), embedding_size), tf.float32) with tf.device(device_assignment_function ) if hparams.size_based_device_assignment else without: concrete_model_fn = get_autoregressor_model_fn( vocab_size, id_to_embeding_fn, time_major_optimization=True, hparams=hparams) estimator_spec = concrete_model_fn(features, labels, mode, params) if hparams.write_target_text_to_summary: words_shape = tf.shape(labels["targets"]) to_vocab_id = input_pipe._vocab_generalized.generalized_id_to_vocab_id( ) to_word = glove.id_to_word_op() flat_targets = tf.reshape(labels["targets"], shape=[-1]) flat_targets_words = to_word(to_vocab_id(flat_targets)) targets_words = tf.reshape(flat_targets_words, shape=words_shape) tf.summary.text("targets_words", targets_words) training_hooks = [] if mode == tf.estimator.ModeKeys.PREDICT: training_hooks.append(InitializeVocabularyHook(glove)) if hparams.profiler: training_hooks.append( tf.train.ProfilerHook(output_dir=model_dir, save_secs=30, show_memory=True)) training_hooks.append(FullLogHook()) estimator_spec_with_hooks = tf.estimator.EstimatorSpec( mode=estimator_spec.mode, loss=estimator_spec.loss, train_op=estimator_spec.train_op, eval_metric_ops=estimator_spec.eval_metric_ops, predictions=estimator_spec.predictions, training_hooks=training_hooks) return estimator_spec_with_hooks
def data_gen(): yield ({ "inputs": np.array([[ LmInputDataPipeline( glove, None)._vocab_generalized.get_special_unit_id( SpecialUnit.START_OF_SEQUENCE) ]], dtype=np.int32), "length": len(input_sentence) }, np.array([0]))
def input_fn(): vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=3) input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) return input_pipeline.load_data(input_dataset).repeat()
def create_input(): simple_examples = SimpleExamplesCorpus() train_data = simple_examples.get_tokens_dataset( DatasetType.TRAIN).repeat().shuffle(1000, seed=0) input_pipe = LmInputDataPipeline(glove, 8) return input_pipe.load_data(train_data)
def create_input(): simple_examples = SimpleExamplesCorpus() train_data = simple_examples.get_tokens_dataset(DatasetType.TRAIN) input_pipe = LmInputDataPipeline(glove, None) return input_pipe.load_data(train_data)
def prepare_training_dataset(ouput_path): """This will transform input corpus into language model training examples with embeddings vectors as inputs and save it to disk. Expect HUGE dataset in terms of occupied space.""" if TEST_SERIALIZATION: test_examples = [] ouput_path = Path(ouput_path) glove = Glove300() def create_input(): simple_examples = SimpleExamplesCorpus() train_data = simple_examples.get_tokens_dataset(DatasetType.TRAIN) input_pipe = LmInputDataPipeline(glove, None) return input_pipe.load_data(train_data) dataset = create_input() def make_tf_record_example(features, labels) -> tf.train.SequenceExample: feature_inputs = tf.train.Feature(float_list=tf.train.FloatList( value=features["inputs"].reshape(-1))) feature_length = tf.train.Feature(int64_list=tf.train.Int64List( value=[features["length"]])) feature_targets = tf.train.Feature(int64_list=tf.train.Int64List( value=labels["targets"])) feature_dict = { "inputs": feature_inputs, "length": feature_length, "targets": feature_targets } features = tf.train.Features(feature=feature_dict) example = tf.train.Example(features=features) return example def max_length_condition(max_length): def check_length(features, labels): return tf.less_equal(features["length"], max_length) return check_length dataset = dataset.filter(max_length_condition(40)) it = dataset.make_initializable_iterator() next = it.get_next() EXAMPLES_PER_FILE = 2000 with tf.Session() as sess: sess.run(tf.tables_initializer()) glove.initialize_embeddings_in_graph(tf.get_default_graph(), sess) sess.run(it.initializer) for i in count(1): dataset_filename = str(ouput_path / "train.{:0=10}.tfrecords".format(i)) writer = tf.python_io.TFRecordWriter(dataset_filename) try: for _ in range(EXAMPLES_PER_FILE): features, labels = sess.run(next) if TEST_SERIALIZATION: test_examples.append((features, labels)) example = make_tf_record_example(features, labels) writer.write(example.SerializeToString()) except tf.errors.OutOfRangeError: break writer.close() if TEST_SERIALIZATION: embedding_size = LmInputDataPipeline( glove, None)._vocab_generalized.vector_size() records_dataset = read_dataset_from_files( [dataset_filename], embedding_size=embedding_size) it = records_dataset.make_initializable_iterator() next_record = it.get_next() with tf.Session() as sess: sess.run(it.initializer) for expected_features, expected_labels in test_examples: actual_features, actual_labels = sess.run(next_record) assert (actual_features["inputs"] == expected_features["inputs"]).all() assert (actual_features["length"] == expected_features["length"]).all() assert actual_labels["targets"] == approx( expected_labels["targets"])