def input_fn(): vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=None) input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) corpus = input_pipeline.load_data(input_dataset).repeat() corpus = input_pipeline.padded_batch(corpus, 3) return corpus
def test_load_no_batching(): def input_generator(): yield ["a", "b", "c"] yield ["c", "b"] expected_output = [ ( { "inputs": np.array([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.5, 2.5, 3.5], [0.0, 0.0, 0.0, 4.5, 5.5, 6.5], [0.0, 0.0, 0.0, 7.5, 8.5, 9.5]], dtype=np.float32), "length": np.array(4, dtype=np.int32), }, { "targets": np.array([4, 5, 6, 2], dtype=np.int32) }, ), ( { "inputs": np.array([ [0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 7.5, 8.5, 9.5], [0.0, 0.0, 0.0, 4.5, 5.5, 6.5], ], dtype=np.float32), "length": np.array(3, dtype=np.int32), }, { "targets": np.array([6, 5, 2], dtype=np.int32) }, ), ] input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=None) input_data = input_pipeline.load_data(input_dataset) it = input_data.make_initializable_iterator() example = it.get_next() with tf.Session() as sess: sess.run(tf.tables_initializer()) sess.run(it.initializer) #sess.run(tf.global_variables_initializer()) for _, expected in enumerate(expected_output): actual = sess.run(example) assert actual[0]["inputs"] == approx(expected[0]["inputs"]) assert actual[0]["length"] == approx(expected[0]["length"]) assert actual[1]["targets"] == approx(expected[1]["targets"])
def create_input(): simple_examples = SimpleExamplesCorpus() train_data = simple_examples.get_tokens_dataset( DatasetType.TRAIN).repeat().shuffle(1000, seed=0) input_pipe = LmInputDataPipeline(glove, 8) return input_pipe.load_data(train_data)
def input_fn(): vocab = MockVocab() input_pipeline = LmInputDataPipeline(vocab, batch_size=3) input_dataset = tf.data.Dataset.from_generator(input_generator, output_types=tf.string) return input_pipeline.load_data(input_dataset).repeat()
def create_input(): simple_examples = SimpleExamplesCorpus() train_data = simple_examples.get_tokens_dataset(DatasetType.TRAIN) input_pipe = LmInputDataPipeline(glove, None) return input_pipe.load_data(train_data)