def test_ngram_delta_threshold_tf(dataset_0_3_8_10_11_20_23): """Test to verify that delta threshold work as expected in one partition in the same ngram and between consecutive ngrams. delta threshold here refers that each ngram must not be more than delta threshold apart for the field specified by timestamp_field.""" fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=4, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=dataset_0_3_8_10_11_20_23.url, reader_pool=DummyPool(), shuffle_options=ShuffleOptions(False)) as reader: # Ngrams expected: (0, 3), (8, 10), (10, 11) with tf.Session() as sess: readout = tf_tensors(reader) for timestep in readout: for field in readout[timestep]: assert field.get_shape().dims is not None first_item = sess.run(readout) expected_item = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 0) _assert_equal_ngram(first_item, expected_item) readout = tf_tensors(reader) for timestep in readout: for field in readout[timestep]: assert field.get_shape().dims is not None second_item = sess.run(readout) expected_item = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 3) _assert_equal_ngram(second_item, expected_item) readout = tf_tensors(reader) for timestep in readout: for field in readout[timestep]: assert field.get_shape().dims is not None third_item = sess.run(readout) expected_item = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 5) _assert_equal_ngram(third_item, expected_item) with pytest.raises(OutOfRangeError): sess.run(tf_tensors(reader))
def _test_continuous_ngram_tf(ngram_fields, dataset_num_files_1, reader_factory): """Tests continuous ngram in tf of a certain length. Continuous here refers to that this reader will always return consecutive ngrams due to shuffle being false and partition being 1. """ ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) with reader_factory(dataset_num_files_1.url, schema_fields=ngram, shuffle_row_groups=False) as reader: readout_examples = tf_tensors(reader) # Make sure we have static shape info for all fields for timestep in readout_examples: for field in readout_examples[timestep]: assert field.get_shape().dims is not None # Read a bunch of entries from the dataset and compare the data to reference expected_id = 0 with tf.Session() as sess: for _ in range(5): actual = sess.run(readout_examples) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_num_files_1.data, expected_id) _assert_equal_ngram(actual, expected_ngram) expected_id = expected_id + 1
def _test_noncontinuous_ngram_tf(ngram_fields, synthetic_dataset, reader_factory): """Test non continuous ngram in tf of a certain length. Non continuous here refers to that the reader will not necessarily return consecutive ngrams because partition is more than one and false is true.""" dataset_dicts = synthetic_dataset.data ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) reader = reader_factory(synthetic_dataset.url, schema_fields=ngram) readout_examples = tf_tensors(reader) # Make sure we have static shape info for all fields for timestep in readout_examples: for field in readout_examples[timestep]: assert field.get_shape().dims is not None # Read a bunch of entries from the dataset and compare the data to reference with tf.Session() as sess: for _ in range(5): actual = sess.run(readout_examples) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) _assert_equal_ngram(actual, expected_ngram) reader.stop() reader.join()
def _read_from_tf_tensors(synthetic_dataset, count, shuffling_queue_capacity, min_after_dequeue, ngram): """Used by several test cases. Reads a 'count' rows using reader. The reader is configured without row-group shuffling and guarantees deterministic order of rows up to the results queue TF shuffling which is controlled by 'shuffling_queue_capacity', 'min_after_dequeue' arguments. The function returns a tuple with: (actual data read from the dataset, a TF tensor returned by the reader) """ schema_fields = (NON_NULLABLE_FIELDS if ngram is None else ngram) with make_reader(schema_fields=schema_fields, dataset_url=synthetic_dataset.url, reader_pool_type='dummy', shuffle_row_groups=False) as reader: row_tensors = tf_tensors( reader, shuffling_queue_capacity=shuffling_queue_capacity, min_after_dequeue=min_after_dequeue) with _tf_session() as sess: rows_data = [sess.run(row_tensors) for _ in range(count)] return rows_data, row_tensors
def test_ngram_delta_small_threshold_tf(): """Test to verify that a small threshold work in ngrams.""" with temporary_directory() as tmp_dir: tmp_url = 'file://{}'.format(tmp_dir) ids = range(0, 99, 5) create_test_dataset(tmp_url, ids) fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=1, timestamp_field=TestSchema.id) reader = Reader( schema_fields=ngram, dataset_url=tmp_url, reader_pool=DummyPool(), ) with tf.Session() as sess: with pytest.raises(OutOfRangeError): sess.run(tf_tensors(reader)) reader.stop() reader.join()
def test_ngram_mix(synthetic_dataset): ngram1_fields = { -1: [ 'id', ], 0: ['id', 'image_png'], } ts_field = '^id$' ngram1 = NGram(fields=ngram1_fields, delta_threshold=10, timestamp_field=ts_field) ngram2 = NGram(fields=ngram1_fields, delta_threshold=10, timestamp_field=ts_field) readers = [ make_reader(synthetic_dataset.url, schema_fields=ngram1, workers_count=1), make_reader(synthetic_dataset.url, schema_fields=ngram2, workers_count=1) ] with WeightedSamplingReader(readers, [0.5, 0.5]) as mixer: mixed_tensors = tf_tensors(mixer) with tf.Session() as sess: for _ in range(10): actual = sess.run(mixed_tensors) assert set(actual.keys()) == {-1, 0}
def test_with_tf_tensors(synthetic_dataset): fields_to_read = ['id.*', 'image_png'] readers = [make_reader(synthetic_dataset.url, schema_fields=fields_to_read, workers_count=1), make_reader(synthetic_dataset.url, schema_fields=fields_to_read, workers_count=1)] with WeightedSamplingReader(readers, [0.5, 0.5]) as mixer: mixed_tensors = tf_tensors(mixer) with tf.Session() as sess: sess.run(mixed_tensors)
def test_simple_read_tensorflow(synthetic_dataset): """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference data""" with make_reader(schema_fields=NON_NULLABLE_FIELDS, dataset_url=synthetic_dataset.url) as reader: row_tensors = tf_tensors(reader) with _tf_session() as sess: rows_data = [sess.run(row_tensors) for _ in range(30)] # Make sure we have static shape info for all fields _assert_all_tensors_have_shape(row_tensors) _assert_expected_rows_data(synthetic_dataset.data, rows_data)
def test_ngram_delta_small_threshold_tf(reader_factory, dataset_range_0_99_5): """Test to verify that a small threshold work in ngrams.""" fields = { 0: [TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=1, timestamp_field=TestSchema.id) with reader_factory(dataset_range_0_99_5.url, schema_fields=ngram) as reader: with tf.Session() as sess: with pytest.raises(tf.errors.OutOfRangeError): sess.run(tf_tensors(reader))
def test_simple_read_tensorflow_with_non_petastorm_many_columns_dataset(many_columns_non_petastorm_dataset): """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference data""" with make_batch_reader(dataset_url_or_urls=many_columns_non_petastorm_dataset.url) as reader: row_tensors = tf_tensors(reader) # Make sure we have static shape info for all fields for column in row_tensors: assert column.get_shape().as_list() == [None] with _tf_session() as sess: batch = sess.run(row_tensors)._asdict() assert set(batch.keys()) == set(many_columns_non_petastorm_dataset.data[0].keys())
def test_ngram_length_1_tf(synthetic_dataset, reader_factory): """Test to verify that ngram generalize to support length 1""" dataset_dicts = synthetic_dataset.data fields = {0: [TestSchema.id, TestSchema.id2]} ngram = NGram(fields=fields, delta_threshold=0.012, timestamp_field=TestSchema.id) reader = reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_row_groups=True, shuffle_row_drop_partitions=5) with tf.Session() as sess: for _ in range(10): actual = sess.run(tf_tensors(reader)) expected_ngram = _get_named_tuple_from_ngram(ngram, dataset_dicts, actual[min(actual.keys())].id) _assert_equal_ngram(actual, expected_ngram) reader.stop() reader.join()
def tensorflow_hello_world(dataset_url='file:///tmp/hello_world_dataset'): # Example: tf_tensors will return tensors with dataset data with make_reader(dataset_url) as reader: tensor = tf_tensors(reader) with tf.Session() as sess: sample = sess.run(tensor) print(sample.id) # Example: use tf.data.Dataset API with make_reader(dataset_url) as reader: dataset = make_petastorm_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: sample = sess.run(tensor) print(sample.id)
def test_simple_read_tensorflow_with_parquet_dataset(scalar_dataset): """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference data""" with make_batch_reader(dataset_url=scalar_dataset.url) as reader: row_tensors = tf_tensors(reader) # Make sure we have static shape info for all fields for column in row_tensors: assert column.get_shape().as_list() == [None] with _tf_session() as sess: for _ in range(2): batch = sess.run(row_tensors)._asdict() for i, id_value in enumerate(batch['id']): expected_row = next(d for d in scalar_dataset.data if d['id'] == id_value) for field_name in expected_row.keys(): _assert_fields_eq(batch[field_name][i], expected_row[field_name])
def _time_warmup_and_work_tf(reader, warmup_cycles_count, measure_cycles_count, shuffling_queue_size, min_after_dequeue): with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) readout_tensors = tf_tensors(reader, shuffling_queue_size, min_after_dequeue) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, start=True, sess=sess) result = _time_warmup_and_work(reader, warmup_cycles_count, measure_cycles_count, lambda: sess.run(readout_tensors)) coord.request_stop() coord.join(threads) return result
def _read_from_tf_tensors(self, count, shuffling_queue_capacity, min_after_dequeue, ngram): """Used by several test cases. Reads a 'count' rows using reader. The reader is configured without row-group shuffling and guarantees deterministic order of rows up to the results queue TF shuffling which is controlled by 'shuffling_queue_capacity', 'min_after_dequeue' arguments. The function returns a tuple with: (actual data read from the dataset, a TF tensor returned by the reader) """ # Nullable fields can not be read by tensorflow (what would be the dimension of a tensor for null data?) fields = set(TestSchema.fields.values()) - { TestSchema.matrix_nullable, TestSchema.string_array_nullable } schema_fields = (fields if ngram is None else ngram) reader = Reader(schema_fields=schema_fields, dataset_url=self._dataset_url, reader_pool=DummyPool(), shuffle=False) row_tensors = tf_tensors( reader, shuffling_queue_capacity=shuffling_queue_capacity, min_after_dequeue=min_after_dequeue) # Read a bunch of entries from the dataset and compare the data to reference with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, start=True) # Collect all the data we need from 'count' number of reads rows_data = [sess.run(row_tensors) for _ in range(count)] coord.request_stop() coord.join(threads) reader.stop() reader.join() return rows_data, row_tensors
def tensorflow_hello_world(dataset_url='file:///tmp/external_dataset'): # Example: tf_tensors will return tensors with dataset data with make_batch_reader(dataset_url) as reader: tensor = tf_tensors(reader) with tf.Session() as sess: # Because we are using make_batch_reader(), each read returns a batch of rows instead of a single row batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id)) # Example: use tf.data.Dataset API with make_batch_reader(dataset_url) as reader: dataset = make_petastorm_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id))
def test_transform_function_new_field(synthetic_dataset): def double_matrix(sample): sample['double_matrix'] = sample['matrix'] * 2 del sample['matrix'] return sample with make_reader(synthetic_dataset.url, reader_pool_type='dummy', schema_fields=[TestSchema.id, TestSchema.matrix], transform_spec=TransformSpec(double_matrix, [('double_matrix', np.float32, (32, 16, 3), False)], ['matrix'])) as reader: row_tensors = tf_tensors(reader) with _tf_session() as sess: actual = sess.run(row_tensors) original_sample = next(d for d in synthetic_dataset.data if d['id'] == actual.id) expected_matrix = original_sample['matrix'] * 2 np.testing.assert_equal(expected_matrix, actual.double_matrix)
def test_simple_read_tf(self): """Just a bunch of read and compares of all values to the expected values for their types and shapes""" reader_tensors = tf_tensors(self.reader)._asdict() for schema_field in TestSchema.fields.values(): self.assertEqual(reader_tensors[schema_field.name].dtype, _numpy_to_tf_dtypes(schema_field.numpy_dtype)) self.assertEqual(len(reader_tensors[schema_field.name].shape), len(schema_field.shape)) # Read a bunch of entries from the dataset and compare the data to reference with tf.Session() as sess: for _ in range(10): sess.run(reader_tensors) self.reader.stop() self.reader.join()
def test_transform_function_new_field_batched(scalar_dataset): def double_float64(sample): sample['new_float64'] = sample['float64'] * 2 del sample['float64'] return sample with make_batch_reader(scalar_dataset.url, reader_pool_type='dummy', transform_spec=TransformSpec(double_float64, [('new_float64', np.float64, (), False)], ['float64'])) as reader: row_tensors = tf_tensors(reader) with _tf_session() as sess: actual = sess.run(row_tensors) for actual_id, actual_float64 in zip(actual.id, actual.new_float64): original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id) expected = original_sample['float64'] * 2 np.testing.assert_equal(expected, actual_float64)
def test_shuffling_queue_with_make_batch_reader(scalar_dataset): with make_batch_reader(dataset_url=scalar_dataset.url) as reader: with pytest.raises(ValueError): tf_tensors(reader, 100, 90)
def train_and_test(dataset_url, training_iterations, batch_size, evaluation_interval): """ Train a model for epochs with a batch size batch_size, printing accuracy every log_interval. :param dataset_url: The MNIST dataset url. :param training_iterations: The training iterations to train for. :param batch_size: The batch size for training. :param evaluation_interval: The interval used to print the accuracy. :return: """ with Reader(os.path.join(dataset_url, 'train'), num_epochs=None) as train_reader: with Reader(os.path.join(dataset_url, 'test'), num_epochs=None) as test_reader: train_readout = tf_tensors(train_reader) train_image = tf.cast(tf.reshape(train_readout.image, [784]), tf.float32) train_label = train_readout.digit batch_image, batch_label = tf.train.batch( [train_image, train_label], batch_size=batch_size) W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(batch_image, W) + b # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.losses.sparse_softmax_cross_entropy on the raw # outputs of 'y', and then average across the batch. cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=batch_label, logits=y) train_step = tf.train.GradientDescentOptimizer(0.5).minimize( cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), batch_label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) test_readout = tf_tensors(test_reader) test_image = tf.cast(tf.reshape(test_readout.image, [784]), tf.float32) test_label = test_readout.digit test_batch_image, test_batch_label = tf.train.batch( [test_image, test_label], batch_size=batch_size) # Train print( 'Training model for {0} training iterations with batch size {1} and evaluation interval {2}' .format(training_iterations, batch_size, evaluation_interval)) with tf.Session() as sess: sess.run([ tf.local_variables_initializer(), tf.global_variables_initializer(), ]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for i in range(training_iterations): if coord.should_stop(): break sess.run(train_step) if (i % evaluation_interval) == 0 or i == ( training_iterations - 1): feed_batch_image, feed_batch_label = sess.run( [test_batch_image, test_batch_label]) print( 'After {0} training iterations, the accuracy of the model is: {1:.2f}' .format( i, sess.run(accuracy, feed_dict={ batch_image: feed_batch_image, batch_label: feed_batch_label }))) finally: coord.request_stop() coord.join(threads)