Beispiel #1
0
def test_ngram_delta_threshold_tf(dataset_0_3_8_10_11_20_23):
    """Test to verify that delta threshold work as expected in one partition in the same ngram
    and between consecutive ngrams. delta threshold here refers that each ngram must not be
    more than delta threshold apart for the field specified by timestamp_field."""

    fields = {
        0: [
            TestSchema.id, TestSchema.id2, TestSchema.image_png,
            TestSchema.matrix
        ],
        1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
    }
    ngram = NGram(fields=fields,
                  delta_threshold=4,
                  timestamp_field=TestSchema.id)
    with Reader(schema_fields=ngram,
                dataset_url=dataset_0_3_8_10_11_20_23.url,
                reader_pool=DummyPool(),
                shuffle_options=ShuffleOptions(False)) as reader:

        # Ngrams expected: (0, 3), (8, 10), (10, 11)

        with tf.Session() as sess:
            readout = tf_tensors(reader)
            for timestep in readout:
                for field in readout[timestep]:
                    assert field.get_shape().dims is not None
            first_item = sess.run(readout)
            expected_item = _get_named_tuple_from_ngram(
                ngram, dataset_0_3_8_10_11_20_23.data, 0)
            _assert_equal_ngram(first_item, expected_item)

            readout = tf_tensors(reader)
            for timestep in readout:
                for field in readout[timestep]:
                    assert field.get_shape().dims is not None
            second_item = sess.run(readout)
            expected_item = _get_named_tuple_from_ngram(
                ngram, dataset_0_3_8_10_11_20_23.data, 3)
            _assert_equal_ngram(second_item, expected_item)

            readout = tf_tensors(reader)
            for timestep in readout:
                for field in readout[timestep]:
                    assert field.get_shape().dims is not None
            third_item = sess.run(readout)
            expected_item = _get_named_tuple_from_ngram(
                ngram, dataset_0_3_8_10_11_20_23.data, 5)
            _assert_equal_ngram(third_item, expected_item)

            with pytest.raises(OutOfRangeError):
                sess.run(tf_tensors(reader))
Beispiel #2
0
def _test_continuous_ngram_tf(ngram_fields, dataset_num_files_1,
                              reader_factory):
    """Tests continuous ngram in tf of a certain length. Continuous here refers to
    that this reader will always return consecutive ngrams due to shuffle being false
    and partition being 1.
    """

    ngram = NGram(fields=ngram_fields,
                  delta_threshold=10,
                  timestamp_field=TestSchema.id)
    with reader_factory(dataset_num_files_1.url,
                        schema_fields=ngram,
                        shuffle_row_groups=False) as reader:

        readout_examples = tf_tensors(reader)

        # Make sure we have static shape info for all fields
        for timestep in readout_examples:
            for field in readout_examples[timestep]:
                assert field.get_shape().dims is not None

        # Read a bunch of entries from the dataset and compare the data to reference
        expected_id = 0
        with tf.Session() as sess:
            for _ in range(5):
                actual = sess.run(readout_examples)
                expected_ngram = _get_named_tuple_from_ngram(
                    ngram, dataset_num_files_1.data, expected_id)
                _assert_equal_ngram(actual, expected_ngram)
                expected_id = expected_id + 1
Beispiel #3
0
def _test_noncontinuous_ngram_tf(ngram_fields, synthetic_dataset,
                                 reader_factory):
    """Test non continuous ngram in tf of a certain length. Non continuous here refers
    to that the reader will not necessarily return consecutive ngrams because partition is more
    than one and false is true."""

    dataset_dicts = synthetic_dataset.data
    ngram = NGram(fields=ngram_fields,
                  delta_threshold=10,
                  timestamp_field=TestSchema.id)
    reader = reader_factory(synthetic_dataset.url, schema_fields=ngram)

    readout_examples = tf_tensors(reader)

    # Make sure we have static shape info for all fields
    for timestep in readout_examples:
        for field in readout_examples[timestep]:
            assert field.get_shape().dims is not None

    # Read a bunch of entries from the dataset and compare the data to reference
    with tf.Session() as sess:
        for _ in range(5):
            actual = sess.run(readout_examples)
            expected_ngram = _get_named_tuple_from_ngram(
                ngram, dataset_dicts, actual[min(actual.keys())].id)
            _assert_equal_ngram(actual, expected_ngram)

    reader.stop()
    reader.join()
Beispiel #4
0
def _read_from_tf_tensors(synthetic_dataset, count, shuffling_queue_capacity,
                          min_after_dequeue, ngram):
    """Used by several test cases. Reads a 'count' rows using reader.

    The reader is configured without row-group shuffling and guarantees deterministic order of rows up to the
    results queue TF shuffling which is controlled by 'shuffling_queue_capacity', 'min_after_dequeue' arguments.

    The function returns a tuple with: (actual data read from the dataset, a TF tensor returned by the reader)
    """

    schema_fields = (NON_NULLABLE_FIELDS if ngram is None else ngram)

    with make_reader(schema_fields=schema_fields,
                     dataset_url=synthetic_dataset.url,
                     reader_pool_type='dummy',
                     shuffle_row_groups=False) as reader:
        row_tensors = tf_tensors(
            reader,
            shuffling_queue_capacity=shuffling_queue_capacity,
            min_after_dequeue=min_after_dequeue)

        with _tf_session() as sess:
            rows_data = [sess.run(row_tensors) for _ in range(count)]

    return rows_data, row_tensors
Beispiel #5
0
def test_ngram_delta_small_threshold_tf():
    """Test to verify that a small threshold work in ngrams."""

    with temporary_directory() as tmp_dir:
        tmp_url = 'file://{}'.format(tmp_dir)
        ids = range(0, 99, 5)
        create_test_dataset(tmp_url, ids)

        fields = {
            0: [
                TestSchema.id, TestSchema.id2, TestSchema.image_png,
                TestSchema.matrix
            ],
            1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        }
        ngram = NGram(fields=fields,
                      delta_threshold=1,
                      timestamp_field=TestSchema.id)
        reader = Reader(
            schema_fields=ngram,
            dataset_url=tmp_url,
            reader_pool=DummyPool(),
        )

        with tf.Session() as sess:
            with pytest.raises(OutOfRangeError):
                sess.run(tf_tensors(reader))

        reader.stop()
        reader.join()
def test_ngram_mix(synthetic_dataset):
    ngram1_fields = {
        -1: [
            'id',
        ],
        0: ['id', 'image_png'],
    }

    ts_field = '^id$'

    ngram1 = NGram(fields=ngram1_fields,
                   delta_threshold=10,
                   timestamp_field=ts_field)
    ngram2 = NGram(fields=ngram1_fields,
                   delta_threshold=10,
                   timestamp_field=ts_field)

    readers = [
        make_reader(synthetic_dataset.url,
                    schema_fields=ngram1,
                    workers_count=1),
        make_reader(synthetic_dataset.url,
                    schema_fields=ngram2,
                    workers_count=1)
    ]

    with WeightedSamplingReader(readers, [0.5, 0.5]) as mixer:
        mixed_tensors = tf_tensors(mixer)

        with tf.Session() as sess:
            for _ in range(10):
                actual = sess.run(mixed_tensors)
                assert set(actual.keys()) == {-1, 0}
def test_with_tf_tensors(synthetic_dataset):
    fields_to_read = ['id.*', 'image_png']
    readers = [make_reader(synthetic_dataset.url, schema_fields=fields_to_read, workers_count=1),
               make_reader(synthetic_dataset.url, schema_fields=fields_to_read, workers_count=1)]

    with WeightedSamplingReader(readers, [0.5, 0.5]) as mixer:
        mixed_tensors = tf_tensors(mixer)

        with tf.Session() as sess:
            sess.run(mixed_tensors)
def test_simple_read_tensorflow(synthetic_dataset):
    """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference
    data"""
    with make_reader(schema_fields=NON_NULLABLE_FIELDS, dataset_url=synthetic_dataset.url) as reader:
        row_tensors = tf_tensors(reader)
        with _tf_session() as sess:
            rows_data = [sess.run(row_tensors) for _ in range(30)]

    # Make sure we have static shape info for all fields
    _assert_all_tensors_have_shape(row_tensors)
    _assert_expected_rows_data(synthetic_dataset.data, rows_data)
Beispiel #9
0
def test_ngram_delta_small_threshold_tf(reader_factory, dataset_range_0_99_5):
    """Test to verify that a small threshold work in ngrams."""

    fields = {
        0: [TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix],
        1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
    }
    ngram = NGram(fields=fields, delta_threshold=1, timestamp_field=TestSchema.id)
    with reader_factory(dataset_range_0_99_5.url, schema_fields=ngram) as reader:
        with tf.Session() as sess:
            with pytest.raises(tf.errors.OutOfRangeError):
                sess.run(tf_tensors(reader))
def test_simple_read_tensorflow_with_non_petastorm_many_columns_dataset(many_columns_non_petastorm_dataset):
    """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference
    data"""
    with make_batch_reader(dataset_url_or_urls=many_columns_non_petastorm_dataset.url) as reader:
        row_tensors = tf_tensors(reader)
        # Make sure we have static shape info for all fields
        for column in row_tensors:
            assert column.get_shape().as_list() == [None]

        with _tf_session() as sess:
            batch = sess.run(row_tensors)._asdict()
            assert set(batch.keys()) == set(many_columns_non_petastorm_dataset.data[0].keys())
Beispiel #11
0
def test_ngram_length_1_tf(synthetic_dataset, reader_factory):
    """Test to verify that ngram generalize to support length 1"""
    dataset_dicts = synthetic_dataset.data
    fields = {0: [TestSchema.id, TestSchema.id2]}
    ngram = NGram(fields=fields, delta_threshold=0.012, timestamp_field=TestSchema.id)
    reader = reader_factory(synthetic_dataset.url, schema_fields=ngram,
                            shuffle_row_groups=True, shuffle_row_drop_partitions=5)
    with tf.Session() as sess:
        for _ in range(10):
            actual = sess.run(tf_tensors(reader))
            expected_ngram = _get_named_tuple_from_ngram(ngram, dataset_dicts, actual[min(actual.keys())].id)
            _assert_equal_ngram(actual, expected_ngram)

    reader.stop()
    reader.join()
Beispiel #12
0
def tensorflow_hello_world(dataset_url='file:///tmp/hello_world_dataset'):
    # Example: tf_tensors will return tensors with dataset data
    with make_reader(dataset_url) as reader:
        tensor = tf_tensors(reader)
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

    # Example: use tf.data.Dataset API
    with make_reader(dataset_url) as reader:
        dataset = make_petastorm_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)
Beispiel #13
0
def test_simple_read_tensorflow_with_parquet_dataset(scalar_dataset):
    """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference
    data"""
    with make_batch_reader(dataset_url=scalar_dataset.url) as reader:
        row_tensors = tf_tensors(reader)
        # Make sure we have static shape info for all fields
        for column in row_tensors:
            assert column.get_shape().as_list() == [None]

        with _tf_session() as sess:
            for _ in range(2):
                batch = sess.run(row_tensors)._asdict()
                for i, id_value in enumerate(batch['id']):
                    expected_row = next(d for d in scalar_dataset.data if d['id'] == id_value)
                    for field_name in expected_row.keys():
                        _assert_fields_eq(batch[field_name][i], expected_row[field_name])
Beispiel #14
0
def _time_warmup_and_work_tf(reader, warmup_cycles_count, measure_cycles_count, shuffling_queue_size,
                             min_after_dequeue):
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

        readout_tensors = tf_tensors(reader, shuffling_queue_size, min_after_dequeue)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, start=True, sess=sess)

        result = _time_warmup_and_work(reader, warmup_cycles_count, measure_cycles_count,
                                       lambda: sess.run(readout_tensors))

        coord.request_stop()
        coord.join(threads)

    return result
Beispiel #15
0
    def _read_from_tf_tensors(self, count, shuffling_queue_capacity,
                              min_after_dequeue, ngram):
        """Used by several test cases. Reads a 'count' rows using reader.

        The reader is configured without row-group shuffling and guarantees deterministic order of rows up to the
        results queue TF shuffling which is controlled by 'shuffling_queue_capacity', 'min_after_dequeue' arguments.

        The function returns a tuple with: (actual data read from the dataset, a TF tensor returned by the reader)
        """

        # Nullable fields can not be read by tensorflow (what would be the dimension of a tensor for null data?)
        fields = set(TestSchema.fields.values()) - {
            TestSchema.matrix_nullable, TestSchema.string_array_nullable
        }
        schema_fields = (fields if ngram is None else ngram)

        reader = Reader(schema_fields=schema_fields,
                        dataset_url=self._dataset_url,
                        reader_pool=DummyPool(),
                        shuffle=False)

        row_tensors = tf_tensors(
            reader,
            shuffling_queue_capacity=shuffling_queue_capacity,
            min_after_dequeue=min_after_dequeue)

        # Read a bunch of entries from the dataset and compare the data to reference
        with tf.Session() as sess:
            sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer()
            ])

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, start=True)

            # Collect all the data we need from 'count' number of reads
            rows_data = [sess.run(row_tensors) for _ in range(count)]

            coord.request_stop()
            coord.join(threads)

        reader.stop()
        reader.join()

        return rows_data, row_tensors
def tensorflow_hello_world(dataset_url='file:///tmp/external_dataset'):
    # Example: tf_tensors will return tensors with dataset data
    with make_batch_reader(dataset_url) as reader:
        tensor = tf_tensors(reader)
        with tf.Session() as sess:
            # Because we are using make_batch_reader(), each read returns a batch of rows instead of a single row
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))

    # Example: use tf.data.Dataset API
    with make_batch_reader(dataset_url) as reader:
        dataset = make_petastorm_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))
def test_transform_function_new_field(synthetic_dataset):
    def double_matrix(sample):
        sample['double_matrix'] = sample['matrix'] * 2
        del sample['matrix']
        return sample

    with make_reader(synthetic_dataset.url, reader_pool_type='dummy', schema_fields=[TestSchema.id, TestSchema.matrix],
                     transform_spec=TransformSpec(double_matrix,
                                                  [('double_matrix', np.float32, (32, 16, 3), False)],
                                                  ['matrix'])) as reader:
        row_tensors = tf_tensors(reader)
        with _tf_session() as sess:
            actual = sess.run(row_tensors)

        original_sample = next(d for d in synthetic_dataset.data if d['id'] == actual.id)
        expected_matrix = original_sample['matrix'] * 2
        np.testing.assert_equal(expected_matrix, actual.double_matrix)
Beispiel #18
0
    def test_simple_read_tf(self):
        """Just a bunch of read and compares of all values to the expected values for their types
        and shapes"""
        reader_tensors = tf_tensors(self.reader)._asdict()

        for schema_field in TestSchema.fields.values():
            self.assertEqual(reader_tensors[schema_field.name].dtype,
                             _numpy_to_tf_dtypes(schema_field.numpy_dtype))
            self.assertEqual(len(reader_tensors[schema_field.name].shape),
                             len(schema_field.shape))

        # Read a bunch of entries from the dataset and compare the data to reference
        with tf.Session() as sess:
            for _ in range(10):
                sess.run(reader_tensors)

        self.reader.stop()
        self.reader.join()
def test_transform_function_new_field_batched(scalar_dataset):
    def double_float64(sample):
        sample['new_float64'] = sample['float64'] * 2
        del sample['float64']
        return sample

    with make_batch_reader(scalar_dataset.url, reader_pool_type='dummy',
                           transform_spec=TransformSpec(double_float64,
                                                        [('new_float64', np.float64, (), False)],
                                                        ['float64'])) as reader:
        row_tensors = tf_tensors(reader)
        with _tf_session() as sess:
            actual = sess.run(row_tensors)

        for actual_id, actual_float64 in zip(actual.id, actual.new_float64):
            original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id)
            expected = original_sample['float64'] * 2
            np.testing.assert_equal(expected, actual_float64)
Beispiel #20
0
def test_shuffling_queue_with_make_batch_reader(scalar_dataset):
    with make_batch_reader(dataset_url=scalar_dataset.url) as reader:
        with pytest.raises(ValueError):
            tf_tensors(reader, 100, 90)
Beispiel #21
0
def train_and_test(dataset_url, training_iterations, batch_size,
                   evaluation_interval):
    """
    Train a model for epochs with a batch size batch_size, printing accuracy every log_interval.
    :param dataset_url: The MNIST dataset url.
    :param training_iterations: The training iterations to train for.
    :param batch_size: The batch size for training.
    :param evaluation_interval: The interval used to print the accuracy.
    :return:
    """
    with Reader(os.path.join(dataset_url, 'train'),
                num_epochs=None) as train_reader:
        with Reader(os.path.join(dataset_url, 'test'),
                    num_epochs=None) as test_reader:
            train_readout = tf_tensors(train_reader)
            train_image = tf.cast(tf.reshape(train_readout.image, [784]),
                                  tf.float32)
            train_label = train_readout.digit
            batch_image, batch_label = tf.train.batch(
                [train_image, train_label], batch_size=batch_size)

            W = tf.Variable(tf.zeros([784, 10]))
            b = tf.Variable(tf.zeros([10]))
            y = tf.matmul(batch_image, W) + b

            # The raw formulation of cross-entropy,
            #
            #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
            #                                 reduction_indices=[1]))
            #
            # can be numerically unstable.
            #
            # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
            # outputs of 'y', and then average across the batch.
            cross_entropy = tf.losses.sparse_softmax_cross_entropy(
                labels=batch_label, logits=y)
            train_step = tf.train.GradientDescentOptimizer(0.5).minimize(
                cross_entropy)

            correct_prediction = tf.equal(tf.argmax(y, 1), batch_label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

            test_readout = tf_tensors(test_reader)
            test_image = tf.cast(tf.reshape(test_readout.image, [784]),
                                 tf.float32)
            test_label = test_readout.digit
            test_batch_image, test_batch_label = tf.train.batch(
                [test_image, test_label], batch_size=batch_size)

            # Train
            print(
                'Training model for {0} training iterations with batch size {1} and evaluation interval {2}'
                .format(training_iterations, batch_size, evaluation_interval))
            with tf.Session() as sess:
                sess.run([
                    tf.local_variables_initializer(),
                    tf.global_variables_initializer(),
                ])
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(sess=sess, coord=coord)
                try:
                    for i in range(training_iterations):
                        if coord.should_stop():
                            break

                        sess.run(train_step)

                        if (i % evaluation_interval) == 0 or i == (
                                training_iterations - 1):
                            feed_batch_image, feed_batch_label = sess.run(
                                [test_batch_image, test_batch_label])
                            print(
                                'After {0} training iterations, the accuracy of the model is: {1:.2f}'
                                .format(
                                    i,
                                    sess.run(accuracy,
                                             feed_dict={
                                                 batch_image: feed_batch_image,
                                                 batch_label: feed_batch_label
                                             })))
                finally:
                    coord.request_stop()
                    coord.join(threads)