Ejemplo n.º 1
0
def test_normalize_shuffle_partitions(synthetic_dataset):
    dataset = pq.ParquetDataset(synthetic_dataset.path)
    row_drop_partitions = Reader._normalize_shuffle_options(2, dataset)
    assert row_drop_partitions == 2

    row_drop_partitions = Reader._normalize_shuffle_options(1000, dataset)
    assert row_drop_partitions == 10
def test_full_pytorch_example(large_mock_mnist_data, tmpdir):
    # First, generate mock dataset
    dataset_url = 'file://{}'.format(tmpdir)
    mnist_data_to_petastorm_dataset(tmpdir,
                                    dataset_url,
                                    mnist_data=large_mock_mnist_data,
                                    spark_master='local[1]',
                                    parquet_files_count=1)

    # Next, run a round of training using the pytorce adapting data loader
    from petastorm.pytorch import DataLoader

    torch.manual_seed(1)
    device = torch.device('cpu')
    model = pytorch_example.Net().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

    with DataLoader(Reader('{}/train'.format(dataset_url),
                           reader_pool=DummyPool(),
                           num_epochs=1),
                    batch_size=32,
                    transform=pytorch_example._transform_row) as train_loader:
        pytorch_example.train(model, device, train_loader, 10, optimizer, 1)
    with DataLoader(Reader('{}/test'.format(dataset_url),
                           reader_pool=DummyPool(),
                           num_epochs=1),
                    batch_size=100,
                    transform=pytorch_example._transform_row) as test_loader:
        pytorch_example.test(model, device, test_loader)
def compute_correlation_distribution(dataset_url,
                                     id_column,
                                     shuffle_options,
                                     num_corr_samples=100):
    """
    Compute the correlation distribution of a given shuffle_options on an existing dataset.
    Use this to compare 2 different shuffling options compare.
    It is encouraged to use a dataset generated by generate_shuffle_analysis_dataset for this analysis.

    :param dataset_url: Dataset url to compute correlation distribution of
    :param id_column: Column where an integer or string id can be found
    :param shuffle_options: shuffle options to test correlation against
    :param num_corr_samples: How many samples of the correlation to take to compute distribution
    :return: (mean, standard deviation) of computed distribution
    """

    # Read the dataset without any shuffling in order (need to use a dummy pool for this).
    with Reader(dataset_url,
                shuffle_options=ShuffleOptions(False),
                reader_pool=DummyPool()) as reader:
        unshuffled = [row[id_column] for row in reader]

    correlations = []
    for _ in range(num_corr_samples):
        with Reader(dataset_url, shuffle_options=shuffle_options) as reader:
            shuffled = [row[id_column] for row in reader]
            correlations.append(abs(np.corrcoef(unshuffled, shuffled)[0, 1]))

    mean = np.mean(correlations)
    std_dev = np.std(correlations)

    return mean, std_dev
Ejemplo n.º 4
0
def test_dataset_url_must_be_string():
    with pytest.raises(ValueError):
        Reader(dataset_url=None)

    with pytest.raises(ValueError):
        Reader(dataset_url=123)

    with pytest.raises(ValueError):
        Reader(dataset_url=[])
Ejemplo n.º 5
0
def test_normalize_shuffle_partitions(synthetic_dataset):
    dataset = pq.ParquetDataset(synthetic_dataset.path)
    shuffle_options = ShuffleOptions(True, 2)
    Reader._normalize_shuffle_options(shuffle_options, dataset)
    assert shuffle_options.shuffle_row_drop_partitions == 2

    shuffle_options = ShuffleOptions(True, 1000)
    Reader._normalize_shuffle_options(shuffle_options, dataset)
    assert shuffle_options.shuffle_row_drop_partitions == 10
Ejemplo n.º 6
0
def test_num_epochs_value_error(synthetic_dataset):
    """Tests that the reader raises value errors when appropriate"""

    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url, reader_pool=DummyPool(), num_epochs=0)

    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url, reader_pool=DummyPool(), num_epochs=-10)

    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url, reader_pool=DummyPool(), num_epochs='abc')
Ejemplo n.º 7
0
 def readout_all_ids(shuffle, drop_ratio):
     with Reader(dataset_url=synthetic_dataset.url,
                 reader_pool=DummyPool(),
                 shuffle_options=ShuffleOptions(shuffle,
                                                drop_ratio)) as reader:
         ids = [row.id for row in reader]
     return ids
Ejemplo n.º 8
0
def test_rowgroup_selector_wrong_index_name(synthetic_dataset):
    """ Attempt to select row groups to based on wrong dataset index,
        Reader should raise exception
    """
    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url,
               rowgroup_selector=SingleIndexSelector('WrongIndexName', ['some_value']), reader_pool=DummyPool())
Ejemplo n.º 9
0
def test_pytorch_dataloader_batched(synthetic_dataset):
    batch_size = 10
    loader = DataLoader(Reader(synthetic_dataset.url, reader_pool=DummyPool()),
                        batch_size=batch_size,
                        collate_fn=_noop_collate)
    for item in loader:
        assert len(item) == batch_size
Ejemplo n.º 10
0
def tensorflow_hello_world(dataset_url='file:///tmp/hello_world_dataset'):
    # Example: tf_tensors will return tensors with dataset data
    with Reader(dataset_url) as reader:
        tensor = tf_tensors(reader)
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

    # Example: use tf.data.Dataset API
    with Reader(dataset_url) as reader:
        dataset = make_petastorm_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)
Ejemplo n.º 11
0
def test_partition_value_error(synthetic_dataset):
    """Tests that the reader raises value errors when appropriate"""

    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url, reader_pool=DummyPool(), training_partition=0)

    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url, reader_pool=DummyPool(), num_training_partitions=5)

    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url, reader_pool=DummyPool(), training_partition='0',
               num_training_partitions=5)

    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url, reader_pool=DummyPool(), training_partition=0,
               num_training_partitions='5')
Ejemplo n.º 12
0
def test_ngram_basic_longer_no_overlap(synthetic_dataset):
    """Tests basic ngram with no delta threshold with no overlaps of timestamps."""
    fields = {
        -5: [TestSchema.id, TestSchema.id2, TestSchema.matrix],
        -4: [TestSchema.id, TestSchema.id2, TestSchema.image_png],
        -3: [TestSchema.id, TestSchema.id2, TestSchema.decimal],
        -2: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        -1: [TestSchema.id, TestSchema.id2]
    }

    dataset_dicts = synthetic_dataset.data
    ngram = NGram(fields=fields,
                  delta_threshold=10,
                  timestamp_field=TestSchema.id,
                  timestamp_overlap=False)
    with Reader(schema_fields=ngram,
                dataset_url=synthetic_dataset.url,
                reader_pool=DummyPool(),
                shuffle_options=ShuffleOptions(False)) as reader:

        timestamps_seen = set()
        for actual in reader:
            expected_ngram = _get_named_tuple_from_ngram(
                ngram, dataset_dicts, actual[min(actual.keys())].id)
            np.testing.assert_equal(actual, expected_ngram)
            for step in actual.values():
                timestamp = step.id
                assert timestamp not in timestamps_seen
                timestamps_seen.add(timestamp)
Ejemplo n.º 13
0
 def test_no_metadata(self):
     self.vanish_metadata()
     with self.assertRaises(PetastormMetadataError) as e:
         Reader(self._dataset_url, reader_pool=DummyPool())
     self.assertTrue(
         'Could not find _common_metadata file' in str(e.exception))
     self.restore_metadata()
Ejemplo n.º 14
0
def _test_continuous_ngram_tf(ngram_fields, dataset_num_files_1):
    """Tests continuous ngram in tf of a certain length. Continuous here refers to
    that this reader will always return consecutive ngrams due to shuffle being false
    and partition being 1.
    """

    ngram = NGram(fields=ngram_fields,
                  delta_threshold=10,
                  timestamp_field=TestSchema.id)
    with Reader(schema_fields=ngram,
                dataset_url=dataset_num_files_1.url,
                reader_pool=ThreadPool(1),
                shuffle_options=ShuffleOptions(False)) as reader:

        readout_examples = tf_tensors(reader)

        # Make sure we have static shape info for all fields
        for timestep in readout_examples:
            for field in readout_examples[timestep]:
                assert field.get_shape().dims is not None

        # Read a bunch of entries from the dataset and compare the data to reference
        expected_id = 0
        with tf.Session() as sess:
            for _ in range(5):
                actual = sess.run(readout_examples)
                expected_ngram = _get_named_tuple_from_ngram(
                    ngram, dataset_num_files_1.data, expected_id)
                _assert_equal_ngram(actual, expected_ngram)
                expected_id = expected_id + 1
Ejemplo n.º 15
0
def test_simple_read_moved_dataset(synthetic_dataset, tmpdir):
    """Tests that a dataset may be opened after being moved to a new location"""
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    with Reader('file://{}'.format(a_moved_path), reader_pool=DummyPool()) as reader:
        _check_simple_reader(reader, synthetic_dataset.data)
Ejemplo n.º 16
0
def test_unlimited_epochs(synthetic_dataset):
    """Tests that unlimited epochs works as expected"""
    with Reader(synthetic_dataset.url, reader_pool=DummyPool(), num_epochs=None) as reader:
        # Read many expected entries from the dataset and compare the data to reference
        for _ in range(len(synthetic_dataset.data) * random.randint(10, 30) + random.randint(25, 50)):
            actual = dict(next(reader)._asdict())
            expected = next(d for d in synthetic_dataset.data if d['id'] == actual['id'])
            np.testing.assert_equal(expected, actual)
Ejemplo n.º 17
0
def test_single_column_predicate(synthetic_dataset):
    """Test quering a single column with a predicate on the same column """
    with Reader(synthetic_dataset.url, schema_fields=[TestSchema.id], predicate=EqualPredicate({'id': 1}),
                reader_pool=ThreadPool(1)) as reader:
        # Read a bunch of entries from the dataset and compare the data to reference
        for row in reader:
            actual = dict(row._asdict())
            expected = next(d for d in synthetic_dataset.data if d['id'] == actual['id'])
            np.testing.assert_equal(expected['id'], actual['id'])
Ejemplo n.º 18
0
def test_read_with_pyarrow_serialization(synthetic_dataset):
    with Reader(synthetic_dataset.url,
                reader_pool=ProcessPool(1, pyarrow_serialize=True)) as reader:
        for actual in reader:
            expected = next(d for d in synthetic_dataset.data
                            if d['id'] == actual.id)
            assert actual.id == expected['id']
            assert Decimal(actual.decimal) == expected['decimal']
            np.testing.assert_equal(actual.matrix, expected['matrix'])
Ejemplo n.º 19
0
def test_reading_subset_of_columns(synthetic_dataset):
    """Just a bunch of read and compares of all values to the expected values"""
    with Reader(synthetic_dataset.url, schema_fields=[TestSchema.id2, TestSchema.id],
                reader_pool=DummyPool()) as reader:
        # Read a bunch of entries from the dataset and compare the data to reference
        for row in reader:
            actual = dict(row._asdict())
            expected = next(d for d in synthetic_dataset.data if d['id'] == actual['id'])
            np.testing.assert_equal(expected['id2'], actual['id2'])
Ejemplo n.º 20
0
def test_rowgroup_selector_string_field(synthetic_dataset):
    """ Select row groups to read based on dataset index for string field"""
    with Reader(synthetic_dataset.url,
                rowgroup_selector=SingleIndexSelector(TestSchema.sensor_name.name, ['test_sensor']),
                reader_pool=DummyPool()) as reader:
        count = 0
        for _ in reader:
            count += 1
        # Since we use artificial dataset all sensors have the same name,
        # so all row groups should be selected and all 100 generated rows should be returned
        assert 100 == count
Ejemplo n.º 21
0
def test_predicate_on_single_column(synthetic_dataset):
    reader = Reader(synthetic_dataset.url,
                    schema_fields=[TestSchema.id2],
                    predicate=in_lambda(['id2'], lambda id2: True),
                    reader_pool=DummyPool())
    counter = 0
    for row in reader:
        counter += 1
        actual = dict(row._asdict())
        assert actual['id2'] < 2
    assert counter == len(synthetic_dataset.data)
def test_generate(tmpdir):
    temp_url = 'file://' + tmpdir.strpath

    # Generate a dataset
    generate_hello_world_dataset(temp_url)
    assert '_SUCCESS' in os.listdir(tmpdir.strpath)

    # Read from it
    with Reader(temp_url) as reader:
        all_samples = list(reader)
    assert all_samples
Ejemplo n.º 23
0
    def test_metadata_missing_unischema(self):
        """ Produce a BAD _metadata that is missing the unischema pickling first, then load dataset. """

        # Remove the common metadata file with unischema information
        self.vanish_metadata('_common_metadata')

        # Reader will now just get the metadata file which will not have the unischema information
        with self.assertRaises(ValueError) as e:
            Reader(self._dataset_url, reader_pool=DummyPool())
        self.assertTrue('Could not find the unischema' in str(e.exception))
        self.restore_metadata('_common_metadata')
Ejemplo n.º 24
0
def test_rowgroup_selector_nullable_array_field(synthetic_dataset):
    """ Select row groups to read based on dataset index for array field"""
    with Reader(synthetic_dataset.url,
                rowgroup_selector=SingleIndexSelector(TestSchema.string_array_nullable.name, ['100']),
                reader_pool=DummyPool()) as reader:
        count = sum(1 for _ in reader)
        # This field contain id string, generated like this
        #   None if id % 5 == 0 else np.asarray([], dtype=np.string_) if id % 4 == 0 else
        #   np.asarray([str(i+id) for i in xrange(2)], dtype=np.string_)
        # hence '100' could be present in row id 99 as 99+1 and row id 100 as 100+0
        # but row 100 will be skipped by ' None if id % 5 == 0' condition, so only one row group should be selected
        assert 10 == count
Ejemplo n.º 25
0
def _test_noncontinuous_ngram_tf(ngram_fields, synthetic_dataset):
    """Test non continuous ngram in tf of a certain length. Non continuous here refers
    to that the reader will not necessarily return consecutive ngrams because partition is more
    than one and false is true."""

    dataset_dicts = synthetic_dataset.data
    ngram = NGram(fields=ngram_fields,
                  delta_threshold=10,
                  timestamp_field=TestSchema.id)
    reader = Reader(
        schema_fields=ngram,
        dataset_url=synthetic_dataset.url,
        reader_pool=ThreadPool(1),
    )

    readout_examples = tf_tensors(reader)

    # Make sure we have static shape info for all fields
    for timestep in readout_examples:
        for field in readout_examples[timestep]:
            assert field.get_shape().dims is not None

    # Read a bunch of entries from the dataset and compare the data to reference
    with tf.Session() as sess:
        for _ in range(5):
            actual = sess.run(readout_examples)
            expected_ngram = _get_named_tuple_from_ngram(
                ngram, dataset_dicts, actual[min(actual.keys())].id)
            _assert_equal_ngram(actual, expected_ngram)

    reader.stop()
    reader.join()
Ejemplo n.º 26
0
def test_ngram_delta_small_threshold_tf():
    """Test to verify that a small threshold work in ngrams."""

    with temporary_directory() as tmp_dir:
        tmp_url = 'file://{}'.format(tmp_dir)
        ids = range(0, 99, 5)
        create_test_dataset(tmp_url, ids)

        fields = {
            0: [
                TestSchema.id, TestSchema.id2, TestSchema.image_png,
                TestSchema.matrix
            ],
            1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
        }
        ngram = NGram(fields=fields,
                      delta_threshold=1,
                      timestamp_field=TestSchema.id)
        reader = Reader(
            schema_fields=ngram,
            dataset_url=tmp_url,
            reader_pool=DummyPool(),
        )

        with tf.Session() as sess:
            with pytest.raises(OutOfRangeError):
                sess.run(tf_tensors(reader))

        reader.stop()
        reader.join()
Ejemplo n.º 27
0
def test_ngram_delta_threshold_tf(dataset_0_3_8_10_11_20_23):
    """Test to verify that delta threshold work as expected in one partition in the same ngram
    and between consecutive ngrams. delta threshold here refers that each ngram must not be
    more than delta threshold apart for the field specified by timestamp_field."""

    fields = {
        0: [
            TestSchema.id, TestSchema.id2, TestSchema.image_png,
            TestSchema.matrix
        ],
        1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name],
    }
    ngram = NGram(fields=fields,
                  delta_threshold=4,
                  timestamp_field=TestSchema.id)
    with Reader(schema_fields=ngram,
                dataset_url=dataset_0_3_8_10_11_20_23.url,
                reader_pool=DummyPool(),
                shuffle_options=ShuffleOptions(False)) as reader:

        # Ngrams expected: (0, 3), (8, 10), (10, 11)

        with tf.Session() as sess:
            readout = tf_tensors(reader)
            for timestep in readout:
                for field in readout[timestep]:
                    assert field.get_shape().dims is not None
            first_item = sess.run(readout)
            expected_item = _get_named_tuple_from_ngram(
                ngram, dataset_0_3_8_10_11_20_23.data, 0)
            _assert_equal_ngram(first_item, expected_item)

            readout = tf_tensors(reader)
            for timestep in readout:
                for field in readout[timestep]:
                    assert field.get_shape().dims is not None
            second_item = sess.run(readout)
            expected_item = _get_named_tuple_from_ngram(
                ngram, dataset_0_3_8_10_11_20_23.data, 3)
            _assert_equal_ngram(second_item, expected_item)

            readout = tf_tensors(reader)
            for timestep in readout:
                for field in readout[timestep]:
                    assert field.get_shape().dims is not None
            third_item = sess.run(readout)
            expected_item = _get_named_tuple_from_ngram(
                ngram, dataset_0_3_8_10_11_20_23.data, 5)
            _assert_equal_ngram(third_item, expected_item)

            with pytest.raises(OutOfRangeError):
                sess.run(tf_tensors(reader))
Ejemplo n.º 28
0
def test_stable_pieces_order(synthetic_dataset):
    """Tests that the reader raises value errors when appropriate"""

    RERUN_THE_TEST_COUNT = 20
    baseline_run = None
    for _ in range(RERUN_THE_TEST_COUNT):
        with Reader(synthetic_dataset.url, schema_fields=[TestSchema.id], shuffle_options=ShuffleOptions(False),
                    reader_pool=DummyPool()) as reader:
            this_run = [row.id for row in reader]
        if baseline_run:
            assert this_run == baseline_run

        baseline_run = this_run
Ejemplo n.º 29
0
def test_predicate_with_invalid_fields(synthetic_dataset):
    """Try passing an invalid field name from a predicate to the reader. An error should be raised."""
    TEST_CASES = [
        {'invalid_field_name': 1},
        dict(),
        {'invalid_field_name': 1, 'id': 11},
        {'invalid_field_name': 1, 'invalid_field_name_2': 11}]

    for predicate_spec in TEST_CASES:
        with Reader(synthetic_dataset.url, shuffle_options=ShuffleOptions(False),
                    predicate=EqualPredicate(predicate_spec),
                    reader_pool=ThreadPool(1)) as reader:
            with pytest.raises(ValueError):
                next(reader)
Ejemplo n.º 30
0
def test_invalid_schema_field(synthetic_dataset):
    # Let's assume we are selecting columns using a schema which is different from the one
    # stored in the dataset. Would expect to get a reasonable error message
    BogusSchema = Unischema('BogusSchema', [
        UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)])

    expected_values = {'bogus_key': 11, 'id': 1}
    with pytest.raises(ValueError) as e:
        Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False),
               predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1))

    assert 'bogus_key' in str(e)