def test_rowgroup_selector_partition_key(synthetic_dataset, reader_factory): """ Select row groups to read based on dataset index for array field""" with reader_factory(synthetic_dataset.url, rowgroup_selector=SingleIndexSelector( TestSchema.partition_key.name, ['p_1'])) as reader: count = sum(1 for _ in reader) assert 10 == count
def test_rowgroup_selector_wrong_index_name(synthetic_dataset): """ Attempt to select row groups to based on wrong dataset index, Reader should raise exception """ with pytest.raises(ValueError): Reader(synthetic_dataset.url, rowgroup_selector=SingleIndexSelector('WrongIndexName', ['some_value']), reader_pool=DummyPool())
def test_rowgroup_selector_string_field(synthetic_dataset, reader_factory): """ Select row groups to read based on dataset index for string field""" with reader_factory(synthetic_dataset.url, rowgroup_selector=SingleIndexSelector(TestSchema.sensor_name.name, ['test_sensor'])) as reader: count = sum(1 for _ in reader) # Since we use artificial dataset all sensors have the same name, # so all row groups should be selected and all 1000 generated rows should be returned assert 100 == count
def test_rowgroup_selector_multiple_fields_union(synthetic_dataset, reader_factory): union_index_selector = UnionIndexSelector( [SingleIndexSelector(TestSchema.sensor_name.name, ['test_sensor']), SingleIndexSelector(TestSchema.id.name, [2, 18])] ) with reader_factory(synthetic_dataset.url, rowgroup_selector=union_index_selector) as reader: count = 0 status = [False, False, False] for row in reader: if row.id == 2: status[0] = True if row.id == 18: status[1] = True if row.sensor_name == 'test_sensor': status[2] = True count += 1 assert all(status) assert 100 == count
def test_rowgroup_selector_nullable_array_field(synthetic_dataset, reader_factory): """ Select row groups to read based on dataset index for array field""" with reader_factory(synthetic_dataset.url, rowgroup_selector=SingleIndexSelector(TestSchema.string_array_nullable.name, ['100'])) as reader: count = sum(1 for _ in reader) # This field contain id string, generated like this # None if id % 5 == 0 else np.asarray([], dtype=np.string_) if id % 4 == 0 else # np.asarray([str(i+id) for i in xrange(2)], dtype=np.string_) # hence '100' could be present in row id 99 as 99+1 and row id 100 as 100+0 # but row 100 will be skipped by ' None if id % 5 == 0' condition, so only one row group should be selected assert 10 == count
def test_rowgroup_selector_integer_field(synthetic_dataset, reader_factory): """ Select row groups to read based on dataset index for integer field""" with reader_factory(synthetic_dataset.url, rowgroup_selector=SingleIndexSelector(TestSchema.id.name, [2, 18])) \ as reader: status = [False, False] count = 0 for row in reader: if row.id == 2: status[0] = True if row.id == 18: status[1] = True count += 1 # both id values in reader result assert all(status) # read only 2 row groups, 100 rows per row group assert 20 == count
def test_regenerate_using_row_group_summary_metadata(synthetic_dataset, tmpdir): a_moved_path = tmpdir.join('moved').strpath copytree(synthetic_dataset.path, a_moved_path) # Make sure we can read dataset before _check_reader(a_moved_path) # Regenerate the metadata (taking the schema information from the common_metadata which exists) petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path), '--use-summary-metadata']) dataset = pq.ParquetDataset(a_moved_path) # Metadata path should not exist still (should be only _common_metadata) assert dataset.metadata # Reader should now work again with rowgroup selector since it was in original metadata _check_reader(a_moved_path, SingleIndexSelector(TestSchema.id.name, [2, 18]))
def test_regenerate_row_group_metadata(synthetic_dataset, tmpdir): a_moved_path = tmpdir.join('moved').strpath copytree(synthetic_dataset.path, a_moved_path) # Make sure we can read dataset before _check_reader(a_moved_path) # Delete only the metadata file dataset = pq.ParquetDataset(a_moved_path) os.remove(dataset.metadata_path) # Should now raise a value error with pytest.raises(ValueError): _check_reader(a_moved_path) # Regenerate the metadata (taking the schema information from the common_metadata which exists) petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path)]) # Reader should now work again with rowgroup selector since it was in original metadata _check_reader(a_moved_path, SingleIndexSelector(TestSchema.id.name, [2, 18]))