def test_predicate_on_partition(synthetic_dataset, reader_factory): for expected_partition_keys in [{'p_0', 'p_2'}, {'p_0'}, {'p_1', 'p_2'}]: with reader_factory(synthetic_dataset.url, predicate=PartitionKeyInSetPredicate( expected_partition_keys)) as reader: partition_keys = set(row.partition_key for row in reader) assert partition_keys == expected_partition_keys
def test_predicate_on_partition_filters_out_everything(synthetic_dataset, reader_factory): with pytest.warns(UserWarning, match='No matching data is available for loading'): # This predicate should filter out all rowgroups. We should raise an error in this case. make_reader(synthetic_dataset.url, reader_pool_type='dummy', predicate=PartitionKeyInSetPredicate( {'non existing value'}))
def test_predicate_on_partition_batched(synthetic_dataset, reader_factory): for expected_partition_keys in [{'p_0', 'p_2'}, {'p_0'}, {'p_1', 'p_2'}]: # TODO(yevgeni): scalar only reader takes 'vectorized' predicate that processes entire columns. Not # yet implemented for the case of a prediction on partition, hence we use a non-vectorized # PartitionKeyInSetPredicate here with reader_factory(synthetic_dataset.url, predicate=PartitionKeyInSetPredicate(expected_partition_keys)) as reader: partition_keys = set() for row in reader: partition_keys |= set(row.partition_key) assert partition_keys == expected_partition_keys