def test_predicate_on_partition(synthetic_dataset, reader_factory):
    for expected_partition_keys in [{'p_0', 'p_2'}, {'p_0'}, {'p_1', 'p_2'}]:
        with reader_factory(synthetic_dataset.url,
                            predicate=PartitionKeyInSetPredicate(
                                expected_partition_keys)) as reader:
            partition_keys = set(row.partition_key for row in reader)
            assert partition_keys == expected_partition_keys
def test_predicate_on_partition_filters_out_everything(synthetic_dataset,
                                                       reader_factory):
    with pytest.warns(UserWarning,
                      match='No matching data is available for loading'):
        # This predicate should filter out all rowgroups. We should raise an error in this case.
        make_reader(synthetic_dataset.url,
                    reader_pool_type='dummy',
                    predicate=PartitionKeyInSetPredicate(
                        {'non existing value'}))
Exemple #3
0
def test_predicate_on_partition_batched(synthetic_dataset, reader_factory):
    for expected_partition_keys in [{'p_0', 'p_2'}, {'p_0'}, {'p_1', 'p_2'}]:
        # TODO(yevgeni): scalar only reader takes 'vectorized' predicate that processes entire columns. Not
        # yet implemented for the case of a prediction on partition, hence we use a non-vectorized
        # PartitionKeyInSetPredicate here
        with reader_factory(synthetic_dataset.url,
                            predicate=PartitionKeyInSetPredicate(expected_partition_keys)) as reader:
            partition_keys = set()
            for row in reader:
                partition_keys |= set(row.partition_key)
            assert partition_keys == expected_partition_keys