def test_no_nulls(schema, file):
    # Should read: at least one null value in all null column
    assert InclusiveMetricsEvaluator(schema, Expressions.is_null("all_nulls")).eval(file)
    # Should read: column with some nulls contains a null value
    assert InclusiveMetricsEvaluator(schema, Expressions.is_null("some_nulls")).eval(file)
    # Should skip: non-null column contains no null values
    assert not InclusiveMetricsEvaluator(schema, Expressions.is_null("no_nulls")).eval(file)
Beispiel #2
0
def test_complex_expansion():
    expected_expr = Expressions.or_(
        Expressions.and_(
            Expressions.equal("a", 1),
            Expressions.and_(Expressions.equal("b", 2),
                             Expressions.not_equal("c", 3))),
        Expressions.is_null("d"))
    conv_expr = Expressions.convert_string_to_expr(
        "(a=1 and b=2 and c<>3) or d is null")
    assert expected_expr == conv_expr
Beispiel #3
0
def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table
def test_zero_record_file(strict_schema, empty):

    exprs = [Expressions.less_than("no_stats", 5),
             Expressions.less_than_or_equal("no_stats", 30),
             Expressions.equal("no_stats", 70),
             Expressions.greater_than("no_stats", 78),
             Expressions.greater_than_or_equal("no_stats", 90),
             Expressions.not_equal("no_stats", 101),
             Expressions.is_null("no_stats"),
             Expressions.not_null("no_stats")]
    for expr in exprs:
        assert StrictMetricsEvaluator(strict_schema, expr).eval(empty)
def test_missing_stats(strict_schema, missing_stats):
    exprs = [Expressions.less_than("no_stats", 5),
             Expressions.less_than_or_equal("no_stats", 30),
             Expressions.equal("no_stats", 70),
             Expressions.greater_than("no_stats", 78),
             Expressions.greater_than_or_equal("no_stats", 90),
             Expressions.not_equal("no_stats", 101),
             Expressions.is_null("no_stats"),
             Expressions.not_null("no_stats")]

    for expr in exprs:
        assert not StrictMetricsEvaluator(strict_schema, expr).eval(missing_stats)
Beispiel #6
0
def test_is_null():
    expected_expr = Expressions.is_null("col_a")
    conv_expr = Expressions.convert_string_to_expr("col_a is null")
    assert expected_expr == conv_expr
        'a': 'a'
    }),
     (Expressions.greater_than_or_equal('a', 1), ds.field('a') >= 1, {
         'a': 'a'
     }), (Expressions.less_than('a', 1), ds.field('a') < 1, {
         'a': 'a'
     }),
     (Expressions.less_than_or_equal('a', 1), ds.field('a') <= 1, {
         'a': 'a'
     }), (Expressions.equal('a', 1), ds.field('a') == 1, {
         'a': 'a'
     }), (Expressions.not_equal('a', 1), ds.field('a') != 1, {
         'a': 'a'
     }), (Expressions.not_null('a'), ds.field('a').is_valid(), {
         'a': 'a'
     }), (Expressions.is_null('a'), ~ds.field('a').is_valid(), {
         'a': 'a'
     })])
def test_simple(expr, dataset_filter, column_map):
    translated_dataset_filter = get_dataset_filter(expr, column_map)
    assert dataset_filter.equals(translated_dataset_filter)


def test_not_conversion():
    expr = Expressions.not_(Expressions.greater_than('a', 1))
    translated_dataset_filter = get_dataset_filter(expr, {'a': 'a'})
    assert (~(ds.field("a") > 1)).equals(translated_dataset_filter)


def test_complex_expr():
    expr = Expressions.or_(
def test_required_column(schema, file):
    assert InclusiveMetricsEvaluator(schema, Expressions.not_null("required")).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.is_null("required")).eval(file)
def op(request):
    yield request.param


@pytest.fixture(scope="session",
                params=[
                    Expressions.always_false(),
                    Expressions.always_true(),
                    Expressions.less_than("x", 5),
                    Expressions.less_than_or_equal("y", -3),
                    Expressions.greater_than("z", 0),
                    Expressions.greater_than_or_equal("t", 129),
                    Expressions.equal("col", "data"),
                    Expressions.not_equal("col", "abc"),
                    Expressions.not_null("maybeNull"),
                    Expressions.is_null("maybeNull2"),
                    Expressions.not_(Expressions.greater_than("a", 10)),
                    Expressions.and_(Expressions.greater_than_or_equal("a", 0),
                                     Expressions.less_than("a", 3)),
                    Expressions.or_(Expressions.less_than("a", 0),
                                    Expressions.greater_than("a", 10)),
                    Expressions.equal("a", 5).bind(exp_schema.as_struct())
                ])
def expression(request):
    yield request.param


@pytest.fixture(scope="session",
                params=[
                    Expressions.less_than("no_stats", 5),
                    Expressions.less_than_or_equal("no_stats", 30),
from iceberg.api.expressions import Expressions, InclusiveManifestEvaluator
from iceberg.exceptions import ValidationException
import pytest


@pytest.mark.parametrize("expression,expected", [
    (Expressions.not_null("all_nulls"), False),
    (Expressions.not_null("some_nulls"), True),
    (Expressions.not_null("no_nulls"), True)])
def test_all_nulls(inc_man_spec, inc_man_file, expression, expected):
    assert InclusiveManifestEvaluator(inc_man_spec, expression).eval(inc_man_file) == expected


@pytest.mark.parametrize("expression,expected", [
    (Expressions.is_null("all_nulls"), True),
    (Expressions.is_null("some_nulls"), True),
    (Expressions.is_null("no_nulls"), False)])
def test_no_nulls(inc_man_spec, inc_man_file, expression, expected):
    assert InclusiveManifestEvaluator(inc_man_spec, expression).eval(inc_man_file) == expected


def test_missing_column(inc_man_spec, inc_man_file):
    with pytest.raises(ValidationException):
        InclusiveManifestEvaluator(inc_man_spec, Expressions.less_than("missing", 5)).eval(inc_man_file)


@pytest.mark.parametrize("expression", [
    Expressions.less_than("id", 5),
    Expressions.less_than_or_equal("id", 30),
    Expressions.equal("id", 70),
def test_required_columns(strict_schema, strict_file):
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_null("required")).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.is_null("required")).eval(strict_file)
def test_no_nulls(strict_schema, strict_file):
    assert StrictMetricsEvaluator(strict_schema, Expressions.is_null("all_nulls")).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.is_null("some_nulls")).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.is_null("no_nulls")).eval(strict_file)