def test_no_nulls(schema, file):
    # Should read: at least one null value in all null column
    assert InclusiveMetricsEvaluator(schema, Expressions.is_null("all_nulls")).eval(file)
    # Should read: column with some nulls contains a null value
    assert InclusiveMetricsEvaluator(schema, Expressions.is_null("some_nulls")).eval(file)
    # Should skip: non-null column contains no null values
    assert not InclusiveMetricsEvaluator(schema, Expressions.is_null("no_nulls")).eval(file)
def test_missing_reference():
    expr = Expressions.and_(Expressions.equal("t", 5),
                            Expressions.equal("x", 7))
    try:
        Binder.bind(STRUCT, expr)
    except ice_ex.ValidationException as e:
        assert "Cannot find field 't' in struct" in "{}".format(e)
Beispiel #3
0
def test_case_sensitive_int_not_eq_rewritten(inc_man_spec, inc_man_file, val,
                                             expected):
    with pytest.raises(ValidationException):
        assert InclusiveManifestEvaluator(
            inc_man_spec,
            Expressions.not_(Expressions.equal("ID", val)),
            case_sensitive=True).eval(inc_man_file) == expected
def test_multiple_references(assert_all_bound):
    expr = Expressions.or_(
        Expressions.and_(Expressions.equal("x", 7),
                         Expressions.less_than("y", 100)),
        Expressions.greater_than("z", -100))

    assert_all_bound("Multiple references", Binder.bind(STRUCT, expr))
def test_integer_gt_eq(schema, file):
    assert not InclusiveMetricsEvaluator(
        schema, Expressions.greater_than_or_equal("id", 85)).eval(file)
    assert not InclusiveMetricsEvaluator(
        schema, Expressions.greater_than_or_equal("id", 80)).eval(file)
    assert InclusiveMetricsEvaluator(
        schema, Expressions.greater_than_or_equal("id", 79)).eval(file)
    assert InclusiveMetricsEvaluator(
        schema, Expressions.greater_than_or_equal("id", 75)).eval(file)
Beispiel #6
0
 def iterator(self, part_filter=None, row_filter=None, columns=None):
     if part_filter is None and row_filter is None and columns is None:
         return self.iterator(Expressions.always_true(),
                              Expressions.always_true(),
                              Filterable.ALL_COLUMNS)
     return iter([
         self.get_filtered_manifest(path, part_filter, row_filter, columns)
         for path in self._manifest_files
     ])
def test_not(assert_all_bound, assert_and_unwrap):
    expr = Expressions.not_(Expressions.equal("x", 7))
    bound_expr = Binder.bind(STRUCT, expr)
    assert_all_bound("Not", bound_expr)

    not_ = assert_and_unwrap(bound_expr, Not)

    child = assert_and_unwrap(not_.child, None)
    # should bind x correctly
    assert 0 == child.ref.field_id
Beispiel #8
0
def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table
Beispiel #9
0
def test_not_null(assert_and_unwrap):
    optional = StructType.of([NestedField.optional(21, "s", StringType.get())])
    unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s"))
    expr = unbound.bind(optional)
    bound = assert_and_unwrap(expr)
    assert Operation.NOT_NULL == bound.op
    assert 21 == bound.ref.field.field_id
    assert bound.lit is None

    required = StructType.of([NestedField.required(22, "s", StringType.get())])
    assert Expressions.always_true() == unbound.bind(required)
def test_integer_lt(schema, file):
    assert not InclusiveMetricsEvaluator(schema, Expressions.less_than(
        "id", 5)).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.less_than(
        "id", 30)).eval(file)
    assert InclusiveMetricsEvaluator(schema,
                                     Expressions.less_than("id",
                                                           31)).eval(file)
    assert InclusiveMetricsEvaluator(schema,
                                     Expressions.less_than("id",
                                                           79)).eval(file)
 def all_entries(self):
     if self.row_filter is not None and self.row_filter != Expressions.always_true() \
             or self.part_filter is not None and self. part_filter != Expressions.always_true():
         evaluator = self.evaluator()
         metrics_evaluator = self.metrics_evaluator()
         return [
             entry for entry in self.reader.entries(self.columns)
             if entry is not None and evaluator.eval(entry.file.partition())
             and metrics_evaluator.eval(entry.file)
         ]
     else:
         return self.reader.entries(self.columns)
Beispiel #12
0
    def truncate_array(name, pred, transform):
        boundary = pred.lit.value

        if pred.op == Operation.LT or pred.op == Operation.LT_EQ:
            return Expressions.predicate(Operation.LT_EQ, name,
                                         transform.apply(boundary))
        elif pred.op == Operation.GT or pred.op == Operation.GT_EQ:
            return Expressions.predicate(Operation.GT_EQ, name,
                                         transform.apply(boundary))
        elif pred.op == Operation.EQ:
            return Expressions.predicate(pred.op, name,
                                         transform.apply(boundary))
    def iterator(self):
        if self.row_filter is not None and self.row_filter != Expressions.always_true() \
                or self.part_filter is not None and self.part_filter != Expressions.always_true():
            evaluator = self.evaluator()
            metrics_evaluator = self.metrics_evaluator()

            return (input.copy() for input in self.reader.iterator(
                self.part_filter, self.columns)
                    if input is not None and evaluator.eval(input.partition())
                    and metrics_evaluator.eval(input))
        else:
            return (entry.copy() for entry in self.reader.iterator(
                self.part_filter, self.columns))
def test_or(strict_schema, strict_file):
    assert not StrictMetricsEvaluator(strict_schema,
                                      Expressions.or_(Expressions.less_than("id", 5),
                                                      Expressions.greater_than_or_equal("id", 80))).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema,
                                      Expressions.or_(Expressions.less_than("id", 5),
                                                      Expressions.greater_than_or_equal("id", 60))).eval(strict_file)
    assert StrictMetricsEvaluator(strict_schema,
                                  Expressions.or_(Expressions.less_than("id", 5),
                                                  Expressions.greater_than_or_equal("id", 30))).eval(strict_file)
def test_or(assert_all_bound, assert_and_unwrap):
    expr = Expressions.or_(Expressions.greater_than("z", -100),
                           Expressions.less_than("y", 100))
    bound_expr = Binder.bind(STRUCT, expr)
    assert_all_bound("Or", bound_expr)

    or_ = assert_and_unwrap(bound_expr, Or)

    left = assert_and_unwrap(or_.left, None)
    # should bind z correctly
    assert 2 == left.ref.field_id
    right = assert_and_unwrap(or_.right, None)
    # should bind y correctly
    assert 1 == right.ref.field_id
def test_and(assert_all_bound, assert_and_unwrap):
    expr = Expressions.and_(Expressions.equal("x", 7),
                            Expressions.less_than("y", 100))
    bound_expr = Binder.bind(STRUCT, expr)
    assert_all_bound("And", bound_expr)

    and_ = assert_and_unwrap(bound_expr, And)

    left = assert_and_unwrap(and_.left, None)
    # should bind x correctly
    assert 0 == left.ref.field_id
    right = assert_and_unwrap(and_.right, None)
    # should bind y correctly
    assert 1 == right.ref.field_id
def test_complex_expr():
    expr = Expressions.or_(
        Expressions.and_(Expressions.greater_than('a', 1),
                         Expressions.equal("b", "US")),
        Expressions.equal("c", True))

    translated_dataset_filter = get_dataset_filter(expr, {
        'a': 'a',
        'b': 'b',
        'c': 'c'
    })
    dataset_filter = (((ds.field("a") > 1) & (ds.field("b") == "US")) |
                      (ds.field("c") == True))  # noqa: E712
    assert dataset_filter.equals(translated_dataset_filter)
Beispiel #18
0
def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table
Beispiel #19
0
    def iterator(self, part_filter=None, columns=None):
        if part_filter is None and columns is None:
            return self.iterator(Expressions.always_true(),
                                 Filterable.ALL_COLUMNS)

        return (entry.file for entry in self.entries
                if entry.status != Status.DELETED)
Beispiel #20
0
def test_invalid_conversions(op):
    struct = StructType.of([NestedField.required(16, "f", FloatType.get())])
    unbound = UnboundPredicate(op, Expressions.ref("f"), "12.40")

    try:
        unbound.bind(struct)
    except ValidationException as e:
        assert e.args[0].startswith('Invalid Value for conversion to type float: "12.40" (StringLiteral)')
Beispiel #21
0
def test_literal_converison(op, assert_and_unwrap):
    struct = StructType.of([NestedField.required(15, "d", DecimalType.of(9, 2))])
    unbound = UnboundPredicate(op, Expressions.ref("d"), "12.40")
    bound = assert_and_unwrap(unbound.bind(struct))

    assert Decimal(12.40).quantize(Decimal(".01")).as_tuple() == bound.lit.value.as_tuple()
    assert 15 == bound.ref.field.field_id
    assert op == bound.op
Beispiel #22
0
def test_comparison_predicate_binding(op, assert_and_unwrap):
    struct = StructType.of([NestedField.required(14, "x", IntegerType.get())])
    unbound = UnboundPredicate(op, Expressions.ref("x"), 5)
    bound = assert_and_unwrap(unbound.bind(struct))

    assert 5 == bound.lit.value
    assert 14 == bound.ref.field.field_id
    assert op == bound.op
Beispiel #23
0
def test_missing_field():
    struct = StructType.of([NestedField.required(13, "x", IntegerType.get())])

    unbound = UnboundPredicate(Operation.LT, Expressions.ref("missing"), 6)
    try:
        unbound.bind(struct)
    except ValidationException as e:
        assert e.args[0].startswith("Cannot find field 'missing' in struct")
Beispiel #24
0
def test_complex_expansion():
    expected_expr = Expressions.or_(
        Expressions.and_(
            Expressions.equal("a", 1),
            Expressions.and_(Expressions.equal("b", 2),
                             Expressions.not_equal("c", 3))),
        Expressions.is_null("d"))
    conv_expr = Expressions.convert_string_to_expr(
        "(a=1 and b=2 and c<>3) or d is null")
    assert expected_expr == conv_expr
def test_integer_not_eq(strict_schema, strict_file):
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 5)).eval(strict_file)
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 29)).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 30)).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 75)).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 79)).eval(strict_file)
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 80)).eval(strict_file)
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 85)).eval(strict_file)
def test_integer_eq(schema, file):
    assert not InclusiveMetricsEvaluator(schema, Expressions.equal("id", 5)).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.equal("id", 29)).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.equal("id", 80)).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.equal("id", 85)).eval(file)
    assert InclusiveMetricsEvaluator(schema, Expressions.equal("id", 30)).eval(file)
    assert InclusiveMetricsEvaluator(schema, Expressions.equal("id", 75)).eval(file)
    assert InclusiveMetricsEvaluator(schema, Expressions.equal("id", 79)).eval(file)
Beispiel #27
0
def test_compound_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(
        input_file, expected_schema, {},
        Expressions.and_(Expressions.equal("string_col", "us"),
                         Expressions.equal("int_col", 1)), True)
    pyarrow_array = [
        pa.array([1], type=pa.int32()),
        pa.array([1], type=pa.int64()),
        pa.array([1.0], type=pa.float32()),
        pa.array([1.0], type=pa.float64()),
        pa.array(['us'], type=pa.string())
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Beispiel #28
0
    def __init__(self, ops, table, snapshot_id=None, columns=None, row_filter=None):
        self.ops = ops
        self.table = table
        self.snapshot_id = snapshot_id
        self.columns = columns
        self.row_filter = row_filter

        if self.columns is None and self.row_filter is None:
            self.columns = Filterable.ALL_COLUMNS
            self.row_filter = Expressions.always_true()
def test_missing_stats(strict_schema, missing_stats):
    exprs = [Expressions.less_than("no_stats", 5),
             Expressions.less_than_or_equal("no_stats", 30),
             Expressions.equal("no_stats", 70),
             Expressions.greater_than("no_stats", 78),
             Expressions.greater_than_or_equal("no_stats", 90),
             Expressions.not_equal("no_stats", 101),
             Expressions.is_null("no_stats"),
             Expressions.not_null("no_stats")]

    for expr in exprs:
        assert not StrictMetricsEvaluator(strict_schema, expr).eval(missing_stats)
def test_zero_record_file(strict_schema, empty):

    exprs = [Expressions.less_than("no_stats", 5),
             Expressions.less_than_or_equal("no_stats", 30),
             Expressions.equal("no_stats", 70),
             Expressions.greater_than("no_stats", 78),
             Expressions.greater_than_or_equal("no_stats", 90),
             Expressions.not_equal("no_stats", 101),
             Expressions.is_null("no_stats"),
             Expressions.not_null("no_stats")]
    for expr in exprs:
        assert StrictMetricsEvaluator(strict_schema, expr).eval(empty)