Python Expressions Beispiele, iceberg.api.expressions.Expressions Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_inclusive_metrics_evaluator.py Projekt: yuhuali1989/incubator-iceberg

def test_no_nulls(schema, file):
    # Should read: at least one null value in all null column
    assert InclusiveMetricsEvaluator(schema, Expressions.is_null("all_nulls")).eval(file)
    # Should read: column with some nulls contains a null value
    assert InclusiveMetricsEvaluator(schema, Expressions.is_null("some_nulls")).eval(file)
    # Should skip: non-null column contains no null values
    assert not InclusiveMetricsEvaluator(schema, Expressions.is_null("no_nulls")).eval(file)

Beispiel #2

0

Datei anzeigen

Datei: test_expression_binding.py Projekt: rdsr/li-iceberg-rdsr

def test_missing_reference():
    expr = Expressions.and_(Expressions.equal("t", 5),
                            Expressions.equal("x", 7))
    try:
        Binder.bind(STRUCT, expr)
    except ice_ex.ValidationException as e:
        assert "Cannot find field 't' in struct" in "{}".format(e)

Beispiel #3

0

Datei anzeigen

def test_case_sensitive_int_not_eq_rewritten(inc_man_spec, inc_man_file, val,
                                             expected):
    with pytest.raises(ValidationException):
        assert InclusiveManifestEvaluator(
            inc_man_spec,
            Expressions.not_(Expressions.equal("ID", val)),
            case_sensitive=True).eval(inc_man_file) == expected

Beispiel #4

0

Datei anzeigen

Datei: test_expression_binding.py Projekt: rdsr/li-iceberg-rdsr

def test_multiple_references(assert_all_bound):
    expr = Expressions.or_(
        Expressions.and_(Expressions.equal("x", 7),
                         Expressions.less_than("y", 100)),
        Expressions.greater_than("z", -100))

    assert_all_bound("Multiple references", Binder.bind(STRUCT, expr))

Beispiel #5

0

Datei anzeigen

Datei: test_inclusive_metrics_evaluator.py Projekt: rdsr/li-iceberg-rdsr

def test_integer_gt_eq(schema, file):
    assert not InclusiveMetricsEvaluator(
        schema, Expressions.greater_than_or_equal("id", 85)).eval(file)
    assert not InclusiveMetricsEvaluator(
        schema, Expressions.greater_than_or_equal("id", 80)).eval(file)
    assert InclusiveMetricsEvaluator(
        schema, Expressions.greater_than_or_equal("id", 79)).eval(file)
    assert InclusiveMetricsEvaluator(
        schema, Expressions.greater_than_or_equal("id", 75)).eval(file)

Beispiel #6

0

Datei anzeigen

 def iterator(self, part_filter=None, row_filter=None, columns=None):
     if part_filter is None and row_filter is None and columns is None:
         return self.iterator(Expressions.always_true(),
                              Expressions.always_true(),
                              Filterable.ALL_COLUMNS)
     return iter([
         self.get_filtered_manifest(path, part_filter, row_filter, columns)
         for path in self._manifest_files
     ])

Beispiel #7

0

Datei anzeigen

Datei: test_expression_binding.py Projekt: rdsr/li-iceberg-rdsr

def test_not(assert_all_bound, assert_and_unwrap):
    expr = Expressions.not_(Expressions.equal("x", 7))
    bound_expr = Binder.bind(STRUCT, expr)
    assert_all_bound("Not", bound_expr)

    not_ = assert_and_unwrap(bound_expr, Not)

    child = assert_and_unwrap(not_.child, None)
    # should bind x correctly
    assert 0 == child.ref.field_id

Beispiel #8

0

Datei anzeigen

def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table

Beispiel #9

0

Datei anzeigen

def test_not_null(assert_and_unwrap):
    optional = StructType.of([NestedField.optional(21, "s", StringType.get())])
    unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s"))
    expr = unbound.bind(optional)
    bound = assert_and_unwrap(expr)
    assert Operation.NOT_NULL == bound.op
    assert 21 == bound.ref.field.field_id
    assert bound.lit is None

    required = StructType.of([NestedField.required(22, "s", StringType.get())])
    assert Expressions.always_true() == unbound.bind(required)

Beispiel #10

0

Datei anzeigen

Datei: test_inclusive_metrics_evaluator.py Projekt: rdsr/li-iceberg-rdsr

def test_integer_lt(schema, file):
    assert not InclusiveMetricsEvaluator(schema, Expressions.less_than(
        "id", 5)).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.less_than(
        "id", 30)).eval(file)
    assert InclusiveMetricsEvaluator(schema,
                                     Expressions.less_than("id",
                                                           31)).eval(file)
    assert InclusiveMetricsEvaluator(schema,
                                     Expressions.less_than("id",
                                                           79)).eval(file)

Beispiel #11

0

Datei anzeigen

Datei: filtered_manifest.py Projekt: rdsr/li-iceberg-rdsr

 def all_entries(self):
     if self.row_filter is not None and self.row_filter != Expressions.always_true() \
             or self.part_filter is not None and self. part_filter != Expressions.always_true():
         evaluator = self.evaluator()
         metrics_evaluator = self.metrics_evaluator()
         return [
             entry for entry in self.reader.entries(self.columns)
             if entry is not None and evaluator.eval(entry.file.partition())
             and metrics_evaluator.eval(entry.file)
         ]
     else:
         return self.reader.entries(self.columns)

Beispiel #12

0

Datei anzeigen

    def truncate_array(name, pred, transform):
        boundary = pred.lit.value

        if pred.op == Operation.LT or pred.op == Operation.LT_EQ:
            return Expressions.predicate(Operation.LT_EQ, name,
                                         transform.apply(boundary))
        elif pred.op == Operation.GT or pred.op == Operation.GT_EQ:
            return Expressions.predicate(Operation.GT_EQ, name,
                                         transform.apply(boundary))
        elif pred.op == Operation.EQ:
            return Expressions.predicate(pred.op, name,
                                         transform.apply(boundary))

Beispiel #13

0

Datei anzeigen

Datei: filtered_manifest.py Projekt: rdsr/li-iceberg-rdsr

    def iterator(self):
        if self.row_filter is not None and self.row_filter != Expressions.always_true() \
                or self.part_filter is not None and self.part_filter != Expressions.always_true():
            evaluator = self.evaluator()
            metrics_evaluator = self.metrics_evaluator()

            return (input.copy() for input in self.reader.iterator(
                self.part_filter, self.columns)
                    if input is not None and evaluator.eval(input.partition())
                    and metrics_evaluator.eval(input))
        else:
            return (entry.copy() for entry in self.reader.iterator(
                self.part_filter, self.columns))

Beispiel #14

0

Datei anzeigen

Datei: test_strict_metrics_evaluator.py Projekt: rdsr/li-iceberg-rdsr

def test_or(strict_schema, strict_file):
    assert not StrictMetricsEvaluator(strict_schema,
                                      Expressions.or_(Expressions.less_than("id", 5),
                                                      Expressions.greater_than_or_equal("id", 80))).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema,
                                      Expressions.or_(Expressions.less_than("id", 5),
                                                      Expressions.greater_than_or_equal("id", 60))).eval(strict_file)
    assert StrictMetricsEvaluator(strict_schema,
                                  Expressions.or_(Expressions.less_than("id", 5),
                                                  Expressions.greater_than_or_equal("id", 30))).eval(strict_file)

Beispiel #15

0

Datei anzeigen

Datei: test_expression_binding.py Projekt: rdsr/li-iceberg-rdsr

def test_or(assert_all_bound, assert_and_unwrap):
    expr = Expressions.or_(Expressions.greater_than("z", -100),
                           Expressions.less_than("y", 100))
    bound_expr = Binder.bind(STRUCT, expr)
    assert_all_bound("Or", bound_expr)

    or_ = assert_and_unwrap(bound_expr, Or)

    left = assert_and_unwrap(or_.left, None)
    # should bind z correctly
    assert 2 == left.ref.field_id
    right = assert_and_unwrap(or_.right, None)
    # should bind y correctly
    assert 1 == right.ref.field_id

Beispiel #16

0

Datei anzeigen

Datei: test_expression_binding.py Projekt: rdsr/li-iceberg-rdsr

def test_and(assert_all_bound, assert_and_unwrap):
    expr = Expressions.and_(Expressions.equal("x", 7),
                            Expressions.less_than("y", 100))
    bound_expr = Binder.bind(STRUCT, expr)
    assert_all_bound("And", bound_expr)

    and_ = assert_and_unwrap(bound_expr, And)

    left = assert_and_unwrap(and_.left, None)
    # should bind x correctly
    assert 0 == left.ref.field_id
    right = assert_and_unwrap(and_.right, None)
    # should bind y correctly
    assert 1 == right.ref.field_id

Beispiel #17

0

Datei anzeigen

Datei: test_dataset_utils.py Projekt: shenodaguirguis/iceberg-1

def test_complex_expr():
    expr = Expressions.or_(
        Expressions.and_(Expressions.greater_than('a', 1),
                         Expressions.equal("b", "US")),
        Expressions.equal("c", True))

    translated_dataset_filter = get_dataset_filter(expr, {
        'a': 'a',
        'b': 'b',
        'c': 'c'
    })
    dataset_filter = (((ds.field("a") > 1) & (ds.field("b") == "US")) |
                      (ds.field("c") == True))  # noqa: E712
    assert dataset_filter.equals(translated_dataset_filter)

Beispiel #18

0

Datei anzeigen

def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table

Beispiel #19

0

Datei anzeigen

    def iterator(self, part_filter=None, columns=None):
        if part_filter is None and columns is None:
            return self.iterator(Expressions.always_true(),
                                 Filterable.ALL_COLUMNS)

        return (entry.file for entry in self.entries
                if entry.status != Status.DELETED)

Beispiel #20

0

Datei anzeigen

def test_invalid_conversions(op):
    struct = StructType.of([NestedField.required(16, "f", FloatType.get())])
    unbound = UnboundPredicate(op, Expressions.ref("f"), "12.40")

    try:
        unbound.bind(struct)
    except ValidationException as e:
        assert e.args[0].startswith('Invalid Value for conversion to type float: "12.40" (StringLiteral)')

Beispiel #21

0

Datei anzeigen

def test_literal_converison(op, assert_and_unwrap):
    struct = StructType.of([NestedField.required(15, "d", DecimalType.of(9, 2))])
    unbound = UnboundPredicate(op, Expressions.ref("d"), "12.40")
    bound = assert_and_unwrap(unbound.bind(struct))

    assert Decimal(12.40).quantize(Decimal(".01")).as_tuple() == bound.lit.value.as_tuple()
    assert 15 == bound.ref.field.field_id
    assert op == bound.op

Beispiel #22

0

Datei anzeigen

def test_comparison_predicate_binding(op, assert_and_unwrap):
    struct = StructType.of([NestedField.required(14, "x", IntegerType.get())])
    unbound = UnboundPredicate(op, Expressions.ref("x"), 5)
    bound = assert_and_unwrap(unbound.bind(struct))

    assert 5 == bound.lit.value
    assert 14 == bound.ref.field.field_id
    assert op == bound.op

Beispiel #23

0

Datei anzeigen

def test_missing_field():
    struct = StructType.of([NestedField.required(13, "x", IntegerType.get())])

    unbound = UnboundPredicate(Operation.LT, Expressions.ref("missing"), 6)
    try:
        unbound.bind(struct)
    except ValidationException as e:
        assert e.args[0].startswith("Cannot find field 'missing' in struct")

Beispiel #24

0

Datei anzeigen

Datei: test_str_to_expr.py Projekt: rzhang10/iceberg-1

def test_complex_expansion():
    expected_expr = Expressions.or_(
        Expressions.and_(
            Expressions.equal("a", 1),
            Expressions.and_(Expressions.equal("b", 2),
                             Expressions.not_equal("c", 3))),
        Expressions.is_null("d"))
    conv_expr = Expressions.convert_string_to_expr(
        "(a=1 and b=2 and c<>3) or d is null")
    assert expected_expr == conv_expr

Beispiel #25

0

Datei anzeigen

Datei: test_strict_metrics_evaluator.py Projekt: rdsr/li-iceberg-rdsr

def test_integer_not_eq(strict_schema, strict_file):
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 5)).eval(strict_file)
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 29)).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 30)).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 75)).eval(strict_file)
    assert not StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 79)).eval(strict_file)
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 80)).eval(strict_file)
    assert StrictMetricsEvaluator(strict_schema, Expressions.not_equal("id", 85)).eval(strict_file)

Beispiel #26

0

Datei anzeigen

Datei: test_inclusive_metrics_evaluator.py Projekt: yuhuali1989/incubator-iceberg

def test_integer_eq(schema, file):
    assert not InclusiveMetricsEvaluator(schema, Expressions.equal("id", 5)).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.equal("id", 29)).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.equal("id", 80)).eval(file)
    assert not InclusiveMetricsEvaluator(schema, Expressions.equal("id", 85)).eval(file)
    assert InclusiveMetricsEvaluator(schema, Expressions.equal("id", 30)).eval(file)
    assert InclusiveMetricsEvaluator(schema, Expressions.equal("id", 75)).eval(file)
    assert InclusiveMetricsEvaluator(schema, Expressions.equal("id", 79)).eval(file)

Beispiel #27

0

Datei anzeigen

def test_compound_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(
        input_file, expected_schema, {},
        Expressions.and_(Expressions.equal("string_col", "us"),
                         Expressions.equal("int_col", 1)), True)
    pyarrow_array = [
        pa.array([1], type=pa.int32()),
        pa.array([1], type=pa.int64()),
        pa.array([1.0], type=pa.float32()),
        pa.array([1.0], type=pa.float64()),
        pa.array(['us'], type=pa.string())
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table

Beispiel #28

0

Datei anzeigen

    def __init__(self, ops, table, snapshot_id=None, columns=None, row_filter=None):
        self.ops = ops
        self.table = table
        self.snapshot_id = snapshot_id
        self.columns = columns
        self.row_filter = row_filter

        if self.columns is None and self.row_filter is None:
            self.columns = Filterable.ALL_COLUMNS
            self.row_filter = Expressions.always_true()

Beispiel #29

0

Datei anzeigen

Datei: test_strict_metrics_evaluator.py Projekt: rdsr/li-iceberg-rdsr

def test_missing_stats(strict_schema, missing_stats):
    exprs = [Expressions.less_than("no_stats", 5),
             Expressions.less_than_or_equal("no_stats", 30),
             Expressions.equal("no_stats", 70),
             Expressions.greater_than("no_stats", 78),
             Expressions.greater_than_or_equal("no_stats", 90),
             Expressions.not_equal("no_stats", 101),
             Expressions.is_null("no_stats"),
             Expressions.not_null("no_stats")]

    for expr in exprs:
        assert not StrictMetricsEvaluator(strict_schema, expr).eval(missing_stats)

Beispiel #30

0

Datei anzeigen

Datei: test_strict_metrics_evaluator.py Projekt: rdsr/li-iceberg-rdsr

def test_zero_record_file(strict_schema, empty):

    exprs = [Expressions.less_than("no_stats", 5),
             Expressions.less_than_or_equal("no_stats", 30),
             Expressions.equal("no_stats", 70),
             Expressions.greater_than("no_stats", 78),
             Expressions.greater_than_or_equal("no_stats", 90),
             Expressions.not_equal("no_stats", 101),
             Expressions.is_null("no_stats"),
             Expressions.not_null("no_stats")]
    for expr in exprs:
        assert StrictMetricsEvaluator(strict_schema, expr).eval(empty)