Exemple #1
0
def test_not_null(assert_and_unwrap):
    optional = StructType.of([NestedField.optional(21, "s", StringType.get())])
    unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s"))
    expr = unbound.bind(optional)
    bound = assert_and_unwrap(expr)
    assert Operation.NOT_NULL == bound.op
    assert 21 == bound.ref.field.field_id
    assert bound.lit is None

    required = StructType.of([NestedField.required(22, "s", StringType.get())])
    assert Expressions.always_true() == unbound.bind(required)
Exemple #2
0
 def get_type(partition_type):
     return StructType.of([
         NestedField.required(100, "file_path", StringType.get()),
         NestedField.required(101, "file_format", StringType.get()),
         NestedField.required(102, "partition", partition_type),
         NestedField.required(103, "record_count", LongType.get()),
         NestedField.required(104, "file_size_in_bytes", LongType.get()),
         NestedField.required(105, "block_size_in_bytes", LongType.get()),
         NestedField.optional(106, "file_ordinal", IntegerType.get()),
         NestedField.optional(107, "sort_columns",
                              ListType.of_required(112, IntegerType.get())),
         NestedField.optional(
             108, "column_sizes",
             MapType.of_required(117, 118, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             109, "value_counts",
             MapType.of_required(119, 120, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             110, "null_value_counts",
             MapType.of_required(121, 122, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             125, "lower_bounds",
             MapType.of_required(126, 127, IntegerType.get(),
                                 BinaryType.get())),
         NestedField.optional(
             128, "upper_bounds",
             MapType.of_required(129, 130, IntegerType.get(),
                                 BinaryType.get()))
     ]
                          # NEXT ID TO ASSIGN: 131
                          )
Exemple #3
0
def test_comparison_predicate_binding(op, assert_and_unwrap):
    struct = StructType.of([NestedField.required(14, "x", IntegerType.get())])
    unbound = UnboundPredicate(op, Expressions.ref("x"), 5)
    bound = assert_and_unwrap(unbound.bind(struct))

    assert 5 == bound.lit.value
    assert 14 == bound.ref.field.field_id
    assert op == bound.op
Exemple #4
0
def get_struct_field(col_type: pa.StructType) -> StructType:
    if col_type.num_fields > 0:
        if col_type[0].name == "map":
            return get_inferred_map(col_type)
        else:
            return StructType.of([get_field(child) for child in col_type])
    else:
        raise RuntimeError("Unable to convert type to iceberg %s" % col_type)
Exemple #5
0
def test_invalid_conversions(op):
    struct = StructType.of([NestedField.required(16, "f", FloatType.get())])
    unbound = UnboundPredicate(op, Expressions.ref("f"), "12.40")

    try:
        unbound.bind(struct)
    except ValidationException as e:
        assert e.args[0].startswith('Invalid Value for conversion to type float: "12.40" (StringLiteral)')
Exemple #6
0
def test_missing_field():
    struct = StructType.of([NestedField.required(13, "x", IntegerType.get())])

    unbound = UnboundPredicate(Operation.LT, Expressions.ref("missing"), 6)
    try:
        unbound.bind(struct)
    except ValidationException as e:
        assert e.args[0].startswith("Cannot find field 'missing' in struct")
Exemple #7
0
def test_literal_converison(op, assert_and_unwrap):
    struct = StructType.of([NestedField.required(15, "d", DecimalType.of(9, 2))])
    unbound = UnboundPredicate(op, Expressions.ref("d"), "12.40")
    bound = assert_and_unwrap(unbound.bind(struct))

    assert Decimal(12.40).quantize(Decimal(".01")).as_tuple() == bound.lit.value.as_tuple()
    assert 15 == bound.ref.field.field_id
    assert op == bound.op
def test_long_to_integer_conversion(assert_and_unwrap):
    struct = StructType.of([NestedField.required(17, "i", IntegerType.get())])

    lt = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                          Literal.JAVA_MAX_INT + 1)
    assert lt.bind(struct) == Expressions.always_true()

    lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                             Literal.JAVA_MAX_INT + 1)
    assert lt_eq.bind(struct) == Expressions.always_true()

    gt = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                          Literal.JAVA_MIN_INT - 1)
    assert gt.bind(struct) == Expressions.always_true()

    gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                             Literal.JAVA_MIN_INT - 1)
    assert gt_eq.bind(struct) == Expressions.always_true()

    gt_max = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                              Literal.JAVA_MAX_INT + 1)
    assert gt_max.bind(struct) == Expressions.always_false()

    gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                                 Literal.JAVA_MAX_INT + 1)
    assert gt_eq_max.bind(struct) == Expressions.always_false()

    lt_min = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                              Literal.JAVA_MIN_INT - 1)
    assert lt_min.bind(struct) == Expressions.always_false()

    lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                                 Literal.JAVA_MIN_INT - 1)
    assert lt_eq_min.bind(struct) == Expressions.always_false()

    lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                               Literal.JAVA_MAX_INT).bind(struct)
    lt_max = assert_and_unwrap(lt_expr)
    assert lt_max.lit.value == Literal.JAVA_MAX_INT

    lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                                  Literal.JAVA_MAX_INT).bind(struct)
    lt_eq_max = assert_and_unwrap(lt_eq_expr)
    assert lt_eq_max.lit.value == Literal.JAVA_MAX_INT

    gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                               Literal.JAVA_MIN_INT).bind(struct)
    gt_min = assert_and_unwrap(gt_expr)
    assert gt_min.lit.value == Literal.JAVA_MIN_INT

    gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                                  Literal.JAVA_MIN_INT).bind(struct)
    gt_eq_min = assert_and_unwrap(gt_eq_expr)
    assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def test_double_to_float_conversion(assert_and_unwrap):
    struct = StructType.of([NestedField.required(18, "f", FloatType.get())])

    lt = UnboundPredicate(Operation.LT, Expressions.ref("f"),
                          Literal.JAVA_MAX_FLOAT * 2)
    assert lt.bind(struct) == Expressions.always_true()

    lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"),
                             Literal.JAVA_MAX_FLOAT * 2)
    assert lt_eq.bind(struct) == Expressions.always_true()

    gt = UnboundPredicate(Operation.GT, Expressions.ref("f"),
                          Literal.JAVA_MAX_FLOAT * -2)
    assert gt.bind(struct) == Expressions.always_true()

    gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"),
                             Literal.JAVA_MAX_FLOAT * -2)
    assert gt_eq.bind(struct) == Expressions.always_true()

    gt_max = UnboundPredicate(Operation.GT, Expressions.ref("f"),
                              Literal.JAVA_MAX_FLOAT * 2)
    assert gt_max.bind(struct) == Expressions.always_false()

    gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"),
                                 Literal.JAVA_MAX_FLOAT * 2)
    assert gt_eq_max.bind(struct) == Expressions.always_false()

    lt_min = UnboundPredicate(Operation.LT, Expressions.ref("f"),
                              Literal.JAVA_MAX_FLOAT * -2)
    assert lt_min.bind(struct) == Expressions.always_false()

    lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"),
                                 Literal.JAVA_MAX_FLOAT * -2)
    assert lt_eq_min.bind(struct) == Expressions.always_false()

    lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("f"),
                               Literal.JAVA_MAX_FLOAT).bind(struct)
    lt_max = assert_and_unwrap(lt_expr)
    assert lt_max.lit.value == Literal.JAVA_MAX_FLOAT

    lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"),
                                  Literal.JAVA_MAX_FLOAT).bind(struct)
    lt_eq_max = assert_and_unwrap(lt_eq_expr)
    assert lt_eq_max.lit.value == Literal.JAVA_MAX_FLOAT

    gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("f"),
                               Literal.JAVA_MIN_INT).bind(struct)
    gt_min = assert_and_unwrap(gt_expr)
    assert gt_min.lit.value == Literal.JAVA_MIN_INT

    gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"),
                                  Literal.JAVA_MIN_INT).bind(struct)
    gt_eq_min = assert_and_unwrap(gt_eq_expr)
    assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
Exemple #10
0
def test_multiple_fields(assert_and_unwrap):
    struct = StructType.of([NestedField.required(10, 'x', IntegerType.get()),
                           NestedField.required(11, 'y', IntegerType.get()),
                           NestedField.required(12, 'z', IntegerType.get())])

    unbound = UnboundPredicate(Operation.LT, Expressions.ref("y"), 6)
    expr = unbound.bind(struct)

    bound = assert_and_unwrap(expr)
    assert 11 == bound.ref.field.field_id
    assert Operation.LT == bound.op
    assert 6 == bound.lit.value
Exemple #11
0
def test_nan_errors(row_of):
    # Placeholder until NaN support is fully implemented
    struct = StructType.of([NestedField.required(34, "f", FloatType.get())])
    evaluator = exp.evaluator.Evaluator(
        struct, exp.expressions.Expressions.is_nan("f"))
    with raises(NotImplementedError):
        evaluator.eval(row_of((123.4, )))

    evaluator = exp.evaluator.Evaluator(
        struct, exp.expressions.Expressions.not_nan("f"))
    with raises(NotImplementedError):
        evaluator.eval(row_of((123.4, )))
class GenericDataFile(DataFile, StructLike):

    EMPTY_STRUCT_TYPE = StructType.of([])
    EMPTY_PARTITION_DATA = PartitionData(EMPTY_STRUCT_TYPE)

    def __init__(self,
                 file_path,
                 format,
                 file_size_in_bytes,
                 block_size_in_bytes,
                 row_count=None,
                 partition=None,
                 metrics=None):
        self.file_path = file_path
        self.format = format
        self.row_count = row_count
        self.file_size_in_bytes = file_size_in_bytes
        self.block_size_in_bytes = block_size_in_bytes
        self.file_ordinal = None
        self.sort_columns = None

        if partition is None:
            self.partition_data = GenericDataFile.EMPTY_PARTITION_DATA
            self.partition_type = GenericDataFile.EMPTY_PARTITION_DATA.partition_type
        else:
            self.partition_data = partition
            self.partition_type = partition.get_partition_type()
        if metrics is None:
            self.row_count = row_count
            self.column_sizes = None
            self.value_counts = None
            self.null_value_counts = None
            self.lower_bounds = None
            self.upper_bounds = None
        else:
            self.row_count = metrics.row_count
            self.column_sizes = metrics.column_sizes
            self.value_counts = metrics.value_counts
            self.null_value_counts = metrics.null_value_counts
            self.lower_bounds = metrics.lower_bounds
            self.upper_bounds = metrics.upper_bounds

    def partition(self):
        return self.partition_data

    @staticmethod
    def get_avro_schema(partition_type):
        return IcebergToAvro.type_to_schema(DataFile.get_type(partition_type),
                                            DataFile.__class__.__name__)
Exemple #13
0
    def convert_record_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)

        if avro_field_type != "record":
            raise RuntimeError("Field type muse be 'record': %s" % avro_field_type)

        fields = avro_field.get(AvroToIceberg.FIELD_FIELDS_PROP)

        iceberg_fields = []
        if next_id is None:
            next_id = len(fields)
        for field in fields:
            iceberg_field, next_id = AvroToIceberg.convert_avro_field_to_iceberg(field, next_id=next_id)
            iceberg_fields.append(iceberg_field)

        return StructType.of(iceberg_fields), next_id
def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.optional(1, "list_int_col",
                             ListType.of_optional(3, IntegerType.get())),
        NestedField.optional(4, "list_str_col",
                             ListType.of_optional(6, StringType.get())),
        NestedField.optional(
            7, "struct_col",
            StructType.of([
                NestedField.optional(8, "f1", IntegerType.get()),
                NestedField.optional(9, "f2", StringType.get())
            ]))
    ])
    converted_schema = convert_parquet_to_iceberg(
        unnested_complex_type_test_parquet_file)
    compare_schema(expected_schema, converted_schema)
    def struct_from_dict(dict_obj):
        struct_fields = list()
        fields = dict_obj.get(SchemaParser.FIELDS)
        for field in fields:
            field_id = field.get(SchemaParser.ID)
            field_name = field.get(SchemaParser.NAME)
            field_type = SchemaParser.type_from_dict(
                field.get(SchemaParser.TYPE))

            if field.get(SchemaParser.REQUIRED):
                struct_fields.append(
                    NestedField.required(field_id, field_name, field_type))
            else:
                struct_fields.append(
                    NestedField.optional(field_id, field_name, field_type))

        return StructType.of(struct_fields)
Exemple #16
0
def supported_primitives():
    return StructType.of([
        NestedField.required(100, "id", LongType.get()),
        NestedField.optional(101, "data", StringType.get()),
        NestedField.required(102, "b", BooleanType.get()),
        NestedField.optional(103, "i", IntegerType.get()),
        NestedField.required(104, "l", LongType.get()),
        NestedField.optional(105, "f", FloatType.get()),
        NestedField.required(106, "d", DoubleType.get()),
        NestedField.optional(107, "date", DateType.get()),
        NestedField.required(108, "ts", TimestampType.with_timezone()),
        NestedField.required(110, "s", StringType.get()),
        NestedField.required(111, "uuid", UUIDType.get()),
        NestedField.required(112, "fixed", FixedType.of_length(7)),
        NestedField.optional(113, "bytes", BinaryType.get()),
        NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)),
        NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)),
        NestedField.required(114, "dec_38_10", DecimalType.of(38, 10))
    ])
class GenericDataFile(DataFile, StructLike):

    EMPTY_STRUCT_TYPE = StructType.of([])
    EMPTY_PARTITION_DATA = PartitionData(EMPTY_STRUCT_TYPE)

    def __init__(self,
                 file_path,
                 format,
                 file_size_in_bytes,
                 block_size_in_bytes,
                 row_count=None,
                 partition=None,
                 metrics=None):

        self._file_path = file_path
        self._format = format
        self._row_count = row_count
        self._file_size_in_bytes = file_size_in_bytes
        self._block_size_in_bytes = block_size_in_bytes
        self._file_ordinal = None
        self._sort_columns = None

        if partition is None:
            self._partition_data = GenericDataFile.EMPTY_PARTITION_DATA
            self._partition_type = GenericDataFile.EMPTY_PARTITION_DATA.partition_type
        else:
            self._partition_data = partition
            self._partition_type = partition.get_partition_type()
        if metrics is None:
            self._row_count = row_count
            self._column_sizes = None
            self._value_counts = None
            self._null_value_counts = None
            self._lower_bounds = None
            self._upper_bounds = None
        else:
            self._row_count = metrics.row_count
            self._column_sizes = metrics.column_sizes
            self._value_counts = metrics.value_counts
            self._null_value_counts = metrics.null_value_counts
            self._lower_bounds = metrics.lower_bounds
            self._upper_bounds = metrics.upper_bounds

    def partition(self):
        return self._partition_data

    def path(self):
        return self._file_path

    def format(self):
        return self._format

    def record_count(self):
        return self._row_count

    def file_size_in_bytes(self):
        return self._file_size_in_bytes

    def block_size_in_bytes(self):
        return self._block_size_in_bytes

    def file_ordinal(self):
        return self._file_ordinal

    def sort_columns(self):
        return self._sort_columns

    def column_sizes(self):
        return self._column_sizes

    def value_counts(self):
        return self._value_counts

    def null_value_counts(self):
        return self._null_value_counts

    def lower_bounds(self):
        return self._lower_bounds

    def upper_bounds(self):
        return self._upper_bounds

    def copy(self):
        return copy.deepcopy(self)

    @staticmethod
    def get_avro_schema(partition_type):
        return IcebergToAvro.type_to_schema(DataFile.get_type(partition_type),
                                            DataFile.__class__.__name__)

    def __repr__(self):
        fields = [
            "file_path: {}".format(self._file_path),
            "file_format: {}".format(self._format),
            "partition: {}".format(self._partition_data),
            "record_count: {}".format(self._row_count),
            "file_size_in_bytes: {}".format(self._file_size_in_bytes),
            "block_size_in_bytes: {}".format(self._block_size_in_bytes),
            "column_sizes: {}".format(self._column_sizes),
            "value_counts: {}".format(self._value_counts),
            "null_value_counts: {}".format(self._null_value_counts),
            "lower_bounds: {}".format(self._lower_bounds),
            "upper_bounds: {}".format(self._upper_bounds),
        ]
        return "GenericDataFile({})".format("\n,".join(fields))

    def __str__(self):
        return self.__repr__()

    def __deepcopy__(self, memodict):
        cls = self.__class__
        result = cls.__new__(cls)
        memodict[id(self)] = result

        for k, v in self.__dict__.items():
            setattr(result, k, copy.deepcopy(v, memodict))

        return result
Exemple #18
0
def test_char_seq_value(row_of):
    struct = StructType.of([NestedField.required(34, "s", StringType.get())])
    evaluator = exp.evaluator.Evaluator(struct, exp.expressions.Expressions.equal("s", "abc"))
    assert evaluator.eval(row_of(("abc",)))
    assert not evaluator.eval(row_of(("abcd",)))
Exemple #19
0
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


import iceberg.api.expressions as exp
from iceberg.api.types import (IntegerType,
                               NestedField,
                               StringType,
                               StructType)
from iceberg.exceptions import ValidationException
from pytest import raises

STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()),
                       NestedField.required(14, "y", IntegerType.get()),
                       NestedField.optional(15, "z", IntegerType.get())])


def test_less_than(row_of):
    evaluator = exp.evaluator.Evaluator(STRUCT,
                                        exp.expressions.Expressions.less_than("x", 7))
    assert not evaluator.eval(row_of((7, 8, None)))
    assert evaluator.eval(row_of((6, 8, None)))


def test_less_than_or_equal(row_of):
    evaluator = exp.evaluator.Evaluator(STRUCT,
                                        exp.expressions.Expressions.less_than_or_equal("x", 7))
    assert evaluator.eval(row_of((7, 8, None)))
    assert evaluator.eval(row_of((6, 8, None)))
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from iceberg.api.expressions import (And, Binder, Expressions, Not, Or)
from iceberg.api.types import (IntegerType, NestedField, StructType)
import iceberg.exceptions as ice_ex
from pytest import raises

STRUCT = StructType.of([
    NestedField.required(0, "x", IntegerType.get()),
    NestedField.required(1, "y", IntegerType.get()),
    NestedField.required(2, "z", IntegerType.get())
])


def test_missing_reference():
    expr = Expressions.and_(Expressions.equal("t", 5),
                            Expressions.equal("x", 7))
    try:
        Binder.bind(STRUCT, expr)
    except ice_ex.ValidationException as e:
        assert "Cannot find field 't' in struct" in "{}".format(e)


def test_bound_expression_fails():
    with raises(RuntimeError):