def test_not_null(assert_and_unwrap): optional = StructType.of([NestedField.optional(21, "s", StringType.get())]) unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s")) expr = unbound.bind(optional) bound = assert_and_unwrap(expr) assert Operation.NOT_NULL == bound.op assert 21 == bound.ref.field.field_id assert bound.lit is None required = StructType.of([NestedField.required(22, "s", StringType.get())]) assert Expressions.always_true() == unbound.bind(required)
def get_type(partition_type): return StructType.of([ NestedField.required(100, "file_path", StringType.get()), NestedField.required(101, "file_format", StringType.get()), NestedField.required(102, "partition", partition_type), NestedField.required(103, "record_count", LongType.get()), NestedField.required(104, "file_size_in_bytes", LongType.get()), NestedField.required(105, "block_size_in_bytes", LongType.get()), NestedField.optional(106, "file_ordinal", IntegerType.get()), NestedField.optional(107, "sort_columns", ListType.of_required(112, IntegerType.get())), NestedField.optional( 108, "column_sizes", MapType.of_required(117, 118, IntegerType.get(), LongType.get())), NestedField.optional( 109, "value_counts", MapType.of_required(119, 120, IntegerType.get(), LongType.get())), NestedField.optional( 110, "null_value_counts", MapType.of_required(121, 122, IntegerType.get(), LongType.get())), NestedField.optional( 125, "lower_bounds", MapType.of_required(126, 127, IntegerType.get(), BinaryType.get())), NestedField.optional( 128, "upper_bounds", MapType.of_required(129, 130, IntegerType.get(), BinaryType.get())) ] # NEXT ID TO ASSIGN: 131 )
def test_comparison_predicate_binding(op, assert_and_unwrap): struct = StructType.of([NestedField.required(14, "x", IntegerType.get())]) unbound = UnboundPredicate(op, Expressions.ref("x"), 5) bound = assert_and_unwrap(unbound.bind(struct)) assert 5 == bound.lit.value assert 14 == bound.ref.field.field_id assert op == bound.op
def get_struct_field(col_type: pa.StructType) -> StructType: if col_type.num_fields > 0: if col_type[0].name == "map": return get_inferred_map(col_type) else: return StructType.of([get_field(child) for child in col_type]) else: raise RuntimeError("Unable to convert type to iceberg %s" % col_type)
def test_invalid_conversions(op): struct = StructType.of([NestedField.required(16, "f", FloatType.get())]) unbound = UnboundPredicate(op, Expressions.ref("f"), "12.40") try: unbound.bind(struct) except ValidationException as e: assert e.args[0].startswith('Invalid Value for conversion to type float: "12.40" (StringLiteral)')
def test_missing_field(): struct = StructType.of([NestedField.required(13, "x", IntegerType.get())]) unbound = UnboundPredicate(Operation.LT, Expressions.ref("missing"), 6) try: unbound.bind(struct) except ValidationException as e: assert e.args[0].startswith("Cannot find field 'missing' in struct")
def test_literal_converison(op, assert_and_unwrap): struct = StructType.of([NestedField.required(15, "d", DecimalType.of(9, 2))]) unbound = UnboundPredicate(op, Expressions.ref("d"), "12.40") bound = assert_and_unwrap(unbound.bind(struct)) assert Decimal(12.40).quantize(Decimal(".01")).as_tuple() == bound.lit.value.as_tuple() assert 15 == bound.ref.field.field_id assert op == bound.op
def test_long_to_integer_conversion(assert_and_unwrap): struct = StructType.of([NestedField.required(17, "i", IntegerType.get())]) lt = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert lt.bind(struct) == Expressions.always_true() lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert lt_eq.bind(struct) == Expressions.always_true() gt = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert gt.bind(struct) == Expressions.always_true() gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert gt_eq.bind(struct) == Expressions.always_true() gt_max = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert gt_max.bind(struct) == Expressions.always_false() gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert gt_eq_max.bind(struct) == Expressions.always_false() lt_min = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert lt_min.bind(struct) == Expressions.always_false() lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert lt_eq_min.bind(struct) == Expressions.always_false() lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MAX_INT).bind(struct) lt_max = assert_and_unwrap(lt_expr) assert lt_max.lit.value == Literal.JAVA_MAX_INT lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT).bind(struct) lt_eq_max = assert_and_unwrap(lt_eq_expr) assert lt_eq_max.lit.value == Literal.JAVA_MAX_INT gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MIN_INT).bind(struct) gt_min = assert_and_unwrap(gt_expr) assert gt_min.lit.value == Literal.JAVA_MIN_INT gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT).bind(struct) gt_eq_min = assert_and_unwrap(gt_eq_expr) assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def test_double_to_float_conversion(assert_and_unwrap): struct = StructType.of([NestedField.required(18, "f", FloatType.get())]) lt = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert lt.bind(struct) == Expressions.always_true() lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert lt_eq.bind(struct) == Expressions.always_true() gt = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert gt.bind(struct) == Expressions.always_true() gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert gt_eq.bind(struct) == Expressions.always_true() gt_max = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert gt_max.bind(struct) == Expressions.always_false() gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert gt_eq_max.bind(struct) == Expressions.always_false() lt_min = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert lt_min.bind(struct) == Expressions.always_false() lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert lt_eq_min.bind(struct) == Expressions.always_false() lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT).bind(struct) lt_max = assert_and_unwrap(lt_expr) assert lt_max.lit.value == Literal.JAVA_MAX_FLOAT lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT).bind(struct) lt_eq_max = assert_and_unwrap(lt_eq_expr) assert lt_eq_max.lit.value == Literal.JAVA_MAX_FLOAT gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MIN_INT).bind(struct) gt_min = assert_and_unwrap(gt_expr) assert gt_min.lit.value == Literal.JAVA_MIN_INT gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MIN_INT).bind(struct) gt_eq_min = assert_and_unwrap(gt_eq_expr) assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def test_multiple_fields(assert_and_unwrap): struct = StructType.of([NestedField.required(10, 'x', IntegerType.get()), NestedField.required(11, 'y', IntegerType.get()), NestedField.required(12, 'z', IntegerType.get())]) unbound = UnboundPredicate(Operation.LT, Expressions.ref("y"), 6) expr = unbound.bind(struct) bound = assert_and_unwrap(expr) assert 11 == bound.ref.field.field_id assert Operation.LT == bound.op assert 6 == bound.lit.value
def test_nan_errors(row_of): # Placeholder until NaN support is fully implemented struct = StructType.of([NestedField.required(34, "f", FloatType.get())]) evaluator = exp.evaluator.Evaluator( struct, exp.expressions.Expressions.is_nan("f")) with raises(NotImplementedError): evaluator.eval(row_of((123.4, ))) evaluator = exp.evaluator.Evaluator( struct, exp.expressions.Expressions.not_nan("f")) with raises(NotImplementedError): evaluator.eval(row_of((123.4, )))
class GenericDataFile(DataFile, StructLike): EMPTY_STRUCT_TYPE = StructType.of([]) EMPTY_PARTITION_DATA = PartitionData(EMPTY_STRUCT_TYPE) def __init__(self, file_path, format, file_size_in_bytes, block_size_in_bytes, row_count=None, partition=None, metrics=None): self.file_path = file_path self.format = format self.row_count = row_count self.file_size_in_bytes = file_size_in_bytes self.block_size_in_bytes = block_size_in_bytes self.file_ordinal = None self.sort_columns = None if partition is None: self.partition_data = GenericDataFile.EMPTY_PARTITION_DATA self.partition_type = GenericDataFile.EMPTY_PARTITION_DATA.partition_type else: self.partition_data = partition self.partition_type = partition.get_partition_type() if metrics is None: self.row_count = row_count self.column_sizes = None self.value_counts = None self.null_value_counts = None self.lower_bounds = None self.upper_bounds = None else: self.row_count = metrics.row_count self.column_sizes = metrics.column_sizes self.value_counts = metrics.value_counts self.null_value_counts = metrics.null_value_counts self.lower_bounds = metrics.lower_bounds self.upper_bounds = metrics.upper_bounds def partition(self): return self.partition_data @staticmethod def get_avro_schema(partition_type): return IcebergToAvro.type_to_schema(DataFile.get_type(partition_type), DataFile.__class__.__name__)
def convert_record_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if avro_field_type != "record": raise RuntimeError("Field type muse be 'record': %s" % avro_field_type) fields = avro_field.get(AvroToIceberg.FIELD_FIELDS_PROP) iceberg_fields = [] if next_id is None: next_id = len(fields) for field in fields: iceberg_field, next_id = AvroToIceberg.convert_avro_field_to_iceberg(field, next_id=next_id) iceberg_fields.append(iceberg_field) return StructType.of(iceberg_fields), next_id
def test_unnested_complex_types(unnested_complex_type_test_parquet_file): expected_schema = Schema([ NestedField.optional(1, "list_int_col", ListType.of_optional(3, IntegerType.get())), NestedField.optional(4, "list_str_col", ListType.of_optional(6, StringType.get())), NestedField.optional( 7, "struct_col", StructType.of([ NestedField.optional(8, "f1", IntegerType.get()), NestedField.optional(9, "f2", StringType.get()) ])) ]) converted_schema = convert_parquet_to_iceberg( unnested_complex_type_test_parquet_file) compare_schema(expected_schema, converted_schema)
def struct_from_dict(dict_obj): struct_fields = list() fields = dict_obj.get(SchemaParser.FIELDS) for field in fields: field_id = field.get(SchemaParser.ID) field_name = field.get(SchemaParser.NAME) field_type = SchemaParser.type_from_dict( field.get(SchemaParser.TYPE)) if field.get(SchemaParser.REQUIRED): struct_fields.append( NestedField.required(field_id, field_name, field_type)) else: struct_fields.append( NestedField.optional(field_id, field_name, field_type)) return StructType.of(struct_fields)
def supported_primitives(): return StructType.of([ NestedField.required(100, "id", LongType.get()), NestedField.optional(101, "data", StringType.get()), NestedField.required(102, "b", BooleanType.get()), NestedField.optional(103, "i", IntegerType.get()), NestedField.required(104, "l", LongType.get()), NestedField.optional(105, "f", FloatType.get()), NestedField.required(106, "d", DoubleType.get()), NestedField.optional(107, "date", DateType.get()), NestedField.required(108, "ts", TimestampType.with_timezone()), NestedField.required(110, "s", StringType.get()), NestedField.required(111, "uuid", UUIDType.get()), NestedField.required(112, "fixed", FixedType.of_length(7)), NestedField.optional(113, "bytes", BinaryType.get()), NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)), NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)), NestedField.required(114, "dec_38_10", DecimalType.of(38, 10)) ])
class GenericDataFile(DataFile, StructLike): EMPTY_STRUCT_TYPE = StructType.of([]) EMPTY_PARTITION_DATA = PartitionData(EMPTY_STRUCT_TYPE) def __init__(self, file_path, format, file_size_in_bytes, block_size_in_bytes, row_count=None, partition=None, metrics=None): self._file_path = file_path self._format = format self._row_count = row_count self._file_size_in_bytes = file_size_in_bytes self._block_size_in_bytes = block_size_in_bytes self._file_ordinal = None self._sort_columns = None if partition is None: self._partition_data = GenericDataFile.EMPTY_PARTITION_DATA self._partition_type = GenericDataFile.EMPTY_PARTITION_DATA.partition_type else: self._partition_data = partition self._partition_type = partition.get_partition_type() if metrics is None: self._row_count = row_count self._column_sizes = None self._value_counts = None self._null_value_counts = None self._lower_bounds = None self._upper_bounds = None else: self._row_count = metrics.row_count self._column_sizes = metrics.column_sizes self._value_counts = metrics.value_counts self._null_value_counts = metrics.null_value_counts self._lower_bounds = metrics.lower_bounds self._upper_bounds = metrics.upper_bounds def partition(self): return self._partition_data def path(self): return self._file_path def format(self): return self._format def record_count(self): return self._row_count def file_size_in_bytes(self): return self._file_size_in_bytes def block_size_in_bytes(self): return self._block_size_in_bytes def file_ordinal(self): return self._file_ordinal def sort_columns(self): return self._sort_columns def column_sizes(self): return self._column_sizes def value_counts(self): return self._value_counts def null_value_counts(self): return self._null_value_counts def lower_bounds(self): return self._lower_bounds def upper_bounds(self): return self._upper_bounds def copy(self): return copy.deepcopy(self) @staticmethod def get_avro_schema(partition_type): return IcebergToAvro.type_to_schema(DataFile.get_type(partition_type), DataFile.__class__.__name__) def __repr__(self): fields = [ "file_path: {}".format(self._file_path), "file_format: {}".format(self._format), "partition: {}".format(self._partition_data), "record_count: {}".format(self._row_count), "file_size_in_bytes: {}".format(self._file_size_in_bytes), "block_size_in_bytes: {}".format(self._block_size_in_bytes), "column_sizes: {}".format(self._column_sizes), "value_counts: {}".format(self._value_counts), "null_value_counts: {}".format(self._null_value_counts), "lower_bounds: {}".format(self._lower_bounds), "upper_bounds: {}".format(self._upper_bounds), ] return "GenericDataFile({})".format("\n,".join(fields)) def __str__(self): return self.__repr__() def __deepcopy__(self, memodict): cls = self.__class__ result = cls.__new__(cls) memodict[id(self)] = result for k, v in self.__dict__.items(): setattr(result, k, copy.deepcopy(v, memodict)) return result
def test_char_seq_value(row_of): struct = StructType.of([NestedField.required(34, "s", StringType.get())]) evaluator = exp.evaluator.Evaluator(struct, exp.expressions.Expressions.equal("s", "abc")) assert evaluator.eval(row_of(("abc",))) assert not evaluator.eval(row_of(("abcd",)))
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import iceberg.api.expressions as exp from iceberg.api.types import (IntegerType, NestedField, StringType, StructType) from iceberg.exceptions import ValidationException from pytest import raises STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()), NestedField.required(14, "y", IntegerType.get()), NestedField.optional(15, "z", IntegerType.get())]) def test_less_than(row_of): evaluator = exp.evaluator.Evaluator(STRUCT, exp.expressions.Expressions.less_than("x", 7)) assert not evaluator.eval(row_of((7, 8, None))) assert evaluator.eval(row_of((6, 8, None))) def test_less_than_or_equal(row_of): evaluator = exp.evaluator.Evaluator(STRUCT, exp.expressions.Expressions.less_than_or_equal("x", 7)) assert evaluator.eval(row_of((7, 8, None))) assert evaluator.eval(row_of((6, 8, None)))
# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from iceberg.api.expressions import (And, Binder, Expressions, Not, Or) from iceberg.api.types import (IntegerType, NestedField, StructType) import iceberg.exceptions as ice_ex from pytest import raises STRUCT = StructType.of([ NestedField.required(0, "x", IntegerType.get()), NestedField.required(1, "y", IntegerType.get()), NestedField.required(2, "z", IntegerType.get()) ]) def test_missing_reference(): expr = Expressions.and_(Expressions.equal("t", 5), Expressions.equal("x", 7)) try: Binder.bind(STRUCT, expr) except ice_ex.ValidationException as e: assert "Cannot find field 't' in struct" in "{}".format(e) def test_bound_expression_fails(): with raises(RuntimeError):