def test_decimal_to_decimal_conversion(): lit = Literal.of(Decimal("34.11").quantize(Decimal(".01"))) assert lit.value.as_tuple() == lit.to(DecimalType.of(9, 2)).value.as_tuple() assert lit.value.as_tuple() == lit.to(DecimalType.of(11, 2)).value.as_tuple() assert lit.to(DecimalType.of(9, 0)) is None assert lit.to(DecimalType.of(9, 1)) is None assert lit.to(DecimalType.of(9, 3)) is None
def test_string_to_decimal_literal(): decimal_str = Literal.of("34.560") decimal_lit = decimal_str.to(DecimalType.of(9, 3)) assert 3 == abs(decimal_lit.value.as_tuple().exponent) assert Decimal("34.560").as_tuple() == decimal_lit.value.as_tuple() assert decimal_str.to(DecimalType.of(9, 2)) is None assert decimal_str.to(DecimalType.of(9, 4)) is None
def rg_expected_schema(): return Schema([ NestedField.required(1, "string_col", StringType.get()), NestedField.required(2, "long_col", LongType.get()), NestedField.required(3, "int_col", IntegerType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "null_col", StringType.get()), NestedField.optional(6, "missing_col", StringType.get()), NestedField.optional(7, "no_stats_col", StringType.get()), NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(9, "ts_wotz_col", TimestampType.without_timezone()), NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)), NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)), NestedField.optional(12, "date_type", DateType.get()), ])
def test_literal_converison(op, assert_and_unwrap): struct = StructType.of([NestedField.required(15, "d", DecimalType.of(9, 2))]) unbound = UnboundPredicate(op, Expressions.ref("d"), "12.40") bound = assert_and_unwrap(unbound.bind(struct)) assert Decimal(12.40).quantize(Decimal(".01")).as_tuple() == bound.lit.value.as_tuple() assert 15 == bound.ref.field.field_id assert op == bound.op
def test_big_decimal_to_human_string(): big_dec = DecimalType.of(9, 2) identity = Transforms.identity(big_dec) dec_str = "-1.50" dec_var = Decimal(dec_str) assert identity.to_human_string(dec_var) == dec_str
def test_primitive_types(primitive_type_test_parquet_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) compare_schema( expected_schema, convert_parquet_to_iceberg(primitive_type_test_parquet_file))
def supported_primitives(): return StructType.of([ NestedField.required(100, "id", LongType.get()), NestedField.optional(101, "data", StringType.get()), NestedField.required(102, "b", BooleanType.get()), NestedField.optional(103, "i", IntegerType.get()), NestedField.required(104, "l", LongType.get()), NestedField.optional(105, "f", FloatType.get()), NestedField.required(106, "d", DoubleType.get()), NestedField.optional(107, "date", DateType.get()), NestedField.required(108, "ts", TimestampType.with_timezone()), NestedField.required(110, "s", StringType.get()), NestedField.required(111, "uuid", UUIDType.get()), NestedField.required(112, "fixed", FixedType.of_length(7)), NestedField.optional(113, "bytes", BinaryType.get()), NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)), NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)), NestedField.required(114, "dec_38_10", DecimalType.of(38, 10)) ])
def test_raise_exception_with_invalid_json(): spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "data", StringType.get()), NestedField.required(3, "num", DecimalType.of(9, 2))) spec_string = '{"spec-id": 0, "fields": [' \ '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \ '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \ '{"name": "data1", "transform": "bucket[16]", "source-id": 2}, ' \ '{"name": "data2", "transform": "bucket[8]", "source-id": 2}, ' \ '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3}]}' with pytest.raises(RuntimeError): PartitionSpecParser.from_json(spec_schema, spec_string)
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) assert reader.read() == source_table
def test_partition_spec(self): schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [PartitionSpec.builder_for(schema).identity("i").build(), PartitionSpec.builder_for(schema).identity("l").build(), PartitionSpec.builder_for(schema).identity("d").build(), PartitionSpec.builder_for(schema).identity("t").build(), PartitionSpec.builder_for(schema).identity("ts").build(), PartitionSpec.builder_for(schema).identity("dec").build(), PartitionSpec.builder_for(schema).identity("s").build(), PartitionSpec.builder_for(schema).identity("u").build(), PartitionSpec.builder_for(schema).identity("f").build(), PartitionSpec.builder_for(schema).identity("b").build(), PartitionSpec.builder_for(schema).bucket("i", 128).build(), PartitionSpec.builder_for(schema).bucket("l", 128).build(), PartitionSpec.builder_for(schema).bucket("d", 128).build(), PartitionSpec.builder_for(schema).bucket("t", 128).build(), PartitionSpec.builder_for(schema).bucket("ts", 128).build(), PartitionSpec.builder_for(schema).bucket("dec", 128).build(), PartitionSpec.builder_for(schema).bucket("s", 128).build(), PartitionSpec.builder_for(schema).bucket("u", 128).build(), PartitionSpec.builder_for(schema).bucket("f", 128).build(), PartitionSpec.builder_for(schema).bucket("b", 128).build(), PartitionSpec.builder_for(schema).year("d").build(), PartitionSpec.builder_for(schema).month("d").build(), PartitionSpec.builder_for(schema).day("d").build(), PartitionSpec.builder_for(schema).year("ts").build(), PartitionSpec.builder_for(schema).month("ts").build(), PartitionSpec.builder_for(schema).day("ts").build(), PartitionSpec.builder_for(schema).hour("ts").build(), PartitionSpec.builder_for(schema).truncate("i", 10).build(), PartitionSpec.builder_for(schema).truncate("l", 10).build(), PartitionSpec.builder_for(schema).truncate("dec", 10).build(), PartitionSpec.builder_for(schema).truncate("s", 10).build(), PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(), PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(), ] for spec in specs: self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))
def test_bucket_hash(self): buckets = [ [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379], [Transforms.bucket(LongType.get(), 100), 34, 2017239379], [Transforms.bucket(DateType.get(), 100), 17486, -653330422], [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989], [Transforms.bucket(TimestampType.without_timezone(), 100), 1510871468000000, -2047944441], [Transforms.bucket(DecimalType.of(9, 2), 100), decimal.Decimal("14.20"), -500754589], [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089], [Transforms.bucket(UUIDType.get(), 100), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340], [Transforms.bucket(FixedType.of_length(3), 128), b'foo', -156908512], [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', -188683207] ] for bucket in buckets: self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))
def test_decimal_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9)) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.decimal128(38, 9)) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("new_dec_col", pa.decimal128(38, 9), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "data", StringType.get()), NestedField.required(3, "num", DecimalType.of(9, 2))) spec = PartitionSpec \ .builder_for(spec_schema) \ .identity("id") \ .bucket("data", 16) \ .add_without_field_id(2, "data1", "bucket[16]") \ .add(2, 1010, "data2", "bucket[8]") \ .bucket("num", 8) \ .build() expected = '{"spec-id": 0, "fields": [' \ '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \ '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \ '{"name": "data1", "transform": "bucket[16]", "source-id": 2, "field-id": 1002}, ' \ '{"name": "data2", "transform": "bucket[8]", "source-id": 2, "field-id": 1010}, ' \ '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3, "field-id": 1011}]}' assert expected == PartitionSpecParser.to_json(spec)
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [ PartitionSpec.builder_for(spec_schema).identity("i").build(), PartitionSpec.builder_for(spec_schema).identity("l").build(), PartitionSpec.builder_for(spec_schema).identity("d").build(), PartitionSpec.builder_for(spec_schema).identity("t").build(), PartitionSpec.builder_for(spec_schema).identity("ts").build(), PartitionSpec.builder_for(spec_schema).identity("dec").build(), PartitionSpec.builder_for(spec_schema).identity("s").build(), PartitionSpec.builder_for(spec_schema).identity("u").build(), PartitionSpec.builder_for(spec_schema).identity("f").build(), PartitionSpec.builder_for(spec_schema).identity("b").build(), PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(), PartitionSpec.builder_for(spec_schema).year("d").build(), PartitionSpec.builder_for(spec_schema).month("d").build(), PartitionSpec.builder_for(spec_schema).day("d").build(), PartitionSpec.builder_for(spec_schema).year("ts").build(), PartitionSpec.builder_for(spec_schema).month("ts").build(), PartitionSpec.builder_for(spec_schema).day("ts").build(), PartitionSpec.builder_for(spec_schema).hour("ts").build(), PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(), PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build() ] expected_spec_strs = [ "[\n i: identity(1)\n]", "[\n l: identity(2)\n]", "[\n d: identity(3)\n]", "[\n t: identity(4)\n]", "[\n ts: identity(5)\n]", "[\n dec: identity(6)\n]", "[\n s: identity(7)\n]", "[\n u: identity(8)\n]", "[\n f: identity(9)\n]", "[\n b: identity(10)\n]", "[\n i_bucket: bucket[128](1)\n]", "[\n l_bucket: bucket[128](2)\n]", "[\n d_bucket: bucket[128](3)\n]", "[\n t_bucket: bucket[128](4)\n]", "[\n ts_bucket: bucket[128](5)\n]", "[\n dec_bucket: bucket[128](6)\n]", "[\n s_bucket: bucket[128](7)\n]", "[\n d_year: year(3)\n]", "[\n d_month: month(3)\n]", "[\n d_day: day(3)\n]", "[\n ts_year: year(5)\n]", "[\n ts_month: month(5)\n]", "[\n ts_day: day(5)\n]", "[\n ts_hour: hour(5)\n]", "[\n i_truncate: truncate[10](1)\n]", "[\n l_truncate: truncate[10](2)\n]", "[\n dec_truncate: truncate[10](6)\n]", "[\n s_truncate: truncate[10](7)\n]", "[\n dec_bucket: bucket[16](6)\n]", ] for (spec, expected_spec_str) in zip(specs, expected_spec_strs): assert str(spec) == expected_spec_str
(34, IntegerType, 2017239379), (34, LongType, 2017239379)]) def test_spec_values_int(test_input, test_type, expected): assert Bucket.get(test_type.get(), 100).hash(test_input) == expected @pytest.mark.parametrize("test_input,test_type,expected", [(1, BucketFloat(100), -142385009), (1, BucketDouble(100), -142385009)]) def test_spec_values_dbl(test_input, test_type, expected): assert test_type.hash(test_input) == expected @pytest.mark.parametrize( "test_input,test_type,scale_factor,expected", [(Decimal("14.20"), DecimalType.of(9, 2), Decimal(10)**-2, -500754589), (Decimal("137302769811943318102518958871258.37580"), DecimalType.of( 38, 5), Decimal(10)**-5, -32334285)]) def test_spec_values_dec(test_input, test_type, scale_factor, expected): getcontext().prec = 38 assert Bucket.get(test_type, 100).hash(test_input.quantize(scale_factor)) == expected @pytest.mark.parametrize("test_input,test_type,expected", [ (Literal.of("2017-11-16").to(DateType.get()), DateType.get(), -653330422), (Literal.of("22:31:08").to(TimeType.get()), TimeType.get(), -662762989), (Literal.of("2017-11-16T22:31:08").to(TimestampType.without_timezone()), TimestampType.without_timezone(), -2047944441), (Literal.of("2017-11-16T14:31:08-08:00").to(TimestampType.with_timezone()), TimestampType.with_timezone(), -2047944441)
DateType, DecimalType, DoubleType, FixedType, FloatType, IntegerType, LongType, StringType, TimestampType, TimeType, UUIDType) PRIMITIVES = [BinaryType.get(), BooleanType.get(), DateType.get(), DecimalType.of(9, 2), DecimalType.of(11, 2), DecimalType.of(9, 3), DoubleType.get(), FixedType.of_length(3), FixedType.of_length(4), FloatType.get(), IntegerType.get(), LongType.get(), StringType.get(), TimestampType.with_timezone(), TimestampType.without_timezone(), TimeType.get(), UUIDType.get()]
ListType, LongType, MapType, NestedField, StringType, StructType, TimestampType) from iceberg.api.types import Type import pyarrow as pa from pyarrow.parquet import lib, ParquetFile _logger = logging.getLogger(__name__) arrow_type_map = {lib.Type_BOOL: lambda x=None: BooleanType.get(), lib.Type_DATE32: lambda x=None: DateType.get(), lib.Type_DECIMAL128: lambda x=None: DecimalType.of(x.precision, x.scale), lib.Type_DOUBLE: lambda x=None: DoubleType.get(), lib.Type_FIXED_SIZE_BINARY: lambda x=None: FixedType.of_length(x.byte_width), lib.Type_BINARY: lambda x=None: BinaryType.get(), lib.Type_FLOAT: lambda x=None: FloatType.get(), lib.Type_STRING: lambda x=None: StringType.get(), lib.Type_INT32: lambda x=None: IntegerType.get(), lib.Type_INT64: lambda x=None: LongType.get(), lib.Type_TIMESTAMP: lambda x=None: (TimestampType.without_timezone() if x.tz is None else TimestampType.with_timezone()) } def get_nested_field(field_id: int, field_name: str, field_type: Type, nullable: bool) -> NestedField: if nullable:
def test_truncate_decimal(input_var, expected): trunc = Truncate.get(DecimalType.of(9, 2), 10) assert trunc.apply(input_var) == expected
Literal.of("2017-11-29T11:30:07.123").to( TimestampType.without_timezone()), Literal.of("2017-11-29T11:30:07.123+01:00").to( TimestampType.with_timezone()), Literal.of("abc"), Literal.of(uuid.uuid4()), Literal.of(bytes([0x01, 0x02, 0x03])).to(FixedType.of_length(3)), Literal.of(bytes([0x03, 0x04, 0x05, 0x06])).to(BinaryType.get()), Literal.of(Decimal(122.50).quantize(Decimal(".01"))) ]) def literal(request): yield request.param @pytest.fixture(scope="session", params=[(DecimalType.of(9, 0), "34"), (DecimalType.of(9, 2), "34.00"), (DecimalType.of(9, 4), "34.0000")]) def type_val_tuples(request): yield request.param @pytest.fixture(scope="session", params=[(DecimalType.of(9, 1), "34.6"), (DecimalType.of(9, 2), "34.56"), (DecimalType.of(9, 4), "34.5600")]) def float_type_val_tuples(request): yield request.param