def test_null_handling(): b1 = bytearray([0x01]) assert None < Literal.of(b1) assert Literal.of(b1) > None assert Literal.of(b1).to(FixedType.of_length(3)) == Literal.of(b1).to( FixedType.of_length(4))
def test_from_bytes(self): self.assertEqual( False, Conversions.from_byte_buffer(BooleanType.get(), b'\x00')) self.assertEqual( True, Conversions.from_byte_buffer(BooleanType.get(), b'\x01')) self.assertEqual( 1234, Conversions.from_byte_buffer(IntegerType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 1234, Conversions.from_byte_buffer(LongType.get(), b'\xd2\x04\x00\x00\x00\x00\x00\x00')) self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer( FloatType.get(), b'\x19\x04\x9e?'), places=5) self.assertAlmostEqual( 1.2345, Conversions.from_byte_buffer(DoubleType.get(), b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f')) self.assertEqual( 1234, Conversions.from_byte_buffer(DateType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimeType.get(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.with_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.without_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( "foo", Conversions.from_byte_buffer(StringType.get(), b'foo')) self.assertEqual( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), Conversions.from_byte_buffer( UUIDType.get(), b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7')) self.assertEqual( b'foo', Conversions.from_byte_buffer(FixedType.of_length(3), b'foo')) self.assertEqual( b'foo', Conversions.from_byte_buffer(BinaryType.get(), b'foo')) self.assertEqual( Decimal(123.45).quantize(Decimal(".01")), Conversions.from_byte_buffer(DecimalType.of(5, 2), b'\x30\x39')) self.assertEqual( Decimal(123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\x00\x12\xd6\x87')) self.assertEqual( Decimal(-123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\xff\xed\x29\x79'))
def test_partition_spec(self): schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [PartitionSpec.builder_for(schema).identity("i").build(), PartitionSpec.builder_for(schema).identity("l").build(), PartitionSpec.builder_for(schema).identity("d").build(), PartitionSpec.builder_for(schema).identity("t").build(), PartitionSpec.builder_for(schema).identity("ts").build(), PartitionSpec.builder_for(schema).identity("dec").build(), PartitionSpec.builder_for(schema).identity("s").build(), PartitionSpec.builder_for(schema).identity("u").build(), PartitionSpec.builder_for(schema).identity("f").build(), PartitionSpec.builder_for(schema).identity("b").build(), PartitionSpec.builder_for(schema).bucket("i", 128).build(), PartitionSpec.builder_for(schema).bucket("l", 128).build(), PartitionSpec.builder_for(schema).bucket("d", 128).build(), PartitionSpec.builder_for(schema).bucket("t", 128).build(), PartitionSpec.builder_for(schema).bucket("ts", 128).build(), PartitionSpec.builder_for(schema).bucket("dec", 128).build(), PartitionSpec.builder_for(schema).bucket("s", 128).build(), PartitionSpec.builder_for(schema).bucket("u", 128).build(), PartitionSpec.builder_for(schema).bucket("f", 128).build(), PartitionSpec.builder_for(schema).bucket("b", 128).build(), PartitionSpec.builder_for(schema).year("d").build(), PartitionSpec.builder_for(schema).month("d").build(), PartitionSpec.builder_for(schema).day("d").build(), PartitionSpec.builder_for(schema).year("ts").build(), PartitionSpec.builder_for(schema).month("ts").build(), PartitionSpec.builder_for(schema).day("ts").build(), PartitionSpec.builder_for(schema).hour("ts").build(), PartitionSpec.builder_for(schema).truncate("i", 10).build(), PartitionSpec.builder_for(schema).truncate("l", 10).build(), PartitionSpec.builder_for(schema).truncate("dec", 10).build(), PartitionSpec.builder_for(schema).truncate("s", 10).build(), PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(), PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(), ] for spec in specs: self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))
def test_bucket_hash(self): buckets = [ [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379], [Transforms.bucket(LongType.get(), 100), 34, 2017239379], [Transforms.bucket(DateType.get(), 100), 17486, -653330422], [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989], [Transforms.bucket(TimestampType.without_timezone(), 100), 1510871468000000, -2047944441], [Transforms.bucket(DecimalType.of(9, 2), 100), decimal.Decimal("14.20"), -500754589], [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089], [Transforms.bucket(UUIDType.get(), 100), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340], [Transforms.bucket(FixedType.of_length(3), 128), b'foo', -156908512], [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', -188683207] ] for bucket in buckets: self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))
def supported_primitives(): return StructType.of([ NestedField.required(100, "id", LongType.get()), NestedField.optional(101, "data", StringType.get()), NestedField.required(102, "b", BooleanType.get()), NestedField.optional(103, "i", IntegerType.get()), NestedField.required(104, "l", LongType.get()), NestedField.optional(105, "f", FloatType.get()), NestedField.required(106, "d", DoubleType.get()), NestedField.optional(107, "date", DateType.get()), NestedField.required(108, "ts", TimestampType.with_timezone()), NestedField.required(110, "s", StringType.get()), NestedField.required(111, "uuid", UUIDType.get()), NestedField.required(112, "fixed", FixedType.of_length(7)), NestedField.optional(113, "bytes", BinaryType.get()), NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)), NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)), NestedField.required(114, "dec_38_10", DecimalType.of(38, 10)) ])
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [ PartitionSpec.builder_for(spec_schema).identity("i").build(), PartitionSpec.builder_for(spec_schema).identity("l").build(), PartitionSpec.builder_for(spec_schema).identity("d").build(), PartitionSpec.builder_for(spec_schema).identity("t").build(), PartitionSpec.builder_for(spec_schema).identity("ts").build(), PartitionSpec.builder_for(spec_schema).identity("dec").build(), PartitionSpec.builder_for(spec_schema).identity("s").build(), PartitionSpec.builder_for(spec_schema).identity("u").build(), PartitionSpec.builder_for(spec_schema).identity("f").build(), PartitionSpec.builder_for(spec_schema).identity("b").build(), PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(), PartitionSpec.builder_for(spec_schema).year("d").build(), PartitionSpec.builder_for(spec_schema).month("d").build(), PartitionSpec.builder_for(spec_schema).day("d").build(), PartitionSpec.builder_for(spec_schema).year("ts").build(), PartitionSpec.builder_for(spec_schema).month("ts").build(), PartitionSpec.builder_for(spec_schema).day("ts").build(), PartitionSpec.builder_for(spec_schema).hour("ts").build(), PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(), PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build() ] expected_spec_strs = [ "[\n i: identity(1)\n]", "[\n l: identity(2)\n]", "[\n d: identity(3)\n]", "[\n t: identity(4)\n]", "[\n ts: identity(5)\n]", "[\n dec: identity(6)\n]", "[\n s: identity(7)\n]", "[\n u: identity(8)\n]", "[\n f: identity(9)\n]", "[\n b: identity(10)\n]", "[\n i_bucket: bucket[128](1)\n]", "[\n l_bucket: bucket[128](2)\n]", "[\n d_bucket: bucket[128](3)\n]", "[\n t_bucket: bucket[128](4)\n]", "[\n ts_bucket: bucket[128](5)\n]", "[\n dec_bucket: bucket[128](6)\n]", "[\n s_bucket: bucket[128](7)\n]", "[\n d_year: year(3)\n]", "[\n d_month: month(3)\n]", "[\n d_day: day(3)\n]", "[\n ts_year: year(5)\n]", "[\n ts_month: month(5)\n]", "[\n ts_day: day(5)\n]", "[\n ts_hour: hour(5)\n]", "[\n i_truncate: truncate[10](1)\n]", "[\n l_truncate: truncate[10](2)\n]", "[\n dec_truncate: truncate[10](6)\n]", "[\n s_truncate: truncate[10](7)\n]", "[\n dec_bucket: bucket[16](6)\n]", ] for (spec, expected_spec_str) in zip(specs, expected_spec_strs): assert str(spec) == expected_spec_str
params=[ Literal.of(False), Literal.of(34), Literal.of(35), Literal.of(36.75), Literal.of(8.75), Literal.of("2017-11-29").to(DateType.get()), Literal.of("11:30:0").to(TimeType.get()), Literal.of("2017-11-29T11:30:07.123").to( TimestampType.without_timezone()), Literal.of("2017-11-29T11:30:07.123+01:00").to( TimestampType.with_timezone()), Literal.of("abc"), Literal.of(uuid.uuid4()), Literal.of(bytes([0x01, 0x02, 0x03])).to(FixedType.of_length(3)), Literal.of(bytes([0x03, 0x04, 0x05, 0x06])).to(BinaryType.get()), Literal.of(Decimal(122.50).quantize(Decimal(".01"))) ]) def literal(request): yield request.param @pytest.fixture(scope="session", params=[(DecimalType.of(9, 0), "34"), (DecimalType.of(9, 2), "34.00"), (DecimalType.of(9, 4), "34.0000")]) def type_val_tuples(request): yield request.param
def test_byte_buffer_conversions(self): # booleans are stored as 0x00 for 'false' and a non-zero byte for 'true' self.assertConversion(False, BooleanType.get(), b'\x00') self.assertConversion(True, BooleanType.get(), b'\x01') self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer()) self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer()) # integers are stored as 4 bytes in little-endian order # 84202 is 0...01|01001000|11101010 in binary # 11101010 -> 234 (-22), 01001000 -> 72, 00000001 -> 1, 00000000 -> 0 self.assertConversion(84202, IntegerType.get(), bytes([234, 72, 1, 0])) self.assertEqual(bytes([234, 72, 1, 0]), Literal.of(84202).to_byte_buffer()) # longs are stored as 8 bytes in little-endian order # 200L is 0...0|11001000 in binary # 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0 self.assertConversion(200, LongType.get(), bytes([200, 0, 0, 0, 0, 0, 0, 0])) self.assertEqual(bytes([200, 0, 0, 0, 0, 0, 0, 0]), Literal.of(200).to(LongType.get()).to_byte_buffer()) # floats are stored as 4 bytes in little-endian order # floating point numbers are represented as sign * 2ˆexponent * mantissa # -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary # 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 192 (-64), self.assertConversion(-4.5, FloatType.get(), bytes([0, 0, 144, 192])) self.assertEqual(bytes([0, 0, 144, 192]), Literal.of(-4.5).to_byte_buffer()) # doubles are stored as 8 bytes in little-endian order # floating point numbers are represented as sign * 2ˆexponent * mantissa # 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0 # 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64 self.assertConversion(6.0, DoubleType.get(), bytes([0, 0, 0, 0, 0, 0, 24, 64])) self.assertEqual(bytes([0, 0, 0, 0, 0, 0, 24, 64]), Literal.of(6.0).to(DoubleType.get()).to_byte_buffer()) # dates are stored as days from 1970-01-01 in a 4-byte little-endian int # 1000 is 0...0|00000011|11101000 in binary # 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0 self.assertConversion(1000, DateType.get(), bytes([232, 3, 0, 0])) self.assertEqual(bytes([232, 3, 0, 0]), Literal.of(1000).to(DateType.get()).to_byte_buffer()) # time is stored as microseconds from midnight in an 8-byte little-endian long # 10000L is 0...0|00100111|00010000 in binary # 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0 self.assertConversion(10000, TimeType.get(), bytes([16, 39, 0, 0, 0, 0, 0, 0])) self.assertEqual( bytes([16, 39, 0, 0, 0, 0, 0, 0]), Literal.of(10000).to(LongType.get()).to( TimeType.get()).to_byte_buffer()) # timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long # 400000L is 0...110|00011010|10000000 in binary # 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0 self.assertConversion(400000, TimestampType.without_timezone(), bytes([128, 26, 6, 0, 0, 0, 0, 0])) self.assertConversion(400000, TimestampType.with_timezone(), bytes([128, 26, 6, 0, 0, 0, 0, 0])) self.assertEqual( bytes([128, 26, 6, 0, 0, 0, 0, 0]), Literal.of(400000).to(LongType.get()).to( TimestampType.without_timezone()).to_byte_buffer()) self.assertEqual( bytes([128, 26, 6, 0, 0, 0, 0, 0]), Literal.of(400000).to(LongType.get()).to( TimestampType.with_timezone()).to_byte_buffer()) # strings are stored as UTF-8 bytes (without length) # 'A' -> 65, 'B' -> 66, 'C' -> 67 self.assertConversion("ABC", StringType.get(), bytes([65, 66, 67])) self.assertEqual(bytes([65, 66, 67]), Literal.of("ABC").to_byte_buffer()) # uuids are stored as 16-byte big-endian values # f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7 # 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E -> 00111110 -> 62, # 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124, # 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 10100100 -> 164 (-92), # 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52, # 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 -> 10000101 -> 133 (-123), # 0xE7 -> 11100111 -> 231 (-25) self.assertConversion( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType.get(), bytes([ 247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231 ])) self.assertEqual( bytes([ 247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231 ]), Literal.of(uuid.UUID( "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer()) # fixed values are stored directly # 'a' -> 97, 'b' -> 98 self.assertConversion(bytes("ab", "utf8"), FixedType.of_length(2), bytes([97, 98])) self.assertEqual(bytes([97, 98]), Literal.of(bytes("ab", "utf8")).to_byte_buffer()) # binary values are stored directly # 'Z' -> 90 self.assertConversion(bytearray("Z", "utf8"), BinaryType.get(), bytes([90])) self.assertEqual(bytes([90]), Literal.of(bytearray("Z", "utf8")).to_byte_buffer()) # decimals are stored as unscaled values in the form of two's-complement big-endian binary, # using the minimum number of bytes for the values # 345 is 0...1|01011001 in binary # 00000001 -> 1, 01011001 -> 89 self.assertConversion( Decimal(3.45).quantize(Decimal(".01")), DecimalType.of(3, 2), bytes([1, 89])) self.assertEqual( bytes([1, 89]), Literal.of(3.45).to(DecimalType.of(3, 2)).to_byte_buffer()) # decimal on 3-bytes to test that we use the minimum number of bytes and not a power of 2 # 1234567 is 00010010|11010110|10000111 in binary # 00010010 -> 18, 11010110 -> 214, 10000111 -> 135 self.assertConversion( Decimal(123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4), bytes([18, 214, 135])) self.assertEqual( bytes([18, 214, 135]), Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # negative decimal to test two's complement # -1234567 is 11101101|00101001|01111001 in binary # 11101101 -> 237, 00101001 -> 41, 01111001 -> 121 self.assertConversion( Decimal(-123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4), bytes([237, 41, 121])) self.assertEqual( bytes([237, 41, 121]), Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # test empty byte in decimal # 11 is 00001011 in binary # 00001011 -> 11 self.assertConversion( Decimal(0.011).quantize(Decimal(".001")), DecimalType.of(10, 3), bytes([11])) self.assertEqual( bytes([11]), Literal.of(0.011).to(DecimalType.of(10, 3)).to_byte_buffer())
MapType, NestedField, StringType, StructType, TimestampType) from iceberg.api.types import Type import pyarrow as pa from pyarrow.parquet import lib, ParquetFile _logger = logging.getLogger(__name__) arrow_type_map = {lib.Type_BOOL: lambda x=None: BooleanType.get(), lib.Type_DATE32: lambda x=None: DateType.get(), lib.Type_DECIMAL128: lambda x=None: DecimalType.of(x.precision, x.scale), lib.Type_DOUBLE: lambda x=None: DoubleType.get(), lib.Type_FIXED_SIZE_BINARY: lambda x=None: FixedType.of_length(x.byte_width), lib.Type_BINARY: lambda x=None: BinaryType.get(), lib.Type_FLOAT: lambda x=None: FloatType.get(), lib.Type_STRING: lambda x=None: StringType.get(), lib.Type_INT32: lambda x=None: IntegerType.get(), lib.Type_INT64: lambda x=None: LongType.get(), lib.Type_TIMESTAMP: lambda x=None: (TimestampType.without_timezone() if x.tz is None else TimestampType.with_timezone()) } def get_nested_field(field_id: int, field_name: str, field_type: Type, nullable: bool) -> NestedField: if nullable: return NestedField.optional(field_id, field_name, field_type) else:
FloatType, IntegerType, LongType, StringType, TimestampType, TimeType, UUIDType) PRIMITIVES = [BinaryType.get(), BooleanType.get(), DateType.get(), DecimalType.of(9, 2), DecimalType.of(11, 2), DecimalType.of(9, 3), DoubleType.get(), FixedType.of_length(3), FixedType.of_length(4), FloatType.get(), IntegerType.get(), LongType.get(), StringType.get(), TimestampType.with_timezone(), TimestampType.without_timezone(), TimeType.get(), UUIDType.get()] class TestReadabilityChecks(unittest.TestCase): def test_primitive_types(self): # TO-DO: Need to implement CheckCompatibility in type_util
def test_fixed_unsigned_comparator(): b1 = bytearray([0x01, 0x01, 0x02]) b2 = bytearray([0x01, 0xFF, 0x01]) assert Literal.of(b2) > Literal.of(b1).to(FixedType.of_length(3))