def expected_metadata_sorting(): spec_schema = Schema(NestedField.required(1, "x", LongType.get()), NestedField.required(2, "y", LongType.get()), NestedField.required(3, "z", LongType.get())) spec = PartitionSpec \ .builder_for(spec_schema) \ .with_spec_id(5) \ .build() random.seed(1234) previous_snapshot_id = int(time.time()) - random.randint(0, 3600) previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None, timestamp_millis=previous_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"), spec_id=spec.spec_id)]) current_snapshot_id = int(time.time()) current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id, timestamp_millis=current_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"), spec_id=spec.spec_id)]) reversed_snapshot_log = list() metadata = TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, spec_schema, 5, [spec], {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot], reversed_snapshot_log) reversed_snapshot_log.append(SnapshotLogEntry(current_snapshot.timestamp_millis, current_snapshot.snapshot_id)) reversed_snapshot_log.append(SnapshotLogEntry(previous_snapshot.timestamp_millis, previous_snapshot.snapshot_id)) return metadata
def test_schema_evolution_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(16, "other_new_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(15, "new_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.not_null("new_col"), True) schema = pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("other_new_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True), pa.field("new_col", pa.string(), nullable=True) ]) pyarrow_not_null_array = [ pa.array([], type=pa.int32()), pa.array([], type=pa.int64()), pa.array([], type=pa.int32()), pa.array([], type=pa.float32()), pa.array([], type=pa.float64()), pa.array([], type=pa.string()), pa.array([], type=pa.string()) ] not_null_table = pa.table(pyarrow_not_null_array, schema=schema) pyarrow_null_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([None, None, None, None, None], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([None, None, None, None, None], type=pa.string()) ] null_table = pa.table(pyarrow_null_array, schema=schema) target_table = reader.read() assert not_null_table == target_table reader = ParquetReader(input_file, expected_schema, {}, Expressions.is_null("new_col"), True) target_table = reader.read() assert null_table == target_table
def test_to_bytes(self): self.assertEqual(b'\x00\x00', Literal.of(False).to_byte_buffer()) self.assertEqual(b'\x01\x00', Literal.of(True).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00', Literal.of(1234).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00\x00\x00\x00\x00', Literal.of(1234).to(LongType.get()).to_byte_buffer()) self.assertEqual(b'\x19\x04\x9e?', Literal.of(1.2345).to_byte_buffer()) self.assertEqual( b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f', Literal.of(1.2345).to(DoubleType.get()).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00', Literal.of(1234).to(DateType.get()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to(TimeType.get()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to( TimestampType.with_timezone()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to( TimestampType.without_timezone()).to_byte_buffer()) self.assertEqual(b'foo', Literal.of("foo").to_byte_buffer()) self.assertEqual( b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7', Literal.of(uuid.UUID( "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer()) self.assertEqual(b'foo', Literal.of(bytes(b'foo')).to_byte_buffer()) self.assertEqual(b'foo', Literal.of(bytearray(b'foo')).to_byte_buffer())
def test_from_bytes(self): self.assertEqual(1234, Conversions.from_byte_buffer(IntegerType.get(), b'\xd2\x04\x00\x00')) self.assertEqual(1234, Conversions.from_byte_buffer(LongType.get(), b'\xd2\x04\x00\x00\x00\x00\x00\x00')) self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer(DoubleType.get(), b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f'))
def test_column_rename(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()) ] schema = pa.schema([ pa.field("int_col", pa.int32(), False), pa.field("bigint_col", pa.int64(), True), pa.field("string_col", pa.string(), True), pa.field("float_col", pa.float32(), True), pa.field("dbl_col", pa.float64(), True) ]) source_table = pa.table(pyarrow_array, schema=schema) target_table = reader.read() assert source_table == target_table
def test_from_bytes(self): self.assertEqual( False, Conversions.from_byte_buffer(BooleanType.get(), b'\x00')) self.assertEqual( True, Conversions.from_byte_buffer(BooleanType.get(), b'\x01')) self.assertEqual( 1234, Conversions.from_byte_buffer(IntegerType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 1234, Conversions.from_byte_buffer(LongType.get(), b'\xd2\x04\x00\x00\x00\x00\x00\x00')) self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer( FloatType.get(), b'\x19\x04\x9e?'), places=5) self.assertAlmostEqual( 1.2345, Conversions.from_byte_buffer(DoubleType.get(), b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f')) self.assertEqual( 1234, Conversions.from_byte_buffer(DateType.get(), b'\xd2\x04\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimeType.get(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.with_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( 100000000000, Conversions.from_byte_buffer(TimestampType.without_timezone(), b'\x00\xe8vH\x17\x00\x00\x00')) self.assertEqual( "foo", Conversions.from_byte_buffer(StringType.get(), b'foo')) self.assertEqual( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), Conversions.from_byte_buffer( UUIDType.get(), b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7')) self.assertEqual( b'foo', Conversions.from_byte_buffer(FixedType.of_length(3), b'foo')) self.assertEqual( b'foo', Conversions.from_byte_buffer(BinaryType.get(), b'foo')) self.assertEqual( Decimal(123.45).quantize(Decimal(".01")), Conversions.from_byte_buffer(DecimalType.of(5, 2), b'\x30\x39')) self.assertEqual( Decimal(123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\x00\x12\xd6\x87')) self.assertEqual( Decimal(-123.4567).quantize(Decimal(".0001")), Conversions.from_byte_buffer(DecimalType.of(5, 4), b'\xff\xed\x29\x79'))
def supported_primitives(): return StructType.of([ NestedField.required(100, "id", LongType.get()), NestedField.optional(101, "data", StringType.get()), NestedField.required(102, "b", BooleanType.get()), NestedField.optional(103, "i", IntegerType.get()), NestedField.required(104, "l", LongType.get()), NestedField.optional(105, "f", FloatType.get()), NestedField.required(106, "d", DoubleType.get()), NestedField.optional(107, "date", DateType.get()), NestedField.required(108, "ts", TimestampType.with_timezone()), NestedField.required(110, "s", StringType.get()), NestedField.required(111, "uuid", UUIDType.get()), NestedField.required(112, "fixed", FixedType.of_length(7)), NestedField.optional(113, "bytes", BinaryType.get()), NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)), NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)), NestedField.required(114, "dec_38_10", DecimalType.of(38, 10)) ])
def get_type(partition_type): return StructType.of([ NestedField.required(100, "file_path", StringType.get()), NestedField.required(101, "file_format", StringType.get()), NestedField.required(102, "partition", partition_type), NestedField.required(103, "record_count", LongType.get()), NestedField.required(104, "file_size_in_bytes", LongType.get()), NestedField.required(105, "block_size_in_bytes", LongType.get()), NestedField.optional(106, "file_ordinal", IntegerType.get()), NestedField.optional(107, "sort_columns", ListType.of_required(112, IntegerType.get())), NestedField.optional( 108, "column_sizes", MapType.of_required(117, 118, IntegerType.get(), LongType.get())), NestedField.optional( 109, "value_counts", MapType.of_required(119, 120, IntegerType.get(), LongType.get())), NestedField.optional( 110, "null_value_counts", MapType.of_required(121, 122, IntegerType.get(), LongType.get())), NestedField.optional( 125, "lower_bounds", MapType.of_required(126, 127, IntegerType.get(), BinaryType.get())), NestedField.optional( 128, "upper_bounds", MapType.of_required(129, 130, IntegerType.get(), BinaryType.get())) ] # NEXT ID TO ASSIGN: 131 )
def missing_spec_list(): schema = Schema(NestedField.required(1, "x", LongType.get()), NestedField.required(2, "y", LongType.get()), NestedField.required(3, "z", LongType.get())) spec = PartitionSpec.builder_for(schema).identity("x").with_spec_id(6).build() random.seed(1234) previous_snapshot_id = int(time.time()) - random.randint(0, 3600) previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None, timestamp_millis=previous_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"), spec_id=spec.spec_id)]) current_snapshot_id = int(time.time()) current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id, timestamp_millis=current_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"), spec_id=spec.spec_id)]) return TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, schema, 6, (spec,), {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot], [])
def test_column_upcast(primitive_type_test_file): expected_schema = Schema( [NestedField.required(1, "int_col", LongType.get())]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [pa.array([1, 2, 3, 4, 5], type=pa.int32())] source_table = pa.table( pyarrow_array, schema=pa.schema([pa.field("int_col", pa.int64(), nullable=False)])) target_table = reader.read() assert source_table == target_table
def test_partition_spec(self): schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [PartitionSpec.builder_for(schema).identity("i").build(), PartitionSpec.builder_for(schema).identity("l").build(), PartitionSpec.builder_for(schema).identity("d").build(), PartitionSpec.builder_for(schema).identity("t").build(), PartitionSpec.builder_for(schema).identity("ts").build(), PartitionSpec.builder_for(schema).identity("dec").build(), PartitionSpec.builder_for(schema).identity("s").build(), PartitionSpec.builder_for(schema).identity("u").build(), PartitionSpec.builder_for(schema).identity("f").build(), PartitionSpec.builder_for(schema).identity("b").build(), PartitionSpec.builder_for(schema).bucket("i", 128).build(), PartitionSpec.builder_for(schema).bucket("l", 128).build(), PartitionSpec.builder_for(schema).bucket("d", 128).build(), PartitionSpec.builder_for(schema).bucket("t", 128).build(), PartitionSpec.builder_for(schema).bucket("ts", 128).build(), PartitionSpec.builder_for(schema).bucket("dec", 128).build(), PartitionSpec.builder_for(schema).bucket("s", 128).build(), PartitionSpec.builder_for(schema).bucket("u", 128).build(), PartitionSpec.builder_for(schema).bucket("f", 128).build(), PartitionSpec.builder_for(schema).bucket("b", 128).build(), PartitionSpec.builder_for(schema).year("d").build(), PartitionSpec.builder_for(schema).month("d").build(), PartitionSpec.builder_for(schema).day("d").build(), PartitionSpec.builder_for(schema).year("ts").build(), PartitionSpec.builder_for(schema).month("ts").build(), PartitionSpec.builder_for(schema).day("ts").build(), PartitionSpec.builder_for(schema).hour("ts").build(), PartitionSpec.builder_for(schema).truncate("i", 10).build(), PartitionSpec.builder_for(schema).truncate("l", 10).build(), PartitionSpec.builder_for(schema).truncate("dec", 10).build(), PartitionSpec.builder_for(schema).truncate("s", 10).build(), PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(), PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(), ] for spec in specs: self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))
def rg_expected_schema(): return Schema([ NestedField.required(1, "string_col", StringType.get()), NestedField.required(2, "long_col", LongType.get()), NestedField.required(3, "int_col", IntegerType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "null_col", StringType.get()), NestedField.optional(6, "missing_col", StringType.get()), NestedField.optional(7, "no_stats_col", StringType.get()), NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(9, "ts_wotz_col", TimestampType.without_timezone()), NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)), NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)), NestedField.optional(12, "date_type", DateType.get()), ])
def test_bucket_hash(self): buckets = [ [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379], [Transforms.bucket(LongType.get(), 100), 34, 2017239379], [Transforms.bucket(DateType.get(), 100), 17486, -653330422], [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989], [Transforms.bucket(TimestampType.without_timezone(), 100), 1510871468000000, -2047944441], [Transforms.bucket(DecimalType.of(9, 2), 100), decimal.Decimal("14.20"), -500754589], [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089], [Transforms.bucket(UUIDType.get(), 100), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340], [Transforms.bucket(FixedType.of_length(3), 128), b'foo', -156908512], [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', -188683207] ] for bucket in buckets: self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))
def test_projection(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) num_cols = source_table.num_columns for i in range(1, num_cols - 1): source_table = source_table.remove_column(num_cols - i) assert source_table == reader.read()
def test_primitive_types(primitive_type_test_parquet_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) compare_schema( expected_schema, convert_parquet_to_iceberg(primitive_type_test_parquet_file))
def test_to_bytes(self): self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer()) self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00', Literal.of(1234).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00\x00\x00\x00\x00', Literal.of(1234).to(LongType.get()).to_byte_buffer()) self.assertEqual(b'\x19\x04\x9e?', Literal.of(1.2345).to_byte_buffer()) self.assertEqual( b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f', Literal.of(1.2345).to(DoubleType.get()).to_byte_buffer()) self.assertEqual(b'\xd2\x04\x00\x00', Literal.of(1234).to(DateType.get()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to(TimeType.get()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to( TimestampType.with_timezone()).to_byte_buffer()) self.assertEqual( b'\x00\xe8vH\x17\x00\x00\x00', Literal.of(100000000000).to( TimestampType.without_timezone()).to_byte_buffer()) self.assertEqual(b'foo', Literal.of("foo").to_byte_buffer()) self.assertEqual( b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7', Literal.of(uuid.UUID( "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer()) self.assertEqual(b'foo', Literal.of(bytes(b'foo')).to_byte_buffer()) self.assertEqual(b'foo', Literal.of(bytearray(b'foo')).to_byte_buffer()) # Decimal on 2-bytes self.assertEqual( b'\x30\x39', Literal.of(123.45).to(DecimalType.of(5, 2)).to_byte_buffer()) # Decimal on 3-bytes to test that we use the minimum number of bytes self.assertEqual( b'\x12\xd6\x87', Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # Negative decimal to test two's complement self.assertEqual( b'\xed\x29\x79', Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer())
def test_compound_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader( input_file, expected_schema, {}, Expressions.and_(Expressions.equal("string_col", "us"), Expressions.equal("int_col", 1)), True) pyarrow_array = [ pa.array([1], type=pa.int32()), pa.array([1], type=pa.int64()), pa.array([1.0], type=pa.float32()), pa.array([1.0], type=pa.float64()), pa.array(['us'], type=pa.string()) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_decimal_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9)) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.decimal128(38, 9)) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("new_dec_col", pa.decimal128(38, 9), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) assert reader.read() == source_table
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [ PartitionSpec.builder_for(spec_schema).identity("i").build(), PartitionSpec.builder_for(spec_schema).identity("l").build(), PartitionSpec.builder_for(spec_schema).identity("d").build(), PartitionSpec.builder_for(spec_schema).identity("t").build(), PartitionSpec.builder_for(spec_schema).identity("ts").build(), PartitionSpec.builder_for(spec_schema).identity("dec").build(), PartitionSpec.builder_for(spec_schema).identity("s").build(), PartitionSpec.builder_for(spec_schema).identity("u").build(), PartitionSpec.builder_for(spec_schema).identity("f").build(), PartitionSpec.builder_for(spec_schema).identity("b").build(), PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(), PartitionSpec.builder_for(spec_schema).year("d").build(), PartitionSpec.builder_for(spec_schema).month("d").build(), PartitionSpec.builder_for(spec_schema).day("d").build(), PartitionSpec.builder_for(spec_schema).year("ts").build(), PartitionSpec.builder_for(spec_schema).month("ts").build(), PartitionSpec.builder_for(spec_schema).day("ts").build(), PartitionSpec.builder_for(spec_schema).hour("ts").build(), PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(), PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build() ] expected_spec_strs = [ "[\n i: identity(1)\n]", "[\n l: identity(2)\n]", "[\n d: identity(3)\n]", "[\n t: identity(4)\n]", "[\n ts: identity(5)\n]", "[\n dec: identity(6)\n]", "[\n s: identity(7)\n]", "[\n u: identity(8)\n]", "[\n f: identity(9)\n]", "[\n b: identity(10)\n]", "[\n i_bucket: bucket[128](1)\n]", "[\n l_bucket: bucket[128](2)\n]", "[\n d_bucket: bucket[128](3)\n]", "[\n t_bucket: bucket[128](4)\n]", "[\n ts_bucket: bucket[128](5)\n]", "[\n dec_bucket: bucket[128](6)\n]", "[\n s_bucket: bucket[128](7)\n]", "[\n d_year: year(3)\n]", "[\n d_month: month(3)\n]", "[\n d_day: day(3)\n]", "[\n ts_year: year(5)\n]", "[\n ts_month: month(5)\n]", "[\n ts_day: day(5)\n]", "[\n ts_hour: hour(5)\n]", "[\n i_truncate: truncate[10](1)\n]", "[\n l_truncate: truncate[10](2)\n]", "[\n dec_truncate: truncate[10](6)\n]", "[\n s_truncate: truncate[10](7)\n]", "[\n dec_bucket: bucket[16](6)\n]", ] for (spec, expected_spec_str) in zip(specs, expected_spec_strs): assert str(spec) == expected_spec_str
def wrap_file_schema(file_struct): return Schema(NestedField.required(0, "status", IntegerType.get()), NestedField.required(1, "snapshot_id", LongType.get()), NestedField.required(2, "data_file", file_struct))
def test_long_to_double_conversion(): lit = Literal.of(34).to(LongType.get()) dbl_lit = lit.to(DoubleType.get()) assert math.isclose(lit.value, dbl_lit.value)
def test_long_to_decimal_conversion(type_val_tuples): lit = Literal.of(34).to(LongType.get()) assert lit.to(type_val_tuples[0]).value.as_tuple() == Decimal(type_val_tuples[1]).as_tuple()
def test_long_to_integer(): lit = Literal.of(34).to(LongType.get()) int_lit = lit.to(IntegerType.get()) assert lit.value == int_lit.value
def test_long_to_float_conversion(): lit = Literal.of(34).to(LongType.get()) float_lit = lit.to(FloatType.get()) assert math.isclose(lit.value, float_lit.value)
def test_integer_to_long_conversion(): lit = Literal.of(34) long_lit = lit.to(LongType.get()) assert lit.value == long_lit.value
class AvroToIceberg(object): FIELD_ID_PROP = "field-id" FIELD_TYPE_PROP = "type" FIELD_NAME_PROP = "name" FIELD_LOGICAL_TYPE_PROP = "logicalType" FIELD_FIELDS_PROP = "fields" FIELD_ITEMS_PROP = "items" FIELD_ELEMENT_ID_PROP = "element-id" AVRO_JSON_PRIMITIVE_TYPES = ("boolean", "int", "long", "float", "double", "bytes", "string") AVRO_JSON_COMPLEX_TYPES = ("record", "array", "enum", "fixed") TYPE_PROCESSING_MAP = {str: lambda x, y: AvroToIceberg.convert_str_type(x, y), dict: lambda x, y: AvroToIceberg.convert_complex_type(x, y), list: lambda x, y: AvroToIceberg.convert_union_type(x, y)} COMPLEX_TYPE_PROCESSING_MAP = {"record": lambda x, y: AvroToIceberg.convert_record_type(x, y), "array": lambda x, y: AvroToIceberg.convert_array_type(x, y), "map": lambda x, y: AvroToIceberg.convert_map_type(x, y)} PRIMITIVE_FIELD_TYPE_MAP = {"boolean": BooleanType.get(), "bytes": BinaryType.get(), "date": DateType.get(), "double": DoubleType.get(), "float": FloatType.get(), "int": IntegerType.get(), "long": LongType.get(), "string": StringType.get(), "time-millis": TimeType.get(), "timestamp-millis": TimestampType.without_timezone()} PROCESS_FUNCS = {TypeID.STRUCT: lambda avro_row, field: AvroToIceberg.get_field_from_struct(avro_row, field), TypeID.LIST: lambda avro_row, field: AvroToIceberg.get_field_from_list(avro_row, field), TypeID.MAP: lambda avro_row, field: AvroToIceberg.get_field_from_map(avro_row, field)} @staticmethod def convert_avro_schema_to_iceberg(avro_schema): if avro_schema.get(AvroToIceberg.FIELD_TYPE_PROP) != "record": raise RuntimeError("Cannot convert avro schema to iceberg %s" % avro_schema) struct = AvroToIceberg.convert_type(avro_schema, None) return Schema(struct[0].fields) @staticmethod def convert_record_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if avro_field_type != "record": raise RuntimeError("Field type muse be 'record': %s" % avro_field_type) fields = avro_field.get(AvroToIceberg.FIELD_FIELDS_PROP) iceberg_fields = [] if next_id is None: next_id = len(fields) for field in fields: iceberg_field, next_id = AvroToIceberg.convert_avro_field_to_iceberg(field, next_id=next_id) iceberg_fields.append(iceberg_field) return StructType.of(iceberg_fields), next_id @staticmethod def convert_avro_field_to_iceberg(field, next_id): field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id) if field.get(AvroToIceberg.FIELD_ID_PROP) is None: return field_type, next_id if is_optional: return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id else: return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id @staticmethod def convert_type(field, next_id=None): avro_field_type = field.get(AvroToIceberg.FIELD_TYPE_PROP) optional = AvroToIceberg.is_option_schema(avro_field_type) processing_func = AvroToIceberg.TYPE_PROCESSING_MAP.get(type(avro_field_type)) if processing_func is None: raise RuntimeError("No function found to process %s" % avro_field_type) iceberg_type, next_id = processing_func(field, next_id) return iceberg_type, optional, next_id @staticmethod def convert_str_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP) if not isinstance(avro_field_type, str): raise RuntimeError("Field type must be of type str: %s" % avro_field_type) if avro_field_type in AvroToIceberg.AVRO_JSON_PRIMITIVE_TYPES: if logical_type is not None: return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(logical_type), next_id else: return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(avro_field_type), next_id elif avro_field_type in AvroToIceberg.AVRO_JSON_COMPLEX_TYPES: if logical_type is not None: processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(logical_type) else: processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(avro_field_type) if processing_func is None: raise RuntimeError("No function found to process %s" % avro_field_type) return processing_func(avro_field, next_id) else: raise RuntimeError("Unknown type %s" % avro_field_type) @staticmethod def convert_complex_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if not isinstance(avro_field_type, dict): raise RuntimeError("Complex field type must be of type dict: %s" % avro_field_type) return AvroToIceberg.convert_avro_field_to_iceberg(avro_field_type, next_id) @staticmethod def convert_union_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if not isinstance(avro_field_type, list): raise RuntimeError("Union field type must be of type list: %s" % avro_field_type) if len(avro_field_type) > 2: raise RuntimeError("Cannot process unions larger than 2 items: %s" % avro_field_type) for item in avro_field_type: if isinstance(item, str) and item == "null": continue avro_field_type = item avro_field[AvroToIceberg.FIELD_TYPE_PROP] = avro_field_type items = AvroToIceberg.convert_type(avro_field, next_id) return items[0], items[2] @staticmethod def convert_array_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) if avro_field_type != "array": raise RuntimeError("Avro type must be array: %s" % avro_field_type) element_id = avro_field.get(AvroToIceberg.FIELD_ELEMENT_ID_PROP) items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP) is_optional = AvroToIceberg.is_option_schema(items) if isinstance(items, str) and items in AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP: item_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(items) if item_type is None: raise RuntimeError("No mapping found for type %s" % items) else: raise RuntimeError("Complex list types not yet implemented") if is_optional: return ListType.of_optional(element_id, item_type), next_id else: return ListType.of_required(element_id, item_type), next_id @staticmethod def convert_map_type(avro_field, next_id=None): avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP) avro_logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP) if avro_field_type != "array" or avro_logical_type != "map": raise RuntimeError("Avro type must be array and logical type must be map: %s" % avro_logical_type) is_optional = False items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP) for field in items.get(AvroToIceberg.FIELD_FIELDS_PROP, list()): if field.get(AvroToIceberg.FIELD_NAME_PROP) == "key": key_id = field.get(AvroToIceberg.FIELD_ID_PROP) if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str): raise RuntimeError("Support for complex map keys not yet implemented") key_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP)) elif field.get(AvroToIceberg.FIELD_NAME_PROP) == "value": value_id = field.get(AvroToIceberg.FIELD_ID_PROP) if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str): raise RuntimeError("Support for complex map values not yet imeplemented") value_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP)) if is_optional: return MapType.of_optional(key_id, value_id, key_type, value_type), next_id else: return MapType.of_required(key_id, value_id, key_type, value_type), next_id @staticmethod def is_option_schema(field_type): if isinstance(field_type, list) and len(field_type) == 2 and "null" in field_type: return True return False @staticmethod def read_avro_file(iceberg_schema, data_file): fo = data_file.new_fo() avro_reader = fastavro.reader(fo) for avro_row in avro_reader: iceberg_row = dict() for field in iceberg_schema.as_struct().fields: iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field) yield iceberg_row fo.close() @staticmethod def read_avro_row(iceberg_schema, avro_reader): try: for avro_row in avro_reader: iceberg_row = dict() for field in iceberg_schema.as_struct().fields: iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field) yield iceberg_row except StopIteration: return @staticmethod def get_field_from_avro(avro_row, field): try: return AvroToIceberg.PROCESS_FUNCS.get(field.type.type_id, AvroToIceberg.get_field_from_primitive)(avro_row, field) except KeyError: raise RuntimeError("Don't know how to get field of type: %s" % field.type.type_id) @staticmethod def get_field_from_primitive(avro_row, field): try: return avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) @staticmethod def get_field_from_struct(avro_row, field): field_obj = {} for nested_field in field.type.fields: field_obj[nested_field.name] = AvroToIceberg.get_field_from_avro(avro_row[field.name], nested_field) return field_obj @staticmethod def get_field_from_list(avro_row, field): try: return avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) @staticmethod def get_field_from_map(avro_row, field): val_map = dict() try: avro_value = avro_row[field.name] except KeyError: if field.is_required: raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row)) else: return None for val in avro_value: val_map[val['key']] = val['value'] return val_map
# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from decimal import Decimal from iceberg.api.transforms import Truncate from iceberg.api.types import (DecimalType, IntegerType, LongType, StringType) import pytest @pytest.mark.parametrize("type_var", [IntegerType.get(), LongType.get()]) @pytest.mark.parametrize("input_var,expected", [(1, 0), (5, 0), (9, 0), (10, 10), (11, 10), (-1, -10), (-10, -10), (-12, -20)]) def test_truncate_integer(type_var, input_var, expected): trunc = Truncate.get(type_var, 10) assert trunc.apply(input_var) == expected @pytest.mark.parametrize( "input_var,expected", [(Decimal(12.34).quantize(Decimal(".01")), Decimal("12.30")), (Decimal(12.30).quantize(Decimal(".01")), Decimal("12.30")), (Decimal(12.20).quantize(Decimal(".01")), Decimal("12.20")), (Decimal(0.05).quantize(Decimal(".01")), Decimal("0.00")), (Decimal(-0.05).quantize(Decimal(".01")), Decimal("-0.10"))])
def iceberg_full_read_projection_schema(): return Schema([ NestedField.required(0, "id", LongType.get()), NestedField.optional(1, "data", StringType.get()) ])
def test_byte_buffer_conversions(self): # booleans are stored as 0x00 for 'false' and a non-zero byte for 'true' self.assertConversion(False, BooleanType.get(), b'\x00') self.assertConversion(True, BooleanType.get(), b'\x01') self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer()) self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer()) # integers are stored as 4 bytes in little-endian order # 84202 is 0...01|01001000|11101010 in binary # 11101010 -> 234 (-22), 01001000 -> 72, 00000001 -> 1, 00000000 -> 0 self.assertConversion(84202, IntegerType.get(), bytes([234, 72, 1, 0])) self.assertEqual(bytes([234, 72, 1, 0]), Literal.of(84202).to_byte_buffer()) # longs are stored as 8 bytes in little-endian order # 200L is 0...0|11001000 in binary # 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0 self.assertConversion(200, LongType.get(), bytes([200, 0, 0, 0, 0, 0, 0, 0])) self.assertEqual(bytes([200, 0, 0, 0, 0, 0, 0, 0]), Literal.of(200).to(LongType.get()).to_byte_buffer()) # floats are stored as 4 bytes in little-endian order # floating point numbers are represented as sign * 2ˆexponent * mantissa # -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary # 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 192 (-64), self.assertConversion(-4.5, FloatType.get(), bytes([0, 0, 144, 192])) self.assertEqual(bytes([0, 0, 144, 192]), Literal.of(-4.5).to_byte_buffer()) # doubles are stored as 8 bytes in little-endian order # floating point numbers are represented as sign * 2ˆexponent * mantissa # 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0 # 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64 self.assertConversion(6.0, DoubleType.get(), bytes([0, 0, 0, 0, 0, 0, 24, 64])) self.assertEqual(bytes([0, 0, 0, 0, 0, 0, 24, 64]), Literal.of(6.0).to(DoubleType.get()).to_byte_buffer()) # dates are stored as days from 1970-01-01 in a 4-byte little-endian int # 1000 is 0...0|00000011|11101000 in binary # 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0 self.assertConversion(1000, DateType.get(), bytes([232, 3, 0, 0])) self.assertEqual(bytes([232, 3, 0, 0]), Literal.of(1000).to(DateType.get()).to_byte_buffer()) # time is stored as microseconds from midnight in an 8-byte little-endian long # 10000L is 0...0|00100111|00010000 in binary # 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0 self.assertConversion(10000, TimeType.get(), bytes([16, 39, 0, 0, 0, 0, 0, 0])) self.assertEqual( bytes([16, 39, 0, 0, 0, 0, 0, 0]), Literal.of(10000).to(LongType.get()).to( TimeType.get()).to_byte_buffer()) # timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long # 400000L is 0...110|00011010|10000000 in binary # 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0 self.assertConversion(400000, TimestampType.without_timezone(), bytes([128, 26, 6, 0, 0, 0, 0, 0])) self.assertConversion(400000, TimestampType.with_timezone(), bytes([128, 26, 6, 0, 0, 0, 0, 0])) self.assertEqual( bytes([128, 26, 6, 0, 0, 0, 0, 0]), Literal.of(400000).to(LongType.get()).to( TimestampType.without_timezone()).to_byte_buffer()) self.assertEqual( bytes([128, 26, 6, 0, 0, 0, 0, 0]), Literal.of(400000).to(LongType.get()).to( TimestampType.with_timezone()).to_byte_buffer()) # strings are stored as UTF-8 bytes (without length) # 'A' -> 65, 'B' -> 66, 'C' -> 67 self.assertConversion("ABC", StringType.get(), bytes([65, 66, 67])) self.assertEqual(bytes([65, 66, 67]), Literal.of("ABC").to_byte_buffer()) # uuids are stored as 16-byte big-endian values # f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7 # 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E -> 00111110 -> 62, # 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124, # 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 10100100 -> 164 (-92), # 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52, # 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 -> 10000101 -> 133 (-123), # 0xE7 -> 11100111 -> 231 (-25) self.assertConversion( uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType.get(), bytes([ 247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231 ])) self.assertEqual( bytes([ 247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183, 133, 231 ]), Literal.of(uuid.UUID( "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer()) # fixed values are stored directly # 'a' -> 97, 'b' -> 98 self.assertConversion(bytes("ab", "utf8"), FixedType.of_length(2), bytes([97, 98])) self.assertEqual(bytes([97, 98]), Literal.of(bytes("ab", "utf8")).to_byte_buffer()) # binary values are stored directly # 'Z' -> 90 self.assertConversion(bytearray("Z", "utf8"), BinaryType.get(), bytes([90])) self.assertEqual(bytes([90]), Literal.of(bytearray("Z", "utf8")).to_byte_buffer()) # decimals are stored as unscaled values in the form of two's-complement big-endian binary, # using the minimum number of bytes for the values # 345 is 0...1|01011001 in binary # 00000001 -> 1, 01011001 -> 89 self.assertConversion( Decimal(3.45).quantize(Decimal(".01")), DecimalType.of(3, 2), bytes([1, 89])) self.assertEqual( bytes([1, 89]), Literal.of(3.45).to(DecimalType.of(3, 2)).to_byte_buffer()) # decimal on 3-bytes to test that we use the minimum number of bytes and not a power of 2 # 1234567 is 00010010|11010110|10000111 in binary # 00010010 -> 18, 11010110 -> 214, 10000111 -> 135 self.assertConversion( Decimal(123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4), bytes([18, 214, 135])) self.assertEqual( bytes([18, 214, 135]), Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # negative decimal to test two's complement # -1234567 is 11101101|00101001|01111001 in binary # 11101101 -> 237, 00101001 -> 41, 01111001 -> 121 self.assertConversion( Decimal(-123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4), bytes([237, 41, 121])) self.assertEqual( bytes([237, 41, 121]), Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer()) # test empty byte in decimal # 11 is 00001011 in binary # 00001011 -> 11 self.assertConversion( Decimal(0.011).quantize(Decimal(".001")), DecimalType.of(10, 3), bytes([11])) self.assertEqual( bytes([11]), Literal.of(0.011).to(DecimalType.of(10, 3)).to_byte_buffer())