Python LongType Examples, iceberg.api.types.LongType Python Examples

Example #1

0

Show file

File: conftest.py Project: shenodaguirguis/iceberg-1

def expected_metadata_sorting():
    spec_schema = Schema(NestedField.required(1, "x", LongType.get()),
                         NestedField.required(2, "y", LongType.get()),
                         NestedField.required(3, "z", LongType.get()))

    spec = PartitionSpec \
        .builder_for(spec_schema) \
        .with_spec_id(5) \
        .build()

    random.seed(1234)
    previous_snapshot_id = int(time.time()) - random.randint(0, 3600)

    previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None,
                                     timestamp_millis=previous_snapshot_id,
                                     manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"),
                                                                    spec_id=spec.spec_id)])

    current_snapshot_id = int(time.time())
    current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id,
                                    timestamp_millis=current_snapshot_id,
                                    manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"),
                                                                   spec_id=spec.spec_id)])

    reversed_snapshot_log = list()
    metadata = TableMetadata(ops, None, "s3://bucket/test/location",
                             int(time.time()), 3, spec_schema, 5, [spec], {"property": "value"}, current_snapshot_id,
                             [previous_snapshot, current_snapshot], reversed_snapshot_log)

    reversed_snapshot_log.append(SnapshotLogEntry(current_snapshot.timestamp_millis, current_snapshot.snapshot_id))
    reversed_snapshot_log.append(SnapshotLogEntry(previous_snapshot.timestamp_millis, previous_snapshot.snapshot_id))

    return metadata

Example #2

0

Show file

def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table

Example #3

0

Show file

 def test_to_bytes(self):
     self.assertEqual(b'\x00\x00', Literal.of(False).to_byte_buffer())
     self.assertEqual(b'\x01\x00', Literal.of(True).to_byte_buffer())
     self.assertEqual(b'\xd2\x04\x00\x00',
                      Literal.of(1234).to_byte_buffer())
     self.assertEqual(b'\xd2\x04\x00\x00\x00\x00\x00\x00',
                      Literal.of(1234).to(LongType.get()).to_byte_buffer())
     self.assertEqual(b'\x19\x04\x9e?', Literal.of(1.2345).to_byte_buffer())
     self.assertEqual(
         b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f',
         Literal.of(1.2345).to(DoubleType.get()).to_byte_buffer())
     self.assertEqual(b'\xd2\x04\x00\x00',
                      Literal.of(1234).to(DateType.get()).to_byte_buffer())
     self.assertEqual(
         b'\x00\xe8vH\x17\x00\x00\x00',
         Literal.of(100000000000).to(TimeType.get()).to_byte_buffer())
     self.assertEqual(
         b'\x00\xe8vH\x17\x00\x00\x00',
         Literal.of(100000000000).to(
             TimestampType.with_timezone()).to_byte_buffer())
     self.assertEqual(
         b'\x00\xe8vH\x17\x00\x00\x00',
         Literal.of(100000000000).to(
             TimestampType.without_timezone()).to_byte_buffer())
     self.assertEqual(b'foo', Literal.of("foo").to_byte_buffer())
     self.assertEqual(
         b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7',
         Literal.of(uuid.UUID(
             "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer())
     self.assertEqual(b'foo', Literal.of(bytes(b'foo')).to_byte_buffer())
     self.assertEqual(b'foo',
                      Literal.of(bytearray(b'foo')).to_byte_buffer())

Example #4

0

Show file

File: test_conversions.py Project: rdsr/li-iceberg-rdsr

 def test_from_bytes(self):
     self.assertEqual(1234, Conversions.from_byte_buffer(IntegerType.get(),
                                                         b'\xd2\x04\x00\x00'))
     self.assertEqual(1234, Conversions.from_byte_buffer(LongType.get(),
                                                         b'\xd2\x04\x00\x00\x00\x00\x00\x00'))
     self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer(DoubleType.get(),
                                                                 b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f'))

Example #5

0

Show file

def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table

Example #6

0

Show file

File: test_conversions.py Project: snazy/iceberg

 def test_from_bytes(self):
     self.assertEqual(
         False, Conversions.from_byte_buffer(BooleanType.get(), b'\x00'))
     self.assertEqual(
         True, Conversions.from_byte_buffer(BooleanType.get(), b'\x01'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(IntegerType.get(),
                                      b'\xd2\x04\x00\x00'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(LongType.get(),
                                      b'\xd2\x04\x00\x00\x00\x00\x00\x00'))
     self.assertAlmostEqual(1.2345,
                            Conversions.from_byte_buffer(
                                FloatType.get(), b'\x19\x04\x9e?'),
                            places=5)
     self.assertAlmostEqual(
         1.2345,
         Conversions.from_byte_buffer(DoubleType.get(),
                                      b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(DateType.get(), b'\xd2\x04\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimeType.get(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimestampType.with_timezone(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimestampType.without_timezone(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         "foo", Conversions.from_byte_buffer(StringType.get(), b'foo'))
     self.assertEqual(
         uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
         Conversions.from_byte_buffer(
             UUIDType.get(), b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7'))
     self.assertEqual(
         b'foo', Conversions.from_byte_buffer(FixedType.of_length(3),
                                              b'foo'))
     self.assertEqual(
         b'foo', Conversions.from_byte_buffer(BinaryType.get(), b'foo'))
     self.assertEqual(
         Decimal(123.45).quantize(Decimal(".01")),
         Conversions.from_byte_buffer(DecimalType.of(5, 2), b'\x30\x39'))
     self.assertEqual(
         Decimal(123.4567).quantize(Decimal(".0001")),
         Conversions.from_byte_buffer(DecimalType.of(5, 4),
                                      b'\x00\x12\xd6\x87'))
     self.assertEqual(
         Decimal(-123.4567).quantize(Decimal(".0001")),
         Conversions.from_byte_buffer(DecimalType.of(5, 4),
                                      b'\xff\xed\x29\x79'))

Example #7

0

Show file

File: conftest.py Project: rdsr/li-iceberg-rdsr

def supported_primitives():
    return StructType.of([
        NestedField.required(100, "id", LongType.get()),
        NestedField.optional(101, "data", StringType.get()),
        NestedField.required(102, "b", BooleanType.get()),
        NestedField.optional(103, "i", IntegerType.get()),
        NestedField.required(104, "l", LongType.get()),
        NestedField.optional(105, "f", FloatType.get()),
        NestedField.required(106, "d", DoubleType.get()),
        NestedField.optional(107, "date", DateType.get()),
        NestedField.required(108, "ts", TimestampType.with_timezone()),
        NestedField.required(110, "s", StringType.get()),
        NestedField.required(111, "uuid", UUIDType.get()),
        NestedField.required(112, "fixed", FixedType.of_length(7)),
        NestedField.optional(113, "bytes", BinaryType.get()),
        NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)),
        NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)),
        NestedField.required(114, "dec_38_10", DecimalType.of(38, 10))
    ])

Example #8

0

Show file

File: data_file.py Project: rdsr/li-iceberg-rdsr

 def get_type(partition_type):
     return StructType.of([
         NestedField.required(100, "file_path", StringType.get()),
         NestedField.required(101, "file_format", StringType.get()),
         NestedField.required(102, "partition", partition_type),
         NestedField.required(103, "record_count", LongType.get()),
         NestedField.required(104, "file_size_in_bytes", LongType.get()),
         NestedField.required(105, "block_size_in_bytes", LongType.get()),
         NestedField.optional(106, "file_ordinal", IntegerType.get()),
         NestedField.optional(107, "sort_columns",
                              ListType.of_required(112, IntegerType.get())),
         NestedField.optional(
             108, "column_sizes",
             MapType.of_required(117, 118, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             109, "value_counts",
             MapType.of_required(119, 120, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             110, "null_value_counts",
             MapType.of_required(121, 122, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             125, "lower_bounds",
             MapType.of_required(126, 127, IntegerType.get(),
                                 BinaryType.get())),
         NestedField.optional(
             128, "upper_bounds",
             MapType.of_required(129, 130, IntegerType.get(),
                                 BinaryType.get()))
     ]
                          # NEXT ID TO ASSIGN: 131
                          )

Example #9

0

Show file

File: conftest.py Project: shenodaguirguis/iceberg-1

def missing_spec_list():
    schema = Schema(NestedField.required(1, "x", LongType.get()),
                    NestedField.required(2, "y", LongType.get()),
                    NestedField.required(3, "z", LongType.get()))

    spec = PartitionSpec.builder_for(schema).identity("x").with_spec_id(6).build()
    random.seed(1234)
    previous_snapshot_id = int(time.time()) - random.randint(0, 3600)

    previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None,
                                     timestamp_millis=previous_snapshot_id,
                                     manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"),
                                                                    spec_id=spec.spec_id)])

    current_snapshot_id = int(time.time())
    current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id,
                                    timestamp_millis=current_snapshot_id,
                                    manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"),
                                                                   spec_id=spec.spec_id)])
    return TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, schema, 6,
                         (spec,), {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot],
                         [])

Example #10

0

Show file

def test_column_upcast(primitive_type_test_file):
    expected_schema = Schema(
        [NestedField.required(1, "int_col", LongType.get())])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [pa.array([1, 2, 3, 4, 5], type=pa.int32())]
    source_table = pa.table(
        pyarrow_array,
        schema=pa.schema([pa.field("int_col", pa.int64(), nullable=False)]))

    target_table = reader.read()
    assert source_table == target_table

Example #11

0

Show file

    def test_partition_spec(self):
        schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                        NestedField.required(2, "l", LongType.get()),
                        NestedField.required(3, "d", DateType.get()),
                        NestedField.required(4, "t", TimeType.get()),
                        NestedField.required(5, "ts", TimestampType.without_timezone()),
                        NestedField.required(6, "dec", DecimalType.of(9, 2)),
                        NestedField.required(7, "s", StringType.get()),
                        NestedField.required(8, "u", UUIDType.get()),
                        NestedField.required(9, "f", FixedType.of_length(3)),
                        NestedField.required(10, "b", BinaryType.get()))
        specs = [PartitionSpec.builder_for(schema).identity("i").build(),
                 PartitionSpec.builder_for(schema).identity("l").build(),
                 PartitionSpec.builder_for(schema).identity("d").build(),
                 PartitionSpec.builder_for(schema).identity("t").build(),
                 PartitionSpec.builder_for(schema).identity("ts").build(),
                 PartitionSpec.builder_for(schema).identity("dec").build(),
                 PartitionSpec.builder_for(schema).identity("s").build(),
                 PartitionSpec.builder_for(schema).identity("u").build(),
                 PartitionSpec.builder_for(schema).identity("f").build(),
                 PartitionSpec.builder_for(schema).identity("b").build(),
                 PartitionSpec.builder_for(schema).bucket("i", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("l", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("d", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("t", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("ts", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("dec", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("s", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("u", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("f", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("b", 128).build(),
                 PartitionSpec.builder_for(schema).year("d").build(),
                 PartitionSpec.builder_for(schema).month("d").build(),
                 PartitionSpec.builder_for(schema).day("d").build(),
                 PartitionSpec.builder_for(schema).year("ts").build(),
                 PartitionSpec.builder_for(schema).month("ts").build(),
                 PartitionSpec.builder_for(schema).day("ts").build(),
                 PartitionSpec.builder_for(schema).hour("ts").build(),
                 PartitionSpec.builder_for(schema).truncate("i", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("l", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("dec", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("s", 10).build(),
                 PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(),
                 PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(),
                 ]

        for spec in specs:
            self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))

Example #12

0

Show file

def rg_expected_schema():
    return Schema([
        NestedField.required(1, "string_col", StringType.get()),
        NestedField.required(2, "long_col", LongType.get()),
        NestedField.required(3, "int_col", IntegerType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "null_col", StringType.get()),
        NestedField.optional(6, "missing_col", StringType.get()),
        NestedField.optional(7, "no_stats_col", StringType.get()),
        NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(9, "ts_wotz_col",
                             TimestampType.without_timezone()),
        NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)),
        NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)),
        NestedField.optional(12, "date_type", DateType.get()),
    ])

Example #13

0

Show file

File: test_bucket.py Project: shenodaguirguis/iceberg-1

    def test_bucket_hash(self):
        buckets = [
            [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379],
            [Transforms.bucket(LongType.get(), 100), 34, 2017239379],
            [Transforms.bucket(DateType.get(), 100), 17486, -653330422],
            [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989],
            [Transforms.bucket(TimestampType.without_timezone(), 100), 1510871468000000, -2047944441],
            [Transforms.bucket(DecimalType.of(9, 2), 100), decimal.Decimal("14.20"), -500754589],
            [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089],
            [Transforms.bucket(UUIDType.get(), 100), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340],
            [Transforms.bucket(FixedType.of_length(3), 128), b'foo', -156908512],
            [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', -188683207]
        ]

        for bucket in buckets:
            self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))

Example #14

0

Show file

def test_projection(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    num_cols = source_table.num_columns
    for i in range(1, num_cols - 1):
        source_table = source_table.remove_column(num_cols - i)

    assert source_table == reader.read()

Example #15

0

Show file

File: test_parquet_to_iceberg.py Project: shenodaguirguis/iceberg-1

def test_primitive_types(primitive_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])
    compare_schema(
        expected_schema,
        convert_parquet_to_iceberg(primitive_type_test_parquet_file))

Example #16

0

Show file

File: test_conversions.py Project: snazy/iceberg

 def test_to_bytes(self):
     self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer())
     self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer())
     self.assertEqual(b'\xd2\x04\x00\x00',
                      Literal.of(1234).to_byte_buffer())
     self.assertEqual(b'\xd2\x04\x00\x00\x00\x00\x00\x00',
                      Literal.of(1234).to(LongType.get()).to_byte_buffer())
     self.assertEqual(b'\x19\x04\x9e?', Literal.of(1.2345).to_byte_buffer())
     self.assertEqual(
         b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f',
         Literal.of(1.2345).to(DoubleType.get()).to_byte_buffer())
     self.assertEqual(b'\xd2\x04\x00\x00',
                      Literal.of(1234).to(DateType.get()).to_byte_buffer())
     self.assertEqual(
         b'\x00\xe8vH\x17\x00\x00\x00',
         Literal.of(100000000000).to(TimeType.get()).to_byte_buffer())
     self.assertEqual(
         b'\x00\xe8vH\x17\x00\x00\x00',
         Literal.of(100000000000).to(
             TimestampType.with_timezone()).to_byte_buffer())
     self.assertEqual(
         b'\x00\xe8vH\x17\x00\x00\x00',
         Literal.of(100000000000).to(
             TimestampType.without_timezone()).to_byte_buffer())
     self.assertEqual(b'foo', Literal.of("foo").to_byte_buffer())
     self.assertEqual(
         b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7',
         Literal.of(uuid.UUID(
             "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer())
     self.assertEqual(b'foo', Literal.of(bytes(b'foo')).to_byte_buffer())
     self.assertEqual(b'foo',
                      Literal.of(bytearray(b'foo')).to_byte_buffer())
     # Decimal on 2-bytes
     self.assertEqual(
         b'\x30\x39',
         Literal.of(123.45).to(DecimalType.of(5, 2)).to_byte_buffer())
     # Decimal on 3-bytes to test that we use the minimum number of bytes
     self.assertEqual(
         b'\x12\xd6\x87',
         Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer())
     # Negative decimal to test two's complement
     self.assertEqual(
         b'\xed\x29\x79',
         Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer())

Example #17

0

Show file

def test_compound_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(
        input_file, expected_schema, {},
        Expressions.and_(Expressions.equal("string_col", "us"),
                         Expressions.equal("int_col", 1)), True)
    pyarrow_array = [
        pa.array([1], type=pa.int32()),
        pa.array([1], type=pa.int64()),
        pa.array([1.0], type=pa.float32()),
        pa.array([1.0], type=pa.float64()),
        pa.array(['us'], type=pa.string())
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table

Example #18

0

Show file

def test_decimal_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9))
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.decimal128(38, 9))
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("new_dec_col",
                                         pa.decimal128(38, 9),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table

Example #19

0

Show file

def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table

Example #20

0

Show file

File: test_partition_spec.py Project: alanzhang211/incubator-iceberg

def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                         NestedField.required(2, "l", LongType.get()),
                         NestedField.required(3, "d", DateType.get()),
                         NestedField.required(4, "t", TimeType.get()),
                         NestedField.required(5, "ts", TimestampType.without_timezone()),
                         NestedField.required(6, "dec", DecimalType.of(9, 2)),
                         NestedField.required(7, "s", StringType.get()),
                         NestedField.required(8, "u", UUIDType.get()),
                         NestedField.required(9, "f", FixedType.of_length(3)),
                         NestedField.required(10, "b", BinaryType.get()))

    specs = [
        PartitionSpec.builder_for(spec_schema).identity("i").build(),
        PartitionSpec.builder_for(spec_schema).identity("l").build(),
        PartitionSpec.builder_for(spec_schema).identity("d").build(),
        PartitionSpec.builder_for(spec_schema).identity("t").build(),
        PartitionSpec.builder_for(spec_schema).identity("ts").build(),
        PartitionSpec.builder_for(spec_schema).identity("dec").build(),
        PartitionSpec.builder_for(spec_schema).identity("s").build(),
        PartitionSpec.builder_for(spec_schema).identity("u").build(),
        PartitionSpec.builder_for(spec_schema).identity("f").build(),
        PartitionSpec.builder_for(spec_schema).identity("b").build(),
        PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(),
        PartitionSpec.builder_for(spec_schema).year("d").build(),
        PartitionSpec.builder_for(spec_schema).month("d").build(),
        PartitionSpec.builder_for(spec_schema).day("d").build(),
        PartitionSpec.builder_for(spec_schema).year("ts").build(),
        PartitionSpec.builder_for(spec_schema).month("ts").build(),
        PartitionSpec.builder_for(spec_schema).day("ts").build(),
        PartitionSpec.builder_for(spec_schema).hour("ts").build(),
        PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(),
        PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build()
    ]

    expected_spec_strs = [
        "[\n i: identity(1)\n]",
        "[\n l: identity(2)\n]",
        "[\n d: identity(3)\n]",
        "[\n t: identity(4)\n]",
        "[\n ts: identity(5)\n]",
        "[\n dec: identity(6)\n]",
        "[\n s: identity(7)\n]",
        "[\n u: identity(8)\n]",
        "[\n f: identity(9)\n]",
        "[\n b: identity(10)\n]",
        "[\n i_bucket: bucket[128](1)\n]",
        "[\n l_bucket: bucket[128](2)\n]",
        "[\n d_bucket: bucket[128](3)\n]",
        "[\n t_bucket: bucket[128](4)\n]",
        "[\n ts_bucket: bucket[128](5)\n]",
        "[\n dec_bucket: bucket[128](6)\n]",
        "[\n s_bucket: bucket[128](7)\n]",
        "[\n d_year: year(3)\n]",
        "[\n d_month: month(3)\n]",
        "[\n d_day: day(3)\n]",
        "[\n ts_year: year(5)\n]",
        "[\n ts_month: month(5)\n]",
        "[\n ts_day: day(5)\n]",
        "[\n ts_hour: hour(5)\n]",
        "[\n i_truncate: truncate[10](1)\n]",
        "[\n l_truncate: truncate[10](2)\n]",
        "[\n dec_truncate: truncate[10](6)\n]",
        "[\n s_truncate: truncate[10](7)\n]",
        "[\n dec_bucket: bucket[16](6)\n]",
    ]

    for (spec, expected_spec_str) in zip(specs, expected_spec_strs):
        assert str(spec) == expected_spec_str

Example #21

0

Show file

File: manifest_entry.py Project: rdsr/li-iceberg-rdsr

 def wrap_file_schema(file_struct):
     return Schema(NestedField.required(0, "status", IntegerType.get()),
                   NestedField.required(1, "snapshot_id", LongType.get()),
                   NestedField.required(2, "data_file", file_struct))

Example #22

0

Show file

def test_long_to_double_conversion():
    lit = Literal.of(34).to(LongType.get())
    dbl_lit = lit.to(DoubleType.get())

    assert math.isclose(lit.value, dbl_lit.value)

Example #23

0

Show file

def test_long_to_decimal_conversion(type_val_tuples):
    lit = Literal.of(34).to(LongType.get())

    assert lit.to(type_val_tuples[0]).value.as_tuple() == Decimal(type_val_tuples[1]).as_tuple()

Example #24

0

Show file

def test_long_to_integer():
    lit = Literal.of(34).to(LongType.get())
    int_lit = lit.to(IntegerType.get())

    assert lit.value == int_lit.value

Example #25

0

Show file

def test_long_to_float_conversion():
    lit = Literal.of(34).to(LongType.get())
    float_lit = lit.to(FloatType.get())

    assert math.isclose(lit.value, float_lit.value)

Example #26

0

Show file

def test_integer_to_long_conversion():
    lit = Literal.of(34)
    long_lit = lit.to(LongType.get())

    assert lit.value == long_lit.value

Example #27

0

Show file

class AvroToIceberg(object):
    FIELD_ID_PROP = "field-id"
    FIELD_TYPE_PROP = "type"
    FIELD_NAME_PROP = "name"
    FIELD_LOGICAL_TYPE_PROP = "logicalType"
    FIELD_FIELDS_PROP = "fields"
    FIELD_ITEMS_PROP = "items"
    FIELD_ELEMENT_ID_PROP = "element-id"

    AVRO_JSON_PRIMITIVE_TYPES = ("boolean", "int", "long", "float", "double", "bytes", "string")
    AVRO_JSON_COMPLEX_TYPES = ("record", "array", "enum", "fixed")

    TYPE_PROCESSING_MAP = {str: lambda x, y: AvroToIceberg.convert_str_type(x, y),
                           dict: lambda x, y: AvroToIceberg.convert_complex_type(x, y),
                           list: lambda x, y: AvroToIceberg.convert_union_type(x, y)}

    COMPLEX_TYPE_PROCESSING_MAP = {"record": lambda x, y: AvroToIceberg.convert_record_type(x, y),
                                   "array": lambda x, y: AvroToIceberg.convert_array_type(x, y),
                                   "map": lambda x, y: AvroToIceberg.convert_map_type(x, y)}

    PRIMITIVE_FIELD_TYPE_MAP = {"boolean": BooleanType.get(),
                                "bytes": BinaryType.get(),
                                "date": DateType.get(),
                                "double": DoubleType.get(),
                                "float": FloatType.get(),
                                "int": IntegerType.get(),
                                "long": LongType.get(),
                                "string": StringType.get(),
                                "time-millis": TimeType.get(),
                                "timestamp-millis": TimestampType.without_timezone()}

    PROCESS_FUNCS = {TypeID.STRUCT: lambda avro_row, field: AvroToIceberg.get_field_from_struct(avro_row, field),
                     TypeID.LIST: lambda avro_row, field: AvroToIceberg.get_field_from_list(avro_row, field),
                     TypeID.MAP: lambda avro_row, field: AvroToIceberg.get_field_from_map(avro_row, field)}

    @staticmethod
    def convert_avro_schema_to_iceberg(avro_schema):
        if avro_schema.get(AvroToIceberg.FIELD_TYPE_PROP) != "record":
            raise RuntimeError("Cannot convert avro schema to iceberg %s" % avro_schema)

        struct = AvroToIceberg.convert_type(avro_schema, None)

        return Schema(struct[0].fields)

    @staticmethod
    def convert_record_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)

        if avro_field_type != "record":
            raise RuntimeError("Field type muse be 'record': %s" % avro_field_type)

        fields = avro_field.get(AvroToIceberg.FIELD_FIELDS_PROP)

        iceberg_fields = []
        if next_id is None:
            next_id = len(fields)
        for field in fields:
            iceberg_field, next_id = AvroToIceberg.convert_avro_field_to_iceberg(field, next_id=next_id)
            iceberg_fields.append(iceberg_field)

        return StructType.of(iceberg_fields), next_id

    @staticmethod
    def convert_avro_field_to_iceberg(field, next_id):
        field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id)

        if field.get(AvroToIceberg.FIELD_ID_PROP) is None:
            return field_type, next_id

        if is_optional:
            return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP),
                                        field.get(AvroToIceberg.FIELD_NAME_PROP),
                                        field_type), next_id
        else:
            return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP),
                                        field.get(AvroToIceberg.FIELD_NAME_PROP),
                                        field_type), next_id

    @staticmethod
    def convert_type(field, next_id=None):
        avro_field_type = field.get(AvroToIceberg.FIELD_TYPE_PROP)

        optional = AvroToIceberg.is_option_schema(avro_field_type)

        processing_func = AvroToIceberg.TYPE_PROCESSING_MAP.get(type(avro_field_type))
        if processing_func is None:
            raise RuntimeError("No function found to process %s" % avro_field_type)

        iceberg_type, next_id = processing_func(field, next_id)

        return iceberg_type, optional, next_id

    @staticmethod
    def convert_str_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP)
        if not isinstance(avro_field_type, str):
            raise RuntimeError("Field type must be of type str: %s" % avro_field_type)

        if avro_field_type in AvroToIceberg.AVRO_JSON_PRIMITIVE_TYPES:
            if logical_type is not None:
                return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(logical_type), next_id
            else:
                return AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(avro_field_type), next_id

        elif avro_field_type in AvroToIceberg.AVRO_JSON_COMPLEX_TYPES:
            if logical_type is not None:
                processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(logical_type)
            else:
                processing_func = AvroToIceberg.COMPLEX_TYPE_PROCESSING_MAP.get(avro_field_type)

            if processing_func is None:
                raise RuntimeError("No function found to process %s" % avro_field_type)

            return processing_func(avro_field, next_id)
        else:
            raise RuntimeError("Unknown type %s" % avro_field_type)

    @staticmethod
    def convert_complex_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        if not isinstance(avro_field_type, dict):
            raise RuntimeError("Complex field type must be of type dict: %s" % avro_field_type)

        return AvroToIceberg.convert_avro_field_to_iceberg(avro_field_type, next_id)

    @staticmethod
    def convert_union_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        if not isinstance(avro_field_type, list):
            raise RuntimeError("Union field type must be of type list: %s" % avro_field_type)

        if len(avro_field_type) > 2:
            raise RuntimeError("Cannot process unions larger than 2 items: %s" % avro_field_type)
        for item in avro_field_type:
            if isinstance(item, str) and item == "null":
                continue
            avro_field_type = item
        avro_field[AvroToIceberg.FIELD_TYPE_PROP] = avro_field_type
        items = AvroToIceberg.convert_type(avro_field, next_id)
        return items[0], items[2]

    @staticmethod
    def convert_array_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        if avro_field_type != "array":
            raise RuntimeError("Avro type must be array: %s" % avro_field_type)
        element_id = avro_field.get(AvroToIceberg.FIELD_ELEMENT_ID_PROP)
        items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP)

        is_optional = AvroToIceberg.is_option_schema(items)

        if isinstance(items, str) and items in AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP:
            item_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(items)
            if item_type is None:
                raise RuntimeError("No mapping found for type %s" % items)
        else:
            raise RuntimeError("Complex list types not yet implemented")

        if is_optional:
            return ListType.of_optional(element_id, item_type), next_id
        else:
            return ListType.of_required(element_id, item_type), next_id

    @staticmethod
    def convert_map_type(avro_field, next_id=None):
        avro_field_type = avro_field.get(AvroToIceberg.FIELD_TYPE_PROP)
        avro_logical_type = avro_field.get(AvroToIceberg.FIELD_LOGICAL_TYPE_PROP)
        if avro_field_type != "array" or avro_logical_type != "map":
            raise RuntimeError("Avro type must be array and logical type must be map: %s" % avro_logical_type)
        is_optional = False
        items = avro_field.get(AvroToIceberg.FIELD_ITEMS_PROP)
        for field in items.get(AvroToIceberg.FIELD_FIELDS_PROP, list()):
            if field.get(AvroToIceberg.FIELD_NAME_PROP) == "key":
                key_id = field.get(AvroToIceberg.FIELD_ID_PROP)
                if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str):
                    raise RuntimeError("Support for complex map keys not yet implemented")
                key_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP))
            elif field.get(AvroToIceberg.FIELD_NAME_PROP) == "value":
                value_id = field.get(AvroToIceberg.FIELD_ID_PROP)
                if not isinstance(field.get(AvroToIceberg.FIELD_TYPE_PROP), str):
                    raise RuntimeError("Support for complex map values not yet imeplemented")
                value_type = AvroToIceberg.PRIMITIVE_FIELD_TYPE_MAP.get(field.get(AvroToIceberg.FIELD_TYPE_PROP))

        if is_optional:
            return MapType.of_optional(key_id, value_id, key_type, value_type), next_id
        else:
            return MapType.of_required(key_id, value_id, key_type, value_type), next_id

    @staticmethod
    def is_option_schema(field_type):
        if isinstance(field_type, list) and len(field_type) == 2 and "null" in field_type:
            return True

        return False

    @staticmethod
    def read_avro_file(iceberg_schema, data_file):
        fo = data_file.new_fo()
        avro_reader = fastavro.reader(fo)
        for avro_row in avro_reader:
            iceberg_row = dict()
            for field in iceberg_schema.as_struct().fields:
                iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field)
            yield iceberg_row
        fo.close()

    @staticmethod
    def read_avro_row(iceberg_schema, avro_reader):
        try:
            for avro_row in avro_reader:
                iceberg_row = dict()
                for field in iceberg_schema.as_struct().fields:
                    iceberg_row[field.name] = AvroToIceberg.get_field_from_avro(avro_row, field)
                yield iceberg_row
        except StopIteration:
            return

    @staticmethod
    def get_field_from_avro(avro_row, field):
        try:
            return AvroToIceberg.PROCESS_FUNCS.get(field.type.type_id,
                                                   AvroToIceberg.get_field_from_primitive)(avro_row, field)
        except KeyError:
            raise RuntimeError("Don't know how to get field of type: %s" % field.type.type_id)

    @staticmethod
    def get_field_from_primitive(avro_row, field):
        try:
            return avro_row[field.name]
        except KeyError:
            if field.is_required:
                raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))

    @staticmethod
    def get_field_from_struct(avro_row, field):
        field_obj = {}
        for nested_field in field.type.fields:
            field_obj[nested_field.name] = AvroToIceberg.get_field_from_avro(avro_row[field.name], nested_field)
        return field_obj

    @staticmethod
    def get_field_from_list(avro_row, field):
        try:
            return avro_row[field.name]
        except KeyError:
            if field.is_required:
                raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))

    @staticmethod
    def get_field_from_map(avro_row, field):
        val_map = dict()

        try:
            avro_value = avro_row[field.name]
        except KeyError:
            if field.is_required:
                raise RuntimeError("Field is required but missing in source %s\n%s:" % (field, avro_row))
            else:
                return None

        for val in avro_value:
            val_map[val['key']] = val['value']

        return val_map

Example #28

0

Show file

#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from decimal import Decimal

from iceberg.api.transforms import Truncate
from iceberg.api.types import (DecimalType, IntegerType, LongType, StringType)
import pytest


@pytest.mark.parametrize("type_var", [IntegerType.get(), LongType.get()])
@pytest.mark.parametrize("input_var,expected", [(1, 0), (5, 0), (9, 0),
                                                (10, 10), (11, 10), (-1, -10),
                                                (-10, -10), (-12, -20)])
def test_truncate_integer(type_var, input_var, expected):
    trunc = Truncate.get(type_var, 10)
    assert trunc.apply(input_var) == expected


@pytest.mark.parametrize(
    "input_var,expected",
    [(Decimal(12.34).quantize(Decimal(".01")), Decimal("12.30")),
     (Decimal(12.30).quantize(Decimal(".01")), Decimal("12.30")),
     (Decimal(12.20).quantize(Decimal(".01")), Decimal("12.20")),
     (Decimal(0.05).quantize(Decimal(".01")), Decimal("0.00")),
     (Decimal(-0.05).quantize(Decimal(".01")), Decimal("-0.10"))])

Example #29

0

Show file

File: conftest.py Project: rdsr/li-iceberg-rdsr

def iceberg_full_read_projection_schema():
    return Schema([
        NestedField.required(0, "id", LongType.get()),
        NestedField.optional(1, "data", StringType.get())
    ])

Example #30

0

Show file

File: test_conversions.py Project: snazy/iceberg

    def test_byte_buffer_conversions(self):
        # booleans are stored as 0x00 for 'false' and a non-zero byte for 'true'
        self.assertConversion(False, BooleanType.get(), b'\x00')
        self.assertConversion(True, BooleanType.get(), b'\x01')
        self.assertEqual(b'\x00', Literal.of(False).to_byte_buffer())
        self.assertEqual(b'\x01', Literal.of(True).to_byte_buffer())

        # integers are stored as 4 bytes in little-endian order
        # 84202 is 0...01|01001000|11101010 in binary
        # 11101010 -> 234 (-22), 01001000 -> 72, 00000001 -> 1, 00000000 -> 0
        self.assertConversion(84202, IntegerType.get(), bytes([234, 72, 1, 0]))
        self.assertEqual(bytes([234, 72, 1, 0]),
                         Literal.of(84202).to_byte_buffer())

        # longs are stored as 8 bytes in little-endian order
        # 200L is 0...0|11001000 in binary
        # 11001000 -> 200 (-56), 00000000 -> 0, ... , 00000000 -> 0
        self.assertConversion(200, LongType.get(),
                              bytes([200, 0, 0, 0, 0, 0, 0, 0]))
        self.assertEqual(bytes([200, 0, 0, 0, 0, 0, 0, 0]),
                         Literal.of(200).to(LongType.get()).to_byte_buffer())

        # floats are stored as 4 bytes in little-endian order
        # floating point numbers are represented as sign * 2ˆexponent * mantissa
        # -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary
        # 00000000 -> 0, 00000000 -> 0, 10010000 -> 144 (-112), 11000000 -> 192 (-64),
        self.assertConversion(-4.5, FloatType.get(), bytes([0, 0, 144, 192]))
        self.assertEqual(bytes([0, 0, 144, 192]),
                         Literal.of(-4.5).to_byte_buffer())

        # doubles are stored as 8 bytes in little-endian order
        # floating point numbers are represented as sign * 2ˆexponent * mantissa
        # 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0
        # 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64
        self.assertConversion(6.0, DoubleType.get(),
                              bytes([0, 0, 0, 0, 0, 0, 24, 64]))
        self.assertEqual(bytes([0, 0, 0, 0, 0, 0, 24, 64]),
                         Literal.of(6.0).to(DoubleType.get()).to_byte_buffer())

        # dates are stored as days from 1970-01-01 in a 4-byte little-endian int
        # 1000 is 0...0|00000011|11101000 in binary
        # 11101000 -> 232 (-24), 00000011 -> 3, ... , 00000000 -> 0
        self.assertConversion(1000, DateType.get(), bytes([232, 3, 0, 0]))
        self.assertEqual(bytes([232, 3, 0, 0]),
                         Literal.of(1000).to(DateType.get()).to_byte_buffer())

        # time is stored as microseconds from midnight in an 8-byte little-endian long
        # 10000L is 0...0|00100111|00010000 in binary
        # 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0
        self.assertConversion(10000, TimeType.get(),
                              bytes([16, 39, 0, 0, 0, 0, 0, 0]))
        self.assertEqual(
            bytes([16, 39, 0, 0, 0, 0, 0, 0]),
            Literal.of(10000).to(LongType.get()).to(
                TimeType.get()).to_byte_buffer())

        # timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long
        # 400000L is 0...110|00011010|10000000 in binary
        # 10000000 -> 128 (-128), 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0
        self.assertConversion(400000, TimestampType.without_timezone(),
                              bytes([128, 26, 6, 0, 0, 0, 0, 0]))
        self.assertConversion(400000, TimestampType.with_timezone(),
                              bytes([128, 26, 6, 0, 0, 0, 0, 0]))
        self.assertEqual(
            bytes([128, 26, 6, 0, 0, 0, 0, 0]),
            Literal.of(400000).to(LongType.get()).to(
                TimestampType.without_timezone()).to_byte_buffer())
        self.assertEqual(
            bytes([128, 26, 6, 0, 0, 0, 0, 0]),
            Literal.of(400000).to(LongType.get()).to(
                TimestampType.with_timezone()).to_byte_buffer())

        # strings are stored as UTF-8 bytes (without length)
        # 'A' -> 65, 'B' -> 66, 'C' -> 67
        self.assertConversion("ABC", StringType.get(), bytes([65, 66, 67]))
        self.assertEqual(bytes([65, 66, 67]),
                         Literal.of("ABC").to_byte_buffer())

        # uuids are stored as 16-byte big-endian values
        # f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7
        # 0xF7 -> 11110111 -> 247 (-9), 0x9C -> 10011100 -> 156 (-100), 0x3E -> 00111110 -> 62,
        # 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124,
        # 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> 189 (-67), 0xA4 -> 10100100 -> 164 (-92),
        # 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52,
        # 0x9C -> 10011100 -> 156 (-100), 0xB7 -> 10110111 -> 183 (-73), 0x85 -> 10000101 -> 133 (-123),
        # 0xE7 -> 11100111 -> 231 (-25)
        self.assertConversion(
            uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), UUIDType.get(),
            bytes([
                247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183,
                133, 231
            ]))
        self.assertEqual(
            bytes([
                247, 156, 62, 9, 103, 124, 75, 189, 164, 121, 63, 52, 156, 183,
                133, 231
            ]),
            Literal.of(uuid.UUID(
                "f79c3e09-677c-4bbd-a479-3f349cb785e7")).to_byte_buffer())

        # fixed values are stored directly
        # 'a' -> 97, 'b' -> 98
        self.assertConversion(bytes("ab", "utf8"), FixedType.of_length(2),
                              bytes([97, 98]))
        self.assertEqual(bytes([97, 98]),
                         Literal.of(bytes("ab", "utf8")).to_byte_buffer())

        # binary values are stored directly
        # 'Z' -> 90
        self.assertConversion(bytearray("Z", "utf8"), BinaryType.get(),
                              bytes([90]))
        self.assertEqual(bytes([90]),
                         Literal.of(bytearray("Z", "utf8")).to_byte_buffer())

        # decimals are stored as unscaled values in the form of two's-complement big-endian binary,
        # using the minimum number of bytes for the values
        # 345 is 0...1|01011001 in binary
        # 00000001 -> 1, 01011001 -> 89
        self.assertConversion(
            Decimal(3.45).quantize(Decimal(".01")), DecimalType.of(3, 2),
            bytes([1, 89]))
        self.assertEqual(
            bytes([1, 89]),
            Literal.of(3.45).to(DecimalType.of(3, 2)).to_byte_buffer())

        # decimal on 3-bytes to test that we use the minimum number of bytes and not a power of 2
        # 1234567 is 00010010|11010110|10000111 in binary
        # 00010010 -> 18, 11010110 -> 214, 10000111 -> 135
        self.assertConversion(
            Decimal(123.4567).quantize(Decimal(".0001")), DecimalType.of(7, 4),
            bytes([18, 214, 135]))
        self.assertEqual(
            bytes([18, 214, 135]),
            Literal.of(123.4567).to(DecimalType.of(7, 4)).to_byte_buffer())

        # negative decimal to test two's complement
        # -1234567 is 11101101|00101001|01111001 in binary
        # 11101101 -> 237, 00101001 -> 41, 01111001 -> 121
        self.assertConversion(
            Decimal(-123.4567).quantize(Decimal(".0001")),
            DecimalType.of(7, 4), bytes([237, 41, 121]))
        self.assertEqual(
            bytes([237, 41, 121]),
            Literal.of(-123.4567).to(DecimalType.of(7, 4)).to_byte_buffer())

        # test empty byte in decimal
        # 11 is 00001011 in binary
        # 00001011 -> 11
        self.assertConversion(
            Decimal(0.011).quantize(Decimal(".001")), DecimalType.of(10, 3),
            bytes([11]))
        self.assertEqual(
            bytes([11]),
            Literal.of(0.011).to(DecimalType.of(10, 3)).to_byte_buffer())