Example #1
0
def test_decimal_to_decimal_conversion():
    lit = Literal.of(Decimal("34.11").quantize(Decimal(".01")))

    assert lit.value.as_tuple() == lit.to(DecimalType.of(9, 2)).value.as_tuple()
    assert lit.value.as_tuple() == lit.to(DecimalType.of(11, 2)).value.as_tuple()
    assert lit.to(DecimalType.of(9, 0)) is None
    assert lit.to(DecimalType.of(9, 1)) is None
    assert lit.to(DecimalType.of(9, 3)) is None
def test_string_to_decimal_literal():
    decimal_str = Literal.of("34.560")
    decimal_lit = decimal_str.to(DecimalType.of(9, 3))

    assert 3 == abs(decimal_lit.value.as_tuple().exponent)
    assert Decimal("34.560").as_tuple() == decimal_lit.value.as_tuple()

    assert decimal_str.to(DecimalType.of(9, 2)) is None
    assert decimal_str.to(DecimalType.of(9, 4)) is None
Example #3
0
def rg_expected_schema():
    return Schema([
        NestedField.required(1, "string_col", StringType.get()),
        NestedField.required(2, "long_col", LongType.get()),
        NestedField.required(3, "int_col", IntegerType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "null_col", StringType.get()),
        NestedField.optional(6, "missing_col", StringType.get()),
        NestedField.optional(7, "no_stats_col", StringType.get()),
        NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(9, "ts_wotz_col",
                             TimestampType.without_timezone()),
        NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)),
        NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)),
        NestedField.optional(12, "date_type", DateType.get()),
    ])
Example #4
0
def test_literal_converison(op, assert_and_unwrap):
    struct = StructType.of([NestedField.required(15, "d", DecimalType.of(9, 2))])
    unbound = UnboundPredicate(op, Expressions.ref("d"), "12.40")
    bound = assert_and_unwrap(unbound.bind(struct))

    assert Decimal(12.40).quantize(Decimal(".01")).as_tuple() == bound.lit.value.as_tuple()
    assert 15 == bound.ref.field.field_id
    assert op == bound.op
Example #5
0
def test_big_decimal_to_human_string():
    big_dec = DecimalType.of(9, 2)
    identity = Transforms.identity(big_dec)

    dec_str = "-1.50"
    dec_var = Decimal(dec_str)

    assert identity.to_human_string(dec_var) == dec_str
def test_primitive_types(primitive_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])
    compare_schema(
        expected_schema,
        convert_parquet_to_iceberg(primitive_type_test_parquet_file))
Example #7
0
def supported_primitives():
    return StructType.of([
        NestedField.required(100, "id", LongType.get()),
        NestedField.optional(101, "data", StringType.get()),
        NestedField.required(102, "b", BooleanType.get()),
        NestedField.optional(103, "i", IntegerType.get()),
        NestedField.required(104, "l", LongType.get()),
        NestedField.optional(105, "f", FloatType.get()),
        NestedField.required(106, "d", DoubleType.get()),
        NestedField.optional(107, "date", DateType.get()),
        NestedField.required(108, "ts", TimestampType.with_timezone()),
        NestedField.required(110, "s", StringType.get()),
        NestedField.required(111, "uuid", UUIDType.get()),
        NestedField.required(112, "fixed", FixedType.of_length(7)),
        NestedField.optional(113, "bytes", BinaryType.get()),
        NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)),
        NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)),
        NestedField.required(114, "dec_38_10", DecimalType.of(38, 10))
    ])
Example #8
0
def test_raise_exception_with_invalid_json():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()),
                         NestedField.required(3, "num", DecimalType.of(9, 2)))

    spec_string = '{"spec-id": 0, "fields": [' \
                  '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \
                  '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \
                  '{"name": "data1", "transform": "bucket[16]", "source-id": 2}, ' \
                  '{"name": "data2", "transform": "bucket[8]", "source-id": 2}, ' \
                  '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3}]}'

    with pytest.raises(RuntimeError):
        PartitionSpecParser.from_json(spec_schema, spec_string)
Example #9
0
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table
Example #10
0
    def test_partition_spec(self):
        schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                        NestedField.required(2, "l", LongType.get()),
                        NestedField.required(3, "d", DateType.get()),
                        NestedField.required(4, "t", TimeType.get()),
                        NestedField.required(5, "ts", TimestampType.without_timezone()),
                        NestedField.required(6, "dec", DecimalType.of(9, 2)),
                        NestedField.required(7, "s", StringType.get()),
                        NestedField.required(8, "u", UUIDType.get()),
                        NestedField.required(9, "f", FixedType.of_length(3)),
                        NestedField.required(10, "b", BinaryType.get()))
        specs = [PartitionSpec.builder_for(schema).identity("i").build(),
                 PartitionSpec.builder_for(schema).identity("l").build(),
                 PartitionSpec.builder_for(schema).identity("d").build(),
                 PartitionSpec.builder_for(schema).identity("t").build(),
                 PartitionSpec.builder_for(schema).identity("ts").build(),
                 PartitionSpec.builder_for(schema).identity("dec").build(),
                 PartitionSpec.builder_for(schema).identity("s").build(),
                 PartitionSpec.builder_for(schema).identity("u").build(),
                 PartitionSpec.builder_for(schema).identity("f").build(),
                 PartitionSpec.builder_for(schema).identity("b").build(),
                 PartitionSpec.builder_for(schema).bucket("i", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("l", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("d", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("t", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("ts", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("dec", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("s", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("u", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("f", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("b", 128).build(),
                 PartitionSpec.builder_for(schema).year("d").build(),
                 PartitionSpec.builder_for(schema).month("d").build(),
                 PartitionSpec.builder_for(schema).day("d").build(),
                 PartitionSpec.builder_for(schema).year("ts").build(),
                 PartitionSpec.builder_for(schema).month("ts").build(),
                 PartitionSpec.builder_for(schema).day("ts").build(),
                 PartitionSpec.builder_for(schema).hour("ts").build(),
                 PartitionSpec.builder_for(schema).truncate("i", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("l", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("dec", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("s", 10).build(),
                 PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(),
                 PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(),
                 ]

        for spec in specs:
            self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))
Example #11
0
    def test_bucket_hash(self):
        buckets = [
            [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379],
            [Transforms.bucket(LongType.get(), 100), 34, 2017239379],
            [Transforms.bucket(DateType.get(), 100), 17486, -653330422],
            [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989],
            [Transforms.bucket(TimestampType.without_timezone(), 100), 1510871468000000, -2047944441],
            [Transforms.bucket(DecimalType.of(9, 2), 100), decimal.Decimal("14.20"), -500754589],
            [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089],
            [Transforms.bucket(UUIDType.get(), 100), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340],
            [Transforms.bucket(FixedType.of_length(3), 128), b'foo', -156908512],
            [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', -188683207]
        ]

        for bucket in buckets:
            self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))
Example #12
0
def test_decimal_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9))
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.decimal128(38, 9))
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("new_dec_col",
                                         pa.decimal128(38, 9),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Example #13
0
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()),
                         NestedField.required(3, "num", DecimalType.of(9, 2)))

    spec = PartitionSpec \
        .builder_for(spec_schema) \
        .identity("id") \
        .bucket("data", 16) \
        .add_without_field_id(2, "data1", "bucket[16]") \
        .add(2, 1010, "data2", "bucket[8]") \
        .bucket("num", 8) \
        .build()

    expected = '{"spec-id": 0, "fields": [' \
               '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \
               '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \
               '{"name": "data1", "transform": "bucket[16]", "source-id": 2, "field-id": 1002}, ' \
               '{"name": "data2", "transform": "bucket[8]", "source-id": 2, "field-id": 1010}, ' \
               '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3, "field-id": 1011}]}'
    assert expected == PartitionSpecParser.to_json(spec)
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                         NestedField.required(2, "l", LongType.get()),
                         NestedField.required(3, "d", DateType.get()),
                         NestedField.required(4, "t", TimeType.get()),
                         NestedField.required(5, "ts", TimestampType.without_timezone()),
                         NestedField.required(6, "dec", DecimalType.of(9, 2)),
                         NestedField.required(7, "s", StringType.get()),
                         NestedField.required(8, "u", UUIDType.get()),
                         NestedField.required(9, "f", FixedType.of_length(3)),
                         NestedField.required(10, "b", BinaryType.get()))

    specs = [
        PartitionSpec.builder_for(spec_schema).identity("i").build(),
        PartitionSpec.builder_for(spec_schema).identity("l").build(),
        PartitionSpec.builder_for(spec_schema).identity("d").build(),
        PartitionSpec.builder_for(spec_schema).identity("t").build(),
        PartitionSpec.builder_for(spec_schema).identity("ts").build(),
        PartitionSpec.builder_for(spec_schema).identity("dec").build(),
        PartitionSpec.builder_for(spec_schema).identity("s").build(),
        PartitionSpec.builder_for(spec_schema).identity("u").build(),
        PartitionSpec.builder_for(spec_schema).identity("f").build(),
        PartitionSpec.builder_for(spec_schema).identity("b").build(),
        PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(),
        PartitionSpec.builder_for(spec_schema).year("d").build(),
        PartitionSpec.builder_for(spec_schema).month("d").build(),
        PartitionSpec.builder_for(spec_schema).day("d").build(),
        PartitionSpec.builder_for(spec_schema).year("ts").build(),
        PartitionSpec.builder_for(spec_schema).month("ts").build(),
        PartitionSpec.builder_for(spec_schema).day("ts").build(),
        PartitionSpec.builder_for(spec_schema).hour("ts").build(),
        PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(),
        PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build()
    ]

    expected_spec_strs = [
        "[\n i: identity(1)\n]",
        "[\n l: identity(2)\n]",
        "[\n d: identity(3)\n]",
        "[\n t: identity(4)\n]",
        "[\n ts: identity(5)\n]",
        "[\n dec: identity(6)\n]",
        "[\n s: identity(7)\n]",
        "[\n u: identity(8)\n]",
        "[\n f: identity(9)\n]",
        "[\n b: identity(10)\n]",
        "[\n i_bucket: bucket[128](1)\n]",
        "[\n l_bucket: bucket[128](2)\n]",
        "[\n d_bucket: bucket[128](3)\n]",
        "[\n t_bucket: bucket[128](4)\n]",
        "[\n ts_bucket: bucket[128](5)\n]",
        "[\n dec_bucket: bucket[128](6)\n]",
        "[\n s_bucket: bucket[128](7)\n]",
        "[\n d_year: year(3)\n]",
        "[\n d_month: month(3)\n]",
        "[\n d_day: day(3)\n]",
        "[\n ts_year: year(5)\n]",
        "[\n ts_month: month(5)\n]",
        "[\n ts_day: day(5)\n]",
        "[\n ts_hour: hour(5)\n]",
        "[\n i_truncate: truncate[10](1)\n]",
        "[\n l_truncate: truncate[10](2)\n]",
        "[\n dec_truncate: truncate[10](6)\n]",
        "[\n s_truncate: truncate[10](7)\n]",
        "[\n dec_bucket: bucket[16](6)\n]",
    ]

    for (spec, expected_spec_str) in zip(specs, expected_spec_strs):
        assert str(spec) == expected_spec_str
Example #15
0
                          (34, IntegerType, 2017239379),
                          (34, LongType, 2017239379)])
def test_spec_values_int(test_input, test_type, expected):
    assert Bucket.get(test_type.get(), 100).hash(test_input) == expected


@pytest.mark.parametrize("test_input,test_type,expected",
                         [(1, BucketFloat(100), -142385009),
                          (1, BucketDouble(100), -142385009)])
def test_spec_values_dbl(test_input, test_type, expected):
    assert test_type.hash(test_input) == expected


@pytest.mark.parametrize(
    "test_input,test_type,scale_factor,expected",
    [(Decimal("14.20"), DecimalType.of(9, 2), Decimal(10)**-2, -500754589),
     (Decimal("137302769811943318102518958871258.37580"), DecimalType.of(
         38, 5), Decimal(10)**-5, -32334285)])
def test_spec_values_dec(test_input, test_type, scale_factor, expected):
    getcontext().prec = 38
    assert Bucket.get(test_type,
                      100).hash(test_input.quantize(scale_factor)) == expected


@pytest.mark.parametrize("test_input,test_type,expected", [
    (Literal.of("2017-11-16").to(DateType.get()), DateType.get(), -653330422),
    (Literal.of("22:31:08").to(TimeType.get()), TimeType.get(), -662762989),
    (Literal.of("2017-11-16T22:31:08").to(TimestampType.without_timezone()),
     TimestampType.without_timezone(), -2047944441),
    (Literal.of("2017-11-16T14:31:08-08:00").to(TimestampType.with_timezone()),
     TimestampType.with_timezone(), -2047944441)
                               DateType,
                               DecimalType,
                               DoubleType,
                               FixedType,
                               FloatType,
                               IntegerType,
                               LongType,
                               StringType,
                               TimestampType,
                               TimeType,
                               UUIDType)

PRIMITIVES = [BinaryType.get(),
              BooleanType.get(),
              DateType.get(),
              DecimalType.of(9, 2),
              DecimalType.of(11, 2),
              DecimalType.of(9, 3),
              DoubleType.get(),
              FixedType.of_length(3),
              FixedType.of_length(4),
              FloatType.get(),
              IntegerType.get(),
              LongType.get(),
              StringType.get(),
              TimestampType.with_timezone(),
              TimestampType.without_timezone(),
              TimeType.get(),
              UUIDType.get()]

Example #17
0
                               ListType,
                               LongType,
                               MapType,
                               NestedField,
                               StringType,
                               StructType,
                               TimestampType)
from iceberg.api.types import Type
import pyarrow as pa
from pyarrow.parquet import lib, ParquetFile

_logger = logging.getLogger(__name__)

arrow_type_map = {lib.Type_BOOL: lambda x=None: BooleanType.get(),
                  lib.Type_DATE32: lambda x=None: DateType.get(),
                  lib.Type_DECIMAL128: lambda x=None: DecimalType.of(x.precision, x.scale),
                  lib.Type_DOUBLE: lambda x=None: DoubleType.get(),
                  lib.Type_FIXED_SIZE_BINARY: lambda x=None: FixedType.of_length(x.byte_width),
                  lib.Type_BINARY: lambda x=None: BinaryType.get(),
                  lib.Type_FLOAT: lambda x=None: FloatType.get(),
                  lib.Type_STRING: lambda x=None: StringType.get(),
                  lib.Type_INT32: lambda x=None: IntegerType.get(),
                  lib.Type_INT64: lambda x=None: LongType.get(),
                  lib.Type_TIMESTAMP: lambda x=None: (TimestampType.without_timezone()
                                                      if x.tz is None
                                                      else TimestampType.with_timezone())
                  }


def get_nested_field(field_id: int, field_name: str, field_type: Type, nullable: bool) -> NestedField:
    if nullable:
Example #18
0
def test_truncate_decimal(input_var, expected):
    trunc = Truncate.get(DecimalType.of(9, 2), 10)
    assert trunc.apply(input_var) == expected
Example #19
0
                    Literal.of("2017-11-29T11:30:07.123").to(
                        TimestampType.without_timezone()),
                    Literal.of("2017-11-29T11:30:07.123+01:00").to(
                        TimestampType.with_timezone()),
                    Literal.of("abc"),
                    Literal.of(uuid.uuid4()),
                    Literal.of(bytes([0x01, 0x02,
                                      0x03])).to(FixedType.of_length(3)),
                    Literal.of(bytes([0x03, 0x04, 0x05,
                                      0x06])).to(BinaryType.get()),
                    Literal.of(Decimal(122.50).quantize(Decimal(".01")))
                ])
def literal(request):
    yield request.param


@pytest.fixture(scope="session",
                params=[(DecimalType.of(9, 0), "34"),
                        (DecimalType.of(9, 2), "34.00"),
                        (DecimalType.of(9, 4), "34.0000")])
def type_val_tuples(request):
    yield request.param


@pytest.fixture(scope="session",
                params=[(DecimalType.of(9, 1), "34.6"),
                        (DecimalType.of(9, 2), "34.56"),
                        (DecimalType.of(9, 4), "34.5600")])
def float_type_val_tuples(request):
    yield request.param