def schema():
    return Schema(NestedField.required(1, "id", IntegerType.get()),
                  NestedField.optional(2, "no_stats", IntegerType.get()),
                  NestedField.required(3, "required", StringType.get()),
                  NestedField.optional(4, "all_nulls", StringType.get()),
                  NestedField.optional(5, "some_nulls", StringType.get()),
                  NestedField.optional(6, "no_nulls", StringType.get()))
Exemple #2
0
def test_multiple_fields(assert_and_unwrap):
    struct = StructType.of([NestedField.required(10, 'x', IntegerType.get()),
                           NestedField.required(11, 'y', IntegerType.get()),
                           NestedField.required(12, 'z', IntegerType.get())])

    unbound = UnboundPredicate(Operation.LT, Expressions.ref("y"), 6)
    expr = unbound.bind(struct)

    bound = assert_and_unwrap(expr)
    assert 11 == bound.ref.field.field_id
    assert Operation.LT == bound.op
    assert 6 == bound.lit.value
Exemple #3
0
def inc_man_file():
    return TestManifestFile(
        "manifest-list.avro", 1024, 0, int(time.time() * 1000), 5,
        10, 0, (TestFieldSummary(
            False, Conversions.to_byte_buffer(IntegerType.get(), 30),
            Conversions.to_byte_buffer(
                IntegerType.get(), 79)), TestFieldSummary(True, None, None),
                TestFieldSummary(
                    True, Conversions.to_byte_buffer(StringType.get(), 'a'),
                    Conversions.to_byte_buffer(StringType.get(), 'z')),
                TestFieldSummary(
                    False, Conversions.to_byte_buffer(StringType.get(), 'a'),
                    Conversions.to_byte_buffer(StringType.get(), 'z'))))
def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.optional(1, "list_int_col",
                             ListType.of_optional(3, IntegerType.get())),
        NestedField.optional(4, "list_str_col",
                             ListType.of_optional(6, StringType.get())),
        NestedField.optional(
            7, "struct_col",
            StructType.of([
                NestedField.optional(8, "f1", IntegerType.get()),
                NestedField.optional(9, "f2", StringType.get())
            ]))
    ])
    converted_schema = convert_parquet_to_iceberg(
        unnested_complex_type_test_parquet_file)
    compare_schema(expected_schema, converted_schema)
 def test_from_bytes(self):
     self.assertEqual(1234, Conversions.from_byte_buffer(IntegerType.get(),
                                                         b'\xd2\x04\x00\x00'))
     self.assertEqual(1234, Conversions.from_byte_buffer(LongType.get(),
                                                         b'\xd2\x04\x00\x00\x00\x00\x00\x00'))
     self.assertAlmostEqual(1.2345, Conversions.from_byte_buffer(DoubleType.get(),
                                                                 b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f'))
def strict_file():
    return TestDataFile("file.avro", TestHelpers.Row.of(), 50, {
        4: 50,
        5: 50,
        6: 50
    }, {
        4: 50,
        5: 10,
        6: 0
    }, {
        1: Conversions.to_byte_buffer(IntegerType.get(), 30),
        7: Conversions.to_byte_buffer(IntegerType.get(), 5)
    }, {
        1: Conversions.to_byte_buffer(IntegerType.get(), 79),
        7: Conversions.to_byte_buffer(IntegerType.get(), 5)
    })
Exemple #7
0
def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table
Exemple #8
0
def test_comparison_predicate_binding(op, assert_and_unwrap):
    struct = StructType.of([NestedField.required(14, "x", IntegerType.get())])
    unbound = UnboundPredicate(op, Expressions.ref("x"), 5)
    bound = assert_and_unwrap(unbound.bind(struct))

    assert 5 == bound.lit.value
    assert 14 == bound.ref.field.field_id
    assert op == bound.op
Exemple #9
0
def test_missing_field():
    struct = StructType.of([NestedField.required(13, "x", IntegerType.get())])

    unbound = UnboundPredicate(Operation.LT, Expressions.ref("missing"), 6)
    try:
        unbound.bind(struct)
    except ValidationException as e:
        assert e.args[0].startswith("Cannot find field 'missing' in struct")
Exemple #10
0
 def test_from_bytes(self):
     self.assertEqual(
         False, Conversions.from_byte_buffer(BooleanType.get(), b'\x00'))
     self.assertEqual(
         True, Conversions.from_byte_buffer(BooleanType.get(), b'\x01'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(IntegerType.get(),
                                      b'\xd2\x04\x00\x00'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(LongType.get(),
                                      b'\xd2\x04\x00\x00\x00\x00\x00\x00'))
     self.assertAlmostEqual(1.2345,
                            Conversions.from_byte_buffer(
                                FloatType.get(), b'\x19\x04\x9e?'),
                            places=5)
     self.assertAlmostEqual(
         1.2345,
         Conversions.from_byte_buffer(DoubleType.get(),
                                      b'\x8d\x97\x6e\x12\x83\xc0\xf3\x3f'))
     self.assertEqual(
         1234,
         Conversions.from_byte_buffer(DateType.get(), b'\xd2\x04\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimeType.get(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimestampType.with_timezone(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         100000000000,
         Conversions.from_byte_buffer(TimestampType.without_timezone(),
                                      b'\x00\xe8vH\x17\x00\x00\x00'))
     self.assertEqual(
         "foo", Conversions.from_byte_buffer(StringType.get(), b'foo'))
     self.assertEqual(
         uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
         Conversions.from_byte_buffer(
             UUIDType.get(), b'\xf7\x9c>\tg|K\xbd\xa4y?4\x9c\xb7\x85\xe7'))
     self.assertEqual(
         b'foo', Conversions.from_byte_buffer(FixedType.of_length(3),
                                              b'foo'))
     self.assertEqual(
         b'foo', Conversions.from_byte_buffer(BinaryType.get(), b'foo'))
     self.assertEqual(
         Decimal(123.45).quantize(Decimal(".01")),
         Conversions.from_byte_buffer(DecimalType.of(5, 2), b'\x30\x39'))
     self.assertEqual(
         Decimal(123.4567).quantize(Decimal(".0001")),
         Conversions.from_byte_buffer(DecimalType.of(5, 4),
                                      b'\x00\x12\xd6\x87'))
     self.assertEqual(
         Decimal(-123.4567).quantize(Decimal(".0001")),
         Conversions.from_byte_buffer(DecimalType.of(5, 4),
                                      b'\xff\xed\x29\x79'))
Exemple #11
0
 def get_type(partition_type):
     return StructType.of([
         NestedField.required(100, "file_path", StringType.get()),
         NestedField.required(101, "file_format", StringType.get()),
         NestedField.required(102, "partition", partition_type),
         NestedField.required(103, "record_count", LongType.get()),
         NestedField.required(104, "file_size_in_bytes", LongType.get()),
         NestedField.required(105, "block_size_in_bytes", LongType.get()),
         NestedField.optional(106, "file_ordinal", IntegerType.get()),
         NestedField.optional(107, "sort_columns",
                              ListType.of_required(112, IntegerType.get())),
         NestedField.optional(
             108, "column_sizes",
             MapType.of_required(117, 118, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             109, "value_counts",
             MapType.of_required(119, 120, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             110, "null_value_counts",
             MapType.of_required(121, 122, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             125, "lower_bounds",
             MapType.of_required(126, 127, IntegerType.get(),
                                 BinaryType.get())),
         NestedField.optional(
             128, "upper_bounds",
             MapType.of_required(129, 130, IntegerType.get(),
                                 BinaryType.get()))
     ]
                          # NEXT ID TO ASSIGN: 131
                          )
Exemple #12
0
def inc_man_spec():
    inc_schema = Schema(
        NestedField.required(1, "id", IntegerType.get()),
        NestedField.optional(4, "all_nulls", StringType.get()),
        NestedField.optional(5, "some_nulls", StringType.get()),
        NestedField.optional(6, "no_nulls", StringType.get()))
    return (PartitionSpec.builder_for(inc_schema).with_spec_id(0).identity(
        "id").identity("all_nulls").identity("some_nulls").identity(
            "no_nulls").build())
def test_table_scan_honors_select_without_case_sensitivity(ts_table):
    scan1 = ts_table.new_scan().case_sensitive(False).select(["ID"])
    # order of refinements shouldn't matter
    scan2 = ts_table.new_scan().select(["ID"]).case_sensitive(False)

    expected_schema = Schema([NestedField.required(1, "id", IntegerType.get())])

    assert scan1.schema.as_struct() == expected_schema.as_struct()
    assert scan2.schema.as_struct() == expected_schema.as_struct()
Exemple #14
0
def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table
def test_long_to_integer_conversion(assert_and_unwrap):
    struct = StructType.of([NestedField.required(17, "i", IntegerType.get())])

    lt = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                          Literal.JAVA_MAX_INT + 1)
    assert lt.bind(struct) == Expressions.always_true()

    lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                             Literal.JAVA_MAX_INT + 1)
    assert lt_eq.bind(struct) == Expressions.always_true()

    gt = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                          Literal.JAVA_MIN_INT - 1)
    assert gt.bind(struct) == Expressions.always_true()

    gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                             Literal.JAVA_MIN_INT - 1)
    assert gt_eq.bind(struct) == Expressions.always_true()

    gt_max = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                              Literal.JAVA_MAX_INT + 1)
    assert gt_max.bind(struct) == Expressions.always_false()

    gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                                 Literal.JAVA_MAX_INT + 1)
    assert gt_eq_max.bind(struct) == Expressions.always_false()

    lt_min = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                              Literal.JAVA_MIN_INT - 1)
    assert lt_min.bind(struct) == Expressions.always_false()

    lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                                 Literal.JAVA_MIN_INT - 1)
    assert lt_eq_min.bind(struct) == Expressions.always_false()

    lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("i"),
                               Literal.JAVA_MAX_INT).bind(struct)
    lt_max = assert_and_unwrap(lt_expr)
    assert lt_max.lit.value == Literal.JAVA_MAX_INT

    lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"),
                                  Literal.JAVA_MAX_INT).bind(struct)
    lt_eq_max = assert_and_unwrap(lt_eq_expr)
    assert lt_eq_max.lit.value == Literal.JAVA_MAX_INT

    gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("i"),
                               Literal.JAVA_MIN_INT).bind(struct)
    gt_min = assert_and_unwrap(gt_expr)
    assert gt_min.lit.value == Literal.JAVA_MIN_INT

    gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"),
                                  Literal.JAVA_MIN_INT).bind(struct)
    gt_eq_min = assert_and_unwrap(gt_eq_expr)
    assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def file():
    return TestDataFile(
        "file.avro",
        TestHelpers.Row.of(),
        50,
        # value counts
        {
            4: 50,
            5: 50,
            6: 50
        },
        # null value counts
        {
            4: 50,
            5: 10,
            6: 0
        },
        # lower bounds
        {1: Conversions.to_byte_buffer(IntegerType.get(), 30)},
        # upper bounds
        {1: Conversions.to_byte_buffer(IntegerType.get(), 79)})
Exemple #17
0
def test_raise_exception_with_invalid_json():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()),
                         NestedField.required(3, "num", DecimalType.of(9, 2)))

    spec_string = '{"spec-id": 0, "fields": [' \
                  '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \
                  '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \
                  '{"name": "data1", "transform": "bucket[16]", "source-id": 2}, ' \
                  '{"name": "data2", "transform": "bucket[8]", "source-id": 2}, ' \
                  '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3}]}'

    with pytest.raises(RuntimeError):
        PartitionSpecParser.from_json(spec_schema, spec_string)
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()))

    spec = PartitionSpec\
        .builder_for(spec_schema) \
        .identity("id")\
        .bucket("data", 16)\
        .build()

    expected = '{"spec-id": 0, "fields": [' \
               '{"name": "id", "transform": "identity", "source-id": 1}, ' \
               '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2}]}'
    assert expected == PartitionSpecParser.to_json(spec)
Exemple #19
0
    def test_partition_spec(self):
        schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                        NestedField.required(2, "l", LongType.get()),
                        NestedField.required(3, "d", DateType.get()),
                        NestedField.required(4, "t", TimeType.get()),
                        NestedField.required(5, "ts", TimestampType.without_timezone()),
                        NestedField.required(6, "dec", DecimalType.of(9, 2)),
                        NestedField.required(7, "s", StringType.get()),
                        NestedField.required(8, "u", UUIDType.get()),
                        NestedField.required(9, "f", FixedType.of_length(3)),
                        NestedField.required(10, "b", BinaryType.get()))
        specs = [PartitionSpec.builder_for(schema).identity("i").build(),
                 PartitionSpec.builder_for(schema).identity("l").build(),
                 PartitionSpec.builder_for(schema).identity("d").build(),
                 PartitionSpec.builder_for(schema).identity("t").build(),
                 PartitionSpec.builder_for(schema).identity("ts").build(),
                 PartitionSpec.builder_for(schema).identity("dec").build(),
                 PartitionSpec.builder_for(schema).identity("s").build(),
                 PartitionSpec.builder_for(schema).identity("u").build(),
                 PartitionSpec.builder_for(schema).identity("f").build(),
                 PartitionSpec.builder_for(schema).identity("b").build(),
                 PartitionSpec.builder_for(schema).bucket("i", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("l", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("d", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("t", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("ts", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("dec", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("s", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("u", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("f", 128).build(),
                 PartitionSpec.builder_for(schema).bucket("b", 128).build(),
                 PartitionSpec.builder_for(schema).year("d").build(),
                 PartitionSpec.builder_for(schema).month("d").build(),
                 PartitionSpec.builder_for(schema).day("d").build(),
                 PartitionSpec.builder_for(schema).year("ts").build(),
                 PartitionSpec.builder_for(schema).month("ts").build(),
                 PartitionSpec.builder_for(schema).day("ts").build(),
                 PartitionSpec.builder_for(schema).hour("ts").build(),
                 PartitionSpec.builder_for(schema).truncate("i", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("l", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("dec", 10).build(),
                 PartitionSpec.builder_for(schema).truncate("s", 10).build(),
                 PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(),
                 PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(),
                 ]

        for spec in specs:
            self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))
Exemple #20
0
def rg_expected_schema():
    return Schema([
        NestedField.required(1, "string_col", StringType.get()),
        NestedField.required(2, "long_col", LongType.get()),
        NestedField.required(3, "int_col", IntegerType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "null_col", StringType.get()),
        NestedField.optional(6, "missing_col", StringType.get()),
        NestedField.optional(7, "no_stats_col", StringType.get()),
        NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(9, "ts_wotz_col",
                             TimestampType.without_timezone()),
        NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)),
        NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)),
        NestedField.optional(12, "date_type", DateType.get()),
    ])
    def test_bucket_hash(self):
        buckets = [
            [Transforms.bucket(IntegerType.get(), 100), 34, 2017239379],
            [Transforms.bucket(LongType.get(), 100), 34, 2017239379],
            [Transforms.bucket(DateType.get(), 100), 17486, -653330422],
            [Transforms.bucket(TimeType.get(), 100), 81068000000, -662762989],
            [Transforms.bucket(TimestampType.without_timezone(), 100), 1510871468000000, -2047944441],
            [Transforms.bucket(DecimalType.of(9, 2), 100), decimal.Decimal("14.20"), -500754589],
            [Transforms.bucket(StringType.get(), 100), "iceberg", 1210000089],
            [Transforms.bucket(UUIDType.get(), 100), uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7"), 1488055340],
            [Transforms.bucket(FixedType.of_length(3), 128), b'foo', -156908512],
            [Transforms.bucket(BinaryType.get(), 128), b'\x00\x01\x02\x03', -188683207]
        ]

        for bucket in buckets:
            self.assertEqual(bucket[2], bucket[0].hash(bucket[1]))
def test_primitive_types(primitive_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])
    compare_schema(
        expected_schema,
        convert_parquet_to_iceberg(primitive_type_test_parquet_file))
Exemple #23
0
def test_projection(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    num_cols = source_table.num_columns
    for i in range(1, num_cols - 1):
        source_table = source_table.remove_column(num_cols - i)

    assert source_table == reader.read()
Exemple #24
0
def supported_primitives():
    return StructType.of([
        NestedField.required(100, "id", LongType.get()),
        NestedField.optional(101, "data", StringType.get()),
        NestedField.required(102, "b", BooleanType.get()),
        NestedField.optional(103, "i", IntegerType.get()),
        NestedField.required(104, "l", LongType.get()),
        NestedField.optional(105, "f", FloatType.get()),
        NestedField.required(106, "d", DoubleType.get()),
        NestedField.optional(107, "date", DateType.get()),
        NestedField.required(108, "ts", TimestampType.with_timezone()),
        NestedField.required(110, "s", StringType.get()),
        NestedField.required(111, "uuid", UUIDType.get()),
        NestedField.required(112, "fixed", FixedType.of_length(7)),
        NestedField.optional(113, "bytes", BinaryType.get()),
        NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)),
        NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)),
        NestedField.required(114, "dec_38_10", DecimalType.of(38, 10))
    ])
Exemple #25
0
def test_compound_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(
        input_file, expected_schema, {},
        Expressions.and_(Expressions.equal("string_col", "us"),
                         Expressions.equal("int_col", 1)), True)
    pyarrow_array = [
        pa.array([1], type=pa.int32()),
        pa.array([1], type=pa.int64()),
        pa.array([1.0], type=pa.float32()),
        pa.array([1.0], type=pa.float64()),
        pa.array(['us'], type=pa.string())
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Exemple #26
0
def test_decimal_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9))
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.decimal128(38, 9))
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("new_dec_col",
                                         pa.decimal128(38, 9),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Exemple #27
0
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()),
                         NestedField.required(2, "data", StringType.get()),
                         NestedField.required(3, "num", DecimalType.of(9, 2)))

    spec = PartitionSpec \
        .builder_for(spec_schema) \
        .identity("id") \
        .bucket("data", 16) \
        .add_without_field_id(2, "data1", "bucket[16]") \
        .add(2, 1010, "data2", "bucket[8]") \
        .bucket("num", 8) \
        .build()

    expected = '{"spec-id": 0, "fields": [' \
               '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \
               '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \
               '{"name": "data1", "transform": "bucket[16]", "source-id": 2, "field-id": 1002}, ' \
               '{"name": "data2", "transform": "bucket[8]", "source-id": 2, "field-id": 1010}, ' \
               '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3, "field-id": 1011}]}'
    assert expected == PartitionSpecParser.to_json(spec)
Exemple #28
0
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table
def test_to_json_conversion():
    spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()),
                         NestedField.required(2, "l", LongType.get()),
                         NestedField.required(3, "d", DateType.get()),
                         NestedField.required(4, "t", TimeType.get()),
                         NestedField.required(5, "ts", TimestampType.without_timezone()),
                         NestedField.required(6, "dec", DecimalType.of(9, 2)),
                         NestedField.required(7, "s", StringType.get()),
                         NestedField.required(8, "u", UUIDType.get()),
                         NestedField.required(9, "f", FixedType.of_length(3)),
                         NestedField.required(10, "b", BinaryType.get()))

    specs = [
        PartitionSpec.builder_for(spec_schema).identity("i").build(),
        PartitionSpec.builder_for(spec_schema).identity("l").build(),
        PartitionSpec.builder_for(spec_schema).identity("d").build(),
        PartitionSpec.builder_for(spec_schema).identity("t").build(),
        PartitionSpec.builder_for(spec_schema).identity("ts").build(),
        PartitionSpec.builder_for(spec_schema).identity("dec").build(),
        PartitionSpec.builder_for(spec_schema).identity("s").build(),
        PartitionSpec.builder_for(spec_schema).identity("u").build(),
        PartitionSpec.builder_for(spec_schema).identity("f").build(),
        PartitionSpec.builder_for(spec_schema).identity("b").build(),
        PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(),
        PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(),
        PartitionSpec.builder_for(spec_schema).year("d").build(),
        PartitionSpec.builder_for(spec_schema).month("d").build(),
        PartitionSpec.builder_for(spec_schema).day("d").build(),
        PartitionSpec.builder_for(spec_schema).year("ts").build(),
        PartitionSpec.builder_for(spec_schema).month("ts").build(),
        PartitionSpec.builder_for(spec_schema).day("ts").build(),
        PartitionSpec.builder_for(spec_schema).hour("ts").build(),
        PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(),
        PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(),
        PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build()
    ]

    expected_spec_strs = [
        "[\n i: identity(1)\n]",
        "[\n l: identity(2)\n]",
        "[\n d: identity(3)\n]",
        "[\n t: identity(4)\n]",
        "[\n ts: identity(5)\n]",
        "[\n dec: identity(6)\n]",
        "[\n s: identity(7)\n]",
        "[\n u: identity(8)\n]",
        "[\n f: identity(9)\n]",
        "[\n b: identity(10)\n]",
        "[\n i_bucket: bucket[128](1)\n]",
        "[\n l_bucket: bucket[128](2)\n]",
        "[\n d_bucket: bucket[128](3)\n]",
        "[\n t_bucket: bucket[128](4)\n]",
        "[\n ts_bucket: bucket[128](5)\n]",
        "[\n dec_bucket: bucket[128](6)\n]",
        "[\n s_bucket: bucket[128](7)\n]",
        "[\n d_year: year(3)\n]",
        "[\n d_month: month(3)\n]",
        "[\n d_day: day(3)\n]",
        "[\n ts_year: year(5)\n]",
        "[\n ts_month: month(5)\n]",
        "[\n ts_day: day(5)\n]",
        "[\n ts_hour: hour(5)\n]",
        "[\n i_truncate: truncate[10](1)\n]",
        "[\n l_truncate: truncate[10](2)\n]",
        "[\n dec_truncate: truncate[10](6)\n]",
        "[\n s_truncate: truncate[10](7)\n]",
        "[\n dec_bucket: bucket[16](6)\n]",
    ]

    for (spec, expected_spec_str) in zip(specs, expected_spec_strs):
        assert str(spec) == expected_spec_str
Exemple #30
0
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


import iceberg.api.expressions as exp
from iceberg.api.types import (IntegerType,
                               NestedField,
                               StringType,
                               StructType)
from iceberg.exceptions import ValidationException
from pytest import raises

STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()),
                       NestedField.required(14, "y", IntegerType.get()),
                       NestedField.optional(15, "z", IntegerType.get())])


def test_less_than(row_of):
    evaluator = exp.evaluator.Evaluator(STRUCT,
                                        exp.expressions.Expressions.less_than("x", 7))
    assert not evaluator.eval(row_of((7, 8, None)))
    assert evaluator.eval(row_of((6, 8, None)))


def test_less_than_or_equal(row_of):
    evaluator = exp.evaluator.Evaluator(STRUCT,
                                        exp.expressions.Expressions.less_than_or_equal("x", 7))
    assert evaluator.eval(row_of((7, 8, None)))