Beispiel #1
0
def test_column_rename(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64())
    ]
    schema = pa.schema([
        pa.field("int_col", pa.int32(), False),
        pa.field("bigint_col", pa.int64(), True),
        pa.field("string_col", pa.string(), True),
        pa.field("float_col", pa.float32(), True),
        pa.field("dbl_col", pa.float64(), True)
    ])

    source_table = pa.table(pyarrow_array, schema=schema)

    target_table = reader.read()
    assert source_table == target_table
def schema():
    return Schema(NestedField.required(1, "id", IntegerType.get()),
                  NestedField.optional(2, "no_stats", IntegerType.get()),
                  NestedField.required(3, "required", StringType.get()),
                  NestedField.optional(4, "all_nulls", StringType.get()),
                  NestedField.optional(5, "some_nulls", StringType.get()),
                  NestedField.optional(6, "no_nulls", StringType.get()))
Beispiel #3
0
def inc_man_spec():
    inc_schema = Schema(
        NestedField.required(1, "id", IntegerType.get()),
        NestedField.optional(4, "all_nulls", StringType.get()),
        NestedField.optional(5, "some_nulls", StringType.get()),
        NestedField.optional(6, "no_nulls", StringType.get()))
    return (PartitionSpec.builder_for(inc_schema).with_spec_id(0).identity(
        "id").identity("all_nulls").identity("some_nulls").identity(
            "no_nulls").build())
def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.optional(1, "list_int_col",
                             ListType.of_optional(3, IntegerType.get())),
        NestedField.optional(4, "list_str_col",
                             ListType.of_optional(6, StringType.get())),
        NestedField.optional(
            7, "struct_col",
            StructType.of([
                NestedField.optional(8, "f1", IntegerType.get()),
                NestedField.optional(9, "f2", StringType.get())
            ]))
    ])
    converted_schema = convert_parquet_to_iceberg(
        unnested_complex_type_test_parquet_file)
    compare_schema(expected_schema, converted_schema)
Beispiel #5
0
 def get_type(partition_type):
     return StructType.of([
         NestedField.required(100, "file_path", StringType.get()),
         NestedField.required(101, "file_format", StringType.get()),
         NestedField.required(102, "partition", partition_type),
         NestedField.required(103, "record_count", LongType.get()),
         NestedField.required(104, "file_size_in_bytes", LongType.get()),
         NestedField.required(105, "block_size_in_bytes", LongType.get()),
         NestedField.optional(106, "file_ordinal", IntegerType.get()),
         NestedField.optional(107, "sort_columns",
                              ListType.of_required(112, IntegerType.get())),
         NestedField.optional(
             108, "column_sizes",
             MapType.of_required(117, 118, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             109, "value_counts",
             MapType.of_required(119, 120, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             110, "null_value_counts",
             MapType.of_required(121, 122, IntegerType.get(),
                                 LongType.get())),
         NestedField.optional(
             125, "lower_bounds",
             MapType.of_required(126, 127, IntegerType.get(),
                                 BinaryType.get())),
         NestedField.optional(
             128, "upper_bounds",
             MapType.of_required(129, 130, IntegerType.get(),
                                 BinaryType.get()))
     ]
                          # NEXT ID TO ASSIGN: 131
                          )
Beispiel #6
0
def supported_primitives():
    return StructType.of([
        NestedField.required(100, "id", LongType.get()),
        NestedField.optional(101, "data", StringType.get()),
        NestedField.required(102, "b", BooleanType.get()),
        NestedField.optional(103, "i", IntegerType.get()),
        NestedField.required(104, "l", LongType.get()),
        NestedField.optional(105, "f", FloatType.get()),
        NestedField.required(106, "d", DoubleType.get()),
        NestedField.optional(107, "date", DateType.get()),
        NestedField.required(108, "ts", TimestampType.with_timezone()),
        NestedField.required(110, "s", StringType.get()),
        NestedField.required(111, "uuid", UUIDType.get()),
        NestedField.required(112, "fixed", FixedType.of_length(7)),
        NestedField.optional(113, "bytes", BinaryType.get()),
        NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)),
        NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)),
        NestedField.required(114, "dec_38_10", DecimalType.of(38, 10))
    ])
Beispiel #7
0
def test_compound_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(
        input_file, expected_schema, {},
        Expressions.and_(Expressions.equal("string_col", "us"),
                         Expressions.equal("int_col", 1)), True)
    pyarrow_array = [
        pa.array([1], type=pa.int32()),
        pa.array([1], type=pa.int64()),
        pa.array([1.0], type=pa.float32()),
        pa.array([1.0], type=pa.float64()),
        pa.array(['us'], type=pa.string())
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("string_col",
                                         pa.string(),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Beispiel #8
0
def rg_expected_schema():
    return Schema([
        NestedField.required(1, "string_col", StringType.get()),
        NestedField.required(2, "long_col", LongType.get()),
        NestedField.required(3, "int_col", IntegerType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "null_col", StringType.get()),
        NestedField.optional(6, "missing_col", StringType.get()),
        NestedField.optional(7, "no_stats_col", StringType.get()),
        NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(9, "ts_wotz_col",
                             TimestampType.without_timezone()),
        NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)),
        NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)),
        NestedField.optional(12, "date_type", DateType.get()),
    ])
Beispiel #9
0
def test_not_null(assert_and_unwrap):
    optional = StructType.of([NestedField.optional(21, "s", StringType.get())])
    unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s"))
    expr = unbound.bind(optional)
    bound = assert_and_unwrap(expr)
    assert Operation.NOT_NULL == bound.op
    assert 21 == bound.ref.field.field_id
    assert bound.lit is None

    required = StructType.of([NestedField.required(22, "s", StringType.get())])
    assert Expressions.always_true() == unbound.bind(required)
Beispiel #10
0
def test_decimal_column_add(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9))
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)
    pyarrow_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array([None, None, None, None, None], type=pa.decimal128(38, 9))
    ]

    source_table = pa.table(pyarrow_array,
                            schema=pa.schema([
                                pa.field("int_col", pa.int32(),
                                         nullable=False),
                                pa.field("bigint_col",
                                         pa.int64(),
                                         nullable=True),
                                pa.field("float_col",
                                         pa.float32(),
                                         nullable=True),
                                pa.field("dbl_col",
                                         pa.float64(),
                                         nullable=True),
                                pa.field("new_dec_col",
                                         pa.decimal128(38, 9),
                                         nullable=True)
                            ]))

    target_table = reader.read()
    assert source_table == target_table
Beispiel #11
0
    def convert_avro_field_to_iceberg(field, next_id):
        field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id)

        if field.get(AvroToIceberg.FIELD_ID_PROP) is None:
            return field_type, next_id

        if is_optional:
            return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP),
                                        field.get(AvroToIceberg.FIELD_NAME_PROP),
                                        field_type), next_id
        else:
            return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP),
                                        field.get(AvroToIceberg.FIELD_NAME_PROP),
                                        field_type), next_id
Beispiel #12
0
def test_schema_evolution_filter(primitive_type_test_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(16, "other_new_col", LongType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(3, "string_col", StringType.get()),
        NestedField.optional(15, "new_col", StringType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.not_null("new_col"), True)

    schema = pa.schema([
        pa.field("int_col", pa.int32(), nullable=False),
        pa.field("bigint_col", pa.int64(), nullable=True),
        pa.field("other_new_col", pa.int64(), nullable=True),
        pa.field("float_col", pa.float32(), nullable=True),
        pa.field("dbl_col", pa.float64(), nullable=True),
        pa.field("string_col", pa.string(), nullable=True),
        pa.field("new_col", pa.string(), nullable=True)
    ])

    pyarrow_not_null_array = [
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.int64()),
        pa.array([], type=pa.int32()),
        pa.array([], type=pa.float32()),
        pa.array([], type=pa.float64()),
        pa.array([], type=pa.string()),
        pa.array([], type=pa.string())
    ]

    not_null_table = pa.table(pyarrow_not_null_array, schema=schema)
    pyarrow_null_array = [
        pa.array([1, 2, 3, 4, 5], type=pa.int32()),
        pa.array([1, 2, 3, None, 5], type=pa.int64()),
        pa.array([None, None, None, None, None], type=pa.int64()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()),
        pa.array([None, None, None, None, None], type=pa.string())
    ]
    null_table = pa.table(pyarrow_null_array, schema=schema)

    target_table = reader.read()
    assert not_null_table == target_table

    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.is_null("new_col"), True)
    target_table = reader.read()
    assert null_table == target_table
Beispiel #13
0
    def struct_from_dict(dict_obj):
        struct_fields = list()
        fields = dict_obj.get(SchemaParser.FIELDS)
        for field in fields:
            field_id = field.get(SchemaParser.ID)
            field_name = field.get(SchemaParser.NAME)
            field_type = SchemaParser.type_from_dict(
                field.get(SchemaParser.TYPE))

            if field.get(SchemaParser.REQUIRED):
                struct_fields.append(
                    NestedField.required(field_id, field_name, field_type))
            else:
                struct_fields.append(
                    NestedField.optional(field_id, field_name, field_type))

        return StructType.of(struct_fields)
Beispiel #14
0
def test_projection(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    num_cols = source_table.num_columns
    for i in range(1, num_cols - 1):
        source_table = source_table.remove_column(num_cols - i)

    assert source_table == reader.read()
import io
import pickle
import uuid

from iceberg.api import DataFile
from iceberg.api.expressions import (BoundPredicate, Expressions,
                                     ExpressionVisitors, Literal, Operation,
                                     UnboundPredicate)
from iceberg.api.schema import Schema
from iceberg.api.struct_like import StructLike
from iceberg.api.types import (BinaryType, Conversions, DateType, DecimalType,
                               FixedType, IntegerType, NestedField, StringType,
                               TimestampType, TimeType)
import pytest

exp_schema = Schema(NestedField.optional(34, "a", IntegerType.get()))


class TestHelpers(object):
    @staticmethod
    def assert_all_references_bound(message, expr):
        ExpressionVisitors.visit(expr,
                                 TestHelpers.CheckReferencesBound(message))

    @staticmethod
    def assert_and_unwrap(expr, expected=None):
        if expected is not None:
            assert isinstance(expr, expected)
        else:
            assert isinstance(expr, BoundPredicate)
Beispiel #16
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


import iceberg.api.expressions as exp
from iceberg.api.types import (IntegerType,
                               NestedField,
                               StringType,
                               StructType)
from iceberg.exceptions import ValidationException
from pytest import raises

STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()),
                       NestedField.required(14, "y", IntegerType.get()),
                       NestedField.optional(15, "z", IntegerType.get())])


def test_less_than(row_of):
    evaluator = exp.evaluator.Evaluator(STRUCT,
                                        exp.expressions.Expressions.less_than("x", 7))
    assert not evaluator.eval(row_of((7, 8, None)))
    assert evaluator.eval(row_of((6, 8, None)))


def test_less_than_or_equal(row_of):
    evaluator = exp.evaluator.Evaluator(STRUCT,
                                        exp.expressions.Expressions.less_than_or_equal("x", 7))
    assert evaluator.eval(row_of((7, 8, None)))
    assert evaluator.eval(row_of((6, 8, None)))
    assert not evaluator.eval(row_of((8, 8, None)))
def test_primitive_types(primitive_type_test_parquet_file):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])
    compare_schema(
        expected_schema,
        convert_parquet_to_iceberg(primitive_type_test_parquet_file))
Beispiel #18
0
def iceberg_full_read_projection_schema():
    return Schema([
        NestedField.required(0, "id", LongType.get()),
        NestedField.optional(1, "data", StringType.get())
    ])
Beispiel #19
0
def get_nested_field(field_id: int, field_name: str, field_type: Type, nullable: bool) -> NestedField:
    if nullable:
        return NestedField.optional(field_id, field_name, field_type)
    else:
        return NestedField.required(field_id, field_name, field_type)
Beispiel #20
0
import os
import random
import tempfile
import time

from iceberg.api import Files, PartitionSpec, Schema
from iceberg.api.types import BooleanType, IntegerType, LongType, NestedField, StringType
from iceberg.core import (BaseSnapshot, BaseTable, ConfigProperties,
                          GenericManifestFile, SnapshotLogEntry, TableMetadata,
                          TableMetadataParser, TableOperations,
                          TableProperties)
from iceberg.exceptions import AlreadyExistsException, CommitFailedException
import pytest

SCHEMA = Schema([NestedField.optional(1, "b", BooleanType.get())])
METADATA = dict()
VERSIONS = dict()


class LocalTableOperations(TableOperations):
    def current(self):
        raise RuntimeError("Not implemented for tests")

    def refresh(self):
        raise RuntimeError("Not implemented for tests")

    def commit(self, base, metadata):
        raise RuntimeError("Not implemented for tests")

    def new_input_file(self, path):
Beispiel #21
0
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array,
                    pyarrow_schema):
    expected_schema = Schema([
        NestedField.required(1, "int_col", IntegerType.get()),
        NestedField.optional(2, "bigint_col", LongType.get()),
        NestedField.optional(3, "str_col", StringType.get()),
        NestedField.optional(4, "float_col", FloatType.get()),
        NestedField.optional(5, "dbl_col", DoubleType.get()),
        NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
        NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
        NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
        NestedField.optional(9, "date_col", DateType.get()),
        NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
        NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
        NestedField.optional(12, "bool_col", BooleanType.get())
    ])

    input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}),
                                     primitive_type_test_file, {})
    reader = ParquetReader(input_file, expected_schema, {},
                           Expressions.always_true(), True)

    source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema)
    assert reader.read() == source_table