def test_column_rename(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()) ] schema = pa.schema([ pa.field("int_col", pa.int32(), False), pa.field("bigint_col", pa.int64(), True), pa.field("string_col", pa.string(), True), pa.field("float_col", pa.float32(), True), pa.field("dbl_col", pa.float64(), True) ]) source_table = pa.table(pyarrow_array, schema=schema) target_table = reader.read() assert source_table == target_table
def schema(): return Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.optional(2, "no_stats", IntegerType.get()), NestedField.required(3, "required", StringType.get()), NestedField.optional(4, "all_nulls", StringType.get()), NestedField.optional(5, "some_nulls", StringType.get()), NestedField.optional(6, "no_nulls", StringType.get()))
def inc_man_spec(): inc_schema = Schema( NestedField.required(1, "id", IntegerType.get()), NestedField.optional(4, "all_nulls", StringType.get()), NestedField.optional(5, "some_nulls", StringType.get()), NestedField.optional(6, "no_nulls", StringType.get())) return (PartitionSpec.builder_for(inc_schema).with_spec_id(0).identity( "id").identity("all_nulls").identity("some_nulls").identity( "no_nulls").build())
def test_unnested_complex_types(unnested_complex_type_test_parquet_file): expected_schema = Schema([ NestedField.optional(1, "list_int_col", ListType.of_optional(3, IntegerType.get())), NestedField.optional(4, "list_str_col", ListType.of_optional(6, StringType.get())), NestedField.optional( 7, "struct_col", StructType.of([ NestedField.optional(8, "f1", IntegerType.get()), NestedField.optional(9, "f2", StringType.get()) ])) ]) converted_schema = convert_parquet_to_iceberg( unnested_complex_type_test_parquet_file) compare_schema(expected_schema, converted_schema)
def get_type(partition_type): return StructType.of([ NestedField.required(100, "file_path", StringType.get()), NestedField.required(101, "file_format", StringType.get()), NestedField.required(102, "partition", partition_type), NestedField.required(103, "record_count", LongType.get()), NestedField.required(104, "file_size_in_bytes", LongType.get()), NestedField.required(105, "block_size_in_bytes", LongType.get()), NestedField.optional(106, "file_ordinal", IntegerType.get()), NestedField.optional(107, "sort_columns", ListType.of_required(112, IntegerType.get())), NestedField.optional( 108, "column_sizes", MapType.of_required(117, 118, IntegerType.get(), LongType.get())), NestedField.optional( 109, "value_counts", MapType.of_required(119, 120, IntegerType.get(), LongType.get())), NestedField.optional( 110, "null_value_counts", MapType.of_required(121, 122, IntegerType.get(), LongType.get())), NestedField.optional( 125, "lower_bounds", MapType.of_required(126, 127, IntegerType.get(), BinaryType.get())), NestedField.optional( 128, "upper_bounds", MapType.of_required(129, 130, IntegerType.get(), BinaryType.get())) ] # NEXT ID TO ASSIGN: 131 )
def supported_primitives(): return StructType.of([ NestedField.required(100, "id", LongType.get()), NestedField.optional(101, "data", StringType.get()), NestedField.required(102, "b", BooleanType.get()), NestedField.optional(103, "i", IntegerType.get()), NestedField.required(104, "l", LongType.get()), NestedField.optional(105, "f", FloatType.get()), NestedField.required(106, "d", DoubleType.get()), NestedField.optional(107, "date", DateType.get()), NestedField.required(108, "ts", TimestampType.with_timezone()), NestedField.required(110, "s", StringType.get()), NestedField.required(111, "uuid", UUIDType.get()), NestedField.required(112, "fixed", FixedType.of_length(7)), NestedField.optional(113, "bytes", BinaryType.get()), NestedField.required(114, "dec_9_0", DecimalType.of(9, 0)), NestedField.required(114, "dec_11_2", DecimalType.of(11, 2)), NestedField.required(114, "dec_38_10", DecimalType.of(38, 10)) ])
def test_compound_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader( input_file, expected_schema, {}, Expressions.and_(Expressions.equal("string_col", "us"), Expressions.equal("int_col", 1)), True) pyarrow_array = [ pa.array([1], type=pa.int32()), pa.array([1], type=pa.int64()), pa.array([1.0], type=pa.float32()), pa.array([1.0], type=pa.float64()), pa.array(['us'], type=pa.string()) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def rg_expected_schema(): return Schema([ NestedField.required(1, "string_col", StringType.get()), NestedField.required(2, "long_col", LongType.get()), NestedField.required(3, "int_col", IntegerType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "null_col", StringType.get()), NestedField.optional(6, "missing_col", StringType.get()), NestedField.optional(7, "no_stats_col", StringType.get()), NestedField.optional(8, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(9, "ts_wotz_col", TimestampType.without_timezone()), NestedField.optional(10, "big_decimal_type", DecimalType.of(38, 5)), NestedField.optional(11, "small_decimal_type", DecimalType.of(10, 2)), NestedField.optional(12, "date_type", DateType.get()), ])
def test_not_null(assert_and_unwrap): optional = StructType.of([NestedField.optional(21, "s", StringType.get())]) unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s")) expr = unbound.bind(optional) bound = assert_and_unwrap(expr) assert Operation.NOT_NULL == bound.op assert 21 == bound.ref.field.field_id assert bound.lit is None required = StructType.of([NestedField.required(22, "s", StringType.get())]) assert Expressions.always_true() == unbound.bind(required)
def test_decimal_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9)) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.decimal128(38, 9)) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("new_dec_col", pa.decimal128(38, 9), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def convert_avro_field_to_iceberg(field, next_id): field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id) if field.get(AvroToIceberg.FIELD_ID_PROP) is None: return field_type, next_id if is_optional: return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id else: return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id
def test_schema_evolution_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(16, "other_new_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(15, "new_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.not_null("new_col"), True) schema = pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("other_new_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True), pa.field("new_col", pa.string(), nullable=True) ]) pyarrow_not_null_array = [ pa.array([], type=pa.int32()), pa.array([], type=pa.int64()), pa.array([], type=pa.int32()), pa.array([], type=pa.float32()), pa.array([], type=pa.float64()), pa.array([], type=pa.string()), pa.array([], type=pa.string()) ] not_null_table = pa.table(pyarrow_not_null_array, schema=schema) pyarrow_null_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([None, None, None, None, None], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([None, None, None, None, None], type=pa.string()) ] null_table = pa.table(pyarrow_null_array, schema=schema) target_table = reader.read() assert not_null_table == target_table reader = ParquetReader(input_file, expected_schema, {}, Expressions.is_null("new_col"), True) target_table = reader.read() assert null_table == target_table
def struct_from_dict(dict_obj): struct_fields = list() fields = dict_obj.get(SchemaParser.FIELDS) for field in fields: field_id = field.get(SchemaParser.ID) field_name = field.get(SchemaParser.NAME) field_type = SchemaParser.type_from_dict( field.get(SchemaParser.TYPE)) if field.get(SchemaParser.REQUIRED): struct_fields.append( NestedField.required(field_id, field_name, field_type)) else: struct_fields.append( NestedField.optional(field_id, field_name, field_type)) return StructType.of(struct_fields)
def test_projection(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) num_cols = source_table.num_columns for i in range(1, num_cols - 1): source_table = source_table.remove_column(num_cols - i) assert source_table == reader.read()
import io import pickle import uuid from iceberg.api import DataFile from iceberg.api.expressions import (BoundPredicate, Expressions, ExpressionVisitors, Literal, Operation, UnboundPredicate) from iceberg.api.schema import Schema from iceberg.api.struct_like import StructLike from iceberg.api.types import (BinaryType, Conversions, DateType, DecimalType, FixedType, IntegerType, NestedField, StringType, TimestampType, TimeType) import pytest exp_schema = Schema(NestedField.optional(34, "a", IntegerType.get())) class TestHelpers(object): @staticmethod def assert_all_references_bound(message, expr): ExpressionVisitors.visit(expr, TestHelpers.CheckReferencesBound(message)) @staticmethod def assert_and_unwrap(expr, expected=None): if expected is not None: assert isinstance(expr, expected) else: assert isinstance(expr, BoundPredicate)
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import iceberg.api.expressions as exp from iceberg.api.types import (IntegerType, NestedField, StringType, StructType) from iceberg.exceptions import ValidationException from pytest import raises STRUCT = StructType.of([NestedField.required(13, "x", IntegerType.get()), NestedField.required(14, "y", IntegerType.get()), NestedField.optional(15, "z", IntegerType.get())]) def test_less_than(row_of): evaluator = exp.evaluator.Evaluator(STRUCT, exp.expressions.Expressions.less_than("x", 7)) assert not evaluator.eval(row_of((7, 8, None))) assert evaluator.eval(row_of((6, 8, None))) def test_less_than_or_equal(row_of): evaluator = exp.evaluator.Evaluator(STRUCT, exp.expressions.Expressions.less_than_or_equal("x", 7)) assert evaluator.eval(row_of((7, 8, None))) assert evaluator.eval(row_of((6, 8, None))) assert not evaluator.eval(row_of((8, 8, None)))
def test_primitive_types(primitive_type_test_parquet_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) compare_schema( expected_schema, convert_parquet_to_iceberg(primitive_type_test_parquet_file))
def iceberg_full_read_projection_schema(): return Schema([ NestedField.required(0, "id", LongType.get()), NestedField.optional(1, "data", StringType.get()) ])
def get_nested_field(field_id: int, field_name: str, field_type: Type, nullable: bool) -> NestedField: if nullable: return NestedField.optional(field_id, field_name, field_type) else: return NestedField.required(field_id, field_name, field_type)
import os import random import tempfile import time from iceberg.api import Files, PartitionSpec, Schema from iceberg.api.types import BooleanType, IntegerType, LongType, NestedField, StringType from iceberg.core import (BaseSnapshot, BaseTable, ConfigProperties, GenericManifestFile, SnapshotLogEntry, TableMetadata, TableMetadataParser, TableOperations, TableProperties) from iceberg.exceptions import AlreadyExistsException, CommitFailedException import pytest SCHEMA = Schema([NestedField.optional(1, "b", BooleanType.get())]) METADATA = dict() VERSIONS = dict() class LocalTableOperations(TableOperations): def current(self): raise RuntimeError("Not implemented for tests") def refresh(self): raise RuntimeError("Not implemented for tests") def commit(self, base, metadata): raise RuntimeError("Not implemented for tests") def new_input_file(self, path):
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) assert reader.read() == source_table