def test_column_rename(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()) ] schema = pa.schema([ pa.field("int_col", pa.int32(), False), pa.field("bigint_col", pa.int64(), True), pa.field("string_col", pa.string(), True), pa.field("float_col", pa.float32(), True), pa.field("dbl_col", pa.float64(), True) ]) source_table = pa.table(pyarrow_array, schema=schema) target_table = reader.read() assert source_table == target_table
def expected_metadata_sorting(): spec_schema = Schema(NestedField.required(1, "x", LongType.get()), NestedField.required(2, "y", LongType.get()), NestedField.required(3, "z", LongType.get())) spec = PartitionSpec \ .builder_for(spec_schema) \ .with_spec_id(5) \ .build() random.seed(1234) previous_snapshot_id = int(time.time()) - random.randint(0, 3600) previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None, timestamp_millis=previous_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"), spec_id=spec.spec_id)]) current_snapshot_id = int(time.time()) current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id, timestamp_millis=current_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"), spec_id=spec.spec_id)]) reversed_snapshot_log = list() metadata = TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, spec_schema, 5, [spec], {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot], reversed_snapshot_log) reversed_snapshot_log.append(SnapshotLogEntry(current_snapshot.timestamp_millis, current_snapshot.snapshot_id)) reversed_snapshot_log.append(SnapshotLogEntry(previous_snapshot.timestamp_millis, previous_snapshot.snapshot_id)) return metadata
def inc_man_spec(): inc_schema = Schema( NestedField.required(1, "id", IntegerType.get()), NestedField.optional(4, "all_nulls", StringType.get()), NestedField.optional(5, "some_nulls", StringType.get()), NestedField.optional(6, "no_nulls", StringType.get())) return (PartitionSpec.builder_for(inc_schema).with_spec_id(0).identity( "id").identity("all_nulls").identity("some_nulls").identity( "no_nulls").build())
def test_not_null(assert_and_unwrap): optional = StructType.of([NestedField.optional(21, "s", StringType.get())]) unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s")) expr = unbound.bind(optional) bound = assert_and_unwrap(expr) assert Operation.NOT_NULL == bound.op assert 21 == bound.ref.field.field_id assert bound.lit is None required = StructType.of([NestedField.required(22, "s", StringType.get())]) assert Expressions.always_true() == unbound.bind(required)
def test_multiple_fields(assert_and_unwrap): struct = StructType.of([NestedField.required(10, 'x', IntegerType.get()), NestedField.required(11, 'y', IntegerType.get()), NestedField.required(12, 'z', IntegerType.get())]) unbound = UnboundPredicate(Operation.LT, Expressions.ref("y"), 6) expr = unbound.bind(struct) bound = assert_and_unwrap(expr) assert 11 == bound.ref.field.field_id assert Operation.LT == bound.op assert 6 == bound.lit.value
def test_raise_exception_with_invalid_json(): spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "data", StringType.get()), NestedField.required(3, "num", DecimalType.of(9, 2))) spec_string = '{"spec-id": 0, "fields": [' \ '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \ '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \ '{"name": "data1", "transform": "bucket[16]", "source-id": 2}, ' \ '{"name": "data2", "transform": "bucket[8]", "source-id": 2}, ' \ '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3}]}' with pytest.raises(RuntimeError): PartitionSpecParser.from_json(spec_schema, spec_string)
def convert_avro_field_to_iceberg(field, next_id): field_type, is_optional, next_id = AvroToIceberg.convert_type(field, next_id) if field.get(AvroToIceberg.FIELD_ID_PROP) is None: return field_type, next_id if is_optional: return NestedField.optional(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id else: return NestedField.required(field.get(AvroToIceberg.FIELD_ID_PROP), field.get(AvroToIceberg.FIELD_NAME_PROP), field_type), next_id
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "data", StringType.get())) spec = PartitionSpec\ .builder_for(spec_schema) \ .identity("id")\ .bucket("data", 16)\ .build() expected = '{"spec-id": 0, "fields": [' \ '{"name": "id", "transform": "identity", "source-id": 1}, ' \ '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2}]}' assert expected == PartitionSpecParser.to_json(spec)
def test_partition_spec(self): schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [PartitionSpec.builder_for(schema).identity("i").build(), PartitionSpec.builder_for(schema).identity("l").build(), PartitionSpec.builder_for(schema).identity("d").build(), PartitionSpec.builder_for(schema).identity("t").build(), PartitionSpec.builder_for(schema).identity("ts").build(), PartitionSpec.builder_for(schema).identity("dec").build(), PartitionSpec.builder_for(schema).identity("s").build(), PartitionSpec.builder_for(schema).identity("u").build(), PartitionSpec.builder_for(schema).identity("f").build(), PartitionSpec.builder_for(schema).identity("b").build(), PartitionSpec.builder_for(schema).bucket("i", 128).build(), PartitionSpec.builder_for(schema).bucket("l", 128).build(), PartitionSpec.builder_for(schema).bucket("d", 128).build(), PartitionSpec.builder_for(schema).bucket("t", 128).build(), PartitionSpec.builder_for(schema).bucket("ts", 128).build(), PartitionSpec.builder_for(schema).bucket("dec", 128).build(), PartitionSpec.builder_for(schema).bucket("s", 128).build(), PartitionSpec.builder_for(schema).bucket("u", 128).build(), PartitionSpec.builder_for(schema).bucket("f", 128).build(), PartitionSpec.builder_for(schema).bucket("b", 128).build(), PartitionSpec.builder_for(schema).year("d").build(), PartitionSpec.builder_for(schema).month("d").build(), PartitionSpec.builder_for(schema).day("d").build(), PartitionSpec.builder_for(schema).year("ts").build(), PartitionSpec.builder_for(schema).month("ts").build(), PartitionSpec.builder_for(schema).day("ts").build(), PartitionSpec.builder_for(schema).hour("ts").build(), PartitionSpec.builder_for(schema).truncate("i", 10).build(), PartitionSpec.builder_for(schema).truncate("l", 10).build(), PartitionSpec.builder_for(schema).truncate("dec", 10).build(), PartitionSpec.builder_for(schema).truncate("s", 10).build(), PartitionSpec.builder_for(schema).add_without_field_id(6, "dec_unsupported", "unsupported").build(), PartitionSpec.builder_for(schema).add(6, 1111, "dec_unsupported", "unsupported").build(), ] for spec in specs: self.assertEqual(spec, TestHelpers.round_trip_serialize(spec))
def test_unnested_complex_types(unnested_complex_type_test_parquet_file): expected_schema = Schema([ NestedField.optional(1, "list_int_col", ListType.of_optional(3, IntegerType.get())), NestedField.optional(4, "list_str_col", ListType.of_optional(6, StringType.get())), NestedField.optional( 7, "struct_col", StructType.of([ NestedField.optional(8, "f1", IntegerType.get()), NestedField.optional(9, "f2", StringType.get()) ])) ]) converted_schema = convert_parquet_to_iceberg( unnested_complex_type_test_parquet_file) compare_schema(expected_schema, converted_schema)
def test_literal_converison(op, assert_and_unwrap): struct = StructType.of([NestedField.required(15, "d", DecimalType.of(9, 2))]) unbound = UnboundPredicate(op, Expressions.ref("d"), "12.40") bound = assert_and_unwrap(unbound.bind(struct)) assert Decimal(12.40).quantize(Decimal(".01")).as_tuple() == bound.lit.value.as_tuple() assert 15 == bound.ref.field.field_id assert op == bound.op
def test_comparison_predicate_binding(op, assert_and_unwrap): struct = StructType.of([NestedField.required(14, "x", IntegerType.get())]) unbound = UnboundPredicate(op, Expressions.ref("x"), 5) bound = assert_and_unwrap(unbound.bind(struct)) assert 5 == bound.lit.value assert 14 == bound.ref.field.field_id assert op == bound.op
def test_missing_field(): struct = StructType.of([NestedField.required(13, "x", IntegerType.get())]) unbound = UnboundPredicate(Operation.LT, Expressions.ref("missing"), 6) try: unbound.bind(struct) except ValidationException as e: assert e.args[0].startswith("Cannot find field 'missing' in struct")
def test_invalid_conversions(op): struct = StructType.of([NestedField.required(16, "f", FloatType.get())]) unbound = UnboundPredicate(op, Expressions.ref("f"), "12.40") try: unbound.bind(struct) except ValidationException as e: assert e.args[0].startswith('Invalid Value for conversion to type float: "12.40" (StringLiteral)')
def struct_from_dict(dict_obj): struct_fields = list() fields = dict_obj.get(SchemaParser.FIELDS) for field in fields: field_id = field.get(SchemaParser.ID) field_name = field.get(SchemaParser.NAME) field_type = SchemaParser.type_from_dict( field.get(SchemaParser.TYPE)) if field.get(SchemaParser.REQUIRED): struct_fields.append( NestedField.required(field_id, field_name, field_type)) else: struct_fields.append( NestedField.optional(field_id, field_name, field_type)) return StructType.of(struct_fields)
def test_projection(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) num_cols = source_table.num_columns for i in range(1, num_cols - 1): source_table = source_table.remove_column(num_cols - i) assert source_table == reader.read()
def test_table_scan_honors_select_without_case_sensitivity(ts_table): scan1 = ts_table.new_scan().case_sensitive(False).select(["ID"]) # order of refinements shouldn't matter scan2 = ts_table.new_scan().select(["ID"]).case_sensitive(False) expected_schema = Schema([NestedField.required(1, "id", IntegerType.get())]) assert scan1.schema.as_struct() == expected_schema.as_struct() assert scan2.schema.as_struct() == expected_schema.as_struct()
def strict_schema(): return Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.optional(2, "no_stats", IntegerType.get()), NestedField.required(3, "required", StringType.get()), NestedField.optional(4, "all_nulls", StringType.get()), NestedField.optional(5, "some_nulls", StringType.get()), NestedField.optional(6, "no_nulls", StringType.get()), NestedField.required(7, "always_5", IntegerType.get()))
def test_compound_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader( input_file, expected_schema, {}, Expressions.and_(Expressions.equal("string_col", "us"), Expressions.equal("int_col", 1)), True) pyarrow_array = [ pa.array([1], type=pa.int32()), pa.array([1], type=pa.int64()), pa.array([1.0], type=pa.float32()), pa.array([1.0], type=pa.float64()), pa.array(['us'], type=pa.string()) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_double_to_float_conversion(assert_and_unwrap): struct = StructType.of([NestedField.required(18, "f", FloatType.get())]) lt = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert lt.bind(struct) == Expressions.always_true() lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert lt_eq.bind(struct) == Expressions.always_true() gt = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert gt.bind(struct) == Expressions.always_true() gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert gt_eq.bind(struct) == Expressions.always_true() gt_max = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert gt_max.bind(struct) == Expressions.always_false() gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert gt_eq_max.bind(struct) == Expressions.always_false() lt_min = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert lt_min.bind(struct) == Expressions.always_false() lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert lt_eq_min.bind(struct) == Expressions.always_false() lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT).bind(struct) lt_max = assert_and_unwrap(lt_expr) assert lt_max.lit.value == Literal.JAVA_MAX_FLOAT lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT).bind(struct) lt_eq_max = assert_and_unwrap(lt_eq_expr) assert lt_eq_max.lit.value == Literal.JAVA_MAX_FLOAT gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MIN_INT).bind(struct) gt_min = assert_and_unwrap(gt_expr) assert gt_min.lit.value == Literal.JAVA_MIN_INT gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MIN_INT).bind(struct) gt_eq_min = assert_and_unwrap(gt_eq_expr) assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def test_long_to_integer_conversion(assert_and_unwrap): struct = StructType.of([NestedField.required(17, "i", IntegerType.get())]) lt = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert lt.bind(struct) == Expressions.always_true() lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert lt_eq.bind(struct) == Expressions.always_true() gt = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert gt.bind(struct) == Expressions.always_true() gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert gt_eq.bind(struct) == Expressions.always_true() gt_max = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert gt_max.bind(struct) == Expressions.always_false() gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert gt_eq_max.bind(struct) == Expressions.always_false() lt_min = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert lt_min.bind(struct) == Expressions.always_false() lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert lt_eq_min.bind(struct) == Expressions.always_false() lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MAX_INT).bind(struct) lt_max = assert_and_unwrap(lt_expr) assert lt_max.lit.value == Literal.JAVA_MAX_INT lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT).bind(struct) lt_eq_max = assert_and_unwrap(lt_eq_expr) assert lt_eq_max.lit.value == Literal.JAVA_MAX_INT gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MIN_INT).bind(struct) gt_min = assert_and_unwrap(gt_expr) assert gt_min.lit.value == Literal.JAVA_MIN_INT gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT).bind(struct) gt_eq_min = assert_and_unwrap(gt_eq_expr) assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def test_decimal_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9)) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.decimal128(38, 9)) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("new_dec_col", pa.decimal128(38, 9), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "id", IntegerType.get()), NestedField.required(2, "data", StringType.get()), NestedField.required(3, "num", DecimalType.of(9, 2))) spec = PartitionSpec \ .builder_for(spec_schema) \ .identity("id") \ .bucket("data", 16) \ .add_without_field_id(2, "data1", "bucket[16]") \ .add(2, 1010, "data2", "bucket[8]") \ .bucket("num", 8) \ .build() expected = '{"spec-id": 0, "fields": [' \ '{"name": "id", "transform": "identity", "source-id": 1, "field-id": 1000}, ' \ '{"name": "data_bucket", "transform": "bucket[16]", "source-id": 2, "field-id": 1001}, ' \ '{"name": "data1", "transform": "bucket[16]", "source-id": 2, "field-id": 1002}, ' \ '{"name": "data2", "transform": "bucket[8]", "source-id": 2, "field-id": 1010}, ' \ '{"name": "num_bucket", "transform": "bucket[8]", "source-id": 3, "field-id": 1011}]}' assert expected == PartitionSpecParser.to_json(spec)
def test_nan_errors(row_of): # Placeholder until NaN support is fully implemented struct = StructType.of([NestedField.required(34, "f", FloatType.get())]) evaluator = exp.evaluator.Evaluator( struct, exp.expressions.Expressions.is_nan("f")) with raises(NotImplementedError): evaluator.eval(row_of((123.4, ))) evaluator = exp.evaluator.Evaluator( struct, exp.expressions.Expressions.not_nan("f")) with raises(NotImplementedError): evaluator.eval(row_of((123.4, )))
def test_schema_evolution_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(16, "other_new_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(15, "new_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.not_null("new_col"), True) schema = pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("other_new_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True), pa.field("new_col", pa.string(), nullable=True) ]) pyarrow_not_null_array = [ pa.array([], type=pa.int32()), pa.array([], type=pa.int64()), pa.array([], type=pa.int32()), pa.array([], type=pa.float32()), pa.array([], type=pa.float64()), pa.array([], type=pa.string()), pa.array([], type=pa.string()) ] not_null_table = pa.table(pyarrow_not_null_array, schema=schema) pyarrow_null_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([None, None, None, None, None], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([None, None, None, None, None], type=pa.string()) ] null_table = pa.table(pyarrow_null_array, schema=schema) target_table = reader.read() assert not_null_table == target_table reader = ParquetReader(input_file, expected_schema, {}, Expressions.is_null("new_col"), True) target_table = reader.read() assert null_table == target_table
def missing_spec_list(): schema = Schema(NestedField.required(1, "x", LongType.get()), NestedField.required(2, "y", LongType.get()), NestedField.required(3, "z", LongType.get())) spec = PartitionSpec.builder_for(schema).identity("x").with_spec_id(6).build() random.seed(1234) previous_snapshot_id = int(time.time()) - random.randint(0, 3600) previous_snapshot = BaseSnapshot(ops, previous_snapshot_id, None, timestamp_millis=previous_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.1.avro"), spec_id=spec.spec_id)]) current_snapshot_id = int(time.time()) current_snapshot = BaseSnapshot(ops, current_snapshot_id, previous_snapshot_id, timestamp_millis=current_snapshot_id, manifests=[GenericManifestFile(file=Files.local_input("file:/tmp/manfiest.2.avro"), spec_id=spec.spec_id)]) return TableMetadata(ops, None, "s3://bucket/test/location", int(time.time()), 3, schema, 6, (spec,), {"property": "value"}, current_snapshot_id, [previous_snapshot, current_snapshot], [])
def test_column_upcast(primitive_type_test_file): expected_schema = Schema( [NestedField.required(1, "int_col", LongType.get())]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [pa.array([1, 2, 3, 4, 5], type=pa.int32())] source_table = pa.table( pyarrow_array, schema=pa.schema([pa.field("int_col", pa.int64(), nullable=False)])) target_table = reader.read() assert source_table == target_table
def test_to_json_conversion(): spec_schema = Schema(NestedField.required(1, "i", IntegerType.get()), NestedField.required(2, "l", LongType.get()), NestedField.required(3, "d", DateType.get()), NestedField.required(4, "t", TimeType.get()), NestedField.required(5, "ts", TimestampType.without_timezone()), NestedField.required(6, "dec", DecimalType.of(9, 2)), NestedField.required(7, "s", StringType.get()), NestedField.required(8, "u", UUIDType.get()), NestedField.required(9, "f", FixedType.of_length(3)), NestedField.required(10, "b", BinaryType.get())) specs = [ PartitionSpec.builder_for(spec_schema).identity("i").build(), PartitionSpec.builder_for(spec_schema).identity("l").build(), PartitionSpec.builder_for(spec_schema).identity("d").build(), PartitionSpec.builder_for(spec_schema).identity("t").build(), PartitionSpec.builder_for(spec_schema).identity("ts").build(), PartitionSpec.builder_for(spec_schema).identity("dec").build(), PartitionSpec.builder_for(spec_schema).identity("s").build(), PartitionSpec.builder_for(spec_schema).identity("u").build(), PartitionSpec.builder_for(spec_schema).identity("f").build(), PartitionSpec.builder_for(spec_schema).identity("b").build(), PartitionSpec.builder_for(spec_schema).bucket("i", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("l", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("d", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("t", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("ts", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("dec", 128).build(), PartitionSpec.builder_for(spec_schema).bucket("s", 128).build(), PartitionSpec.builder_for(spec_schema).year("d").build(), PartitionSpec.builder_for(spec_schema).month("d").build(), PartitionSpec.builder_for(spec_schema).day("d").build(), PartitionSpec.builder_for(spec_schema).year("ts").build(), PartitionSpec.builder_for(spec_schema).month("ts").build(), PartitionSpec.builder_for(spec_schema).day("ts").build(), PartitionSpec.builder_for(spec_schema).hour("ts").build(), PartitionSpec.builder_for(spec_schema).truncate("i", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("l", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("dec", 10).build(), PartitionSpec.builder_for(spec_schema).truncate("s", 10).build(), PartitionSpec.builder_for(spec_schema).add(6, "dec_bucket", "bucket[16]").build() ] expected_spec_strs = [ "[\n i: identity(1)\n]", "[\n l: identity(2)\n]", "[\n d: identity(3)\n]", "[\n t: identity(4)\n]", "[\n ts: identity(5)\n]", "[\n dec: identity(6)\n]", "[\n s: identity(7)\n]", "[\n u: identity(8)\n]", "[\n f: identity(9)\n]", "[\n b: identity(10)\n]", "[\n i_bucket: bucket[128](1)\n]", "[\n l_bucket: bucket[128](2)\n]", "[\n d_bucket: bucket[128](3)\n]", "[\n t_bucket: bucket[128](4)\n]", "[\n ts_bucket: bucket[128](5)\n]", "[\n dec_bucket: bucket[128](6)\n]", "[\n s_bucket: bucket[128](7)\n]", "[\n d_year: year(3)\n]", "[\n d_month: month(3)\n]", "[\n d_day: day(3)\n]", "[\n ts_year: year(5)\n]", "[\n ts_month: month(5)\n]", "[\n ts_day: day(5)\n]", "[\n ts_hour: hour(5)\n]", "[\n i_truncate: truncate[10](1)\n]", "[\n l_truncate: truncate[10](2)\n]", "[\n dec_truncate: truncate[10](6)\n]", "[\n s_truncate: truncate[10](7)\n]", "[\n dec_bucket: bucket[16](6)\n]", ] for (spec, expected_spec_str) in zip(specs, expected_spec_strs): assert str(spec) == expected_spec_str
def get_type(partition_type): return StructType.of([ NestedField.required(100, "file_path", StringType.get()), NestedField.required(101, "file_format", StringType.get()), NestedField.required(102, "partition", partition_type), NestedField.required(103, "record_count", LongType.get()), NestedField.required(104, "file_size_in_bytes", LongType.get()), NestedField.required(105, "block_size_in_bytes", LongType.get()), NestedField.optional(106, "file_ordinal", IntegerType.get()), NestedField.optional(107, "sort_columns", ListType.of_required(112, IntegerType.get())), NestedField.optional( 108, "column_sizes", MapType.of_required(117, 118, IntegerType.get(), LongType.get())), NestedField.optional( 109, "value_counts", MapType.of_required(119, 120, IntegerType.get(), LongType.get())), NestedField.optional( 110, "null_value_counts", MapType.of_required(121, 122, IntegerType.get(), LongType.get())), NestedField.optional( 125, "lower_bounds", MapType.of_required(126, 127, IntegerType.get(), BinaryType.get())), NestedField.optional( 128, "upper_bounds", MapType.of_required(129, 130, IntegerType.get(), BinaryType.get())) ] # NEXT ID TO ASSIGN: 131 )
def test_char_seq_value(row_of): struct = StructType.of([NestedField.required(34, "s", StringType.get())]) evaluator = exp.evaluator.Evaluator(struct, exp.expressions.Expressions.equal("s", "abc")) assert evaluator.eval(row_of(("abc",))) assert not evaluator.eval(row_of(("abcd",)))