def test_schema_evolution_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(16, "other_new_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(15, "new_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.not_null("new_col"), True) schema = pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("other_new_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True), pa.field("new_col", pa.string(), nullable=True) ]) pyarrow_not_null_array = [ pa.array([], type=pa.int32()), pa.array([], type=pa.int64()), pa.array([], type=pa.int32()), pa.array([], type=pa.float32()), pa.array([], type=pa.float64()), pa.array([], type=pa.string()), pa.array([], type=pa.string()) ] not_null_table = pa.table(pyarrow_not_null_array, schema=schema) pyarrow_null_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([None, None, None, None, None], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([None, None, None, None, None], type=pa.string()) ] null_table = pa.table(pyarrow_null_array, schema=schema) target_table = reader.read() assert not_null_table == target_table reader = ParquetReader(input_file, expected_schema, {}, Expressions.is_null("new_col"), True) target_table = reader.read() assert null_table == target_table
def test_column_rename(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()) ] schema = pa.schema([ pa.field("int_col", pa.int32(), False), pa.field("bigint_col", pa.int64(), True), pa.field("string_col", pa.string(), True), pa.field("float_col", pa.float32(), True), pa.field("dbl_col", pa.float64(), True) ]) source_table = pa.table(pyarrow_array, schema=schema) target_table = reader.read() assert source_table == target_table
def test_column_upcast(primitive_type_test_file): expected_schema = Schema( [NestedField.required(1, "int_col", LongType.get())]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [pa.array([1, 2, 3, 4, 5], type=pa.int32())] source_table = pa.table( pyarrow_array, schema=pa.schema([pa.field("int_col", pa.int64(), nullable=False)])) target_table = reader.read() assert source_table == target_table
def test_projection(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) num_cols = source_table.num_columns for i in range(1, num_cols - 1): source_table = source_table.remove_column(num_cols - i) assert source_table == reader.read()
def test_compound_filter(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(3, "string_col", StringType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader( input_file, expected_schema, {}, Expressions.and_(Expressions.equal("string_col", "us"), Expressions.equal("int_col", 1)), True) pyarrow_array = [ pa.array([1], type=pa.int32()), pa.array([1], type=pa.int64()), pa.array([1.0], type=pa.float32()), pa.array([1.0], type=pa.float64()), pa.array(['us'], type=pa.string()) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("string_col", pa.string(), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_decimal_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9)) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.decimal128(38, 9)) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("new_dec_col", pa.decimal128(38, 9), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) assert reader.read() == source_table