def iterator(self, part_filter=None, row_filter=None, columns=None): if part_filter is None and row_filter is None and columns is None: return self.iterator(Expressions.always_true(), Expressions.always_true(), Filterable.ALL_COLUMNS) return iter([ self.get_filtered_manifest(path, part_filter, row_filter, columns) for path in self._manifest_files ])
def test_double_to_float_conversion(assert_and_unwrap): struct = StructType.of([NestedField.required(18, "f", FloatType.get())]) lt = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert lt.bind(struct) == Expressions.always_true() lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert lt_eq.bind(struct) == Expressions.always_true() gt = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert gt.bind(struct) == Expressions.always_true() gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert gt_eq.bind(struct) == Expressions.always_true() gt_max = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert gt_max.bind(struct) == Expressions.always_false() gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * 2) assert gt_eq_max.bind(struct) == Expressions.always_false() lt_min = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert lt_min.bind(struct) == Expressions.always_false() lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT * -2) assert lt_eq_min.bind(struct) == Expressions.always_false() lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT).bind(struct) lt_max = assert_and_unwrap(lt_expr) assert lt_max.lit.value == Literal.JAVA_MAX_FLOAT lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("f"), Literal.JAVA_MAX_FLOAT).bind(struct) lt_eq_max = assert_and_unwrap(lt_eq_expr) assert lt_eq_max.lit.value == Literal.JAVA_MAX_FLOAT gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("f"), Literal.JAVA_MIN_INT).bind(struct) gt_min = assert_and_unwrap(gt_expr) assert gt_min.lit.value == Literal.JAVA_MIN_INT gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("f"), Literal.JAVA_MIN_INT).bind(struct) gt_eq_min = assert_and_unwrap(gt_eq_expr) assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def test_long_to_integer_conversion(assert_and_unwrap): struct = StructType.of([NestedField.required(17, "i", IntegerType.get())]) lt = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert lt.bind(struct) == Expressions.always_true() lt_eq = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert lt_eq.bind(struct) == Expressions.always_true() gt = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert gt.bind(struct) == Expressions.always_true() gt_eq = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert gt_eq.bind(struct) == Expressions.always_true() gt_max = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert gt_max.bind(struct) == Expressions.always_false() gt_eq_max = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT + 1) assert gt_eq_max.bind(struct) == Expressions.always_false() lt_min = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert lt_min.bind(struct) == Expressions.always_false() lt_eq_min = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT - 1) assert lt_eq_min.bind(struct) == Expressions.always_false() lt_expr = UnboundPredicate(Operation.LT, Expressions.ref("i"), Literal.JAVA_MAX_INT).bind(struct) lt_max = assert_and_unwrap(lt_expr) assert lt_max.lit.value == Literal.JAVA_MAX_INT lt_eq_expr = UnboundPredicate(Operation.LT_EQ, Expressions.ref("i"), Literal.JAVA_MAX_INT).bind(struct) lt_eq_max = assert_and_unwrap(lt_eq_expr) assert lt_eq_max.lit.value == Literal.JAVA_MAX_INT gt_expr = UnboundPredicate(Operation.GT, Expressions.ref("i"), Literal.JAVA_MIN_INT).bind(struct) gt_min = assert_and_unwrap(gt_expr) assert gt_min.lit.value == Literal.JAVA_MIN_INT gt_eq_expr = UnboundPredicate(Operation.GT_EQ, Expressions.ref("i"), Literal.JAVA_MIN_INT).bind(struct) gt_eq_min = assert_and_unwrap(gt_eq_expr) assert gt_eq_min.lit.value == Literal.JAVA_MIN_INT
def all_entries(self): if self.row_filter is not None and self.row_filter != Expressions.always_true() \ or self.part_filter is not None and self. part_filter != Expressions.always_true(): evaluator = self.evaluator() metrics_evaluator = self.metrics_evaluator() return [ entry for entry in self.reader.entries(self.columns) if entry is not None and evaluator.eval(entry.file.partition()) and metrics_evaluator.eval(entry.file) ] else: return self.reader.entries(self.columns)
def iterator(self): if self.row_filter is not None and self.row_filter != Expressions.always_true() \ or self.part_filter is not None and self.part_filter != Expressions.always_true(): evaluator = self.evaluator() metrics_evaluator = self.metrics_evaluator() return (input.copy() for input in self.reader.iterator( self.part_filter, self.columns) if input is not None and evaluator.eval(input.partition()) and metrics_evaluator.eval(input)) else: return (entry.copy() for entry in self.reader.iterator( self.part_filter, self.columns))
def iterator(self, part_filter=None, columns=None): if part_filter is None and columns is None: return self.iterator(Expressions.always_true(), Filterable.ALL_COLUMNS) return (entry.file for entry in self.entries if entry.status != Status.DELETED)
def test_column_rename(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "string_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array(['us', 'can', 'us', 'us', 'can'], type=pa.string()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()) ] schema = pa.schema([ pa.field("int_col", pa.int32(), False), pa.field("bigint_col", pa.int64(), True), pa.field("string_col", pa.string(), True), pa.field("float_col", pa.float32(), True), pa.field("dbl_col", pa.float64(), True) ]) source_table = pa.table(pyarrow_array, schema=schema) target_table = reader.read() assert source_table == target_table
def __init__(self, ops, table, snapshot_id=None, columns=None, row_filter=None): self.ops = ops self.table = table self.snapshot_id = snapshot_id self.columns = columns self.row_filter = row_filter if self.columns is None and self.row_filter is None: self.columns = Filterable.ALL_COLUMNS self.row_filter = Expressions.always_true()
def test_not_null(assert_and_unwrap): optional = StructType.of([NestedField.optional(21, "s", StringType.get())]) unbound = UnboundPredicate(Operation.NOT_NULL, Expressions.ref("s")) expr = unbound.bind(optional) bound = assert_and_unwrap(expr) assert Operation.NOT_NULL == bound.op assert 21 == bound.ref.field.field_id assert bound.lit is None required = StructType.of([NestedField.required(22, "s", StringType.get())]) assert Expressions.always_true() == unbound.bind(required)
def metrics_evaluator(self): if self.lazy_metrics_evaluator is None: if self.row_filter is not None: self.lazy_metrics_evaluator = InclusiveMetricsEvaluator( self.reader.spec.schema, self.row_filter, self.case_sensitive) else: self.lazy_metrics_evaluator = InclusiveMetricsEvaluator( self.reader.spec.schema, Expressions.always_true(), self.case_sensitive) return self.lazy_metrics_evaluator
def evaluator(self): if self.lazy_evaluator is None: if self.part_filter is not None: self.lazy_evaluator = Evaluator( self.reader.spec.partition_type(), self.part_filter, self.case_sensitive) else: self.lazy_evaluator = Evaluator( self.reader.spec.partition_type(), Expressions.always_true(), self.case_sensitive) return self.lazy_evaluator
def test_column_upcast(primitive_type_test_file): expected_schema = Schema( [NestedField.required(1, "int_col", LongType.get())]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [pa.array([1, 2, 3, 4, 5], type=pa.int32())] source_table = pa.table( pyarrow_array, schema=pa.schema([pa.field("int_col", pa.int64(), nullable=False)])) target_table = reader.read() assert source_table == target_table
def test_basic_simplification(assert_and_unwrap): # Should simplify or expression to alwaysTrue assert Expressions.always_true() == Binder.bind( STRUCT, Expressions.or_(Expressions.less_than("y", 100), Expressions.greater_than("z", -9999999999))) # Should simplify or expression to alwaysfalse assert Expressions.always_false() == Binder.bind( STRUCT, Expressions.and_(Expressions.less_than("y", 100), Expressions.less_than("z", -9999999999))) bound = Binder.bind( STRUCT, Expressions.not_(Expressions.not_(Expressions.less_than("y", 100)))) pred = assert_and_unwrap(bound, None) assert 1 == pred.ref.field_id
def test_projection(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) num_cols = source_table.num_columns for i in range(1, num_cols - 1): source_table = source_table.remove_column(num_cols - i) assert source_table == reader.read()
def __init__(self, ops, table, schema, snapshot_id=None, columns=None, row_filter=None, case_sensitive=True, selected_columns=None, options=None, minused_cols=None): self.ops = ops self.table = table self._schema = schema self.snapshot_id = snapshot_id self.columns = columns self._row_filter = row_filter self._case_sensitive = case_sensitive self.selected_columns = selected_columns self.minused_cols = minused_cols or list() self.options = options if options is not None else dict() if self.columns is None and self._row_filter is None: self.columns = Filterable.ALL_COLUMNS self._row_filter = Expressions.always_true() self._stats = dict()
def test_decimal_column_add(primitive_type_test_file): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(13, "new_dec_col", DecimalType.of(38, 9)) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) pyarrow_array = [ pa.array([1, 2, 3, 4, 5], type=pa.int32()), pa.array([1, 2, 3, None, 5], type=pa.int64()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array([None, None, None, None, None], type=pa.decimal128(38, 9)) ] source_table = pa.table(pyarrow_array, schema=pa.schema([ pa.field("int_col", pa.int32(), nullable=False), pa.field("bigint_col", pa.int64(), nullable=True), pa.field("float_col", pa.float32(), nullable=True), pa.field("dbl_col", pa.float64(), nullable=True), pa.field("new_dec_col", pa.decimal128(38, 9), nullable=True) ])) target_table = reader.read() assert source_table == target_table
def test_basic_read(primitive_type_test_file, pyarrow_primitive_array, pyarrow_schema): expected_schema = Schema([ NestedField.required(1, "int_col", IntegerType.get()), NestedField.optional(2, "bigint_col", LongType.get()), NestedField.optional(3, "str_col", StringType.get()), NestedField.optional(4, "float_col", FloatType.get()), NestedField.optional(5, "dbl_col", DoubleType.get()), NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)), NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)), NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)), NestedField.optional(9, "date_col", DateType.get()), NestedField.optional(10, "ts_col", TimestampType.without_timezone()), NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()), NestedField.optional(12, "bool_col", BooleanType.get()) ]) input_file = FileSystemInputFile(get_fs(primitive_type_test_file, conf={}), primitive_type_test_file, {}) reader = ParquetReader(input_file, expected_schema, {}, Expressions.always_true(), True) source_table = pa.table(pyarrow_primitive_array, schema=pyarrow_schema) assert reader.read() == source_table
def test_always_true(): assert Expressions.always_true() == Binder.bind(STRUCT, Expressions.always_true())
return lambda x: TestHelpers.Row.of(x) @pytest.fixture(scope="session", params=[ Operation.LT, Operation.LT_EQ, Operation.GT, Operation.GT_EQ, Operation.EQ, Operation.NOT_EQ ]) def op(request): yield request.param @pytest.fixture(scope="session", params=[ Expressions.always_false(), Expressions.always_true(), Expressions.less_than("x", 5), Expressions.less_than_or_equal("y", -3), Expressions.greater_than("z", 0), Expressions.greater_than_or_equal("t", 129), Expressions.equal("col", "data"), Expressions.not_equal("col", "abc"), Expressions.not_null("maybeNull"), Expressions.is_null("maybeNull2"), Expressions.not_(Expressions.greater_than("a", 10)), Expressions.and_(Expressions.greater_than_or_equal("a", 0), Expressions.less_than("a", 3)), Expressions.or_(Expressions.less_than("a", 0), Expressions.greater_than("a", 10)), Expressions.equal("a", 5).bind(exp_schema.as_struct()) ])
def filter_partitions(self, expr): return FilteredManifest(self, expr, Expressions.always_true(), ManifestReader.ALL_COLUMNS, self.case_sensitive)
def select(self, columns): return FilteredManifest(self, Expressions.always_true(), Expressions.always_true(), list(columns), self.case_sensitive)
def select(self, columns): return FilteredSnapshot(self, Expressions.always_true(), Expressions.always_true(), columns)
def join_filters(expressions): result = Expressions.always_true() for expression in expressions: result = Expressions.and_(result, expression) return result
def filter_rows(self, expr): return FilteredSnapshot(self, Expressions.always_true(), expr, Filterable.ALL_COLUMNS)