def test_preserves_index_output_partitioning(self): # Empty DataFrame with two columns and two index levels input_expr = expressions.ConstantExpression( pd.DataFrame(columns=["foo", "bar"], index=[[], []])) preserves_partial_index = expressions.ComputedExpression( 'preserves_partial_index', # This adds an additional index level, so we'd only preserve # partitioning on the two index levels that existed before. lambda df: df.set_index('foo', append=True), [input_expr], requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Index([0, 1])) for partitioning in ( partitionings.Singleton(), partitionings.Index([0]), partitionings.Index([1]), partitionings.Index([0, 1]), ): self.assertEqual( expressions.output_partitioning(preserves_partial_index, partitioning), partitioning, f"Should preserve {partitioning}") for partitioning in (partitionings.Index([0, 1, 2]), partitionings.Index(), partitionings.Arbitrary()): self.assertEqual( expressions.output_partitioning(preserves_partial_index, partitioning), partitionings.Arbitrary(), f"Should NOT preserve {partitioning}")
def test_preserves_singleton_output_partitioning(self): # Empty DataFrame with one column and two index levels input_expr = expressions.ConstantExpression( pd.DataFrame(columns=["column"], index=[[], []])) preserves_only_singleton = expressions.ComputedExpression( 'preserves_only_singleton', # index is replaced with an entirely new one, so # if we were partitioned by Index we're not anymore. lambda df: df.set_index('column'), [input_expr], requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Singleton()) for partitioning in (partitionings.Singleton(), ): self.assertEqual( expressions.output_partitioning(preserves_only_singleton, partitioning), partitioning, f"Should preserve {partitioning}") for partitioning in (partitionings.Index([0]), partitionings.Index(), partitionings.Arbitrary()): self.assertEqual( expressions.output_partitioning(preserves_only_singleton, partitioning), partitionings.Arbitrary(), f"Should NOT preserve {partitioning}")
def output_partitioning_in_stage(expr, stage): """Return the output partitioning of expr when computed in stage, or returns None if the expression cannot be computed in this stage. """ if expr in stage.inputs or expr in inputs: # Inputs are all partitioned by stage.partitioning. return stage.partitioning # Anything that's not an input must have arguments assert len(expr.args()) arg_partitionings = set( output_partitioning_in_stage(arg, stage) for arg in expr.args() if not is_scalar(arg)) if len(arg_partitionings) == 0: # All inputs are scalars, output partitioning isn't dependent on the # input. return expr.preserves_partition_by() if len(arg_partitionings) > 1: # Arguments must be identically partitioned, can't compute this # expression here. return None arg_partitioning = arg_partitionings.pop() if not expr.requires_partition_by().is_subpartitioning_of( arg_partitioning): # Arguments aren't partitioned sufficiently for this expression return None return expressions.output_partitioning(expr, arg_partitioning)