예제 #1
0
    def test_preserves_index_output_partitioning(self):
        # Empty DataFrame with two columns and two index levels
        input_expr = expressions.ConstantExpression(
            pd.DataFrame(columns=["foo", "bar"], index=[[], []]))

        preserves_partial_index = expressions.ComputedExpression(
            'preserves_partial_index',
            # This adds an additional index  level, so we'd only preserve
            # partitioning on the two index levels that existed before.
            lambda df: df.set_index('foo', append=True),
            [input_expr],
            requires_partition_by=partitionings.Arbitrary(),
            preserves_partition_by=partitionings.Index([0, 1]))

        for partitioning in (
                partitionings.Singleton(),
                partitionings.Index([0]),
                partitionings.Index([1]),
                partitionings.Index([0, 1]),
        ):
            self.assertEqual(
                expressions.output_partitioning(preserves_partial_index,
                                                partitioning), partitioning,
                f"Should preserve {partitioning}")

        for partitioning in (partitionings.Index([0, 1, 2]),
                             partitionings.Index(), partitionings.Arbitrary()):
            self.assertEqual(
                expressions.output_partitioning(preserves_partial_index,
                                                partitioning),
                partitionings.Arbitrary(),
                f"Should NOT preserve {partitioning}")
예제 #2
0
    def test_preserves_singleton_output_partitioning(self):
        # Empty DataFrame with one column and two index levels
        input_expr = expressions.ConstantExpression(
            pd.DataFrame(columns=["column"], index=[[], []]))

        preserves_only_singleton = expressions.ComputedExpression(
            'preserves_only_singleton',
            # index is replaced with an entirely new one, so
            # if we were partitioned by Index we're not anymore.
            lambda df: df.set_index('column'),
            [input_expr],
            requires_partition_by=partitionings.Arbitrary(),
            preserves_partition_by=partitionings.Singleton())

        for partitioning in (partitionings.Singleton(), ):
            self.assertEqual(
                expressions.output_partitioning(preserves_only_singleton,
                                                partitioning), partitioning,
                f"Should preserve {partitioning}")

        for partitioning in (partitionings.Index([0]), partitionings.Index(),
                             partitionings.Arbitrary()):
            self.assertEqual(
                expressions.output_partitioning(preserves_only_singleton,
                                                partitioning),
                partitionings.Arbitrary(),
                f"Should NOT preserve {partitioning}")
예제 #3
0
        def output_partitioning_in_stage(expr, stage):
            """Return the output partitioning of expr when computed in stage,
      or returns None if the expression cannot be computed in this stage.
      """
            if expr in stage.inputs or expr in inputs:
                # Inputs are all partitioned by stage.partitioning.
                return stage.partitioning

            # Anything that's not an input must have arguments
            assert len(expr.args())

            arg_partitionings = set(
                output_partitioning_in_stage(arg, stage)
                for arg in expr.args() if not is_scalar(arg))

            if len(arg_partitionings) == 0:
                # All inputs are scalars, output partitioning isn't dependent on the
                # input.
                return expr.preserves_partition_by()

            if len(arg_partitionings) > 1:
                # Arguments must be identically partitioned, can't compute this
                # expression here.
                return None

            arg_partitioning = arg_partitionings.pop()

            if not expr.requires_partition_by().is_subpartitioning_of(
                    arg_partitioning):
                # Arguments aren't partitioned sufficiently for this expression
                return None

            return expressions.output_partitioning(expr, arg_partitioning)