def test_only_caches_same_input(self): arg_0_expr = expressions.ConstantExpression(0) ident_val = expressions.ComputedExpression('ident', lambda x: x, [arg_0_expr]) comp_expr = expressions.ComputedExpression('add', lambda x, y: x + y, [ident_val, arg_0_expr]) self.mock_cache(arg_0_expr) replaced = self.cache.replace_with_cached(comp_expr) # Assert that arg_0_expr, being an input to two computations, was replaced # with the same placeholder expression. expected_trace = [ expressions.ComputedExpression, expressions.ComputedExpression, expressions.PlaceholderExpression, expressions.PlaceholderExpression ] actual_trace = self.create_trace(comp_expr) unique_placeholders = set( t for t in actual_trace if isinstance(t, expressions.PlaceholderExpression)) self.assertTraceTypes(comp_expr, expected_trace) self.assertTrue( all(e == replaced[arg_0_expr._id] for e in unique_placeholders)) self.assertIn(arg_0_expr._id, replaced)
def dot(self, other): # We want to broadcast the right hand side to all partitions of the left. # This is OK, as its index must be the same size as the columns set of self, # so cannot be too large. class AsScalar(object): def __init__(self, value): self.value = value if isinstance(other, frame_base.DeferredFrame): proxy = other._expr.proxy() with expressions.allow_non_parallel_operations(): side = expressions.ComputedExpression( 'as_scalar', lambda df: AsScalar(df), [other._expr], requires_partition_by=partitionings.Singleton()) else: proxy = pd.DataFrame(columns=range(len(other[0]))) side = expressions.ConstantExpression(AsScalar(other)) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'dot', lambda left, right: left @ right.value, [self._expr, side], requires_partition_by=partitionings.Nothing(), preserves_partition_by=partitionings.Index(), proxy=proxy))
def test_only_replaces_cached(self): in_expr = expressions.ConstantExpression(0) comp_expr = expressions.ComputedExpression('test', lambda x: x, [in_expr]) # Expect that no replacement of expressions is performed. expected_trace = [ expressions.ComputedExpression, expressions.ConstantExpression ] self.assertTraceTypes(comp_expr, expected_trace) self.cache.replace_with_cached(comp_expr) self.assertTraceTypes(comp_expr, expected_trace) # Now "cache" the expression and assert that the cached expression was # replaced with a placeholder. self.mock_cache(in_expr) replaced = self.cache.replace_with_cached(comp_expr) expected_trace = [ expressions.ComputedExpression, expressions.PlaceholderExpression ] self.assertTraceTypes(comp_expr, expected_trace) self.assertIn(in_expr._id, replaced)
def test_preserves_index_output_partitioning(self): # Empty DataFrame with two columns and two index levels input_expr = expressions.ConstantExpression( pd.DataFrame(columns=["foo", "bar"], index=[[], []])) preserves_partial_index = expressions.ComputedExpression( 'preserves_partial_index', # This adds an additional index level, so we'd only preserve # partitioning on the two index levels that existed before. lambda df: df.set_index('foo', append=True), [input_expr], requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Index([0, 1])) for partitioning in ( partitionings.Singleton(), partitionings.Index([0]), partitionings.Index([1]), partitionings.Index([0, 1]), ): self.assertEqual( expressions.output_partitioning(preserves_partial_index, partitioning), partitioning, f"Should preserve {partitioning}") for partitioning in (partitionings.Index([0, 1, 2]), partitionings.Index(), partitionings.Arbitrary()): self.assertEqual( expressions.output_partitioning(preserves_partial_index, partitioning), partitionings.Arbitrary(), f"Should NOT preserve {partitioning}")
def test_preserves_singleton_output_partitioning(self): # Empty DataFrame with one column and two index levels input_expr = expressions.ConstantExpression( pd.DataFrame(columns=["column"], index=[[], []])) preserves_only_singleton = expressions.ComputedExpression( 'preserves_only_singleton', # index is replaced with an entirely new one, so # if we were partitioned by Index we're not anymore. lambda df: df.set_index('column'), [input_expr], requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Singleton()) for partitioning in (partitionings.Singleton(), ): self.assertEqual( expressions.output_partitioning(preserves_only_singleton, partitioning), partitioning, f"Should preserve {partitioning}") for partitioning in (partitionings.Index([0]), partitionings.Index(), partitionings.Arbitrary()): self.assertEqual( expressions.output_partitioning(preserves_only_singleton, partitioning), partitionings.Arbitrary(), f"Should NOT preserve {partitioning}")
def wrapper(*args, **kwargs): res = getattr(pd, name)(*args, **kwargs) if type(res) in frame_base.DeferredBase._pandas_type_map.keys(): return frame_base.DeferredBase.wrap( expressions.ConstantExpression(res, res[0:0])) else: return res
def concat( objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy): if ignore_index: raise NotImplementedError('concat(ignore_index)') if levels: raise NotImplementedError('concat(levels)') if isinstance(objs, Mapping): if keys is None: keys = list(objs.keys()) objs = [objs[k] for k in keys] else: objs = list(objs) if keys is None: preserves_partitioning = partitionings.Arbitrary() else: # Index 0 will be a new index for keys, only partitioning by the original # indexes (1 to N) will be preserved. nlevels = min(o._expr.proxy().index.nlevels for o in objs) preserves_partitioning = partitionings.Index( [i for i in range(1, nlevels + 1)]) deferred_none = expressions.ConstantExpression(None) exprs = [deferred_none if o is None else o._expr for o in objs] if axis in (1, 'columns'): required_partitioning = partitionings.Index() elif verify_integrity: required_partitioning = partitionings.Index() else: required_partitioning = partitionings.Arbitrary() return frame_base.DeferredBase.wrap( expressions.ComputedExpression( 'concat', lambda *objs: pd.concat( objs, axis=axis, join=join, ignore_index=ignore_index, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity), # yapf break exprs, requires_partition_by=required_partitioning, preserves_partition_by=preserves_partitioning))
def _run_test(self, func, *args): deferred_args = [ frame_base.DeferredFrame.wrap( expressions.ConstantExpression(arg, arg[0:0])) for arg in args ] expected = func(*args) actual = expressions.Session({}).evaluate(func(*deferred_args)._expr) self.assertTrue( expected.equals(actual), 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
def wrapper(*args, **kwargs): for key, values in (): #restrictions.items(): if key in kwargs: value = kwargs[key] else: try: # pylint: disable=deprecated-method ix = inspect.getargspec(func).args.index(key) except ValueError: # TODO: fix for delegation? continue if len(args) <= ix: continue value = args[ix] if not isinstance(values, list): values = [values] if value not in values: raise NotImplementedError('%s=%s not supported for %s' % (key, value, name)) deferred_arg_indices = [] deferred_arg_exprs = [] constant_args = [None] * len(args) for ix, arg in enumerate(args): if isinstance(arg, DeferredBase): deferred_arg_indices.append(ix) deferred_arg_exprs.append(arg._expr) elif isinstance(arg, pd.core.generic.NDFrame): deferred_arg_indices.append(ix) deferred_arg_exprs.append( expressions.ConstantExpression(arg, arg[0:0])) else: constant_args[ix] = arg if inplace: actual_func = copy_and_mutate(func) else: actual_func = func def apply(*actual_args): full_args = list(constant_args) for ix, arg in zip(deferred_arg_indices, actual_args): full_args[ix] = arg return actual_func(*full_args, **kwargs) result_expr = expressions.ComputedExpression( name, apply, deferred_arg_exprs, requires_partition_by=requires_partition_by, preserves_partition_by=preserves_partition_by) if inplace: args[0]._expr = result_expr else: return DeferredFrame.wrap(result_expr)
def test_only_replaces_inputs(self): arg_0_expr = expressions.ConstantExpression(0) ident_val = expressions.ComputedExpression('ident', lambda x: x, [arg_0_expr]) arg_1_expr = expressions.ConstantExpression(1) comp_expr = expressions.ComputedExpression('add', lambda x, y: x + y, [ident_val, arg_1_expr]) self.mock_cache(ident_val) replaced = self.cache.replace_with_cached(comp_expr) # Assert that ident_val was replaced and that its arguments were removed # from the expression tree. expected_trace = [ expressions.ComputedExpression, expressions.PlaceholderExpression, expressions.ConstantExpression ] self.assertTraceTypes(comp_expr, expected_trace) self.assertIn(ident_val._id, replaced) self.assertNotIn(arg_0_expr, self.create_trace(comp_expr))
def _run_test(self, func, *args, distributed=True, expect_error=False): deferred_args = [ frame_base.DeferredFrame.wrap( expressions.ConstantExpression(arg, arg[0:0])) for arg in args ] try: expected = func(*args) except Exception as e: if not expect_error: raise expected = e else: if expect_error: raise AssertionError( "Expected an error but computing expected result successfully " f"returned: {expected}") session_type = (expressions.PartitioningSession if distributed else expressions.Session) try: actual = session_type({}).evaluate(func(*deferred_args)._expr) except Exception as e: if not expect_error: raise actual = e else: if expect_error: raise AssertionError( "Expected an error:\n{expected}\nbut successfully " f"returned:\n{actual}") if not expect_error: if hasattr(expected, 'equals'): if distributed: cmp = lambda df: expected.sort_index().equals(df. sort_index()) else: cmp = expected.equals elif isinstance(expected, float): cmp = lambda x: (math.isnan(x) and math.isnan( expected)) or x == expected == 0 or abs(expected - x) / ( abs(expected) + abs(x)) < 1e-8 else: cmp = expected.__eq__ self.assertTrue( cmp(actual), 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual)) else: self.assertIsInstance(actual, type(expected)) self.assertEqual(str(actual), str(expected))
def fillna(self, value, method): if method is not None: raise frame_base.WontImplementError('order-sensitive') if isinstance(value, frame_base.DeferredBase): value_expr = value._expr else: value_expr = expressions.ConstantExpression(value) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'fillna', lambda df, value: df.fillna(value, method=method), [self._expr, value_expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing()))
def test_dataframe_eval_query(self): df = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) self._run_test(lambda df: df.eval('foo = a + b - c'), df) self._run_test(lambda df: df.query('a > b + c'), df) self._run_inplace_test(lambda df: df.eval('foo = a + b - c'), df) # Verify that attempting to access locals raises a useful error deferred_df = frame_base.DeferredFrame.wrap( expressions.ConstantExpression(df, df[0:0])) self.assertRaises(NotImplementedError, lambda: deferred_df.eval('foo = a + @b - c')) self.assertRaises(NotImplementedError, lambda: deferred_df.query('a > @b + c'))
def concat( objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy): if ignore_index: raise NotImplementedError('concat(ignore_index)') if levels: raise NotImplementedError('concat(levels)') if isinstance(objs, Mapping): if keys is None: keys = list(objs.keys()) objs = [objs[k] for k in keys] else: objs = list(objs) deferred_none = expressions.ConstantExpression(None) exprs = [deferred_none if o is None else o._expr for o in objs] if axis in (1, 'columns'): required_partitioning = partitionings.Index() elif verify_integrity: required_partitioning = partitionings.Index() else: required_partitioning = partitionings.Nothing() return frame_base.DeferredBase.wrap( expressions.ComputedExpression( 'concat', lambda *objs: pd.concat( objs, axis=axis, join=join, ignore_index=ignore_index, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity), # yapf break exprs, requires_partition_by=required_partitioning, preserves_partition_by=partitionings.Index()))
def _run_test(self, func, *args, distributed=False): deferred_args = [ frame_base.DeferredFrame.wrap( expressions.ConstantExpression(arg, arg[0:0])) for arg in args ] expected = func(*args) session_type = ( expressions.PartitioningSession if distributed else expressions.Session) actual = session_type({}).evaluate(func(*deferred_args)._expr) if hasattr(expected, 'equals'): if distributed: cmp = lambda df: expected.sort_index().equals(df.sort_index()) else: cmp = expected.equals elif isinstance(expected, float): cmp = lambda x: (math.isnan(x) and math.isnan(expected) ) or x == expected == 0 or abs(expected - x) / ( abs(expected) + abs(x)) < 1e-8 else: cmp = expected.__eq__ self.assertTrue( cmp(actual), 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
def _maybe_wrap_constant_expr(res): if type(res) in frame_base.DeferredBase._pandas_type_map.keys(): return frame_base.DeferredBase.wrap( expressions.ConstantExpression(res, res[0:0])) else: return res
def wrapper(*args, **kwargs): for key, values in restrictions.items(): if key in kwargs: value = kwargs[key] else: try: ix = _getargspec(func).args.index(key) except ValueError: # TODO: fix for delegation? continue if len(args) <= ix: continue value = args[ix] if callable(values): check = values elif isinstance(values, list): check = lambda x, values=values: x in values else: check = lambda x, value=value: x == value if not check(value): raise NotImplementedError('%s=%s not supported for %s' % (key, value, name)) deferred_arg_indices = [] deferred_arg_exprs = [] constant_args = [None] * len(args) from apache_beam.dataframe.frames import _DeferredIndex for ix, arg in enumerate(args): if isinstance(arg, DeferredBase): deferred_arg_indices.append(ix) deferred_arg_exprs.append(arg._expr) elif isinstance(arg, _DeferredIndex): # TODO(robertwb): Consider letting indices pass through as indices. # This would require updating the partitioning code, as indices don't # have indices. deferred_arg_indices.append(ix) deferred_arg_exprs.append( expressions.ComputedExpression( 'index_as_series', lambda ix: ix.index.to_series(), # yapf break [arg._frame._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing())) elif isinstance(arg, pd.core.generic.NDFrame): deferred_arg_indices.append(ix) deferred_arg_exprs.append( expressions.ConstantExpression(arg, arg[0:0])) else: constant_args[ix] = arg deferred_kwarg_keys = [] deferred_kwarg_exprs = [] constant_kwargs = {key: None for key in kwargs} for key, arg in kwargs.items(): if isinstance(arg, DeferredBase): deferred_kwarg_keys.append(key) deferred_kwarg_exprs.append(arg._expr) elif isinstance(arg, pd.core.generic.NDFrame): deferred_kwarg_keys.append(key) deferred_kwarg_exprs.append( expressions.ConstantExpression(arg, arg[0:0])) else: constant_kwargs[key] = arg deferred_exprs = deferred_arg_exprs + deferred_kwarg_exprs if inplace: actual_func = copy_and_mutate(func) else: actual_func = func def apply(*actual_args): actual_args, actual_kwargs = ( actual_args[:len(deferred_arg_exprs)], actual_args[len(deferred_arg_exprs):]) full_args = list(constant_args) for ix, arg in zip(deferred_arg_indices, actual_args): full_args[ix] = arg full_kwargs = dict(constant_kwargs) for key, arg in zip(deferred_kwarg_keys, actual_kwargs): full_kwargs[key] = arg return actual_func(*full_args, **full_kwargs) if (not requires_partition_by.is_subpartitioning_of( partitionings.Index()) and sum( isinstance(arg.proxy(), pd.core.generic.NDFrame) for arg in deferred_exprs) > 1): # Implicit join on index if there is more than one indexed input. actual_requires_partition_by = partitionings.Index() else: actual_requires_partition_by = requires_partition_by result_expr = expressions.ComputedExpression( name, apply, deferred_exprs, requires_partition_by=actual_requires_partition_by, preserves_partition_by=preserves_partition_by) if inplace: args[0]._expr = result_expr else: return DeferredFrame.wrap(result_expr)
def _get_deferred_args(*args): return [ frame_base.DeferredFrame.wrap( expressions.ConstantExpression(arg, arg[0:0])) for arg in args ]
def test_constant_expresion(self): two = expressions.ConstantExpression(2) session = expressions.Session({}) self.assertEqual(session.evaluate(two), 2)
def _run_test(self, func, *args, distributed=True, expect_error=False): deferred_args = [ frame_base.DeferredFrame.wrap( expressions.ConstantExpression(arg, arg[0:0])) for arg in args ] try: expected = func(*args) except Exception as e: if not expect_error: raise expected = e else: if expect_error: raise AssertionError( "Expected an error but computing expected result successfully " f"returned: {expected}") session_type = (expressions.PartitioningSession if distributed else expressions.Session) try: actual = session_type({}).evaluate(func(*deferred_args)._expr) except Exception as e: if not expect_error: raise actual = e else: if expect_error: raise AssertionError( "Expected an error:\n{expected}\nbut successfully " f"returned:\n{actual}") if expect_error: if not isinstance( actual, type(expected)) or not str(actual) == str(expected): raise AssertionError( f'Expected {expected!r} to be raised, but got {actual!r}' ) from actual else: if isinstance(expected, pd.core.generic.NDFrame): if distributed: if expected.index.is_unique: expected = expected.sort_index() actual = actual.sort_index() else: expected = expected.sort_values(list(expected.columns)) actual = actual.sort_values(list(actual.columns)) if isinstance(expected, pd.Series): pd.testing.assert_series_equal(expected, actual) elif isinstance(expected, pd.DataFrame): pd.testing.assert_frame_equal(expected, actual) else: raise ValueError(f"Expected value is a {type(expected)}," "not a Series or DataFrame.") else: # Expectation is not a pandas object if isinstance(expected, float): cmp = lambda x: np.isclose(expected, x) else: cmp = expected.__eq__ self.assertTrue( cmp(actual), 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
def wrapper(*args, **kwargs): for key, values in restrictions.items(): if key in kwargs: value = kwargs[key] else: try: ix = _getargspec(func).args.index(key) except ValueError: # TODO: fix for delegation? continue if len(args) <= ix: continue value = args[ix] if callable(values): check = values elif isinstance(values, list): check = lambda x, values=values: x in values else: check = lambda x, value=value: x == value if not check(value): raise NotImplementedError( '%s=%s not supported for %s' % (key, value, name)) deferred_arg_indices = [] deferred_arg_exprs = [] constant_args = [None] * len(args) for ix, arg in enumerate(args): if isinstance(arg, DeferredBase): deferred_arg_indices.append(ix) deferred_arg_exprs.append(arg._expr) elif isinstance(arg, pd.core.generic.NDFrame): deferred_arg_indices.append(ix) deferred_arg_exprs.append(expressions.ConstantExpression(arg, arg[0:0])) else: constant_args[ix] = arg deferred_kwarg_keys = [] deferred_kwarg_exprs = [] constant_kwargs = {key: None for key in kwargs} for key, arg in kwargs.items(): if isinstance(arg, DeferredBase): deferred_kwarg_keys.append(key) deferred_kwarg_exprs.append(arg._expr) elif isinstance(arg, pd.core.generic.NDFrame): deferred_kwarg_keys.append(key) deferred_kwarg_exprs.append( expressions.ConstantExpression(arg, arg[0:0])) else: constant_kwargs[key] = arg deferred_exprs = deferred_arg_exprs + deferred_kwarg_exprs if inplace: actual_func = copy_and_mutate(func) else: actual_func = func def apply(*actual_args): actual_args, actual_kwargs = (actual_args[:len(deferred_arg_exprs)], actual_args[len(deferred_arg_exprs):]) full_args = list(constant_args) for ix, arg in zip(deferred_arg_indices, actual_args): full_args[ix] = arg full_kwargs = dict(constant_kwargs) for key, arg in zip(deferred_kwarg_keys, actual_kwargs): full_kwargs[key] = arg return actual_func(*full_args, **full_kwargs) if any(isinstance(arg.proxy(), pd.core.generic.NDFrame) for arg in deferred_exprs) and not requires_partition_by.is_subpartitioning_of( partitionings.Index()): # Implicit join on index. actual_requires_partition_by = partitionings.Index() else: actual_requires_partition_by = requires_partition_by result_expr = expressions.ComputedExpression( name, apply, deferred_exprs, requires_partition_by=actual_requires_partition_by, preserves_partition_by=preserves_partition_by) if inplace: args[0]._expr = result_expr else: return DeferredFrame.wrap(result_expr)