def aggregate(self, func, axis=0, *args, **kwargs): if isinstance(func, list) and len(func) > 1: # Aggregate each column separately, then stick them all together. rows = [self.agg([f], *args, **kwargs) for f in func] return frame_base.DeferredFrame.wrap( expressions.ComputedExpression('join_aggregate', lambda *rows: pd.concat(rows), [row._expr for row in rows])) else: # We're only handling a single column. base_func = func[0] if isinstance(func, list) else func if _is_associative(base_func) and not args and not kwargs: intermediate = expressions.elementwise_expression( 'pre_aggregate', lambda s: s.agg([base_func], *args, **kwargs), [self._expr]) allow_nonparallel_final = True else: intermediate = self._expr allow_nonparallel_final = None # i.e. don't change the value with expressions.allow_non_parallel_operations( allow_nonparallel_final): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda s: s.agg(func, *args, **kwargs), [intermediate], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton()))
def _run_patched(func, *args, **kwargs): set_pandas_options() # https://github.com/pandas-dev/pandas/blob/1.0.x/setup.cfg#L63 optionflags = kwargs.pop('optionflags', 0) optionflags |= (doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL) env = TestEnvironment() use_beam = kwargs.pop('use_beam', True) skip = kwargs.pop('skip', {}) wont_implement_ok = kwargs.pop('wont_implement_ok', {}) not_implemented_ok = kwargs.pop('not_implemented_ok', {}) extraglobs = dict(kwargs.pop('extraglobs', {})) extraglobs['pd'] = env.fake_pandas_module() try: # Unfortunately the runner is not injectable. original_doc_test_runner = doctest.DocTestRunner doctest.DocTestRunner = lambda **kwargs: BeamDataframeDoctestRunner( env, use_beam=use_beam, wont_implement_ok=wont_implement_ok, not_implemented_ok=not_implemented_ok, skip=skip, **kwargs) with expressions.allow_non_parallel_operations(): return func(*args, extraglobs=extraglobs, optionflags=optionflags, **kwargs) finally: doctest.DocTestRunner = original_doc_test_runner
def _run_patched(func, *args, **kwargs): try: # See # https://github.com/pandas-dev/pandas/blob/a00202d12d399662b8045a8dd3fdac04f18e1e55/doc/source/conf.py#L319 np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 # https://github.com/pandas-dev/pandas/blob/1.0.x/setup.cfg#L63 optionflags = kwargs.pop('optionflags', 0) optionflags |= (doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL) env = TestEnvironment() use_beam = kwargs.pop('use_beam', True) skip = kwargs.pop('skip', {}) wont_implement_ok = kwargs.pop('wont_implement_ok', {}) extraglobs = dict(kwargs.pop('extraglobs', {})) extraglobs['pd'] = env.fake_pandas_module() # Unfortunately the runner is not injectable. original_doc_test_runner = doctest.DocTestRunner doctest.DocTestRunner = lambda **kwargs: BeamDataframeDoctestRunner( env, use_beam=use_beam, wont_implement_ok=wont_implement_ok, skip=skip, **kwargs) with expressions.allow_non_parallel_operations(): return func(*args, extraglobs=extraglobs, optionflags=optionflags, **kwargs) finally: doctest.DocTestRunner = original_doc_test_runner
def teststring(text, report=True, **runner_kwargs): optionflags = runner_kwargs.pop('optionflags', 0) optionflags |= ( doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL) wont_implement_ok = runner_kwargs.pop('wont_implement_ok', False) not_implemented_ok = runner_kwargs.pop('not_implemented_ok', False) parser = doctest.DocTestParser() runner = BeamDataframeDoctestRunner( TestEnvironment(), optionflags=optionflags, wont_implement_ok={'<string>': ['*']} if wont_implement_ok else None, not_implemented_ok={'<string>': ['*']} if not_implemented_ok else None, **runner_kwargs) test = parser.get_doctest( text, { 'pd': runner.fake_pandas_module(), 'np': np }, '<string>', '<string>', 0) with expressions.allow_non_parallel_operations(): result = runner.run(test) if report: runner.summarize() return result
def dot(self, other): # We want to broadcast the right hand side to all partitions of the left. # This is OK, as its index must be the same size as the columns set of self, # so cannot be too large. class AsScalar(object): def __init__(self, value): self.value = value if isinstance(other, frame_base.DeferredFrame): proxy = other._expr.proxy() with expressions.allow_non_parallel_operations(): side = expressions.ComputedExpression( 'as_scalar', lambda df: AsScalar(df), [other._expr], requires_partition_by=partitionings.Singleton()) else: proxy = pd.DataFrame(columns=range(len(other[0]))) side = expressions.ConstantExpression(AsScalar(other)) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'dot', lambda left, right: left @ right.value, [self._expr, side], requires_partition_by=partitionings.Nothing(), preserves_partition_by=partitionings.Index(), proxy=proxy))
def test_sum_mean(self): df = pd.DataFrame({ 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], 'Speed': [380., 370., 24., 26.] }) with expressions.allow_non_parallel_operations(): self.run_scenario(df, lambda df: df.groupby('Animal').sum()) self.run_scenario(df, lambda df: df.groupby('Animal').mean())
def aggregate(self, func, axis=0, *args, **kwargs): if axis is None: # Aggregate across all elements by first aggregating across columns, # then across rows. return self.agg(func, *args, **dict(kwargs, axis=1)).agg( func, *args, **dict(kwargs, axis=0)) elif axis in (1, 'columns'): # This is an easy elementwise aggregation. return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda df: df.agg(func, axis=1, *args, **kwargs), [self._expr], requires_partition_by=partitionings.Nothing())) elif len(self._expr.proxy().columns) == 0 or args or kwargs: # For these corner cases, just colocate everything. return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda df: df.agg(func, *args, **kwargs), [self._expr], requires_partition_by=partitionings.Singleton())) else: # In the general case, compute the aggregation of each column separately, # then recombine. if not isinstance(func, dict): col_names = list(self._expr.proxy().columns) func = {col: func for col in col_names} else: col_names = list(func.keys()) aggregated_cols = [] for col in col_names: funcs = func[col] if not isinstance(funcs, list): funcs = [funcs] aggregated_cols.append(self[col].agg(funcs, *args, **kwargs)) # The final shape is different depending on whether any of the columns # were aggregated by a list of aggregators. with expressions.allow_non_parallel_operations(): if any(isinstance(funcs, list) for funcs in func.values()): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join_aggregate', lambda *cols: pd.DataFrame( {col: value for col, value in zip(col_names, cols)}), [col._expr for col in aggregated_cols], requires_partition_by=partitionings.Singleton())) else: return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join_aggregate', lambda *cols: pd.Series( {col: value[0] for col, value in zip(col_names, cols)}), [col._expr for col in aggregated_cols], requires_partition_by=partitionings.Singleton(), proxy=self._expr.proxy().agg(func, *args, **kwargs)))
def test_scalar(self): with expressions.allow_non_parallel_operations(): a = pd.Series([1, 2, 6]) self.run_scenario(a, lambda a: a.agg(sum)) self.run_scenario(a, lambda a: a / a.agg(sum)) # Tests scalar being used as an input to a downstream stage. df = pd.DataFrame({'key': ['a', 'a', 'b'], 'val': [1, 2, 6]}) self.run_scenario( df, lambda df: df.groupby('key').sum().val / df.val.agg(sum))
def apply(self, func, name=None, args=()): if name is None: name = func.__name__ with expressions.allow_non_parallel_operations( all(isinstance(arg, _DeferredScalar) for arg in args) or None): return DeferredFrame.wrap( expressions.ComputedExpression( name, func, [self._expr] + [arg._expr for arg in args], requires_partition_by=partitionings.Singleton()))
def test_rename(self): df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) self.run_scenario( df, lambda df: df.rename(columns={'B': 'C'}, index={ 0: 2, 2: 0 })) with expressions.allow_non_parallel_operations(): self.run_scenario( df, lambda df: df.rename( columns={'B': 'C'}, index={ 0: 2, 2: 0 }, errors='raise'))
def test_filter(self): df = pd.DataFrame({ 'Animal': ['Aardvark', 'Ant', 'Elephant', 'Zebra'], 'Speed': [5, 2, 35, 40] }) self.run_scenario(df, lambda df: df.filter(items=['Animal'])) self.run_scenario(df, lambda df: df.filter(regex='Anim.*')) self.run_scenario( df, lambda df: df.set_index('Animal').filter(regex='F.*', axis='index')) with expressions.allow_non_parallel_operations(): a = pd.DataFrame({'col': [1, 2, 3]}) self.run_scenario(a, lambda a: a.agg(sum)) self.run_scenario(a, lambda a: a.agg(['mean', 'min', 'max']))
def nsmallest(self, **kwargs): if 'keep' in kwargs and kwargs['keep'] != 'all': raise frame_base.WontImplementError('order-sensitive') per_partition = expressions.ComputedExpression( 'nsmallest-per-partition', lambda df: df.nsmallest(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing()) with expressions.allow_non_parallel_operations(True): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'nsmallest', lambda df: df.nsmallest(**kwargs), [per_partition], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton()))
def teststrings(texts, report=False, **runner_kwargs): optionflags = runner_kwargs.pop('optionflags', 0) optionflags |= (doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL) parser = doctest.DocTestParser() runner = BeamDataframeDoctestRunner(TestEnvironment(), optionflags=optionflags, **runner_kwargs) globs = { 'pd': runner.fake_pandas_module(), 'np': np, 'option_context': pd.option_context, } with expressions.allow_non_parallel_operations(): for name, text in texts.items(): test = parser.get_doctest(text, globs, name, name, 0) runner.run(test) if report: runner.summarize() return runner.summary().result()
def nlargest(self, keep, **kwargs): # TODO(robertwb): Document 'any' option. # TODO(robertwb): Consider (conditionally) defaulting to 'any' if no # explicit keep parameter is requested. if keep == 'any': keep = 'first' elif keep != 'all': raise frame_base.WontImplementError('order-sensitive') kwargs['keep'] = keep per_partition = expressions.ComputedExpression( 'nlargest-per-partition', lambda df: df.nlargest(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing()) with expressions.allow_non_parallel_operations(True): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'nlargest', lambda df: df.nlargest(**kwargs), [per_partition], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton()))
def dot(self, other): left = self._expr if isinstance(other, DeferredSeries): right = expressions.ComputedExpression( 'to_dataframe', pd.DataFrame, [other._expr], requires_partition_by=partitionings.Nothing(), preserves_partition_by=partitionings.Index()) right_is_series = True elif isinstance(other, DeferredDataFrame): right = other._expr right_is_series = False else: raise frame_base.WontImplementError('non-deferred result') dots = expressions.ComputedExpression( 'dot', # Transpose so we can sum across rows. (lambda left, right: pd.DataFrame(left @ right).T), [left, right], requires_partition_by=partitionings.Index()) with expressions.allow_non_parallel_operations(True): sums = expressions.ComputedExpression( 'sum', lambda dots: dots.sum(), # [dots], requires_partition_by=partitionings.Singleton()) if right_is_series: result = expressions.ComputedExpression( 'extract', lambda df: df[0], [sums], requires_partition_by=partitionings.Singleton()) else: result = sums return frame_base.DeferredFrame.wrap(result)
def test_aggregate(self): with expressions.allow_non_parallel_operations(): a = pd.DataFrame({'col': [1, 2, 3]}) self.run_scenario(a, lambda a: a.agg(sum)) self.run_scenario(a, lambda a: a.agg(['mean', 'min', 'max']))