class DeferredSeries(frame_base.DeferredFrame): def __array__(self, dtype=None): raise frame_base.WontImplementError( 'Conversion to a non-deferred a numpy array.') isna = frame_base._elementwise_method('isna') notnull = notna = frame_base._elementwise_method('notna') transform = frame_base._elementwise_method('transform', restrictions={'axis': 0}) def agg(self, *args, **kwargs): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'agg', lambda df: df.agg(*args, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) all = frame_base._associative_agg_method('all') any = frame_base._associative_agg_method('any') min = frame_base._associative_agg_method('min') max = frame_base._associative_agg_method('max') prod = product = frame_base._associative_agg_method('prod') sum = frame_base._associative_agg_method('sum') cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method( 'order-sensitive') diff = frame_base.wont_implement_method('order-sensitive') def replace(self, to_replace=None, value=None, inplace=False, limit=None, *args, **kwargs): if limit is None: requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() result = frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'replace', lambda df: df.replace(to_replace, value, False, limit, *args, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) if inplace: self._expr = result._expr else: return result def unstack(self, *args, **kwargs): raise frame_base.WontImplementError('non-deferred column values')
def __getattr__(self, name): if name.startswith('read_'): return frame_base.wont_implement_method( 'Use p | apache_beam.dataframe.io.%s' % name) res = getattr(pd, name) if _is_top_level_function(res): return frame_base.not_implemented_method(name) else: return res
class DeferredDataFrame(frame_base.DeferredFrame): @property def T(self): return self.transpose() def groupby(self, cols): # TODO: what happens to the existing index? # We set the columns to index as we have a notion of being partitioned by # index, but not partitioned by an arbitrary subset of columns. return DeferredGroupBy( expressions.ComputedExpression( 'groupbyindex', lambda df: df.groupby(level=list(range(df.index.nlevels))), [self.set_index(cols)._expr], requires_partition_by=partitionings.Index(), preserves_partition_by=partitionings.Singleton())) def __getattr__(self, name): # Column attribute access. if name in self._expr.proxy().columns: return self[name] else: return object.__getattribute__(self, name) def __getitem__(self, key): if key in self._expr.proxy().columns: return self._elementwise(lambda df: df[key], 'get_column') else: raise NotImplementedError(key) def __setitem__(self, key, value): if isinstance(key, str): # yapf: disable return self._elementwise( lambda df, key, value: df.__setitem__(key, value), 'set_column', (key, value), inplace=True) else: raise NotImplementedError(key) def set_index(self, keys, **kwargs): if isinstance(keys, str): keys = [keys] if not set(keys).issubset(self._expr.proxy().columns): raise NotImplementedError(keys) return self._elementwise( lambda df: df.set_index(keys, **kwargs), 'set_index', inplace=kwargs.get('inplace', False)) def at(self, *args, **kwargs): raise NotImplementedError() @property def loc(self): return _DeferredLoc(self) def aggregate(self, *args, **kwargs): if 'axis' in kwargs and kwargs['axis'] is None: return self.agg(*args, **dict(kwargs, axis=1)).agg( *args, **dict(kwargs, axis=0)) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda df: df.agg(*args, **kwargs), [self._expr], # TODO(robertwb): Sub-aggregate when possible. requires_partition_by=partitionings.Singleton())) agg = aggregate applymap = frame_base._elementwise_method('applymap') memory_usage = frame_base.wont_implement_method('non-deferred value') all = frame_base._associative_agg_method('all') any = frame_base._associative_agg_method('any') cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method( 'order-sensitive') diff = frame_base.wont_implement_method('order-sensitive') max = frame_base._associative_agg_method('max') min = frame_base._associative_agg_method('min') mode = frame_base._agg_method('mode') def dropna( self, axis=0, how='any', thresh=None, subset=None, inplace=False, *args, **kwargs): # TODO(robertwb): This is a common pattern. Generalize? if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Singleton() else: requires_partition_by = partitionings.Nothing() result = frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'dropna', lambda df: df.dropna( axis, how, thresh, subset, False, *args, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) if inplace: self._expr = result._expr else: return result items = itertuples = iterrows = iteritems = frame_base.wont_implement_method( 'non-lazy') isna = frame_base._elementwise_method('isna') notnull = notna = frame_base._elementwise_method('notna') prod = product = frame_base._associative_agg_method('prod') def quantile(self, q=0.5, axis=0, *args, **kwargs): if axis != 0: raise frame_base.WontImplementError('non-deferred column values') return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'quantile', lambda df: df.quantile(q, axis, *args, **kwargs), [self._expr], #TODO(robertwb): Approximate quantiles? requires_partition_by=partitionings.Singleton(), preserves_partition_by=partitionings.Singleton())) query = frame_base._elementwise_method('query') def replace(self, to_replace=None, value=None, inplace=False, limit=None, *args, **kwargs): if limit is None: requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() result = frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'replace', lambda df: df.replace( to_replace, value, False, limit, *args, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) if inplace: self._expr = result._expr else: return result def reset_index(self, level=None, drop=False, inplace=False, *args, **kwargs): if level is not None and not isinstance(level, (tuple, list)): level = [level] if level is None or len(level) == len(self._expr.proxy().index.levels): # TODO: Could do distributed re-index with offsets. requires_partition_by = partitionings.Singleton() else: requires_partition_by = partitionings.Nothing() result = frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'reset_index', lambda df: df.reset_index(level, drop, False, *args, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) if inplace: self._expr = result._expr else: return result round = frame_base._elementwise_method('round') select_dtypes = frame_base._elementwise_method('select_dtypes') def shift(self, periods=1, freq=None, axis=0, *args, **kwargs): if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'shift', lambda df: df.shift(periods, freq, axis, *args, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) @property def shape(self): raise frame_base.WontImplementError('scalar value') def sort_values( self, by, axis=0, ascending=True, inplace=False, *args, **kwargs): if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() result = frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'sort_values', lambda df: df.sort_values( by, axis, ascending, False, *args, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) if inplace: self._expr = result._expr else: return result stack = frame_base._elementwise_method('stack') sum = frame_base._associative_agg_method('sum') to_records = to_dict = to_numpy = to_string = ( frame_base.wont_implement_method('non-deferred value')) to_sparse = to_string # frame_base._elementwise_method('to_sparse') transform = frame_base._elementwise_method( 'transform', restrictions={'axis': 0}) def transpose(self, *args, **kwargs): raise frame_base.WontImplementError('non-deferred column values') def unstack(self, *args, **kwargs): if self._expr.proxy().index.nlevels == 1: return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'unstack', lambda df: df.unstack(*args, **kwargs), [self._expr], requires_partition_by=partitionings.Index())) else: raise frame_base.WontImplementError('non-deferred column values') update = frame_base._proxy_method( 'update', inplace=True, requires_partition_by=partitionings.Index(), preserves_partition_by=partitionings.Index())
globals()['read_%s' % format] = frame_base.with_docs_from(pd)( _binary_reader(format)) globals()['to_%s' % format] = frame_base.with_docs_from(pd.DataFrame)( _binary_writer(format)) for format in ('sas', 'spss'): if hasattr(pd, 'read_%s' % format): # Depends on pandas version. globals()['read_%s' % format] = frame_base.with_docs_from(pd)( _binary_reader(format)) read_clipboard = frame_base.not_implemented_method('read_clipboard', base_type=pd) to_clipboard = frame_base.not_implemented_method('to_clipboard', base_type=pd.DataFrame) read_msgpack = frame_base.wont_implement_method(pd, 'read_msgpack', reason="deprecated") to_msgpack = frame_base.wont_implement_method(pd.DataFrame, 'to_msgpack', reason="deprecated") read_hdf = frame_base.wont_implement_method( pd, 'read_hdf', explanation="because HDF5 is a random access file format") to_hdf = frame_base.wont_implement_method( pd.DataFrame, 'to_hdf', explanation="because HDF5 is a random access file format") for name in dir(pd): if name.startswith('read_') and name not in globals(): globals()[name] = frame_base.not_implemented_method(name, base_type=pd)
def _binary_writer(format): return (lambda df, path, *args, **kwargs: _as_pc(df) | _WriteToPandas( f'to_{format}', path, args, kwargs)) for format in ('excel', 'feather', 'parquet', 'stata'): globals()['read_%s' % format] = _binary_reader(format) globals()['to_%s' % format] = _binary_writer(format) for format in ('sas', 'spss'): if hasattr(pd, 'read_%s' % format): # Depends on pandas version. globals()['read_%s' % format] = _binary_reader(format) read_clipboard = to_clipboard = frame_base.wont_implement_method('clipboard') read_msgpack = to_msgpack = frame_base.wont_implement_method('deprecated') read_hdf = to_hdf = frame_base.wont_implement_method('random access files') for name in dir(pd): if name.startswith('read_') and name not in globals(): globals()[name] = frame_base.not_implemented_method(name) def _prefix_range_index_with(prefix, df): if isinstance(df.index, pd.RangeIndex): return df.set_index(prefix + df.index.map(str).astype(str)) else: return df
class DeferredSeries(frame_base.DeferredFrame): def __array__(self, dtype=None): raise frame_base.WontImplementError( 'Conversion to a non-deferred a numpy array.') astype = frame_base._elementwise_method('astype') between = frame_base._elementwise_method('between') @frame_base.args_to_kwargs(pd.Series) @frame_base.populate_defaults(pd.Series) @frame_base.maybe_inplace def dropna(self, **kwargs): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'dropna', lambda df: df.dropna(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing())) items = iteritems = frame_base.wont_implement_method('non-lazy') isin = frame_base._elementwise_method('isin') isna = frame_base._elementwise_method('isna') notnull = notna = frame_base._elementwise_method('notna') @frame_base.args_to_kwargs(pd.Series) @frame_base.populate_defaults(pd.Series) @frame_base.maybe_inplace def fillna(self, value, method): if method is not None: raise frame_base.WontImplementError('order-sensitive') if isinstance(value, frame_base.DeferredBase): value_expr = value._expr else: value_expr = expressions.ConstantExpression(value) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'fillna', lambda df, value: df.fillna(value, method=method), [self._expr, value_expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing())) transform = frame_base._elementwise_method('transform', restrictions={'axis': 0}) def aggregate(self, func, axis=0, *args, **kwargs): if isinstance(func, list) and len(func) > 1: # Aggregate each column separately, then stick them all together. rows = [self.agg([f], *args, **kwargs) for f in func] return frame_base.DeferredFrame.wrap( expressions.ComputedExpression('join_aggregate', lambda *rows: pd.concat(rows), [row._expr for row in rows])) else: # We're only handling a single column. base_func = func[0] if isinstance(func, list) else func if _is_associative(base_func) and not args and not kwargs: intermediate = expressions.elementwise_expression( 'pre_aggregate', lambda s: s.agg([base_func], *args, **kwargs), [self._expr]) allow_nonparallel_final = True else: intermediate = self._expr allow_nonparallel_final = None # i.e. don't change the value with expressions.allow_non_parallel_operations( allow_nonparallel_final): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda s: s.agg(func, *args, **kwargs), [intermediate], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) agg = aggregate all = frame_base._agg_method('all') any = frame_base._agg_method('any') min = frame_base._agg_method('min') max = frame_base._agg_method('max') prod = product = frame_base._agg_method('prod') sum = frame_base._agg_method('sum') cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method( 'order-sensitive') diff = frame_base.wont_implement_method('order-sensitive') @frame_base.args_to_kwargs(pd.Series) @frame_base.populate_defaults(pd.Series) @frame_base.maybe_inplace def replace(self, limit, **kwargs): if limit is None: requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'replace', lambda df: df.replace(limit=limit, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) round = frame_base._elementwise_method('round') searchsorted = frame_base.wont_implement_method('order-sensitive') shift = frame_base.wont_implement_method('order-sensitive') take = frame_base.wont_implement_method('deprecated') to_dict = frame_base.wont_implement_method('non-deferred') to_frame = frame_base._elementwise_method('to_frame') def unique(self, as_series=False): if not as_series: raise frame_base.WontImplementError( 'pass as_series=True to get the result as a (deferred) Series') return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'unique', lambda df: pd.Series(df.unique()), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) def update(self, other): self._expr = expressions.ComputedExpression( 'update', lambda df, other: df.update(other) or df, [self._expr, other._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Index()) unstack = frame_base.wont_implement_method('non-deferred column values') values = property(frame_base.wont_implement_method('non-deferred')) view = frame_base.wont_implement_method('memory sharing semantics')
class DeferredDataFrame(frame_base.DeferredFrame): @property def T(self): return self.transpose() def groupby(self, cols): # TODO: what happens to the existing index? # We set the columns to index as we have a notion of being partitioned by # index, but not partitioned by an arbitrary subset of columns. return DeferredGroupBy( expressions.ComputedExpression( 'groupbyindex', lambda df: df.groupby(level=list(range(df.index.nlevels))), [self.set_index(cols)._expr], requires_partition_by=partitionings.Index(), preserves_partition_by=partitionings.Singleton())) def __getattr__(self, name): # Column attribute access. if name in self._expr.proxy().columns: return self[name] else: return object.__getattribute__(self, name) def __getitem__(self, key): if key in self._expr.proxy().columns: return self._elementwise(lambda df: df[key], 'get_column') else: raise NotImplementedError(key) def __setitem__(self, key, value): if isinstance(key, str): # yapf: disable return self._elementwise( lambda df, key, value: df.__setitem__(key, value), 'set_column', (key, value), inplace=True) else: raise NotImplementedError(key) def set_index(self, keys, **kwargs): if isinstance(keys, str): keys = [keys] if not set(keys).issubset(self._expr.proxy().columns): raise NotImplementedError(keys) return self._elementwise( lambda df: df.set_index(keys, **kwargs), 'set_index', inplace=kwargs.get('inplace', False)) def at(self, *args, **kwargs): raise NotImplementedError() @property def loc(self): return _DeferredLoc(self) def aggregate(self, func, axis=0, *args, **kwargs): if axis is None: # Aggregate across all elements by first aggregating across columns, # then across rows. return self.agg(func, *args, **dict(kwargs, axis=1)).agg( func, *args, **dict(kwargs, axis=0)) elif axis in (1, 'columns'): # This is an easy elementwise aggregation. return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda df: df.agg(func, axis=1, *args, **kwargs), [self._expr], requires_partition_by=partitionings.Nothing())) elif len(self._expr.proxy().columns) == 0 or args or kwargs: # For these corner cases, just colocate everything. return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda df: df.agg(func, *args, **kwargs), [self._expr], requires_partition_by=partitionings.Singleton())) else: # In the general case, compute the aggregation of each column separately, # then recombine. if not isinstance(func, dict): col_names = list(self._expr.proxy().columns) func = {col: func for col in col_names} else: col_names = list(func.keys()) aggregated_cols = [] for col in col_names: funcs = func[col] if not isinstance(funcs, list): funcs = [funcs] aggregated_cols.append(self[col].agg(funcs, *args, **kwargs)) # The final shape is different depending on whether any of the columns # were aggregated by a list of aggregators. with expressions.allow_non_parallel_operations(): if any(isinstance(funcs, list) for funcs in func.values()): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join_aggregate', lambda *cols: pd.DataFrame( {col: value for col, value in zip(col_names, cols)}), [col._expr for col in aggregated_cols], requires_partition_by=partitionings.Singleton())) else: return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join_aggregate', lambda *cols: pd.Series( {col: value[0] for col, value in zip(col_names, cols)}), [col._expr for col in aggregated_cols], requires_partition_by=partitionings.Singleton(), proxy=self._expr.proxy().agg(func, *args, **kwargs))) agg = aggregate applymap = frame_base._elementwise_method('applymap') memory_usage = frame_base.wont_implement_method('non-deferred value') all = frame_base._agg_method('all') any = frame_base._agg_method('any') cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method( 'order-sensitive') diff = frame_base.wont_implement_method('order-sensitive') max = frame_base._agg_method('max') min = frame_base._agg_method('min') def mode(self, axis=0, *args, **kwargs): if axis == 1 or axis == 'columns': raise frame_base.WontImplementError('non-deferred column values') return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'mode', lambda df: df.mode(*args, **kwargs), [self._expr], #TODO(robertwb): Approximate? requires_partition_by=partitionings.Singleton(), preserves_partition_by=partitionings.Singleton())) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def dropna(self, axis, **kwargs): # TODO(robertwb): This is a common pattern. Generalize? if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Singleton() else: requires_partition_by = partitionings.Nothing() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'dropna', lambda df: df.dropna(axis=axis, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) items = itertuples = iterrows = iteritems = frame_base.wont_implement_method( 'non-lazy') isna = frame_base._elementwise_method('isna') notnull = notna = frame_base._elementwise_method('notna') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def fillna(self, value, method, axis, **kwargs): if method is not None and axis in (0, 'index'): raise frame_base.WontImplementError('order-sensitive') if isinstance(value, frame_base.DeferredBase): value_expr = value._expr else: value_expr = expressions.ConstantExpression(value) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'fillna', lambda df, value: df.fillna( value, method=method, axis=axis, **kwargs), [self._expr, value_expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing())) prod = product = frame_base._agg_method('prod') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) def quantile(self, axis, **kwargs): if axis == 1 or axis == 'columns': raise frame_base.WontImplementError('non-deferred column values') return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'quantile', lambda df: df.quantile(axis=axis, **kwargs), [self._expr], #TODO(robertwb): Approximate quantiles? requires_partition_by=partitionings.Singleton(), preserves_partition_by=partitionings.Singleton())) query = frame_base._elementwise_method('query') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def replace(self, limit, **kwargs): if limit is None: requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'replace', lambda df: df.replace(limit=limit, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def reset_index(self, level, **kwargs): if level is not None and not isinstance(level, (tuple, list)): level = [level] if level is None or len(level) == len(self._expr.proxy().index.levels): # TODO: Could do distributed re-index with offsets. requires_partition_by = partitionings.Singleton() else: requires_partition_by = partitionings.Nothing() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'reset_index', lambda df: df.reset_index(level=level, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) round = frame_base._elementwise_method('round') select_dtypes = frame_base._elementwise_method('select_dtypes') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) def shift(self, axis, **kwargs): if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'shift', lambda df: df.shift(axis=axis, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) @property def shape(self): raise frame_base.WontImplementError('scalar value') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def sort_values(self, axis, **kwargs): if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'sort_values', lambda df: df.sort_values(axis=axis, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) stack = frame_base._elementwise_method('stack') sum = frame_base._agg_method('sum') take = frame_base.wont_implement_method('deprecated') to_records = to_dict = to_numpy = to_string = ( frame_base.wont_implement_method('non-deferred value')) to_sparse = to_string # frame_base._elementwise_method('to_sparse') transform = frame_base._elementwise_method( 'transform', restrictions={'axis': 0}) def transpose(self, *args, **kwargs): raise frame_base.WontImplementError('non-deferred column values') def unstack(self, *args, **kwargs): if self._expr.proxy().index.nlevels == 1: return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'unstack', lambda df: df.unstack(*args, **kwargs), [self._expr], requires_partition_by=partitionings.Index())) else: raise frame_base.WontImplementError('non-deferred column values') update = frame_base._proxy_method( 'update', inplace=True, requires_partition_by=partitionings.Index(), preserves_partition_by=partitionings.Index())
class DeferredPandasModule(object): array = _defer_to_pandas('array') bdate_range = _defer_to_pandas('bdate_range') @staticmethod @frame_base.args_to_kwargs(pd) @frame_base.populate_defaults(pd) def concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy): if ignore_index: raise NotImplementedError('concat(ignore_index)') if levels: raise NotImplementedError('concat(levels)') if isinstance(objs, Mapping): if keys is None: keys = list(objs.keys()) objs = [objs[k] for k in keys] else: objs = list(objs) deferred_none = expressions.ConstantExpression(None) exprs = [deferred_none if o is None else o._expr for o in objs] if axis in (1, 'columns'): required_partitioning = partitionings.Index() elif verify_integrity: required_partitioning = partitionings.Index() else: required_partitioning = partitionings.Nothing() return frame_base.DeferredBase.wrap( expressions.ComputedExpression( 'concat', lambda *objs: pd.concat(objs, axis=axis, join=join, ignore_index=ignore_index, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity ), # yapf break exprs, requires_partition_by=required_partitioning, preserves_partition_by=partitionings.Index())) date_range = _defer_to_pandas('date_range') describe_option = _defer_to_pandas('describe_option') factorize = _call_on_first_arg('factorize') get_option = _defer_to_pandas('get_option') interval_range = _defer_to_pandas('interval_range') isna = _call_on_first_arg('isna') isnull = _call_on_first_arg('isnull') json_normalize = _defer_to_pandas('json_normalize') melt = _call_on_first_arg('melt') merge = _call_on_first_arg('merge') melt = _call_on_first_arg('melt') merge_ordered = frame_base.wont_implement_method('order-sensitive') notna = _call_on_first_arg('notna') notnull = _call_on_first_arg('notnull') option_context = _defer_to_pandas('option_context') period_range = _defer_to_pandas('period_range') pivot = _call_on_first_arg('pivot') pivot_table = _call_on_first_arg('pivot_table') show_versions = _defer_to_pandas('show_versions') test = frame_base.wont_implement_method('test') timedelta_range = _defer_to_pandas('timedelta_range') to_pickle = frame_base.wont_implement_method('order-sensitive') notna = _call_on_first_arg('notna') def __getattr__(self, name): if name.startswith('read_'): return frame_base.wont_implement_method( 'Use p | apache_beam.dataframe.io.%s' % name) res = getattr(pd, name) if _is_top_level_function(res): return frame_base.not_implemented_method(name) else: return res
class DeferredDataFrame(frame_base.DeferredFrame): @property def T(self): return self.transpose() @property def columns(self): return self._expr.proxy().columns def groupby(self, by): # TODO: what happens to the existing index? # We set the columns to index as we have a notion of being partitioned by # index, but not partitioned by an arbitrary subset of columns. return DeferredGroupBy( expressions.ComputedExpression( 'groupbyindex', lambda df: df.groupby(level=list(range(df.index.nlevels))), [self.set_index(by)._expr], requires_partition_by=partitionings.Index(), preserves_partition_by=partitionings.Singleton())) def __getattr__(self, name): # Column attribute access. if name in self._expr.proxy().columns: return self[name] else: return object.__getattribute__(self, name) def __getitem__(self, key): # TODO: Replicate pd.DataFrame.__getitem__ logic if isinstance(key, frame_base.DeferredBase): # Fail early if key is a DeferredBase as it interacts surprisingly with # key in self._expr.proxy().columns raise NotImplementedError( "Indexing with a deferred frame is not yet supported. Consider " "using df.loc[...]") if (isinstance(key, list) and all(key_column in self._expr.proxy().columns for key_column in key)) or key in self._expr.proxy().columns: return self._elementwise(lambda df: df[key], 'get_column') else: raise NotImplementedError(key) def __contains__(self, key): # Checks if proxy has the given column return self._expr.proxy().__contains__(key) def __setitem__(self, key, value): if isinstance(key, str): # yapf: disable return self._elementwise( lambda df, key, value: df.__setitem__(key, value), 'set_column', (key, value), inplace=True) else: raise NotImplementedError(key) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def set_index(self, keys, **kwargs): if isinstance(keys, str): keys = [keys] if not set(keys).issubset(self._expr.proxy().columns): raise NotImplementedError(keys) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'set_index', lambda df: df.set_index(keys, **kwargs), [self._expr], requires_partition_by=partitionings.Nothing(), preserves_partition_by=partitionings.Nothing())) at = frame_base.not_implemented_method('at') @property def loc(self): return _DeferredLoc(self) _get_index = _set_index = frame_base.not_implemented_method('index') index = property(_get_index, _set_index) @property def axes(self): return (self.index, self.columns) apply = frame_base.not_implemented_method('apply') explode = frame_base.not_implemented_method('explode') isin = frame_base.not_implemented_method('isin') assign = frame_base.not_implemented_method('assign') append = frame_base.not_implemented_method('append') combine = frame_base.not_implemented_method('combine') combine_first = frame_base.not_implemented_method('combine_first') cov = frame_base.not_implemented_method('cov') corr = frame_base.not_implemented_method('corr') count = frame_base.not_implemented_method('count') drop = frame_base.not_implemented_method('drop') eval = frame_base.not_implemented_method('eval') reindex = frame_base.not_implemented_method('reindex') melt = frame_base.not_implemented_method('melt') pivot = frame_base.not_implemented_method('pivot') pivot_table = frame_base.not_implemented_method('pivot_table') def aggregate(self, func, axis=0, *args, **kwargs): if axis is None: # Aggregate across all elements by first aggregating across columns, # then across rows. return self.agg(func, *args, **dict(kwargs, axis=1)).agg( func, *args, **dict(kwargs, axis=0)) elif axis in (1, 'columns'): # This is an easy elementwise aggregation. return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda df: df.agg(func, axis=1, *args, **kwargs), [self._expr], requires_partition_by=partitionings.Nothing())) elif len(self._expr.proxy().columns) == 0 or args or kwargs: # For these corner cases, just colocate everything. return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda df: df.agg(func, *args, **kwargs), [self._expr], requires_partition_by=partitionings.Singleton())) else: # In the general case, compute the aggregation of each column separately, # then recombine. if not isinstance(func, dict): col_names = list(self._expr.proxy().columns) func = {col: func for col in col_names} else: col_names = list(func.keys()) aggregated_cols = [] for col in col_names: funcs = func[col] if not isinstance(funcs, list): funcs = [funcs] aggregated_cols.append(self[col].agg(funcs, *args, **kwargs)) # The final shape is different depending on whether any of the columns # were aggregated by a list of aggregators. with expressions.allow_non_parallel_operations(): if any(isinstance(funcs, list) for funcs in func.values()): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join_aggregate', lambda *cols: pd.DataFrame( {col: value for col, value in zip(col_names, cols)}), [col._expr for col in aggregated_cols], requires_partition_by=partitionings.Singleton())) else: return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join_aggregate', lambda *cols: pd.Series( {col: value[0] for col, value in zip(col_names, cols)}), [col._expr for col in aggregated_cols], requires_partition_by=partitionings.Singleton(), proxy=self._expr.proxy().agg(func, *args, **kwargs))) agg = aggregate applymap = frame_base._elementwise_method('applymap') memory_usage = frame_base.wont_implement_method('non-deferred value') info = frame_base.wont_implement_method('non-deferred value') all = frame_base._agg_method('all') any = frame_base._agg_method('any') cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method( 'order-sensitive') diff = frame_base.wont_implement_method('order-sensitive') def dot(self, other): # We want to broadcast the right hand side to all partitions of the left. # This is OK, as its index must be the same size as the columns set of self, # so cannot be too large. class AsScalar(object): def __init__(self, value): self.value = value if isinstance(other, frame_base.DeferredFrame): proxy = other._expr.proxy() with expressions.allow_non_parallel_operations(): side = expressions.ComputedExpression( 'as_scalar', lambda df: AsScalar(df), [other._expr], requires_partition_by=partitionings.Singleton()) else: proxy = pd.DataFrame(columns=range(len(other[0]))) side = expressions.ConstantExpression(AsScalar(other)) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'dot', lambda left, right: left @ right.value, [self._expr, side], requires_partition_by=partitionings.Nothing(), preserves_partition_by=partitionings.Index(), proxy=proxy)) __matmul__ = dot head = tail = frame_base.wont_implement_method('order-sensitive') max = frame_base._agg_method('max') min = frame_base._agg_method('min') def mode(self, axis=0, *args, **kwargs): if axis == 1 or axis == 'columns': # Number of columns is max(number mode values for each row), so we can't # determine how many there will be before looking at the data. raise frame_base.WontImplementError('non-deferred column values') return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'mode', lambda df: df.mode(*args, **kwargs), [self._expr], #TODO(robertwb): Approximate? requires_partition_by=partitionings.Singleton(), preserves_partition_by=partitionings.Singleton())) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def dropna(self, axis, **kwargs): # TODO(robertwb): This is a common pattern. Generalize? if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Singleton() else: requires_partition_by = partitionings.Nothing() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'dropna', lambda df: df.dropna(axis=axis, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def fillna(self, value, method, axis, **kwargs): if method is not None and axis in (0, 'index'): raise frame_base.WontImplementError('order-sensitive') if isinstance(value, frame_base.DeferredBase): value_expr = value._expr else: value_expr = expressions.ConstantExpression(value) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'fillna', lambda df, value: df.fillna( value, method=method, axis=axis, **kwargs), [self._expr, value_expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing())) isna = frame_base._elementwise_method('isna') notnull = notna = frame_base._elementwise_method('notna') items = itertuples = iterrows = iteritems = frame_base.wont_implement_method( 'non-lazy') def _cols_as_temporary_index(self, cols, suffix=''): original_index_names = list(self._expr.proxy().index.names) new_index_names = [ '__apache_beam_temp_%d_%s' % (ix, suffix) for (ix, _) in enumerate(original_index_names)] def reindex(df): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'reindex', lambda df: df.rename_axis(index=new_index_names, copy=False) .reset_index().set_index(cols), [df._expr], preserves_partition_by=partitionings.Nothing(), requires_partition_by=partitionings.Nothing())) def revert(df): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join_restoreindex', lambda df: df.reset_index().set_index(new_index_names) .rename_axis(index=original_index_names, copy=False), [df._expr], preserves_partition_by=partitionings.Nothing(), requires_partition_by=partitionings.Nothing())) return reindex, revert @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) def join(self, other, on, **kwargs): if on is not None: reindex, revert = self._cols_as_temporary_index(on) return revert(reindex(self).join(other, **kwargs)) if isinstance(other, list): other_is_list = True else: other = [other] other_is_list = False placeholder = object() other_exprs = [ df._expr for df in other if isinstance(df, frame_base.DeferredFrame)] const_others = [ placeholder if isinstance(df, frame_base.DeferredFrame) else df for df in other] def fill_placeholders(values): values = iter(values) filled = [ next(values) if df is placeholder else df for df in const_others] if other_is_list: return filled else: return filled[0] return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join', lambda df, *deferred_others: df.join( fill_placeholders(deferred_others), **kwargs), [self._expr] + other_exprs, preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Index())) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) def merge( self, right, on, left_on, right_on, left_index, right_index, **kwargs): self_proxy = self._expr.proxy() right_proxy = right._expr.proxy() # Validate with a pandas call. _ = self_proxy.merge( right_proxy, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, **kwargs) if not any([on, left_on, right_on, left_index, right_index]): on = [col for col in self_proxy.columns() if col in right_proxy.columns()] if not left_on: left_on = on elif not isinstance(left_on, list): left_on = [left_on] if not right_on: right_on = on elif not isinstance(right_on, list): right_on = [right_on] if left_index: indexed_left = self else: indexed_left = self.set_index(left_on, drop=False) if right_index: indexed_right = right else: indexed_right = right.set_index(right_on, drop=False) merged = frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'merge', lambda left, right: left.merge( right, left_index=True, right_index=True, **kwargs), [indexed_left._expr, indexed_right._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Index())) if left_index or right_index: return merged else: return merged.reset_index(drop=True) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) def nlargest(self, keep, **kwargs): if keep == 'any': keep = 'first' elif keep != 'all': raise frame_base.WontImplementError('order-sensitive') kwargs['keep'] = keep per_partition = expressions.ComputedExpression( 'nlargest-per-partition', lambda df: df.nlargest(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing()) with expressions.allow_non_parallel_operations(True): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'nlargest', lambda df: df.nlargest(**kwargs), [per_partition], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) def nsmallest(self, keep, **kwargs): if keep == 'any': keep = 'first' elif keep != 'all': raise frame_base.WontImplementError('order-sensitive') kwargs['keep'] = keep per_partition = expressions.ComputedExpression( 'nsmallest-per-partition', lambda df: df.nsmallest(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing()) with expressions.allow_non_parallel_operations(True): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'nsmallest', lambda df: df.nsmallest(**kwargs), [per_partition], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) @frame_base.args_to_kwargs(pd.DataFrame) def nunique(self, **kwargs): if kwargs.get('axis', None) in (1, 'columns'): requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'nunique', lambda df: df.nunique(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) prod = product = frame_base._agg_method('prod') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) def quantile(self, axis, **kwargs): if axis == 1 or axis == 'columns': raise frame_base.WontImplementError('non-deferred column values') return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'quantile', lambda df: df.quantile(axis=axis, **kwargs), [self._expr], #TODO(robertwb): Approximate quantiles? requires_partition_by=partitionings.Singleton(), preserves_partition_by=partitionings.Singleton())) query = frame_base._elementwise_method('query') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.maybe_inplace def rename(self, **kwargs): rename_index = ( 'index' in kwargs or kwargs.get('axis', None) in (0, 'index') or ('columns' not in kwargs and 'axis' not in kwargs)) if rename_index: # Technically, it's still partitioned by index, but it's no longer # partitioned by the hash of the index. preserves_partition_by = partitionings.Nothing() else: preserves_partition_by = partitionings.Singleton() if kwargs.get('errors', None) == 'raise' and rename_index: # Renaming index with checking. requires_partition_by = partitionings.Singleton() proxy = self._expr.proxy() else: requires_partition_by = partitionings.Nothing() proxy = None return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'rename', lambda df: df.rename(**kwargs), [self._expr], proxy=proxy, preserves_partition_by=preserves_partition_by, requires_partition_by=requires_partition_by)) rename_axis = frame_base._elementwise_method('rename_axis') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def replace(self, limit, **kwargs): if limit is None: requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'replace', lambda df: df.replace(limit=limit, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def reset_index(self, level=None, **kwargs): if level is not None and not isinstance(level, (tuple, list)): level = [level] if level is None or len(level) == len(self._expr.proxy().index.levels): # TODO: Could do distributed re-index with offsets. requires_partition_by = partitionings.Singleton() else: requires_partition_by = partitionings.Nothing() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'reset_index', lambda df: df.reset_index(level=level, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) round = frame_base._elementwise_method('round') select_dtypes = frame_base._elementwise_method('select_dtypes') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) def shift(self, axis, **kwargs): if 'freq' in kwargs: raise frame_base.WontImplementError('data-dependent') if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'shift', lambda df: df.shift(axis=axis, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) @property def shape(self): raise frame_base.WontImplementError('scalar value') @frame_base.args_to_kwargs(pd.DataFrame) @frame_base.populate_defaults(pd.DataFrame) @frame_base.maybe_inplace def sort_values(self, axis, **kwargs): if axis == 1 or axis == 'columns': requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'sort_values', lambda df: df.sort_values(axis=axis, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) stack = frame_base._elementwise_method('stack') sum = frame_base._agg_method('sum') take = frame_base.wont_implement_method('deprecated') to_records = to_dict = to_numpy = to_string = ( frame_base.wont_implement_method('non-deferred value')) to_sparse = to_string # frame_base._elementwise_method('to_sparse') transform = frame_base._elementwise_method( 'transform', restrictions={'axis': 0}) transpose = frame_base.wont_implement_method('non-deferred column values') def unstack(self, *args, **kwargs): if self._expr.proxy().index.nlevels == 1: return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'unstack', lambda df: df.unstack(*args, **kwargs), [self._expr], requires_partition_by=partitionings.Index())) else: raise frame_base.WontImplementError('non-deferred column values') update = frame_base._proxy_method( 'update', inplace=True, requires_partition_by=partitionings.Index(), preserves_partition_by=partitionings.Index())
class DeferredSeries(frame_base.DeferredFrame): def __array__(self, dtype=None): raise frame_base.WontImplementError( 'Conversion to a non-deferred a numpy array.') astype = frame_base._elementwise_method('astype') between = frame_base._elementwise_method('between') def dot(self, other): left = self._expr if isinstance(other, DeferredSeries): right = expressions.ComputedExpression( 'to_dataframe', pd.DataFrame, [other._expr], requires_partition_by=partitionings.Nothing(), preserves_partition_by=partitionings.Index()) right_is_series = True elif isinstance(other, DeferredDataFrame): right = other._expr right_is_series = False else: raise frame_base.WontImplementError('non-deferred result') dots = expressions.ComputedExpression( 'dot', # Transpose so we can sum across rows. (lambda left, right: pd.DataFrame(left @ right).T), [left, right], requires_partition_by=partitionings.Index()) with expressions.allow_non_parallel_operations(True): sums = expressions.ComputedExpression( 'sum', lambda dots: dots.sum(), # [dots], requires_partition_by=partitionings.Singleton()) if right_is_series: result = expressions.ComputedExpression( 'extract', lambda df: df[0], [sums], requires_partition_by=partitionings.Singleton()) else: result = sums return frame_base.DeferredFrame.wrap(result) __matmul__ = dot @frame_base.args_to_kwargs(pd.Series) @frame_base.populate_defaults(pd.Series) @frame_base.maybe_inplace def dropna(self, **kwargs): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'dropna', lambda df: df.dropna(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing())) items = iteritems = frame_base.wont_implement_method('non-lazy') isin = frame_base._elementwise_method('isin') isna = frame_base._elementwise_method('isna') notnull = notna = frame_base._elementwise_method('notna') @frame_base.args_to_kwargs(pd.Series) @frame_base.populate_defaults(pd.Series) @frame_base.maybe_inplace def fillna(self, value, method): if method is not None: raise frame_base.WontImplementError('order-sensitive') if isinstance(value, frame_base.DeferredBase): value_expr = value._expr else: value_expr = expressions.ConstantExpression(value) return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'fillna', lambda df, value: df.fillna(value, method=method), [self._expr, value_expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing())) reindex = frame_base.not_implemented_method('reindex') to_numpy = to_string = frame_base.wont_implement_method('non-deferred value') transform = frame_base._elementwise_method( 'transform', restrictions={'axis': 0}) def aggregate(self, func, axis=0, *args, **kwargs): if isinstance(func, list) and len(func) > 1: # Aggregate each column separately, then stick them all together. rows = [self.agg([f], *args, **kwargs) for f in func] return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'join_aggregate', lambda *rows: pd.concat(rows), [row._expr for row in rows])) else: # We're only handling a single column. base_func = func[0] if isinstance(func, list) else func if _is_associative(base_func) and not args and not kwargs: intermediate = expressions.elementwise_expression( 'pre_aggregate', lambda s: s.agg([base_func], *args, **kwargs), [self._expr]) allow_nonparallel_final = True else: intermediate = self._expr allow_nonparallel_final = None # i.e. don't change the value with expressions.allow_non_parallel_operations(allow_nonparallel_final): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'aggregate', lambda s: s.agg(func, *args, **kwargs), [intermediate], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) agg = aggregate all = frame_base._agg_method('all') any = frame_base._agg_method('any') min = frame_base._agg_method('min') max = frame_base._agg_method('max') prod = product = frame_base._agg_method('prod') sum = frame_base._agg_method('sum') cummax = cummin = cumsum = cumprod = frame_base.wont_implement_method( 'order-sensitive') diff = frame_base.wont_implement_method('order-sensitive') head = tail = frame_base.wont_implement_method('order-sensitive') memory_usage = frame_base.wont_implement_method('non-deferred value') # In Series __contains__ checks the index __contains__ = frame_base.wont_implement_method('non-deferred value') @frame_base.args_to_kwargs(pd.Series) @frame_base.populate_defaults(pd.Series) def nlargest(self, keep, **kwargs): # TODO(robertwb): Document 'any' option. # TODO(robertwb): Consider (conditionally) defaulting to 'any' if no # explicit keep parameter is requested. if keep == 'any': keep = 'first' elif keep != 'all': raise frame_base.WontImplementError('order-sensitive') kwargs['keep'] = keep per_partition = expressions.ComputedExpression( 'nlargest-per-partition', lambda df: df.nlargest(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing()) with expressions.allow_non_parallel_operations(True): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'nlargest', lambda df: df.nlargest(**kwargs), [per_partition], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) @frame_base.args_to_kwargs(pd.Series) @frame_base.populate_defaults(pd.Series) def nsmallest(self, keep, **kwargs): if keep == 'any': keep = 'first' elif keep != 'all': raise frame_base.WontImplementError('order-sensitive') kwargs['keep'] = keep per_partition = expressions.ComputedExpression( 'nsmallest-per-partition', lambda df: df.nsmallest(**kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Nothing()) with expressions.allow_non_parallel_operations(True): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'nsmallest', lambda df: df.nsmallest(**kwargs), [per_partition], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) rename_axis = frame_base._elementwise_method('rename_axis') @frame_base.args_to_kwargs(pd.Series) @frame_base.populate_defaults(pd.Series) @frame_base.maybe_inplace def replace(self, limit, **kwargs): if limit is None: requires_partition_by = partitionings.Nothing() else: requires_partition_by = partitionings.Singleton() return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'replace', lambda df: df.replace(limit=limit, **kwargs), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=requires_partition_by)) round = frame_base._elementwise_method('round') searchsorted = frame_base.wont_implement_method('order-sensitive') shift = frame_base.wont_implement_method('order-sensitive') take = frame_base.wont_implement_method('deprecated') to_dict = frame_base.wont_implement_method('non-deferred') to_frame = frame_base._elementwise_method('to_frame') def unique(self, as_series=False): if not as_series: raise frame_base.WontImplementError( 'pass as_series=True to get the result as a (deferred) Series') return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'unique', lambda df: pd.Series(df.unique()), [self._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Singleton())) def update(self, other): self._expr = expressions.ComputedExpression( 'update', lambda df, other: df.update(other) or df, [self._expr, other._expr], preserves_partition_by=partitionings.Singleton(), requires_partition_by=partitionings.Index()) unstack = frame_base.wont_implement_method('non-deferred column values') values = property(frame_base.wont_implement_method('non-deferred')) view = frame_base.wont_implement_method('memory sharing semantics') @property def str(self): expr = expressions.ComputedExpression( 'str', lambda df: df.str, [self._expr], requires_partition_by=partitionings.Nothing(), preserves_partition_by=partitionings.Singleton()) return _DeferredStringMethods(expr)
class DeferredPandasModule(object): array = _defer_to_pandas('array') bdate_range = _defer_to_pandas('bdate_range') @staticmethod @frame_base.args_to_kwargs(pd) @frame_base.populate_defaults(pd) def concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy): if ignore_index: raise NotImplementedError('concat(ignore_index)') if levels: raise NotImplementedError('concat(levels)') if isinstance(objs, Mapping): if keys is None: keys = list(objs.keys()) objs = [objs[k] for k in keys] else: objs = list(objs) if keys is None: preserves_partitioning = partitionings.Arbitrary() else: # Index 0 will be a new index for keys, only partitioning by the original # indexes (1 to N) will be preserved. nlevels = min(o._expr.proxy().index.nlevels for o in objs) preserves_partitioning = partitionings.Index( [i for i in range(1, nlevels + 1)]) deferred_none = expressions.ConstantExpression(None) exprs = [deferred_none if o is None else o._expr for o in objs] if axis in (1, 'columns'): required_partitioning = partitionings.Index() elif verify_integrity: required_partitioning = partitionings.Index() else: required_partitioning = partitionings.Arbitrary() return frame_base.DeferredBase.wrap( expressions.ComputedExpression( 'concat', lambda *objs: pd.concat(objs, axis=axis, join=join, ignore_index=ignore_index, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity ), # yapf break exprs, requires_partition_by=required_partitioning, preserves_partition_by=preserves_partitioning)) date_range = _defer_to_pandas('date_range') describe_option = _defer_to_pandas('describe_option') factorize = _call_on_first_arg('factorize') get_option = _defer_to_pandas('get_option') interval_range = _defer_to_pandas('interval_range') isna = _call_on_first_arg('isna') isnull = _call_on_first_arg('isnull') json_normalize = _defer_to_pandas('json_normalize') melt = _call_on_first_arg('melt') merge = _call_on_first_arg('merge') melt = _call_on_first_arg('melt') merge_ordered = frame_base.wont_implement_method(pd, 'merge_ordered', reason='order-sensitive') notna = _call_on_first_arg('notna') notnull = _call_on_first_arg('notnull') option_context = _defer_to_pandas('option_context') period_range = _defer_to_pandas('period_range') pivot = _call_on_first_arg('pivot') pivot_table = _call_on_first_arg('pivot_table') show_versions = _defer_to_pandas('show_versions') test = frame_base.wont_implement_method( pd, 'test', explanation="because it is an internal pandas testing utility.") timedelta_range = _defer_to_pandas('timedelta_range') to_pickle = frame_base.wont_implement_method(pd, 'to_pickle', reason='order-sensitive') to_datetime = _defer_to_pandas_maybe_elementwise('to_datetime') notna = _call_on_first_arg('notna') def __getattr__(self, name): if name.startswith('read_'): def func(*args, **kwargs): raise frame_base.WontImplementError( 'Use p | apache_beam.dataframe.io.%s' % name) return func res = getattr(pd, name) if _is_top_level_function(res): return frame_base.not_implemented_method(name, base_type=pd) else: return res