def compute_projection_column_expr( expr, parent, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): result_name = expr._safe_name op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ops.TableColumn): # slightly faster path for simple column selection name = op.name assert isinstance(name, str) if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) suffix = util.get_join_suffix_for_op(op, parent_table_op) return data.loc[:, name + suffix].rename(result_name or name) data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) }, timecontext, ) for t in op.root_tables()) result = coerce_to_output( execute(expr, scope=scope, timecontext=timecontext, **kwargs), expr, data.index, ) return result
def coerce_to_output( result: Any, expr: ir.Expr, index: Optional[pd.Index] = None) -> Union[dd.Series, dd.DataFrame]: """Cast the result to either a Series of DataFrame, renaming as needed. Reimplementation of `coerce_to_output` in the pandas backend, but creates dask objects and adds special handling for dd.Scalars. Parameters ---------- result: Any The result to cast expr: ibis.expr.types.Expr The expression associated with the result index: pd.Index Optional. If passed, scalar results will be broadcasted according to the index. Returns ------- result: A `dd.Series` or `dd.DataFrame` Raises ------ ValueError If unable to coerce result Examples -------- For dataframe outputs, see ``_coerce_to_dataframe``. Examples below use pandas objects for legibility, but functionality is the same on dask objects. >>> coerce_to_output(pd.Series(1), expr) 0 1 Name: result, dtype: int64 >>> coerce_to_output(1, expr) 0 1 Name: result, dtype: int64 >>> coerce_to_output(1, expr, [1,2,3]) 1 1 2 1 3 1 Name: result, dtype: int64 >>> coerce_to_output([1,2,3], expr) 0 [1, 2, 3] Name: result, dtype: object """ result_name = expr.get_name() dataframe_exprs = ( ir.DestructColumn, ir.StructColumn, ir.DestructScalar, ir.StructScalar, ) if isinstance(expr, dataframe_exprs): return _coerce_to_dataframe(result, expr.type().names, expr.type().types) elif isinstance(result, (pd.Series, dd.Series)): # Series from https://github.com/ibis-project/ibis/issues/2711 return result.rename(result_name) elif isinstance(expr.op(), ops.Reduction): if isinstance(result, dd.core.Scalar): # wrap the scalar in a series out_dtype = _pandas_dtype_from_dd_scalar(result) out_len = 1 if index is None else len(index) meta = make_meta_series(dtype=out_dtype, name=result_name) # Specify `divisions` so that the created Dask object has # known divisions (to be concatenatable with Dask objects # created using `dd.from_pandas`) series = dd.from_delayed( _wrap_dd_scalar(result, result_name, out_len), meta=meta, divisions=(0, out_len - 1), ) return series else: return dd.from_pandas(pd_util.coerce_to_output( result, expr, index), npartitions=1) else: raise ValueError(f"Cannot coerce_to_output. Result: {result}")
def execute_aggregation_dataframe( op, data, metrics, by, having, predicates, sort_keys, scope=None, timecontext: Optional[TimeContext] = None, **kwargs, ): assert metrics, 'no metrics found during aggregation execution' if sort_keys: raise NotImplementedError( 'sorting on aggregations not yet implemented') if predicates: predicate = functools.reduce( operator.and_, (execute(p, scope=scope, timecontext=timecontext, **kwargs) for p in predicates), ) data = data.loc[predicate] columns: Dict[str, str] = {} if op.by: grouping_key_pairs = list(zip(by, map(operator.methodcaller('op'), by))) grouping_keys = [ by_op.name if isinstance(by_op, ops.TableColumn) else execute( by, scope=scope, timecontext=timecontext, **kwargs).rename( by.get_name()) for by, by_op in grouping_key_pairs ] columns.update((by_op.name, by.get_name()) for by, by_op in grouping_key_pairs if hasattr(by_op, 'name')) source = data.groupby(grouping_keys) else: source = data scope = scope.merge_scope(Scope({op.table.op(): source}, timecontext)) pieces = [ coerce_to_output( execute(metric, scope=scope, timecontext=timecontext, **kwargs), metric, ) for metric in metrics ] result = pd.concat(pieces, axis=1) # If grouping, need a reset to get the grouping key back as a column if by: result = result.reset_index() result.columns = [columns.get(c, c) for c in result.columns] if having: # .having(...) is only accessible on groupby, so this should never # raise if not by: raise ValueError( 'Filtering out aggregation values is not allowed without at ' 'least one grouping key') # TODO(phillipc): Don't recompute identical subexpressions predicate = functools.reduce( operator.and_, (execute(h, scope=scope, timecontext=timecontext, **kwargs) for h in having), ) assert len(predicate) == len( result), 'length of predicate does not match length of DataFrame' result = result.loc[predicate.values] return result