def execute_without_scope(expr, params=None, scope=None, context=None, **kwargs): """Execute an expression against data that are bound to it. If no data are bound, raise an Exception. Parameters ---------- expr : ir.Expr The expression to execute params : Dict[Expr, object] Returns ------- result : scalar, pd.Series, pd.DataFrame Raises ------ ValueError * If no data are bound to the input expression """ data_scope = find_data(expr) if not data_scope: raise ValueError( 'No data sources found while trying to execute against the pandas ' 'backend') factory = type(data_scope) if scope is None: scope = factory() if params is None: params = factory() params = {k.op() if hasattr(k, 'op') else k: v for k, v in params.items()} new_scope = toolz.merge(scope, data_scope, params, factory=factory) new_scope.update((node, data_preload(node, data, scope=new_scope)) for node, data in new_scope.items()) # By default, our aggregate functions are N -> 1 return execute( expr, new_scope, context=context if context is not None else agg_ctx.Summarize(), **kwargs)
def execute_with_scope(expr, scope, aggcontext=None, clients=None, **kwargs): """Execute an expression `expr`, with data provided in `scope`. Parameters ---------- expr : ibis.expr.types.Expr The expression to execute. scope : collections.Mapping A dictionary mapping :class:`~ibis.expr.operations.Node` subclass instances to concrete data such as a pandas DataFrame. aggcontext : Optional[ibis.pandas.aggcontext.AggregationContext] Returns ------- result : scalar, pd.Series, pd.DataFrame """ op = expr.op() # Call pre_execute, to allow clients to intercept the expression before # computing anything *and* before associating leaf nodes with data. This # allows clients to provide their own data for each leaf. if clients is None: clients = list(find_backends(expr)) if aggcontext is None: aggcontext = agg_ctx.Summarize() pre_executed_scope = pre_execute(op, *clients, scope=scope, aggcontext=aggcontext, **kwargs) new_scope = toolz.merge(scope, pre_executed_scope) result = execute_until_in_scope( expr, new_scope, aggcontext=aggcontext, clients=clients, # XXX: we *explicitly* pass in scope and not new_scope here so that # post_execute sees the scope of execute_with_scope, not the scope of # execute_until_in_scope post_execute_=functools.partial(post_execute, scope=scope, aggcontext=aggcontext, clients=clients, **kwargs), **kwargs) return result
def get_aggcontext_window(window, *, operand, operand_dtype, parent, group_by, order_by) -> AggregationContext: # no order by or group by: default summarization aggcontext # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not group_by and not order_by: aggcontext = agg_ctx.Summarize() elif (isinstance(operand.op(), (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All)) and order_by): # XXX(phillipc): What a horror show preceding = window.preceding if preceding is not None: max_lookback = window.max_lookback assert not isinstance(operand.op(), ops.CumulativeOp) aggcontext = agg_ctx.Moving( preceding, max_lookback, parent=parent, group_by=group_by, order_by=order_by, dtype=operand_dtype, ) else: # expanding window aggcontext = agg_ctx.Cumulative( parent=parent, group_by=group_by, order_by=order_by, dtype=operand_dtype, ) else: # groupby transform (window with a partition by clause in SQL parlance) aggcontext = agg_ctx.Transform( parent=parent, group_by=group_by, order_by=order_by, dtype=operand_dtype, ) return aggcontext
def execute_window_op(op, data, window, scope=None, aggcontext=None, clients=None, **kwargs): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute(operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs) scope = toolz.merge(scope, pre_executed_scope) root, = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, aggcontext=aggcontext, **kwargs) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame(data, order_by, group_by=group_by, **kwargs) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() # no order by or group by: default summarization aggcontext # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not ordering_keys: aggcontext = agg_ctx.Summarize() elif (isinstance(operand.op(), (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All)) and ordering_keys): # XXX(phillipc): What a horror show preceding = window.preceding if preceding is not None: max_lookback = window.max_lookback assert not isinstance(operand.op(), ops.CumulativeOp) aggcontext = agg_ctx.Moving( preceding, max_lookback, parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # expanding window aggcontext = agg_ctx.Cumulative( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # groupby transform (window with a partition by clause in SQL parlance) aggcontext = agg_ctx.Transform( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) result = execute( operand, scope=new_scope, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series
def execute_frame_window_op(op, data, scope=None, context=None, **kwargs): operand, window = op.args following = window.following order_by = window._order_by if order_by and following != 0: raise ValueError( 'Following with a value other than 0 (current row) with order_by ' 'is not yet implemented in the pandas backend. Use ' 'ibis.trailing_window or ibis.cumulative_window to ' 'construct windows when using the pandas backend.') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ir.TableColumn) else execute( key, context=context, **kwargs) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if grouping_keys: source = data.groupby(grouping_keys, sort=False, as_index=not order_by) if order_by: sorted_df = source.apply( lambda df, order_by=order_by, kwargs=kwargs: (util.compute_sorted_frame(order_by, df, **kwargs))) source = sorted_df.groupby(grouping_keys, sort=False) post_process = _post_process_group_by_order_by else: post_process = _post_process_group_by else: if order_by: source = util.compute_sorted_frame(order_by, data, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # no order by or group by: default summarization context # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not order_by: context = agg_ctx.Summarize() elif isinstance(operand.op(), ops.Reduction) and order_by: preceding = window.preceding if preceding is not None: context = agg_ctx.Trailing(preceding) else: context = agg_ctx.Cumulative() else: context = agg_ctx.Transform() result = execute(operand, new_scope, context=context, **kwargs) series = post_process(result, data.index) assert len(data) == len(series), \ 'input data source and computed column do not have the same length' return series
def execute_with_scope(expr, scope, context=None, **kwargs): """Execute an expression `expr`, with data provided in `scope`. Parameters ---------- expr : ir.Expr The expression to execute. scope : dict A dictionary mapping :class:`~ibis.expr.types.Node` subclass instances to concrete data such as a pandas DataFrame. Returns ------- result : scalar, pd.Series, pd.DataFrame """ op = expr.op() # Call pre_execute, to allow clients to intercept the expression before # computing anything *and* before associating leaf nodes with data. This # allows clients to provide their own scope. scope = toolz.merge( scope, *map( functools.partial(pre_execute, op, scope=scope, **kwargs), find_backends(expr) ) ) # base case: our op has been computed (or is a leaf data node), so # return the corresponding value if op in scope: return scope[op] if context is None: context = agg_ctx.Summarize() try: computed_args = [scope[t] for t in op.root_tables()] except KeyError: pass else: try: # special case: we have a definition of execute_first that matches # our current operation and data leaves return execute_first( op, *computed_args, scope=scope, context=context, **kwargs ) except NotImplementedError: pass args = op.args # recursively compute the op's arguments computed_args = [ execute(arg, scope, context=context, **kwargs) if hasattr(arg, 'op') else arg for arg in args if isinstance(arg, _VALID_INPUT_TYPES) ] # Compute our op, with its computed arguments return execute_node( op, *computed_args, scope=scope, context=context, **kwargs )
def main_execute(expr: ir.Expr, scope: Optional[Mapping] = None, aggcontext: Optional[agg_ctx.AggregationContext] = None, clients: Sequence[ibis.client.Client] = (), params: Optional[Mapping] = None, **kwargs: Any): """Execute an ibis expression against the pandas backend. Parameters ---------- expr scope aggcontext clients params """ toposorted, dependencies = toposort(expr) params = toolz.keymap(get_node, params if params is not None else {}) # Add to scope the objects that have no dependencies and are not ibis # nodes. We have to filter out nodes for cases--such as zero argument # UDFs--that do not have any dependencies yet still need to be evaluated. full_scope = toolz.merge( scope if scope is not None else {}, { key: key for key, parents in dependencies.items() if not parents and not isinstance(key, ops.Node) }, params, ) if not clients: clients = list(find_backends(expr)) if aggcontext is None: aggcontext = agg_ctx.Summarize() # give backends a chance to inject scope if needed execute_first_scope = execute_first(expr.op(), *clients, scope=full_scope, aggcontext=aggcontext, **kwargs) full_scope = toolz.merge(full_scope, execute_first_scope) nodes = [node for node in toposorted if node not in full_scope] # compute the nodes that are not currently in scope for node in nodes: # allow clients to pre compute nodes as they like pre_executed_scope = pre_execute(node, *clients, scope=full_scope, aggcontext=aggcontext, **kwargs) # merge the existing scope with whatever was returned from pre_execute execute_scope = toolz.merge(full_scope, pre_executed_scope) # if after pre_execute our node is in scope, then there's nothing to do # in this iteration if node in execute_scope: full_scope = execute_scope else: # If we're evaluating a literal then we can be a bit quicker about # evaluating the dispatch graph if isinstance(node, ops.Literal): executor = execute_literal else: executor = execute_node # Gather the inputs we've already computed that the current node # depends on execute_args = [ full_scope[get_node(arg)] for arg in dependencies[node] ] # execute the node with its inputs execute_node_result = executor( node, *execute_args, aggcontext=aggcontext, scope=execute_scope, clients=clients, **kwargs, ) # last change to perform any additional computation on the result # before it gets added to scope for the next node full_scope[node] = post_execute( node, execute_node_result, clients=clients, aggcontext=aggcontext, scope=full_scope, ) # the last node in the toposorted graph is the root and maps to the desired # result in scope last_node = toposorted[-1] result = full_scope[last_node] return result
def execute_with_scope( expr, scope: Scope, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): """Execute an expression `expr`, with data provided in `scope`. Parameters ---------- expr : ibis.expr.types.Expr The expression to execute. scope : Scope A Scope class, with dictionary mapping :class:`~ibis.expr.operations.Node` subclass instances to concrete data such as a pandas DataFrame. timecontext : Optional[TimeContext] A tuple of (begin, end) that is passed from parent Node to children see [timecontext.py](ibis/pandas/execution/timecontext.py) for detailed usage for this time context. aggcontext : Optional[ibis.pandas.aggcontext.AggregationContext] Returns ------- result : scalar, pd.Series, pd.DataFrame """ op = expr.op() # Call pre_execute, to allow clients to intercept the expression before # computing anything *and* before associating leaf nodes with data. This # allows clients to provide their own data for each leaf. if clients is None: clients = list(find_backends(expr)) if aggcontext is None: aggcontext = agg_ctx.Summarize() pre_executed_scope = pre_execute( op, *clients, scope=scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, ) new_scope = scope.merge_scope(pre_executed_scope) result = execute_until_in_scope( expr, new_scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, # XXX: we *explicitly* pass in scope and not new_scope here so that # post_execute sees the scope of execute_with_scope, not the scope of # execute_until_in_scope post_execute_=functools.partial( post_execute, scope=scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ), **kwargs, ).get_value(op, timecontext) return result