def _compute_predicates( table_op, predicates, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): """Compute the predicates for a table operation. Parameters ---------- table_op : TableNode predicates : List[ir.ColumnExpr] data : pd.DataFrame scope : Scope timecontext: Optional[TimeContext] kwargs : dict Returns ------- computed_predicate : pd.Series[bool] Notes ----- This handles the cases where the predicates are computed columns, in addition to the simple case of named columns coming directly from the input table. """ for predicate in predicates: # Map each root table of the predicate to the data so that we compute # predicates on the result instead of any left or right tables if the # Selection is on a Join. Project data to only inlude columns from # the root table. root_tables = predicate.op().root_tables() # handle suffixes data_columns = frozenset(data.columns) additional_scope = Scope() for root_table in root_tables: mapping = remap_overlapping_column_names( table_op, root_table, data_columns ) if mapping is not None: new_data = data.loc[:, mapping.keys()].rename(columns=mapping) else: new_data = data additional_scope = additional_scope.merge_scope( Scope({root_table: new_data}, timecontext) ) scope = scope.merge_scope(additional_scope) yield execute(predicate, scope=scope, **kwargs)
def test_scope_look_up(): # test if scope could lookup items properly scope = Scope() one_day = ibis.interval(days=1).op() one_hour = ibis.interval(hours=1).op() scope = scope.merge_scope(Scope({one_day: 1}, None)) assert scope.get_value(one_hour) is None assert scope.get_value(one_day) is not None
def main_execute( expr, params=None, scope=None, timecontext: Optional[TimeContext] = None, aggcontext=None, **kwargs, ): """Execute an expression against data that are bound to it. If no data are bound, raise an Exception. Parameters ---------- expr : ibis.expr.types.Expr The expression to execute params : Mapping[ibis.expr.types.Expr, object] The data that an unbound parameter in `expr` maps to scope : Mapping[ibis.expr.operations.Node, object] Additional scope, mapping ibis operations to data timecontext : Optional[TimeContext] timecontext needed for execution aggcontext : Optional[ibis.backends.pandas.aggcontext.AggregationContext] An object indicating how to compute aggregations. For example, a rolling mean needs to be computed differently than the mean of a column. kwargs : Dict[str, object] Additional arguments that can potentially be used by individual node execution Returns ------- result : Union[ pandas.Series, pandas.DataFrame, ibis.backends.pandas.core.simple_types ] Raises ------ ValueError * If no data are bound to the input expression """ if scope is None: scope = Scope() if timecontext is not None: # convert timecontext to datetime type, if time strings are provided timecontext = canonicalize_context(timecontext) if params is None: params = {} # TODO: make expresions hashable so that we can get rid of these .op() # calls everywhere params = {k.op() if hasattr(k, 'op') else k: v for k, v in params.items()} scope = scope.merge_scope(Scope(params, timecontext)) return execute_with_scope( expr, scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, )
def execute_until_in_scope( expr, scope: Scope, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, post_execute_=None, **kwargs, ) -> Scope: """Execute until our op is in `scope`. Parameters ---------- expr : ibis.expr.types.Expr scope : Scope timecontext : Optional[TimeContext] aggcontext : Optional[AggregationContext] clients : List[ibis.client.Client] kwargs : Mapping """ # these should never be None assert aggcontext is not None, 'aggcontext is None' assert clients is not None, 'clients is None' assert post_execute_ is not None, 'post_execute_ is None' # base case: our op has been computed (or is a leaf data node), so # return the corresponding value op = expr.op() if scope.get_value(op, timecontext) is not None: return scope if isinstance(op, ops.Literal): # special case literals to avoid the overhead of dispatching # execute_node return Scope( { op: execute_literal( op, op.value, expr.type(), aggcontext=aggcontext, **kwargs) }, timecontext, ) # figure out what arguments we're able to compute on based on the # expressions inputs. things like expressions, None, and scalar types are # computable whereas ``list``s are not computable_args = [arg for arg in op.inputs if is_computable_input(arg)] # pre_executed_states is a list of states with same the length of # computable_args, these states are passed to each arg if timecontext: arg_timecontexts = compute_time_context( op, num_args=len(computable_args), timecontext=timecontext, clients=clients, ) else: arg_timecontexts = [None] * len(computable_args) pre_executed_scope = pre_execute( op, *clients, scope=scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, ) new_scope = scope.merge_scope(pre_executed_scope) # Short circuit: if pre_execute puts op in scope, then we don't need to # execute its computable_args if new_scope.get_value(op, timecontext) is not None: return new_scope # recursively compute each node's arguments until we've changed type. # compute_time_context should return with a list with the same length # as computable_args, the two lists will be zipping together for # further execution if len(arg_timecontexts) != len(computable_args): raise com.IbisError( 'arg_timecontexts differ with computable_arg in length ' f'for type:\n{type(op).__name__}.') scopes = [ execute_until_in_scope( arg, new_scope, timecontext=timecontext, aggcontext=aggcontext, post_execute_=post_execute_, clients=clients, **kwargs, ) if hasattr(arg, 'op') else Scope({arg: arg}, timecontext) for (arg, timecontext) in zip(computable_args, arg_timecontexts) ] # if we're unable to find data then raise an exception if not scopes and computable_args: raise com.UnboundExpressionError( 'Unable to find data for expression:\n{}'.format(repr(expr))) # there should be exactly one dictionary per computable argument assert len(computable_args) == len(scopes) new_scope = new_scope.merge_scopes(scopes) # pass our computed arguments to this node's execute_node implementation data = [ new_scope.get_value(arg.op(), timecontext) if hasattr(arg, 'op') else arg for arg in computable_args ] result = execute_node( op, *data, scope=scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) computed = post_execute_(op, result, timecontext=timecontext) return Scope({op: computed}, timecontext)
def execute_with_scope( expr, scope: Scope, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): """Execute an expression `expr`, with data provided in `scope`. Parameters ---------- expr : ibis.expr.types.Expr The expression to execute. scope : Scope A Scope class, with dictionary mapping :class:`~ibis.expr.operations.Node` subclass instances to concrete data such as a pandas DataFrame. timecontext : Optional[TimeContext] A tuple of (begin, end) that is passed from parent Node to children see [timecontext.py](ibis/backends/pandas/execution/timecontext.py) for detailed usage for this time context. aggcontext : Optional[ibis.backends.pandas.aggcontext.AggregationContext] Returns ------- result : scalar, pd.Series, pd.DataFrame """ op = expr.op() # Call pre_execute, to allow clients to intercept the expression before # computing anything *and* before associating leaf nodes with data. This # allows clients to provide their own data for each leaf. if clients is None: clients = list(find_backends(expr)) if aggcontext is None: aggcontext = agg_ctx.Summarize() pre_executed_scope = pre_execute( op, *clients, scope=scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, ) new_scope = scope.merge_scope(pre_executed_scope) result = execute_until_in_scope( expr, new_scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, # XXX: we *explicitly* pass in scope and not new_scope here so that # post_execute sees the scope of execute_with_scope, not the scope of # execute_until_in_scope post_execute_=functools.partial( post_execute, scope=scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ), **kwargs, ).get_value(op, timecontext) return result
def execute_window_op( op, data, window, scope: Scope = None, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() adjusted_timecontext = None if timecontext: arg_timecontexts = compute_time_context( op, timecontext=timecontext, clients=clients ) # timecontext is the original time context required by parent node # of this WindowOp, while adjusted_timecontext is the adjusted context # of this Window, since we are doing a manual execution here, use # adjusted_timecontext in later execution phases adjusted_timecontext = arg_timecontexts[0] pre_executed_scope = pre_execute( operand_op, *clients, scope=scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) scope = scope.merge_scope(pre_executed_scope) (root,) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, timecontext=adjusted_timecontext, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if ( order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase) ): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented' ) group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by) ) ] order_by = window._order_by if not order_by: ordering_keys = [] if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame( data, order_by, group_by=group_by, timecontext=adjusted_timecontext, **kwargs, ) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, timecontext=adjusted_timecontext, **kwargs ) post_process = _post_process_order_by else: source = data post_process = _post_process_empty # Here groupby object should be add to the corresponding node in scope # for execution, data will be overwrite to a groupby object, so we # force an update regardless of time context new_scope = scope.merge_scopes( [ Scope({t: source}, adjusted_timecontext) for t in operand.op().root_tables() ], overwrite=True, ) aggcontext = get_aggcontext( window, scope=scope, operand=operand, parent=source, group_by=grouping_keys, order_by=ordering_keys, **kwargs, ) result = execute( operand, scope=new_scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process( result, data, ordering_keys, grouping_keys, adjusted_timecontext, ) assert len(data) == len( series ), 'input data source and computed column do not have the same length' # trim data to original time context series = trim_with_timecontext(series, timecontext) return series