def compute_projection_scalar_expr( expr, parent, data, scope: Scope, timecontext: Optional[TimeContext] = None, **kwargs, ): name = expr._name assert name is not None, 'Scalar selection name is None' op = expr.op() parent_table_op = parent.table.op() data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) }, timecontext, ) for t in op.root_tables()) scalar = execute(expr, scope=scope, **kwargs) return data.assign(**{name: scalar})[name]
def csv_pre_execute_selection( op: ops.Node, client: CSVClient, scope: Scope, timecontext: TimeContext = None, **kwargs, ): tables = filter( lambda t: scope.get_value(t, timecontext) is None, physical_tables(op.table.op()), ) ops = Scope() for table in tables: path = client.dictionary[table.name] usecols = None if op.selections: header = _read_csv(path, schema=table.schema, header=0, nrows=1) usecols = [ getattr(s.op(), 'name', None) or s.get_name() for s in op.selections ] # we cannot read all the columns that we would like if len(pd.Index(usecols) & header.columns) != len(usecols): usecols = None result = _read_csv(path, table.schema, usecols=usecols, header=0) ops = ops.merge_scope(Scope({table: result}, timecontext)) return ops
def compile(self, expr, timecontext=None, params=None, *args, **kwargs): """Compile an ibis expression to a PySpark DataFrame object """ if timecontext is not None: session_timezone = self._session.conf.get( 'spark.sql.session.timeZone' ) # Since spark use session timezone for tz-naive timestamps # we localize tz-naive context here to match that behavior timecontext = localize_context( canonicalize_context(timecontext), session_timezone ) # Insert params in scope if params is None: scope = Scope() else: scope = Scope( {param.op(): raw_value for param, raw_value in params.items()}, timecontext, ) return self.translator.translate( expr, scope=scope, timecontext=timecontext )
def compute_sort_key( key: ops.SortKey, data: dd.DataFrame, timecontext: Optional[TimeContext] = None, scope: Scope = None, **kwargs, ): """ Note - we use this function instead of the pandas.execution.util so that we use the dask `execute` method This function borrows the logic in the pandas backend. ``by`` can be a string or an expression. If ``by.get_name()`` raises an exception, we must ``execute`` the expression and sort by the new derived column. """ by = key.to_expr() name = ibis.util.guid() try: if isinstance(by, str): return name, data[by] return name, data[by.get_name()] except com.ExpressionError: if scope is None: scope = Scope() scope = scope.merge_scopes( Scope({t: data}, timecontext) for t in by.op().root_tables()) new_column = execute(by, scope=scope, **kwargs) new_column.name = name return name, new_column
def compute_projection_scalar_expr( expr, parent, data, scope: Scope = None, timecontext: Optional[TimeContext] = None, **kwargs, ): name = expr._name assert name is not None, 'Scalar selection name is None' op = expr.op() parent_table_op = parent.table.op() data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names( parent_table_op, t, data_columns ), data, ) }, timecontext, ) for t in op.root_tables() ) scalar = execute(expr, scope=scope, **kwargs) result = pandas.Series([scalar], name=name).repeat(len(data.index)) result.index = data.index return dd.from_pandas(result, npartitions=data.npartitions)
def main_execute( expr, params=None, scope=None, timecontext: Optional[TimeContext] = None, aggcontext=None, **kwargs, ): """Execute an expression against data that are bound to it. If no data are bound, raise an Exception. Parameters ---------- expr : ibis.expr.types.Expr The expression to execute params : Mapping[ibis.expr.types.Expr, object] The data that an unbound parameter in `expr` maps to scope : Mapping[ibis.expr.operations.Node, object] Additional scope, mapping ibis operations to data timecontext : Optional[TimeContext] timecontext needed for execution aggcontext : Optional[ibis.backends.pandas.aggcontext.AggregationContext] An object indicating how to compute aggregations. For example, a rolling mean needs to be computed differently than the mean of a column. kwargs : Dict[str, object] Additional arguments that can potentially be used by individual node execution Returns ------- result : Union[ pandas.Series, pandas.DataFrame, ibis.backends.pandas.core.simple_types ] Raises ------ ValueError * If no data are bound to the input expression """ if scope is None: scope = Scope() if timecontext is not None: # convert timecontext to datetime type, if time strings are provided timecontext = canonicalize_context(timecontext) if params is None: params = {} # TODO: make expresions hashable so that we can get rid of these .op() # calls everywhere params = {k.op() if hasattr(k, 'op') else k: v for k, v in params.items()} scope = scope.merge_scope(Scope(params, timecontext)) return execute_with_scope( expr, scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, )
def compute_projection_column_expr( expr, parent, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): result_name = getattr(expr, '_name', None) op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ops.TableColumn): # slightly faster path for simple column selection name = op.name if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) (root_table,) = op.root_tables() left_root, right_root = ops.distinct_roots( parent_table_op.left, parent_table_op.right ) suffixes = { left_root: constants.LEFT_JOIN_SUFFIX, right_root: constants.RIGHT_JOIN_SUFFIX, } return data.loc[:, name + suffixes[root_table]].rename( result_name or name ) data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names( parent_table_op, t, data_columns ), data, ) }, timecontext, ) for t in op.root_tables() ) result = execute(expr, scope=scope, timecontext=timecontext, **kwargs) assert result_name is not None, 'Column selection name is None' if np.isscalar(result): series = dd.from_array(np.repeat(result, len(data.index))) series.name = result_name series.index = data.index return series return result.rename(result_name)
def _compute_predicates( table_op, predicates, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): """Compute the predicates for a table operation. Parameters ---------- table_op : TableNode predicates : List[ir.ColumnExpr] data : pd.DataFrame scope : Scope timecontext: Optional[TimeContext] kwargs : dict Returns ------- computed_predicate : pd.Series[bool] Notes ----- This handles the cases where the predicates are computed columns, in addition to the simple case of named columns coming directly from the input table. """ for predicate in predicates: # Map each root table of the predicate to the data so that we compute # predicates on the result instead of any left or right tables if the # Selection is on a Join. Project data to only inlude columns from # the root table. root_tables = predicate.op().root_tables() # handle suffixes data_columns = frozenset(data.columns) additional_scope = Scope() for root_table in root_tables: mapping = remap_overlapping_column_names( table_op, root_table, data_columns ) if mapping is not None: new_data = data.loc[:, mapping.keys()].rename(columns=mapping) else: new_data = data additional_scope = additional_scope.merge_scope( Scope({root_table: new_data}, timecontext) ) scope = scope.merge_scope(additional_scope) yield execute(predicate, scope=scope, **kwargs)
def compute_projection_column_expr( expr, parent, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): result_name = getattr(expr, '_name', None) op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ops.TableColumn): # slightly faster path for simple column selection name = op.name assert isinstance(name, str) if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) suffix = util.get_join_suffix_for_op(op, parent_table_op) return data.loc[:, name + suffix].rename(result_name or name) data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names( parent_table_op, t, data_columns ), data, ) }, timecontext, ) for t in op.root_tables() ) result = coerce_to_output( execute(expr, scope=scope, timecontext=timecontext, **kwargs), expr, data.index, ) assert result_name is not None, 'Column selection name is None' return result
def compute_sort_key(key, data, timecontext, scope=None, **kwargs): by = key.to_expr() try: if isinstance(by, str): return by, None return by.get_name(), None except com.ExpressionError: if scope is None: scope = Scope() scope = scope.merge_scopes( Scope({t: data}, timecontext) for t in by.op().root_tables()) new_column = execute(by, scope=scope, **kwargs) name = ibis.util.guid() new_column.name = name return name, new_column
def test_bad_call_to_adjust_context(): op = "not_a_node" context = (pd.Timestamp('20170101'), pd.Timestamp('20170103')) scope = Scope() with pytest.raises(com.IbisError, match=r".*Unsupported input type for adjust context.*"): adjust_context(op, scope, context)
def compile(self, expr, timecontext=None, params=None, *args, **kwargs): """Compile an ibis expression to a PySpark DataFrame object """ if timecontext is not None: timecontext = canonicalize_context(timecontext) # Insert params in scope if params is None: scope = Scope() else: scope = Scope( {param.op(): raw_value for param, raw_value in params.items()}, timecontext, ) return self.translator.translate( expr, scope=scope, timecontext=timecontext )
def execute_grouped_window_op( op, data, window, scope, timecontext, aggcontext, clients, **kwargs, ): # extract the parent (root, ) = op.root_tables() root_expr = root.to_expr() root_data = execute( root_expr, scope=scope, timecontext=timecontext, clients=clients, aggcontext=aggcontext, **kwargs, ) group_by = window._group_by grouping_keys = [ key_op.name for key_op in map(operator.methodcaller('op'), group_by) ] grouped_root_data = root_data.groupby(grouping_keys) scope = scope.merge_scopes( [ Scope({t: grouped_root_data}, timecontext) for t in op.expr.op().root_tables() ], overwrite=True, ) result = execute_with_scope( expr=op.expr, scope=scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) # If the grouped operation we performed is not an analytic UDF we have to # realign the output to the input. if not isinstance(op.expr._arg, ops.AnalyticVectorizedUDF): result = dd.merge( root_data[result.index.name].to_frame(), result.to_frame(), left_on=result.index.name, right_index=True, )[result.name] result.divisions = root_data.divisions return result
def test_scope_look_up(): # test if scope could lookup items properly scope = Scope() one_day = ibis.interval(days=1).op() one_hour = ibis.interval(hours=1).op() scope = scope.merge_scope(Scope({one_day: 1}, None)) assert scope.get_value(one_hour) is None assert scope.get_value(one_day) is not None
def execute_until_in_scope( expr, scope: Scope, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, post_execute_=None, **kwargs, ) -> Scope: """Execute until our op is in `scope`. Parameters ---------- expr : ibis.expr.types.Expr scope : Scope timecontext : Optional[TimeContext] aggcontext : Optional[AggregationContext] clients : List[ibis.client.Client] kwargs : Mapping """ # these should never be None assert aggcontext is not None, 'aggcontext is None' assert clients is not None, 'clients is None' assert post_execute_ is not None, 'post_execute_ is None' # base case: our op has been computed (or is a leaf data node), so # return the corresponding value op = expr.op() if scope.get_value(op, timecontext) is not None: return scope if isinstance(op, ops.Literal): # special case literals to avoid the overhead of dispatching # execute_node return Scope( { op: execute_literal( op, op.value, expr.type(), aggcontext=aggcontext, **kwargs) }, timecontext, ) # figure out what arguments we're able to compute on based on the # expressions inputs. things like expressions, None, and scalar types are # computable whereas ``list``s are not computable_args = [arg for arg in op.inputs if is_computable_input(arg)] # pre_executed_states is a list of states with same the length of # computable_args, these states are passed to each arg if timecontext: arg_timecontexts = compute_time_context( op, num_args=len(computable_args), timecontext=timecontext, clients=clients, ) else: arg_timecontexts = [None] * len(computable_args) pre_executed_scope = pre_execute( op, *clients, scope=scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, ) new_scope = scope.merge_scope(pre_executed_scope) # Short circuit: if pre_execute puts op in scope, then we don't need to # execute its computable_args if new_scope.get_value(op, timecontext) is not None: return new_scope # recursively compute each node's arguments until we've changed type. # compute_time_context should return with a list with the same length # as computable_args, the two lists will be zipping together for # further execution if len(arg_timecontexts) != len(computable_args): raise com.IbisError( 'arg_timecontexts differ with computable_arg in length ' f'for type:\n{type(op).__name__}.') scopes = [ execute_until_in_scope( arg, new_scope, timecontext=timecontext, aggcontext=aggcontext, post_execute_=post_execute_, clients=clients, **kwargs, ) if hasattr(arg, 'op') else Scope({arg: arg}, timecontext) for (arg, timecontext) in zip(computable_args, arg_timecontexts) ] # if we're unable to find data then raise an exception if not scopes and computable_args: raise com.UnboundExpressionError( 'Unable to find data for expression:\n{}'.format(repr(expr))) # there should be exactly one dictionary per computable argument assert len(computable_args) == len(scopes) new_scope = new_scope.merge_scopes(scopes) # pass our computed arguments to this node's execute_node implementation data = [ new_scope.get_value(arg.op(), timecontext) if hasattr(arg, 'op') else arg for arg in computable_args ] result = execute_node( op, *data, scope=scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) computed = post_execute_(op, result, timecontext=timecontext) return Scope({op: computed}, timecontext)
def execute_with_scope( expr, scope: Scope, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): """Execute an expression `expr`, with data provided in `scope`. Parameters ---------- expr : ibis.expr.types.Expr The expression to execute. scope : Scope A Scope class, with dictionary mapping :class:`~ibis.expr.operations.Node` subclass instances to concrete data such as a pandas DataFrame. timecontext : Optional[TimeContext] A tuple of (begin, end) that is passed from parent Node to children see [timecontext.py](ibis/backends/pandas/execution/timecontext.py) for detailed usage for this time context. aggcontext : Optional[ibis.backends.pandas.aggcontext.AggregationContext] Returns ------- result : scalar, pd.Series, pd.DataFrame """ op = expr.op() # Call pre_execute, to allow clients to intercept the expression before # computing anything *and* before associating leaf nodes with data. This # allows clients to provide their own data for each leaf. if clients is None: clients = list(find_backends(expr)) if aggcontext is None: aggcontext = agg_ctx.Summarize() pre_executed_scope = pre_execute( op, *clients, scope=scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, ) new_scope = scope.merge_scope(pre_executed_scope) result = execute_until_in_scope( expr, new_scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, # XXX: we *explicitly* pass in scope and not new_scope here so that # post_execute sees the scope of execute_with_scope, not the scope of # execute_until_in_scope post_execute_=functools.partial( post_execute, scope=scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ), **kwargs, ).get_value(op, timecontext) return result
def test_pre_execute(op, client, **kwargs): called[0] += 1 return Scope()
def execute_window_op( op, data, window, scope: Scope = None, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() adjusted_timecontext = None if timecontext: arg_timecontexts = compute_time_context( op, timecontext=timecontext, clients=clients ) # timecontext is the original time context required by parent node # of this WindowOp, while adjusted_timecontext is the adjusted context # of this Window, since we are doing a manual execution here, use # adjusted_timecontext in later execution phases adjusted_timecontext = arg_timecontexts[0] pre_executed_scope = pre_execute( operand_op, *clients, scope=scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) scope = scope.merge_scope(pre_executed_scope) (root,) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, timecontext=adjusted_timecontext, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if ( order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase) ): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented' ) group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by) ) ] order_by = window._order_by if not order_by: ordering_keys = [] if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame( data, order_by, group_by=group_by, timecontext=adjusted_timecontext, **kwargs, ) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, timecontext=adjusted_timecontext, **kwargs ) post_process = _post_process_order_by else: source = data post_process = _post_process_empty # Here groupby object should be add to the corresponding node in scope # for execution, data will be overwrite to a groupby object, so we # force an update regardless of time context new_scope = scope.merge_scopes( [ Scope({t: source}, adjusted_timecontext) for t in operand.op().root_tables() ], overwrite=True, ) aggcontext = get_aggcontext( window, scope=scope, operand=operand, parent=source, group_by=grouping_keys, order_by=ordering_keys, **kwargs, ) result = execute( operand, scope=new_scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process( result, data, ordering_keys, grouping_keys, adjusted_timecontext, ) assert len(data) == len( series ), 'input data source and computed column do not have the same length' # trim data to original time context series = trim_with_timecontext(series, timecontext) return series
def pre_execute_timestamp_now(op, *args, **kwargs): timecontext = kwargs.get('timecontext', None) return Scope({op: pd.Timestamp('now')}, timecontext)
def pre_execute_default(node, *clients, **kwargs): return Scope()
def execute_aggregation_dataframe(op, data, scope=None, timecontext: Optional[TimeContext] = None, **kwargs): assert op.metrics, 'no metrics found during aggregation execution' if op.sort_keys: raise NotImplementedError( 'sorting on aggregations not yet implemented') predicates = op.predicates if predicates: predicate = functools.reduce( operator.and_, (execute(p, scope=scope, timecontext=timecontext, **kwargs) for p in predicates), ) data = data.loc[predicate] columns = {} if op.by: grouping_key_pairs = list( zip(op.by, map(operator.methodcaller('op'), op.by))) grouping_keys = [ by_op.name if isinstance(by_op, ops.TableColumn) else execute( by, scope=scope, timecontext=timecontext, **kwargs).rename( by.get_name()) for by, by_op in grouping_key_pairs ] columns.update((by_op.name, by.get_name()) for by, by_op in grouping_key_pairs if hasattr(by_op, 'name')) source = data.groupby(grouping_keys) else: source = data scope = scope.merge_scope(Scope({op.table.op(): source}, timecontext)) pieces = [ coerce_to_output( execute(metric, scope=scope, timecontext=timecontext, **kwargs), metric, ) for metric in op.metrics ] result = pd.concat(pieces, axis=1) # If grouping, need a reset to get the grouping key back as a column if op.by: result = result.reset_index() result.columns = [columns.get(c, c) for c in result.columns] if op.having: # .having(...) is only accessible on groupby, so this should never # raise if not op.by: raise ValueError( 'Filtering out aggregation values is not allowed without at ' 'least one grouping key') # TODO(phillipc): Don't recompute identical subexpressions predicate = functools.reduce( operator.and_, (execute(having, scope=scope, timecontext=timecontext, **kwargs) for having in op.having), ) assert len(predicate) == len( result), 'length of predicate does not match length of DataFrame' result = result.loc[predicate.values] return result
def pre_execute_test(op, *clients, scope=None, **kwargs): return Scope({op: 4}, None)