def execute_aggregation_dataframe(op, data, scope=None, **kwargs): assert op.metrics, 'no metrics found during aggregation execution' if op.sort_keys: raise NotImplementedError( 'sorting on aggregations not yet implemented') predicates = op.predicates if predicates: predicate = functools.reduce(operator.and_, (execute(p, scope, **kwargs) for p in predicates)) data = data.loc[predicate] columns = {} if op.by: grouping_key_pairs = list( zip(op.by, map(operator.methodcaller('op'), op.by))) grouping_keys = [ by_op.name if isinstance(by_op, ir.TableColumn) else execute( by, scope, **kwargs).rename(by.get_name()) for by, by_op in grouping_key_pairs ] columns.update((by_op.name, by.get_name()) for by, by_op in grouping_key_pairs if hasattr(by_op, 'name')) source = data.groupby(grouping_keys) else: source = data new_scope = toolz.merge(scope, {op.table.op(): source}) pieces = [ pd.Series(execute(metric, new_scope, **kwargs), name=metric.get_name()) for metric in op.metrics ] result = pd.concat(pieces, axis=1).reset_index() result.columns = [columns.get(c, c) for c in result.columns] if op.having: # .having(...) is only accessible on groupby, so this should never # raise if not op.by: raise ValueError( 'Filtering out aggregation values is not allowed without at ' 'least one grouping key') # TODO(phillipc): Don't recompute identical subexpressions predicate = functools.reduce(operator.and_, (execute(having, new_scope, **kwargs) for having in op.having)) assert len(predicate) == len(result), \ 'length of predicate does not match length of DataFrame' result = result.loc[predicate.values].reset_index(drop=True) return result
def execute_with_scope(expr, scope, context=None, **kwargs): """Execute an expression `expr`, with data provided in `scope`. Parameters ---------- expr : ir.Expr The expression to execute. scope : dict A dictionary mapping :class:`~ibis.expr.types.Node` subclass instances to concrete data such as a pandas DataFrame. Returns ------- result : scalar, pd.Series, pd.DataFrame """ op = expr.op() # base case: our op has been computed (or is a leaf data node), so # return the corresponding value if op in scope: return scope[op] if context is None: context = ctx.Summarize() try: computed_args = [scope[t] for t in op.root_tables()] except KeyError: pass else: try: # special case: we have a definition of execute_first that matches # our current operation and data leaves return execute_first(op, *computed_args, scope=scope, context=context, **kwargs) except NotImplementedError: pass args = op.args # recursively compute the op's arguments computed_args = [ execute(arg, scope, context=context, **kwargs) if hasattr(arg, 'op') else arg for arg in args if isinstance(arg, _VALID_INPUT_TYPES) ] # Compute our op, with its computed arguments return execute_node(op, *computed_args, scope=scope, context=context, **kwargs)
def compute_sort_key(key, data, **kwargs): by = key.args[0] try: return by.get_name(), None except com.ExpressionError: name = ibis.util.guid() new_scope = {t: data for t in by.op().root_tables()} new_column = execute(by, new_scope, **kwargs) new_column.name = name return name, new_column
def _compute_join_column(column_expr, **kwargs): column_op = column_expr.op() if isinstance(column_op, ops.TableColumn): new_column = column_op.name else: new_column = execute(column_expr, **kwargs) root_table, = column_op.root_tables() return new_column, root_table
def execute_aggregation_dataframe(op, data, scope=None, **kwargs): assert op.metrics if op.having: raise NotImplementedError('having expressions not yet implemented') if op.sort_keys: raise NotImplementedError( 'sorting on aggregations not yet implemented') predicates = op.predicates if predicates: predicate = functools.reduce(operator.and_, (execute(p, scope, **kwargs) for p in predicates)) data = data.loc[predicate] columns = {} if op.by: grouping_key_pairs = list( zip(op.by, map(operator.methodcaller('op'), op.by))) grouping_keys = [ by_op.name if isinstance(by_op, ir.TableColumn) else execute( by, scope, **kwargs).rename(by.get_name()) for by, by_op in grouping_key_pairs ] columns.update((by_op.name, by.get_name()) for by, by_op in grouping_key_pairs if hasattr(by_op, 'name')) source = data.groupby(grouping_keys) else: source = data new_scope = toolz.merge(scope, {op.table.op(): source}) pieces = [ pd.Series(execute(metric, new_scope, **kwargs), name=metric.get_name()) for metric in op.metrics ] df = pd.concat(pieces, axis=1).reset_index() df.columns = [columns.get(c, c) for c in df.columns] return df
def execute_without_scope( expr, params=None, scope=None, context=None, **kwargs): """Execute an expression against data that are bound to it. If no data are bound, raise an Exception. Parameters ---------- expr : ir.Expr The expression to execute params : Dict[Expr, object] Returns ------- result : scalar, pd.Series, pd.DataFrame Raises ------ ValueError * If no data are bound to the input expression """ data_scope = find_data(expr) if not data_scope: raise ValueError( 'No data sources found while trying to execute against the pandas ' 'backend' ) factory = type(data_scope) if scope is None: scope = factory() if params is None: params = factory() params = {k.op() if hasattr(k, 'op') else k: v for k, v in params.items()} new_scope = toolz.merge(scope, data_scope, params, factory=factory) # data_preload new_scope.update( (node, data_preload(node, data, scope=new_scope)) for node, data in new_scope.items() ) # By default, our aggregate functions are N -> 1 return execute( expr, new_scope, context=context if context is not None else agg_ctx.Summarize(), **kwargs )
def compute_projection_scalar_expr(expr, parent, data, scope=None, **kwargs): name = expr._name assert name is not None, 'Scalar selection name is None' op = expr.op() parent_table_op = parent.table.op() data_columns = frozenset(data.columns) additional_scope = OrderedDict( (t, map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data)) for t in op.root_tables()) new_scope = toolz.merge(scope, additional_scope, factory=OrderedDict) result = execute(expr, new_scope, **kwargs) return pd.Series([result], name=name, index=data.index)
def _compute_predicates(table_op, predicates, data, scope, **kwargs): """Compute the predicates for a table operation. Parameters ---------- table_op : TableNode predicates : List[ir.ColumnExpr] data : pd.DataFrame scope : dict kwargs : dict Returns ------- computed_predicate : pd.Series[bool] Notes ----- This handles the cases where the predicates are computed columns, in addition to the simple case of named columns coming directly from the input table. """ for predicate in predicates: # Map each root table of the predicate to the data so that we compute # predicates on the result instead of any left or right tables if the # Selection is on a Join. Project data to only inlude columns from # the root table. root_tables = predicate.op().root_tables() # handle suffixes additional_scope = {} data_columns = frozenset(data.columns) for root_table in root_tables: mapping = remap_overlapping_column_names( table_op, root_table, data_columns ) if mapping is not None: new_data = data.loc[:, mapping.keys()].rename(columns=mapping) else: new_data = data additional_scope[root_table] = new_data new_scope = toolz.merge(scope, additional_scope) yield execute(predicate, new_scope, **kwargs)
def compute_projection_column_expr(expr, parent, data, scope=None, **kwargs): result_name = getattr(expr, '_name', None) op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ir.TableColumn): # slightly faster path for simple column selection name = op.name if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) root_table, = op.root_tables() left_root, right_root = ir.distinct_roots( parent_table_op.left, parent_table_op.right ) suffixes = {left_root: constants.LEFT_JOIN_SUFFIX, right_root: constants.RIGHT_JOIN_SUFFIX} return data.loc[:, name + suffixes[root_table]].rename( result_name or name ) data_columns = frozenset(data.columns) additional_scope = { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data ) for t in op.root_tables() } new_scope = toolz.merge(scope, additional_scope) result = execute(expr, new_scope, **kwargs) assert result_name is not None, 'Column selection name is None' return result.rename(result_name)
def execute_node_value_list(op, **kwargs): return [execute(arg, **kwargs) for arg in op.values]
def execute_frame_window_op(op, data, scope=None, context=None, **kwargs): operand, window = op.args following = window.following order_by = window._order_by if order_by and following != 0: raise ValueError( 'Following with a value other than 0 (current row) with order_by ' 'is not yet implemented in the pandas backend. Use ' 'ibis.trailing_window or ibis.cumulative_window to ' 'construct windows when using the pandas backend.') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ir.TableColumn) else execute( key, context=context, **kwargs) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if grouping_keys: source = data.groupby(grouping_keys, sort=False, as_index=not order_by) if order_by: sorted_df = source.apply( lambda df, order_by=order_by, kwargs=kwargs: (util.compute_sorted_frame(order_by, df, **kwargs))) source = sorted_df.groupby(grouping_keys, sort=False) post_process = _post_process_group_by_order_by else: post_process = _post_process_group_by else: if order_by: source = util.compute_sorted_frame(order_by, data, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # no order by or group by: default summarization context # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not order_by: context = agg_ctx.Summarize() elif isinstance(operand.op(), ops.Reduction) and order_by: preceding = window.preceding if preceding is not None: context = agg_ctx.Trailing(preceding) else: context = agg_ctx.Cumulative() else: context = agg_ctx.Transform() result = execute(operand, new_scope, context=context, **kwargs) series = post_process(result, data.index) assert len(data) == len(series), \ 'input data source and computed column do not have the same length' return series
def execute_selection_dataframe(op, data, scope=None, **kwargs): selections = op.selections predicates = op.predicates sort_keys = op.sort_keys result = data if selections: data_pieces = [] for selection in selections: table_op = op.table.op() selection_operation = selection.op() if op.table is selection: pandas_object = data elif isinstance(selection, ir.ScalarExpr): root_tables = selection_operation.root_tables() additional_scope = collections.OrderedDict( zip(root_tables, (data for _ in range(len(root_tables))))) new_scope = toolz.merge( scope, additional_scope, factory=collections.OrderedDict, ) pandas_object = execute(selection, new_scope, **kwargs) elif isinstance(selection, ir.ColumnExpr): if isinstance(selection_operation, ir.TableColumn): # slightly faster path for simple column selection pandas_object = data[selection_operation.name] elif isinstance(table_op, ops.Join): pandas_object = execute( selection, toolz.merge(scope, {selection_operation.table.op(): data}), **kwargs) else: pandas_object = execute( selection, toolz.merge(scope, {op.table.op(): data}), **kwargs) elif isinstance(selection, ir.TableExpr): # These two statements should never raise unless our # assumptions are wrong because: # 1. If we're selecting ourself, then we've already caught that # case above # 2. We've checked that `s` originates from its parent before # executing assert isinstance(table_op, ops.Join) assert selection.equals(table_op.left) or selection.equals( table_op.right) pandas_object = data[selection.columns] else: raise TypeError( "Don't know how to compute selection of type {}".format( type(selection_operation).__name__)) if isinstance(pandas_object, pd.Series): pandas_object = pandas_object.rename( getattr(selection, '_name', pandas_object.name)) data_pieces.append(pandas_object) result = pd.concat(data_pieces, axis=1) if predicates: where = functools.reduce(operator.and_, (execute(p, scope, **kwargs) for p in predicates)) result = result.loc[where] if sort_keys: result = _compute_sorted_frame(sort_keys, result, **kwargs) return result.reset_index(drop=True)