def execute_selection_dataframe(op, data, scope=None, **kwargs): selections = op.selections predicates = op.predicates sort_keys = op.sort_keys result = data # Build up the individual pandas structures from column expressions if selections: data_pieces = [] for selection in selections: pandas_object = compute_projection( selection, op, data, scope=scope, **kwargs ) data_pieces.append(pandas_object) result = pd.concat(data_pieces, axis=1) if predicates: predicates = _compute_predicates( op.table.op(), predicates, data, scope, **kwargs ) predicate = functools.reduce(operator.and_, predicates) assert len(predicate) == len(result), \ 'Selection predicate length does not match underlying table' result = result.loc[predicate] if sort_keys: result = util.compute_sorted_frame(sort_keys, result, **kwargs) return result.reset_index(drop=True)
def execute_selection_dataframe(op, data, scope, timecontext: Optional[TimeContext], **kwargs): selections = op.selections predicates = op.predicates sort_keys = op.sort_keys result = data # Build up the individual pandas structures from column expressions if selections: data_pieces = [] for selection in selections: pandas_object = compute_projection( selection, op, data, scope=scope, timecontext=timecontext, **kwargs, ) data_pieces.append(pandas_object) new_pieces = [ piece.reset_index(level=list(range(1, piece.index.nlevels)), drop=True) if piece.index.nlevels > 1 else piece for piece in data_pieces ] result = pd.concat(new_pieces, axis=1) if predicates: predicates = _compute_predicates(op.table.op(), predicates, data, scope, **kwargs) predicate = functools.reduce(operator.and_, predicates) assert len(predicate) == len( result ), 'Selection predicate length does not match underlying table' result = result.loc[predicate] if sort_keys: result, grouping_keys, ordering_keys = util.compute_sorted_frame( result, order_by=sort_keys, scope=scope, **kwargs) else: grouping_keys = ordering_keys = () # return early if we do not have any temporary grouping or ordering columns assert not grouping_keys, 'group by should never show up in Selection' if not ordering_keys: return result # create a sequence of columns that we need to drop temporary_columns = pd.Index(concatv( grouping_keys, ordering_keys)).difference(data.columns) # no reason to call drop if we don't need to if temporary_columns.empty: return result # drop every temporary column we created for ordering or grouping return result.drop(temporary_columns, axis=1)
def execute_selection_dataframe(op, data, scope=None, **kwargs): selections = op.selections predicates = op.predicates sort_keys = op.sort_keys result = data # Build up the individual pandas structures from column expressions if selections: data_pieces = [] for selection in selections: pandas_object = compute_projection( selection, op, data, scope=scope, **kwargs ) data_pieces.append(pandas_object) new_pieces = [ piece.reset_index( level=list(range(1, piece.index.nlevels)), drop=True ) if piece.index.nlevels > 1 else piece for piece in data_pieces ] result = pd.concat(new_pieces, axis=1) if predicates: predicates = _compute_predicates( op.table.op(), predicates, data, scope, **kwargs ) predicate = functools.reduce(operator.and_, predicates) assert len(predicate) == len( result ), 'Selection predicate length does not match underlying table' result = result.loc[predicate] if sort_keys: result, grouping_keys, ordering_keys = util.compute_sorted_frame( result, order_by=sort_keys, scope=scope, **kwargs ) else: grouping_keys = ordering_keys = () # return early if we do not have any temporary grouping or ordering columns assert not grouping_keys, 'group by should never show up in Selection' if not ordering_keys: return result # create a sequence of columns that we need to drop temporary_columns = pd.Index( concatv(grouping_keys, ordering_keys) ).difference(data.columns) # no reason to call drop if we don't need to if temporary_columns.empty: return result # drop every temporary column we created for ordering or grouping return result.drop(temporary_columns, axis=1)
def execute_window_op(op, data, window, scope=None, aggcontext=None, clients=None, **kwargs): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute(operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs) scope = toolz.merge(scope, pre_executed_scope) (root, ) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs) for key, key_op in zip(group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame(data, order_by, group_by=group_by, **kwargs) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() aggcontext = get_aggcontext( window, operand=operand, operand_dtype=operand_dtype, parent=source, group_by=grouping_keys, order_by=ordering_keys, ) result = execute( operand, scope=new_scope, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series
def execute_window_op(op, data, window, scope=None, aggcontext=None, clients=None, **kwargs): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute(operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs) scope = toolz.merge(scope, pre_executed_scope) root, = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, aggcontext=aggcontext, **kwargs) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame(data, order_by, group_by=group_by, **kwargs) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() # no order by or group by: default summarization aggcontext # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not ordering_keys: aggcontext = agg_ctx.Summarize() elif (isinstance(operand.op(), (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All)) and ordering_keys): # XXX(phillipc): What a horror show preceding = window.preceding if preceding is not None: max_lookback = window.max_lookback assert not isinstance(operand.op(), ops.CumulativeOp) aggcontext = agg_ctx.Moving( preceding, max_lookback, parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # expanding window aggcontext = agg_ctx.Cumulative( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # groupby transform (window with a partition by clause in SQL parlance) aggcontext = agg_ctx.Transform( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) result = execute( operand, scope=new_scope, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series
def execute_frame_window_op(op, data, scope=None, context=None, **kwargs): operand, window = op.args following = window.following order_by = window._order_by if order_by and following != 0: raise ValueError( 'Following with a value other than 0 (current row) with order_by ' 'is not yet implemented in the pandas backend. Use ' 'ibis.trailing_window or ibis.cumulative_window to ' 'construct windows when using the pandas backend.') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ir.TableColumn) else execute( key, context=context, **kwargs) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if grouping_keys: source = data.groupby(grouping_keys, sort=False, as_index=not order_by) if order_by: sorted_df = source.apply( lambda df, order_by=order_by, kwargs=kwargs: (util.compute_sorted_frame(order_by, df, **kwargs))) source = sorted_df.groupby(grouping_keys, sort=False) post_process = _post_process_group_by_order_by else: post_process = _post_process_group_by else: if order_by: source = util.compute_sorted_frame(order_by, data, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # no order by or group by: default summarization context # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not order_by: context = agg_ctx.Summarize() elif isinstance(operand.op(), ops.Reduction) and order_by: preceding = window.preceding if preceding is not None: context = agg_ctx.Trailing(preceding) else: context = agg_ctx.Cumulative() else: context = agg_ctx.Transform() result = execute(operand, new_scope, context=context, **kwargs) series = post_process(result, data.index) assert len(data) == len(series), \ 'input data source and computed column do not have the same length' return series
def execute_window_op( op, data, window, scope=None, aggcontext=None, clients=None, **kwargs ): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute( operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs ) scope = toolz.merge(scope, pre_executed_scope) root, = op.root_tables() root_expr = root.to_expr() data = execute(root_expr, scope=scope, aggcontext=aggcontext, **kwargs) following = window.following order_by = window._order_by if ( order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase) ): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented' ) group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute(key, aggcontext=aggcontext, **kwargs) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by) ) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame( data, order_by, group_by=group_by, **kwargs ) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs ) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() if isinstance(operand_type, dt.Integer) and operand_type.nullable: operand_dtype = np.float64 else: operand_dtype = operand.type().to_pandas() # no order by or group by: default summarization aggcontext # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not ordering_keys: aggcontext = agg_ctx.Summarize() elif isinstance(operand.op(), ops.Reduction) and ordering_keys: # XXX(phillipc): What a horror show preceding = window.preceding if preceding is not None: aggcontext = agg_ctx.Moving( preceding, parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # expanding window aggcontext = agg_ctx.Cumulative( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # groupby transform (window with a partition by clause in SQL parlance) aggcontext = agg_ctx.Transform( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) result = execute(operand, scope=new_scope, aggcontext=aggcontext, **kwargs) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series
def execute_window_op( op, data, window, scope: Scope = None, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() adjusted_timecontext = None if timecontext: arg_timecontexts = compute_time_context(op, timecontext=timecontext, clients=clients) # timecontext is the original time context required by parent node # of this WindowOp, while adjusted_timecontext is the adjusted context # of this Window, since we are doing a manual execution here, use # adjusted_timecontext in later execution phases adjusted_timecontext = arg_timecontexts[0] pre_executed_scope = pre_execute( operand_op, *clients, scope=scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) scope = scope.merge_scope(pre_executed_scope) (root, ) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, timecontext=adjusted_timecontext, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) for key, key_op in zip(group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = [] if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame( data, order_by, group_by=group_by, timecontext=adjusted_timecontext, **kwargs, ) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, timecontext=adjusted_timecontext, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty # Here groupby object should be add to the corresponding node in scope # for execution, data will be overwrite to a groupby object, so we # force an update regardless of time context new_scope = scope.merge_scopes( [ Scope({t: source}, adjusted_timecontext) for t in operand.op().root_tables() ], overwrite=True, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() aggcontext = get_aggcontext( window, scope=scope, operand=operand, operand_dtype=operand_dtype, parent=source, group_by=grouping_keys, order_by=ordering_keys, **kwargs, ) result = execute( operand, scope=new_scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' # trim data to original time context series = trim_with_timecontext(series, timecontext) return series