def execute_aggregation_dataframe(op, data, scope=None, **kwargs): assert op.metrics, 'no metrics found during aggregation execution' if op.sort_keys: raise NotImplementedError( 'sorting on aggregations not yet implemented') predicates = op.predicates if predicates: predicate = functools.reduce( operator.and_, (execute(p, scope=scope, **kwargs) for p in predicates), ) data = data.loc[predicate] columns = {} if op.by: grouping_key_pairs = list( zip(op.by, map(operator.methodcaller('op'), op.by))) grouping_keys = [ by_op.name if isinstance(by_op, ops.TableColumn) else execute( by, scope=scope, **kwargs).rename(by.get_name()) for by, by_op in grouping_key_pairs ] columns.update((by_op.name, by.get_name()) for by, by_op in grouping_key_pairs if hasattr(by_op, 'name')) source = data.groupby(grouping_keys) else: source = data new_scope = toolz.merge(scope, {op.table.op(): source}) pieces = [ pd.Series(execute(metric, scope=new_scope, **kwargs), name=metric.get_name()) for metric in op.metrics ] # group by always needs a reset to get the grouping key back as a column result = pd.concat(pieces, axis=1).reset_index() result.columns = [columns.get(c, c) for c in result.columns] if op.having: # .having(...) is only accessible on groupby, so this should never # raise if not op.by: raise ValueError( 'Filtering out aggregation values is not allowed without at ' 'least one grouping key') # TODO(phillipc): Don't recompute identical subexpressions predicate = functools.reduce( operator.and_, (execute(having, scope=new_scope, **kwargs) for having in op.having), ) assert len(predicate) == len( result), 'length of predicate does not match length of DataFrame' result = result.loc[predicate.values] return result
def compute_projection_scalar_expr(expr, parent, data, scope=None, **kwargs): name = expr._name assert name is not None, 'Scalar selection name is None' op = expr.op() parent_table_op = parent.table.op() data_columns = frozenset(data.columns) additional_scope = OrderedDict( ( t, map_new_column_names_to_data( remap_overlapping_column_names( parent_table_op, t, data_columns ), data ) ) for t in op.root_tables() ) new_scope = toolz.merge(scope, additional_scope, factory=OrderedDict) scalar = execute(expr, new_scope, **kwargs) result = pd.Series([scalar], name=name).repeat(len(data.index)) result.index = data.index return result
def insert(self, path, key, expr, format='table', data_columns=True, **kwargs): path = self.root / path data = execute(expr) data.to_hdf(str(path), key, format=format, data_columns=data_columns, **kwargs)
def compute_projection_scalar_expr( expr, parent, data, scope: Scope = None, timecontext: Optional[TimeContext] = None, **kwargs, ): name = expr._name assert name is not None, 'Scalar selection name is None' op = expr.op() parent_table_op = parent.table.op() data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) }, timecontext, ) for t in op.root_tables()) scalar = execute(expr, scope=scope, **kwargs) result = pd.Series([scalar], name=name).repeat(len(data.index)) result.index = data.index return result
def compute_projection_scalar_expr(expr, parent, data, scope=None, **kwargs): name = expr._name assert name is not None, 'Scalar selection name is None' op = expr.op() parent_table_op = parent.table.op() data_columns = frozenset(data.columns) additional_scope = OrderedDict( ( t, map_new_column_names_to_data( remap_overlapping_column_names( parent_table_op, t, data_columns ), data, ), ) for t in op.root_tables() ) new_scope = toolz.merge(scope, additional_scope, factory=OrderedDict) scalar = execute(expr, scope=new_scope, **kwargs) result = pd.Series([scalar], name=name).repeat(len(data.index)) result.index = data.index return result
def insert( self, path, key, expr, format='table', data_columns=True, **kwargs ): path = self.root / path data = execute(expr) data.to_hdf( str(path), key, format=format, data_columns=data_columns, **kwargs )
def _compute_join_column(column_expr, **kwargs): column_op = column_expr.op() if isinstance(column_op, ops.TableColumn): new_column = column_op.name else: new_column = execute(column_expr, **kwargs) (root_table, ) = column_op.root_tables() return new_column, root_table
def execute(self, expr, params=None, **kwargs): # noqa assert isinstance(expr, ir.Expr), "Expected ir.Expr, got {}".format( type(expr)) return execute_last( expr.op(), execute(expr, params=params, **kwargs), params=params, **kwargs, )
def compute_sort_key(key, data, **kwargs): by = key.args[0] try: return by.get_name(), None except com.ExpressionError: name = ibis.util.guid() new_scope = {t: data for t in by.op().root_tables()} new_column = execute(by, new_scope, **kwargs) new_column.name = name return name, new_column
def _compute_join_column(column_expr, **kwargs): column_op = column_expr.op() if isinstance(column_op, ops.TableColumn): new_column = column_op.name else: new_column = execute(column_expr, **kwargs) root_table, = column_op.root_tables() return new_column, root_table
def compute_projection_column_expr( expr, parent, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): result_name = getattr(expr, '_name', None) op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ops.TableColumn): # slightly faster path for simple column selection name = op.name if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) (root_table, ) = op.root_tables() left_root, right_root = ops.distinct_roots(parent_table_op.left, parent_table_op.right) suffixes = { left_root: constants.LEFT_JOIN_SUFFIX, right_root: constants.RIGHT_JOIN_SUFFIX, } return data.loc[:, name + suffixes[root_table]].rename(result_name or name) data_columns = frozenset(data.columns) scope = scope.merge_scopes( Scope( { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) }, timecontext, ) for t in op.root_tables()) result = execute(expr, scope=scope, timecontext=timecontext, **kwargs) assert result_name is not None, 'Column selection name is None' if np.isscalar(result): return pd.Series( np.repeat(result, len(data.index)), index=data.index, name=result_name, ) return result.rename(result_name)
def compute_sort_key(key, data, scope=None, **kwargs): by = key.to_expr() try: if isinstance(by, str): return by, None return by.get_name(), None except com.ExpressionError: new_scope = {t: data for t in by.op().root_tables()} new_column = execute(by, scope=toolz.merge(scope, new_scope), **kwargs) name = ibis.util.guid() new_column.name = name return name, new_column
def _compute_predicates( table_op, predicates, data, scope: Scope, timecontext: Optional[TimeContext], **kwargs, ): """Compute the predicates for a table operation. Parameters ---------- table_op : TableNode predicates : List[ir.ColumnExpr] data : pd.DataFrame scope : Scope timecontext: Optional[TimeContext] kwargs : dict Returns ------- computed_predicate : pd.Series[bool] Notes ----- This handles the cases where the predicates are computed columns, in addition to the simple case of named columns coming directly from the input table. """ for predicate in predicates: # Map each root table of the predicate to the data so that we compute # predicates on the result instead of any left or right tables if the # Selection is on a Join. Project data to only inlude columns from # the root table. root_tables = predicate.op().root_tables() # handle suffixes data_columns = frozenset(data.columns) additional_scope = Scope() for root_table in root_tables: mapping = remap_overlapping_column_names(table_op, root_table, data_columns) if mapping is not None: new_data = data.loc[:, mapping.keys()].rename(columns=mapping) else: new_data = data additional_scope = additional_scope.merge_scope( Scope({root_table: new_data}, timecontext)) scope = scope.merge_scope(additional_scope) yield execute(predicate, scope=scope, **kwargs)
def execute(self, query, params=None, limit='default', **kwargs): if limit != 'default': raise ValueError( 'limit parameter to execute is not yet implemented in the ' 'pandas backend') if not isinstance(query, ir.Expr): raise TypeError( "`query` has type {!r}, expected ibis.expr.types.Expr".format( type(query).__name__)) result = execute(query, params=params, **kwargs) query_op = query.op() return execute_last(query_op, result, params=params, **kwargs)
def compute_sort_key(key, data, timecontext, scope=None, **kwargs): by = key.to_expr() try: if isinstance(by, str): return by, None return by.get_name(), None except com.ExpressionError: if scope is None: scope = Scope() scope = scope.merge_scopes( Scope({t: data}, timecontext) for t in by.op().root_tables()) new_column = execute(by, scope=scope, **kwargs) name = ibis.util.guid() new_column.name = name return name, new_column
def _compute_predicates(table_op, predicates, data, scope, **kwargs): """Compute the predicates for a table operation. Parameters ---------- table_op : TableNode predicates : List[ir.ColumnExpr] data : pd.DataFrame scope : dict kwargs : dict Returns ------- computed_predicate : pd.Series[bool] Notes ----- This handles the cases where the predicates are computed columns, in addition to the simple case of named columns coming directly from the input table. """ for predicate in predicates: # Map each root table of the predicate to the data so that we compute # predicates on the result instead of any left or right tables if the # Selection is on a Join. Project data to only inlude columns from # the root table. root_tables = predicate.op().root_tables() # handle suffixes additional_scope = {} data_columns = frozenset(data.columns) for root_table in root_tables: mapping = remap_overlapping_column_names( table_op, root_table, data_columns ) if mapping is not None: new_data = data.loc[:, mapping.keys()].rename(columns=mapping) else: new_data = data additional_scope[root_table] = new_data new_scope = toolz.merge(scope, additional_scope) yield execute(predicate, scope=new_scope, **kwargs)
def compute_projection_column_expr(expr, parent, data, scope=None, **kwargs): result_name = getattr(expr, '_name', None) op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ops.TableColumn): # slightly faster path for simple column selection name = op.name if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) root_table, = op.root_tables() left_root, right_root = ops.distinct_roots( parent_table_op.left, parent_table_op.right ) suffixes = { left_root: constants.LEFT_JOIN_SUFFIX, right_root: constants.RIGHT_JOIN_SUFFIX, } return data.loc[:, name + suffixes[root_table]].rename( result_name or name ) data_columns = frozenset(data.columns) additional_scope = { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) for t in op.root_tables() } new_scope = toolz.merge(scope, additional_scope) result = execute(expr, scope=new_scope, **kwargs) assert result_name is not None, 'Column selection name is None' return result.rename(result_name)
def compute_projection_column_expr(expr, parent, data, scope=None, **kwargs): result_name = getattr(expr, '_name', None) op = expr.op() parent_table_op = parent.table.op() if isinstance(op, ops.TableColumn): # slightly faster path for simple column selection name = op.name if name in data: return data[name].rename(result_name or name) if not isinstance(parent_table_op, ops.Join): raise KeyError(name) root_table, = op.root_tables() left_root, right_root = ops.distinct_roots(parent_table_op.left, parent_table_op.right) suffixes = { left_root: constants.LEFT_JOIN_SUFFIX, right_root: constants.RIGHT_JOIN_SUFFIX, } return data.loc[:, name + suffixes[root_table]].rename(result_name or name) data_columns = frozenset(data.columns) additional_scope = { t: map_new_column_names_to_data( remap_overlapping_column_names(parent_table_op, t, data_columns), data, ) for t in op.root_tables() } new_scope = toolz.merge(scope, additional_scope) result = execute(expr, scope=new_scope, **kwargs) assert result_name is not None, 'Column selection name is None' return result.rename(result_name)
def execute_node_expr_list(op, sequence, **kwargs): # TODO: no true approx count distinct for pandas, so we use exact for now columns = [e.get_name() for e in op.exprs] schema = ibis.schema(list(zip(columns, (e.type() for e in op.exprs)))) data = {col: [execute(el, **kwargs)] for col, el in zip(columns, sequence)} return schema.apply_to(pd.DataFrame(data, columns=columns))
def execute_window_op(op, data, window, scope=None, context=None, **kwargs): operand = op.expr root, = op.root_tables() try: data = scope[root] except KeyError: data = execute(root.to_expr(), scope=scope, context=context, **kwargs) following = window.following order_by = window._order_by if order_by and following != 0: raise com.OperationNotDefinedError( 'Following with a value other than 0 (current row) with order_by ' 'is not yet implemented in the pandas backend. Use ' 'ibis.trailing_window or ibis.cumulative_window to ' 'construct windows when using the pandas backend.' ) group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, context=context, **kwargs ) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by) ) ] order_by = window._order_by if grouping_keys: source = data.groupby(grouping_keys, sort=False, as_index=not order_by) if order_by: sorted_df = source.apply( lambda df, order_by=order_by, kwargs=kwargs: ( util.compute_sorted_frame(order_by, df, **kwargs) ) ) source = sorted_df.groupby(grouping_keys, sort=False) post_process = _post_process_group_by_order_by else: post_process = _post_process_group_by else: if order_by: source = util.compute_sorted_frame(order_by, data, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # no order by or group by: default summarization context # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not order_by: context = agg_ctx.Summarize() elif isinstance(operand.op(), ops.Reduction) and order_by: # XXX(phillipc): What a horror show preceding = window.preceding if preceding is not None: context = agg_ctx.Moving(preceding) else: # expanding window context = agg_ctx.Cumulative() else: # groupby transform (window with a partition by clause in SQL parlance) context = agg_ctx.Transform() result = execute(operand, new_scope, context=context, **kwargs) series = post_process(result, data.index) assert len(data) == len(series), \ 'input data source and computed column do not have the same length' return series
def execute_window_op( op, data, window, scope: Scope = None, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() adjusted_timecontext = None if timecontext: arg_timecontexts = compute_time_context(op, timecontext=timecontext, clients=clients) # timecontext is the original time context required by parent node # of this WindowOp, while adjusted_timecontext is the adjusted context # of this Window, since we are doing a manual execution here, use # adjusted_timecontext in later execution phases adjusted_timecontext = arg_timecontexts[0] pre_executed_scope = pre_execute( operand_op, *clients, scope=scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) scope = scope.merge_scope(pre_executed_scope) (root, ) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, timecontext=adjusted_timecontext, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) for key, key_op in zip(group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = [] if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame( data, order_by, group_by=group_by, timecontext=adjusted_timecontext, **kwargs, ) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, timecontext=adjusted_timecontext, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty # Here groupby object should be add to the corresponding node in scope # for execution, data will be overwrite to a groupby object, so we # force an update regardless of time context new_scope = scope.merge_scopes( [ Scope({t: source}, adjusted_timecontext) for t in operand.op().root_tables() ], overwrite=True, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() aggcontext = get_aggcontext( window, scope=scope, operand=operand, operand_dtype=operand_dtype, parent=source, group_by=grouping_keys, order_by=ordering_keys, **kwargs, ) result = execute( operand, scope=new_scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' # trim data to original time context series = trim_with_timecontext(series, timecontext) return series
def execute_window_op(op, data, window, scope=None, aggcontext=None, clients=None, **kwargs): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute(operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs) scope = toolz.merge(scope, pre_executed_scope) (root, ) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs) for key, key_op in zip(group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame(data, order_by, group_by=group_by, **kwargs) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() aggcontext = get_aggcontext( window, operand=operand, operand_dtype=operand_dtype, parent=source, group_by=grouping_keys, order_by=ordering_keys, ) result = execute( operand, scope=new_scope, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series
def execute_window_op( op, data, window, scope=None, aggcontext=None, clients=None, **kwargs ): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute( operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs ) scope = toolz.merge(scope, pre_executed_scope) root, = op.root_tables() root_expr = root.to_expr() data = execute(root_expr, scope=scope, aggcontext=aggcontext, **kwargs) following = window.following order_by = window._order_by if ( order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase) ): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented' ) group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute(key, aggcontext=aggcontext, **kwargs) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by) ) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame( data, order_by, group_by=group_by, **kwargs ) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs ) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() if isinstance(operand_type, dt.Integer) and operand_type.nullable: operand_dtype = np.float64 else: operand_dtype = operand.type().to_pandas() # no order by or group by: default summarization aggcontext # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not ordering_keys: aggcontext = agg_ctx.Summarize() elif isinstance(operand.op(), ops.Reduction) and ordering_keys: # XXX(phillipc): What a horror show preceding = window.preceding if preceding is not None: aggcontext = agg_ctx.Moving( preceding, parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # expanding window aggcontext = agg_ctx.Cumulative( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # groupby transform (window with a partition by clause in SQL parlance) aggcontext = agg_ctx.Transform( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) result = execute(operand, scope=new_scope, aggcontext=aggcontext, **kwargs) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series
def execute(self, expr, params=None, **kwargs): # noqa assert isinstance(expr, ir.Expr) scope = kwargs.pop('scope', {}) return execute(expr, scope=scope, params=params, **kwargs)
def execute_aggregation_dataframe(op, data, scope=None, **kwargs): assert op.metrics, 'no metrics found during aggregation execution' if op.sort_keys: raise NotImplementedError( 'sorting on aggregations not yet implemented' ) predicates = op.predicates if predicates: predicate = functools.reduce( operator.and_, (execute(p, scope=scope, **kwargs) for p in predicates), ) data = data.loc[predicate] columns = {} if op.by: grouping_key_pairs = list( zip(op.by, map(operator.methodcaller('op'), op.by)) ) grouping_keys = [ by_op.name if isinstance(by_op, ops.TableColumn) else execute(by, scope=scope, **kwargs).rename(by.get_name()) for by, by_op in grouping_key_pairs ] columns.update( (by_op.name, by.get_name()) for by, by_op in grouping_key_pairs if hasattr(by_op, 'name') ) source = data.groupby(grouping_keys) else: source = data new_scope = toolz.merge(scope, {op.table.op(): source}) pieces = [ pd.Series( execute(metric, scope=new_scope, **kwargs), name=metric.get_name() ) for metric in op.metrics ] # group by always needs a reset to get the grouping key back as a column result = pd.concat(pieces, axis=1).reset_index() result.columns = [columns.get(c, c) for c in result.columns] if op.having: # .having(...) is only accessible on groupby, so this should never # raise if not op.by: raise ValueError( 'Filtering out aggregation values is not allowed without at ' 'least one grouping key' ) # TODO(phillipc): Don't recompute identical subexpressions predicate = functools.reduce( operator.and_, ( execute(having, scope=new_scope, **kwargs) for having in op.having ), ) assert len(predicate) == len( result ), 'length of predicate does not match length of DataFrame' result = result.loc[predicate.values] return result
def insert(self, path, expr, index=False, **kwargs): path = self.root / path data = execute(expr) data.to_csv(str(path), index=index, **kwargs)
def execute_window_op(op, data, window, scope=None, aggcontext=None, clients=None, **kwargs): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute(operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs) scope = toolz.merge(scope, pre_executed_scope) root, = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, aggcontext=aggcontext, **kwargs) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame(data, order_by, group_by=group_by, **kwargs) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() # no order by or group by: default summarization aggcontext # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not ordering_keys: aggcontext = agg_ctx.Summarize() elif (isinstance(operand.op(), (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All)) and ordering_keys): # XXX(phillipc): What a horror show preceding = window.preceding if preceding is not None: max_lookback = window.max_lookback assert not isinstance(operand.op(), ops.CumulativeOp) aggcontext = agg_ctx.Moving( preceding, max_lookback, parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # expanding window aggcontext = agg_ctx.Cumulative( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # groupby transform (window with a partition by clause in SQL parlance) aggcontext = agg_ctx.Transform( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) result = execute( operand, scope=new_scope, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series
def execute_node_value_list(op, _, **kwargs): return [execute(arg, **kwargs) for arg in op.values]
def insert(self, path, expr, **kwargs): path = self.root / path df = execute(expr) table = pa.Table.from_pandas(df) pq.write_table(table, str(path))