def translate(self, expr, scope, **kwargs): """ Translate Ibis expression into a PySpark object. All translated expressions are cached within scope. If an expression is found within scope, it's returned. Otherwise, the it's translated and cached for future reference. :param expr: ibis expression :param scope: dictionary mapping from operation to translated result :param kwargs: parameters passed as keyword args (e.g. window) :return: translated PySpark DataFrame or Column object """ # The operation node type the typed expression wraps op = expr.op() if op in scope: return scope[op] elif type(op) in self._registry: formatter = self._registry[type(op)] result = formatter(self, expr, scope, **kwargs) scope[op] = result return result else: raise com.OperationNotDefinedError( 'No translation rule for {}'.format(type(op)))
def execute_arbitrary_series_groupby(op, data, _, aggcontext=None, **kwargs): how = op.how if how is None: how = 'first' if how not in {'first', 'last'}: raise com.OperationNotDefinedError( 'Arbitrary {!r} is not supported'.format(how)) return aggcontext.agg(data, how)
def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs): if op.how == 'first': index = 0 elif op.how == 'last': index = -1 else: raise com.OperationNotDefinedError( 'Arbitrary {!r} is not supported'.format(op.how)) data = data[mask] if mask is not None else data return data.iloc[index]
def translate(self, expr, scope, **kwargs): # The operation node type the typed expression wraps op = expr.op() if type(op) in self._registry: formatter = self._registry[type(op)] return formatter(self, expr, scope, **kwargs) else: raise com.OperationNotDefinedError( 'No translation rule for {}'.format(type(op)) )
def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs): """ Note: we cannot use the pandas version because Dask does not support .iloc """ if op.how == 'first': index = 0 elif op.how == 'last': index = -1 else: raise com.OperationNotDefinedError( 'Arbitrary {!r} is not supported'.format(op.how)) data = data[mask] if mask is not None else data return data.loc[index]
def execute_arbitrary_series_mask(op, data, mask, aggcontext=None, **kwargs): """ Note: we cannot use the pandas version because Dask does not support .iloc See https://docs.dask.org/en/latest/dataframe-indexing.html. .loc will only work if our index lines up with the label. """ data = data[mask] if mask is not None else data if op.how == 'first': index = 0 elif op.how == 'last': index = len(data) - 1 # TODO - computation else: raise com.OperationNotDefinedError( f'Arbitrary {op.how!r} is not supported') return data.loc[index]
def translate(self, expr): # The operation node type the typed expression wraps op = expr.op() if type(op) in self._rewrites: # even if type(op) is in self._registry expr = self._rewrites[type(op)](expr) op = expr.op() # TODO: use op MRO for subclasses instead of this isinstance spaghetti if isinstance(op, ops.ScalarParameter): return self._trans_param(expr) elif isinstance(op, ops.TableNode): # HACK/TODO: revisit for more complex cases return '*' elif type(op) in self._registry: formatter = self._registry[type(op)] return formatter(self, expr) else: raise com.OperationNotDefinedError( f'No translation rule for {type(op)}')
def execute_window_op(op, data, window, scope=None, aggcontext=None, clients=None, **kwargs): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute(operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs) scope = toolz.merge(scope, pre_executed_scope) (root, ) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs) for key, key_op in zip(group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame(data, order_by, group_by=group_by, **kwargs) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() aggcontext = get_aggcontext( window, operand=operand, operand_dtype=operand_dtype, parent=source, group_by=grouping_keys, order_by=ordering_keys, ) result = execute( operand, scope=new_scope, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series
def operation(op, expr): raise com.OperationNotDefinedError(f'No translation rule for {type(op)}')
def execute_window_op( op, data, window, scope: Scope = None, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, **kwargs, ): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() adjusted_timecontext = None if timecontext: arg_timecontexts = compute_time_context( op, timecontext=timecontext, clients=clients ) # timecontext is the original time context required by parent node # of this WindowOp, while adjusted_timecontext is the adjusted context # of this Window, since we are doing a manual execution here, use # adjusted_timecontext in later execution phases adjusted_timecontext = arg_timecontexts[0] pre_executed_scope = pre_execute( operand_op, *clients, scope=scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) scope = scope.merge_scope(pre_executed_scope) (root,) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, timecontext=adjusted_timecontext, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if ( order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase) ): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented' ) group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, timecontext=adjusted_timecontext, aggcontext=aggcontext, **kwargs, ) for key, key_op in zip( group_by, map(operator.methodcaller('op'), group_by) ) ] order_by = window._order_by if not order_by: ordering_keys = [] if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame( data, order_by, group_by=group_by, timecontext=adjusted_timecontext, **kwargs, ) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, timecontext=adjusted_timecontext, **kwargs ) post_process = _post_process_order_by else: source = data post_process = _post_process_empty # Here groupby object should be add to the corresponding node in scope # for execution, data will be overwrite to a groupby object, so we # force an update regardless of time context new_scope = scope.merge_scopes( [ Scope({t: source}, adjusted_timecontext) for t in operand.op().root_tables() ], overwrite=True, ) aggcontext = get_aggcontext( window, scope=scope, operand=operand, parent=source, group_by=grouping_keys, order_by=ordering_keys, **kwargs, ) result = execute( operand, scope=new_scope, timecontext=adjusted_timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process( result, data, ordering_keys, grouping_keys, adjusted_timecontext, ) assert len(data) == len( series ), 'input data source and computed column do not have the same length' # trim data to original time context series = trim_with_timecontext(series, timecontext) return series
def execute_window_op(op, data, window, scope=None, aggcontext=None, clients=None, **kwargs): operand = op.expr # pre execute "manually" here because otherwise we wouldn't pickup # relevant scope changes from the child operand since we're managing # execution of that by hand operand_op = operand.op() pre_executed_scope = pre_execute(operand_op, *clients, scope=scope, aggcontext=aggcontext, **kwargs) scope = toolz.merge(scope, pre_executed_scope) (root, ) = op.root_tables() root_expr = root.to_expr() data = execute( root_expr, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs, ) following = window.following order_by = window._order_by if (order_by and following != 0 and not isinstance(operand_op, ops.ShiftBase)): raise com.OperationNotDefinedError( 'Window functions affected by following with order_by are not ' 'implemented') group_by = window._group_by grouping_keys = [ key_op.name if isinstance(key_op, ops.TableColumn) else execute( key, scope=scope, clients=clients, aggcontext=aggcontext, **kwargs) for key, key_op in zip(group_by, map(operator.methodcaller('op'), group_by)) ] order_by = window._order_by if not order_by: ordering_keys = () if group_by: if order_by: ( sorted_df, grouping_keys, ordering_keys, ) = util.compute_sorted_frame(data, order_by, group_by=group_by, **kwargs) source = sorted_df.groupby(grouping_keys, sort=True) post_process = _post_process_group_by_order_by else: source = data.groupby(grouping_keys, sort=False) post_process = _post_process_group_by else: if order_by: source, grouping_keys, ordering_keys = util.compute_sorted_frame( data, order_by, **kwargs) post_process = _post_process_order_by else: source = data post_process = _post_process_empty new_scope = toolz.merge( scope, OrderedDict((t, source) for t in operand.op().root_tables()), factory=OrderedDict, ) # figure out what the dtype of the operand is operand_type = operand.type() operand_dtype = operand_type.to_pandas() # no order by or group by: default summarization aggcontext # # if we're reducing and we have an order by expression then we need to # expand or roll. # # otherwise we're transforming if not grouping_keys and not ordering_keys: aggcontext = agg_ctx.Summarize() elif (isinstance(operand.op(), (ops.Reduction, ops.CumulativeOp, ops.Any, ops.All)) and ordering_keys): # XXX(phillipc): What a horror show preceding = window.preceding if preceding is not None: max_lookback = window.max_lookback assert not isinstance(operand.op(), ops.CumulativeOp) aggcontext = agg_ctx.Moving( preceding, max_lookback, parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # expanding window aggcontext = agg_ctx.Cumulative( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) else: # groupby transform (window with a partition by clause in SQL parlance) aggcontext = agg_ctx.Transform( parent=source, group_by=grouping_keys, order_by=ordering_keys, dtype=operand_dtype, ) result = execute( operand, scope=new_scope, aggcontext=aggcontext, clients=clients, **kwargs, ) series = post_process(result, data, ordering_keys, grouping_keys) assert len(data) == len( series ), 'input data source and computed column do not have the same length' return series