def _post_process_empty( result: Any, parent: pd.DataFrame, order_by: List[str], group_by: List[str], timecontext: Optional[TimeContext], ) -> pd.Series: # This is the post process of the no groupby nor orderby window # `result` could be a Series or a scalar. generated by `agg` method # of class `Window`. For window without grouby or orderby, `agg` # calls pands method directly. So if timecontext is present, we # need to insert 'time' column into index for trimming the result. # For cases when grouby or orderby is present, `agg` calls # Ibis method `window_agg_built_in` and `window_agg_udf`, time # context is already inserted there. assert not order_by and not group_by if isinstance(result, pd.Series): # `result` is a Series when an analytic operation is being # applied over the window, since analytic operations are N->N if timecontext: result = construct_time_context_aware_series(result, parent) return result else: # `result` is a scalar when a reduction operation is being # applied over the window, since reduction operations are N->1 index = parent.index result = pd.Series([result]).repeat(len(index)) result.index = index if timecontext: result = construct_time_context_aware_series(result, parent) return result
def test_construct_time_context_aware_series(time_df3): """Unit test for `construct_time_context_aware_series` """ # Series without 'time' index will result in a MultiIndex with 'time' df = time_df3 expected = df['value'] time_index = pd.Index(df['time']) expected.index = pd.MultiIndex.from_arrays( [expected.index, time_index], names=expected.index.names + ['time'], ) result = construct_time_context_aware_series(df['value'], df) tm.assert_series_equal(result, expected) # Series with 'time' as index will not change time_indexed_df = time_df3.set_index('time') expected_time_aware = time_indexed_df['value'] result_time_aware = construct_time_context_aware_series( time_indexed_df['value'], time_indexed_df) tm.assert_series_equal(result_time_aware, expected_time_aware) # Series with a MultiIndex, where 'time' is in the MultiIndex, # will not change multi_index_time_aware_series = result_time_aware expected_multi_index_time_aware = result_time_aware result_multi_index_time_aware = construct_time_context_aware_series( multi_index_time_aware_series, time_indexed_df) tm.assert_series_equal(result_multi_index_time_aware, expected_multi_index_time_aware) # Series with a MultiIndex, where 'time' is NOT in the MultiIndex, # 'time' will be added into the MultiIndex multi_index_series = df['id'] expected_multi_index = df['id'].copy() other_index = pd.Index(df['value']) expected_multi_index.index = pd.MultiIndex.from_arrays( [expected_multi_index.index, other_index, time_index], names=expected_multi_index.index.names + ['value', 'time'], ) multi_index_series.index = pd.MultiIndex.from_arrays( [multi_index_series.index, other_index], names=multi_index_series.index.names + ['value'], ) result_multi_index = construct_time_context_aware_series( multi_index_series, df) tm.assert_series_equal(result_multi_index, expected_multi_index)
def agg( self, grouped_data: Union[pd.Series, SeriesGroupBy], function: Union[str, Callable], *args: Any, **kwargs: Any, ) -> pd.Series: # avoid a pandas warning about numpy arrays being passed through # directly group_by = self.group_by order_by = self.order_by assert group_by or order_by # Get the DataFrame from which the operand originated # (passed in when constructing this context object in # execute_node(ops.WindowOp)) parent = self.parent frame = getattr(parent, 'obj', parent) obj = getattr(grouped_data, 'obj', grouped_data) name = obj.name if frame[name] is not obj or name in group_by or name in order_by: name = f"{name}_{ibis.util.guid()}" frame = frame.assign(**{name: obj}) # set the index to our order_by keys and append it to the existing # index # TODO: see if we can do this in the caller, when the context # is constructed rather than pulling out the data columns = group_by + order_by + [name] # Create a new frame to avoid mutating the original one indexed_by_ordering = frame[columns].copy() # placeholder column to compute window_sizes below indexed_by_ordering['_placeholder'] = 0 indexed_by_ordering = indexed_by_ordering.set_index( order_by ).sort_index(kind="stable") # regroup if needed if group_by: grouped_frame = indexed_by_ordering.groupby(group_by) else: grouped_frame = indexed_by_ordering grouped = grouped_frame[name] if callable(function): # To compute the window_size, we need to contruct a # RollingGroupby and compute count using construct_window. # However, if the RollingGroupby is not numeric, e.g., # we are calling window UDF on a timestamp column, we # cannot compute rolling count directly because: # (1) windowed.count() will exclude NaN observations # , which results in incorrect window sizes. # (2) windowed.apply(len, raw=True) will include NaN # obversations, but doesn't work on non-numeric types. # https://github.com/pandas-dev/pandas/issues/23002 # To deal with this, we create a _placeholder column windowed_frame = self.construct_window(grouped_frame) window_sizes = ( windowed_frame['_placeholder'].count().reset_index(drop=True) ) mask = ~(window_sizes.isna()) window_upper_indices = pd.Series(range(len(window_sizes))) + 1 window_lower_indices = window_upper_indices - window_sizes # The result Series of udf may need to be trimmed by # timecontext. In order to do so, 'time' must be added # as an index to the Series, if present. Here We extract # time column from the parent Dataframe `frame`. if get_time_col() in frame: result_index = construct_time_context_aware_series( obj, frame ).index else: result_index = obj.index result = window_agg_udf( grouped_data, function, window_lower_indices, window_upper_indices, mask, result_index, self.dtype, self.max_lookback, *args, **kwargs, ) else: # perform the per-group rolling operation windowed = self.construct_window(grouped) result = window_agg_built_in( frame, windowed, function, self.max_lookback, *args, **kwargs, ) try: return result.astype(self.dtype, copy=False) except (TypeError, ValueError): return result