def test_rolling_mean(min_periods, window_length, gap, rolling_series_pd): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Since we're using a uniform series we can check correctness using numeric parameters expected_vals = (_roll_series_with_gap( rolling_series_pd, window_length_num, gap=gap_num, min_periods=min_periods).mean().values) primitive_instance = RollingMean(window_length=window_length, gap=gap, min_periods=min_periods) primitive_func = primitive_instance.get_function() actual_vals = pd.Series( primitive_func(rolling_series_pd.index, pd.Series(rolling_series_pd.values))) # Since min_periods of 0 is the same as min_periods of 1 num_nans_from_min_periods = min_periods or 1 assert actual_vals.isna().sum() == gap_num + num_nans_from_min_periods - 1 pd.testing.assert_series_equal(pd.Series(expected_vals), actual_vals)
def test_rolling_std(min_periods, window_length, gap, rolling_series_pd): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Since we're using a uniform series we can check correctness using numeric parameters expected_vals = (_roll_series_with_gap( rolling_series_pd, window_length_num, gap=gap_num, min_periods=min_periods).std().values) primitive_instance = RollingSTD(window_length=window_length, gap=gap, min_periods=min_periods) primitive_func = primitive_instance.get_function() actual_vals = pd.Series( primitive_func(rolling_series_pd.index, pd.Series(rolling_series_pd.values))) # Since min_periods of 0 is the same as min_periods of 1 num_nans_from_min_periods = min_periods or 2 if min_periods in [0, 1]: # the additional nan is because std pandas function returns NaN if there's only one value num_nans = gap_num + 1 else: num_nans = gap_num + num_nans_from_min_periods - 1 # The extra 1 at the beinning is because the std pandas function returns NaN if there's only one value assert actual_vals.isna().sum() == num_nans pd.testing.assert_series_equal(pd.Series(expected_vals), actual_vals)
def test_roll_series_with_gap(window_length, gap, rolling_series_pd): rolling_max = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap).max() rolling_min = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap).min() assert len(rolling_max) == len(rolling_series_pd) assert len(rolling_min) == len(rolling_series_pd) gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) for i in range(len(rolling_series_pd)): start_idx = i - gap_num - window_length_num + 1 if isinstance(gap, str): # No gap functionality is happening, so gap isn't taken account in the end index # it's like the gap is 0; it includes the row itself end_idx = i else: end_idx = i - gap_num # If start and end are negative, they're entirely before if start_idx < 0 and end_idx < 0: assert pd.isnull(rolling_max.iloc[i]) assert pd.isnull(rolling_min.iloc[i]) continue if start_idx < 0: start_idx = 0 # Because the row values are a range from 0 to 20, the rolling min will be the start index # and the rolling max will be the end idx assert rolling_min.iloc[i] == start_idx assert rolling_max.iloc[i] == end_idx
def test_roll_series_with_gap_early_values(window_length, gap, rolling_series_pd): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) # Default min periods is 1 - will include all default_partial_values = _roll_series_with_gap( rolling_series_pd, window_length, gap=gap ).count() num_empty_aggregates = len(default_partial_values.loc[default_partial_values == 0]) num_partial_aggregates = len( (default_partial_values.loc[default_partial_values != 0]).loc[ default_partial_values < window_length_num ] ) assert num_partial_aggregates == window_length_num - 1 if isinstance(gap, str): # gap isn't handled, so we'll always at least include the row itself assert num_empty_aggregates == 0 else: assert num_empty_aggregates == gap_num # Make min periods the size of the window no_partial_values = _roll_series_with_gap( rolling_series_pd, window_length, gap=gap, min_periods=window_length_num ).count() num_null_aggregates = len(no_partial_values.loc[pd.isna(no_partial_values)]) num_partial_aggregates = len( no_partial_values.loc[no_partial_values < window_length_num] ) # because we shift, gap is included as nan values in the series. # Count treats nans in a window as values that don't get counted, # so the gap rows get included in the count for whether a window has "min periods". # This is different than max, for example, which does not count nans in a window as values towards "min periods" assert num_null_aggregates == window_length_num - 1 if isinstance(gap, str): # gap isn't handled, so we'll never have any partial aggregates assert num_partial_aggregates == 0 else: assert num_partial_aggregates == gap_num
def test_rolling_count(window_length, gap, rolling_series_pd): gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) expected_vals = (_roll_series_with_gap( rolling_series_pd, window_length_num, gap=gap_num, min_periods=window_length_num, ).count().values) primitive_instance = RollingCount(window_length=window_length, gap=gap, min_periods=window_length_num) primitive_func = primitive_instance.get_function() actual_vals = pd.Series(primitive_func(rolling_series_pd.index)) num_nans = gap_num + window_length_num - 1 assert actual_vals.isna().sum() == num_nans # RollingCount will not match the exact _roll_series_with_gap call, # because it handles the min_periods difference within the primitive pd.testing.assert_series_equal( pd.Series(expected_vals).iloc[num_nans:], actual_vals.iloc[num_nans:])
def test_apply_roll_with_offset_gap(window_length, gap, rolling_series_pd): def max_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=1) rolling_max_obj = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap) rolling_max_series = rolling_max_obj.apply(max_wrapper) def min_wrapper(sub_s): return _apply_roll_with_offset_gap(sub_s, gap, min, min_periods=1) rolling_min_obj = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap) rolling_min_series = rolling_min_obj.apply(min_wrapper) assert len(rolling_max_series) == len(rolling_series_pd) assert len(rolling_min_series) == len(rolling_series_pd) gap_num = get_number_from_offset(gap) window_length_num = get_number_from_offset(window_length) for i in range(len(rolling_series_pd)): start_idx = i - gap_num - window_length_num + 1 # Now that we have the _apply call, this acts as expected end_idx = i - gap_num # If start and end are negative, they're entirely before if start_idx < 0 and end_idx < 0: assert pd.isnull(rolling_max_series.iloc[i]) assert pd.isnull(rolling_min_series.iloc[i]) continue if start_idx < 0: start_idx = 0 # Because the row values are a range from 0 to 20, the rolling min will be the start index # and the rolling max will be the end idx assert rolling_min_series.iloc[i] == start_idx assert rolling_max_series.iloc[i] == end_idx