Esempio n. 1
0
def test_roll_series_with_gap_different_input_types_same_result_uniform(
    rolling_series_pd,
):
    # Offset inputs will only produce the same results as numeric inputs
    # when the data has a uniform frequency
    offset_gap = "2d"
    offset_window_length = "5d"
    int_gap = 2
    int_window_length = 5

    # Rolling series' with matching input types
    expected_rolling_numeric = _roll_series_with_gap(
        rolling_series_pd, window_size=int_window_length, gap=int_gap
    ).max()

    def count_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, offset_gap, max, min_periods=1)

    rolling_count_obj = _roll_series_with_gap(
        rolling_series_pd, window_size=offset_window_length, gap=offset_gap
    )
    expected_rolling_offset = rolling_count_obj.apply(count_wrapper)

    # confirm that the offset and gap results are equal to one another
    pd.testing.assert_series_equal(expected_rolling_numeric, expected_rolling_offset)

    # Rolling series' with mismatched input types
    mismatched_numeric_gap = _roll_series_with_gap(
        rolling_series_pd, window_size=offset_window_length, gap=int_gap
    ).max()
    # Confirm the mismatched results also produce the same results
    pd.testing.assert_series_equal(expected_rolling_numeric, mismatched_numeric_gap)
Esempio n. 2
0
def test_roll_series_with_gap(window_length, gap, rolling_series_pd):
    rolling_max = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap).max()
    rolling_min = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap).min()

    assert len(rolling_max) == len(rolling_series_pd)
    assert len(rolling_min) == len(rolling_series_pd)

    gap_num = get_number_from_offset(gap)
    window_length_num = get_number_from_offset(window_length)
    for i in range(len(rolling_series_pd)):
        start_idx = i - gap_num - window_length_num + 1

        if isinstance(gap, str):
            # No gap functionality is happening, so gap isn't taken account in the end index
            # it's like the gap is 0; it includes the row itself
            end_idx = i
        else:
            end_idx = i - gap_num

        # If start and end are negative, they're entirely before
        if start_idx < 0 and end_idx < 0:
            assert pd.isnull(rolling_max.iloc[i])
            assert pd.isnull(rolling_min.iloc[i])
            continue

        if start_idx < 0:
            start_idx = 0

        # Because the row values are a range from 0 to 20, the rolling min will be the start index
        # and the rolling max will be the end idx
        assert rolling_min.iloc[i] == start_idx
        assert rolling_max.iloc[i] == end_idx
Esempio n. 3
0
def test_roll_series_with_gap_nullable_types_with_nans(rolling_series_pd):
    window_length = 3
    gap = 2
    nullable_floats = rolling_series_pd.astype("float64").replace(
        {1: np.nan, 3: np.nan}
    )
    nullable_ints = nullable_floats.astype("Int64")

    nullable_ints_rolling_max = _roll_series_with_gap(
        nullable_ints, window_length, gap=gap
    ).max()
    nullable_floats_rolling_max = _roll_series_with_gap(
        nullable_floats, window_length, gap=gap
    ).max()

    pd.testing.assert_series_equal(
        nullable_ints_rolling_max, nullable_floats_rolling_max
    )

    expected_early_values = [np.nan, np.nan, 0, 0, 2, 2, 4] + list(
        range(7 - gap, len(rolling_series_pd) - gap)
    )
    for i in range(len(rolling_series_pd)):
        actual = nullable_floats_rolling_max.iloc[i]
        expected = expected_early_values[i]

        if pd.isnull(actual):
            assert pd.isnull(expected)
        else:
            assert actual == expected
Esempio n. 4
0
def test_roll_series_with_gap_incorrect_types(rolling_series_pd):
    error = "Window length must be either an offset string or an integer."
    with pytest.raises(TypeError, match=error):
        _roll_series_with_gap(rolling_series_pd, window_size=4.2, gap=4)

    error = "Gap must be either an offset string or an integer."
    with pytest.raises(TypeError, match=error):
        _roll_series_with_gap(rolling_series_pd, window_size=4, gap=4.2)
Esempio n. 5
0
def test_roll_series_with_gap_negative_inputs(rolling_series_pd):
    error = "Window length must be greater than zero."
    with pytest.raises(ValueError, match=error):
        _roll_series_with_gap(rolling_series_pd, window_size=-4, gap=4)

    error = "Gap must be greater than or equal to zero."
    with pytest.raises(ValueError, match=error):
        _roll_series_with_gap(rolling_series_pd, window_size=4, gap=-4)
Esempio n. 6
0
def test_apply_roll_with_offset_data_frequency_higher_than_parameters_frequency():
    window_length = "5D"  # 120 hours
    window_length_num = 5
    # In order for min periods to be the length of the window, we multiply 24hours*5
    min_periods = window_length_num * 24

    datetimes = list(pd.date_range(start="2017-01-01", freq="1H", periods=200))
    high_frequency_series = pd.Series(range(200), index=datetimes)

    # Check without gap
    gap = "0d"
    gap_num = 0

    def max_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods)

    rolling_max_obj = _roll_series_with_gap(
        high_frequency_series, window_length, min_periods=min_periods, gap=gap
    )
    rolling_max_series = rolling_max_obj.apply(max_wrapper)

    assert rolling_max_series.isna().sum() == (min_periods - 1) + gap_num

    # Check with small gap
    gap = "3H"
    gap_num = 3

    def max_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods)

    rolling_max_obj = _roll_series_with_gap(
        high_frequency_series, window_length, min_periods=min_periods, gap=gap
    )
    rolling_max_series = rolling_max_obj.apply(max_wrapper)

    assert rolling_max_series.isna().sum() == (min_periods - 1) + gap_num

    # Check with large gap - in terms of days, so we'll multiply by 24hours for number of nans
    gap = "2D"
    gap_num = 2

    def max_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods)

    rolling_max_obj = _roll_series_with_gap(
        high_frequency_series, window_length, min_periods=min_periods, gap=gap
    )
    rolling_max_series = rolling_max_obj.apply(max_wrapper)

    assert rolling_max_series.isna().sum() == (min_periods - 1) + (gap_num * 24)
Esempio n. 7
0
def test_roll_series_with_gap_nullable_types(rolling_series_pd):
    window_length = 3
    gap = 2
    # Because we're inserting nans, confirm that nullability of the dtype doesn't have an impact on the results
    nullable_series = rolling_series_pd.astype("Int64")
    non_nullable_series = rolling_series_pd.astype("int64")

    nullable_rolling_max = _roll_series_with_gap(
        nullable_series, window_length, gap=gap
    ).max()
    non_nullable_rolling_max = _roll_series_with_gap(
        non_nullable_series, window_length, gap=gap
    ).max()

    pd.testing.assert_series_equal(nullable_rolling_max, non_nullable_rolling_max)
        def rolling_count(datetime):
            x = pd.Series(1, index=datetime)
            rolled_series = _roll_series_with_gap(x,
                                                  self.window_length,
                                                  gap=self.gap,
                                                  min_periods=self.min_periods)

            if isinstance(self.gap, str):
                # Since _apply_roll_with_offset_gap doesn't artificially add nans before rolling,
                # it produces correct results
                additional_args = (self.gap, len, self.min_periods)
                return rolled_series.apply(_apply_roll_with_offset_gap,
                                           args=additional_args).values

            rolling_count_series = rolled_series.count()

            # The shift made to account for gap adds NaNs to the rolled series
            # Those values get counted towards min_periods when they shouldn't.
            # So we need to replace any of those partial values with NaNs
            if not self.min_periods:
                # when min periods is 0 or None it's treated the same as if it's 1
                num_nans = self.gap
            else:
                num_nans = self.min_periods - 1 + self.gap
            rolling_count_series.iloc[range(num_nans)] = np.nan
            return rolling_count_series.values
Esempio n. 9
0
def test_apply_roll_with_offset_gap_non_uniform():
    window_length = "3d"
    gap = "3d"
    # When the data isn't uniform, this impacts the number of values in each rolling window
    datetimes = (
        list(pd.date_range(start="2017-01-01", freq="1d", periods=7))
        + list(pd.date_range(start="2017-02-01", freq="2d", periods=7))
        + list(pd.date_range(start="2017-03-01", freq="1d", periods=7))
    )
    no_freq_series = pd.Series(range(len(datetimes)), index=datetimes)

    assert pd.infer_freq(no_freq_series.index) is None

    expected_series = pd.Series(
        [None, None, None, 1, 2, 3, 3]
        + [None, None, 1, 1, 1, 1, 1]
        + [None, None, None, 1, 2, 3, 3],
        index=datetimes,
    )

    def count_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, gap, len, min_periods=1)

    rolling_count_obj = _roll_series_with_gap(no_freq_series, window_length, gap=gap)
    rolling_count_series = rolling_count_obj.apply(count_wrapper)

    pd.testing.assert_series_equal(rolling_count_series, expected_series)
def test_rolling_mean(min_periods, window_length, gap, rolling_series_pd):
    gap_num = get_number_from_offset(gap)
    window_length_num = get_number_from_offset(window_length)

    # Since we're using a uniform series we can check correctness using numeric parameters
    expected_vals = (_roll_series_with_gap(
        rolling_series_pd,
        window_length_num,
        gap=gap_num,
        min_periods=min_periods).mean().values)

    primitive_instance = RollingMean(window_length=window_length,
                                     gap=gap,
                                     min_periods=min_periods)
    primitive_func = primitive_instance.get_function()

    actual_vals = pd.Series(
        primitive_func(rolling_series_pd.index,
                       pd.Series(rolling_series_pd.values)))

    # Since min_periods of 0 is the same as min_periods of 1
    num_nans_from_min_periods = min_periods or 1

    assert actual_vals.isna().sum() == gap_num + num_nans_from_min_periods - 1
    pd.testing.assert_series_equal(pd.Series(expected_vals), actual_vals)
def test_rolling_std(min_periods, window_length, gap, rolling_series_pd):
    gap_num = get_number_from_offset(gap)
    window_length_num = get_number_from_offset(window_length)

    # Since we're using a uniform series we can check correctness using numeric parameters
    expected_vals = (_roll_series_with_gap(
        rolling_series_pd,
        window_length_num,
        gap=gap_num,
        min_periods=min_periods).std().values)

    primitive_instance = RollingSTD(window_length=window_length,
                                    gap=gap,
                                    min_periods=min_periods)
    primitive_func = primitive_instance.get_function()

    actual_vals = pd.Series(
        primitive_func(rolling_series_pd.index,
                       pd.Series(rolling_series_pd.values)))

    # Since min_periods of 0 is the same as min_periods of 1
    num_nans_from_min_periods = min_periods or 2

    if min_periods in [0, 1]:
        # the additional nan is because std pandas function returns NaN if there's only one value
        num_nans = gap_num + 1
    else:
        num_nans = gap_num + num_nans_from_min_periods - 1

    # The extra 1 at the beinning is because the std pandas function returns NaN if there's only one value
    assert actual_vals.isna().sum() == num_nans
    pd.testing.assert_series_equal(pd.Series(expected_vals), actual_vals)
Esempio n. 12
0
def test_roll_series_with_gap_early_values(window_length, gap, rolling_series_pd):
    gap_num = get_number_from_offset(gap)
    window_length_num = get_number_from_offset(window_length)

    # Default min periods is 1 - will include all
    default_partial_values = _roll_series_with_gap(
        rolling_series_pd, window_length, gap=gap
    ).count()
    num_empty_aggregates = len(default_partial_values.loc[default_partial_values == 0])
    num_partial_aggregates = len(
        (default_partial_values.loc[default_partial_values != 0]).loc[
            default_partial_values < window_length_num
        ]
    )

    assert num_partial_aggregates == window_length_num - 1
    if isinstance(gap, str):
        # gap isn't handled, so we'll always at least include the row itself
        assert num_empty_aggregates == 0
    else:
        assert num_empty_aggregates == gap_num

    # Make min periods the size of the window
    no_partial_values = _roll_series_with_gap(
        rolling_series_pd, window_length, gap=gap, min_periods=window_length_num
    ).count()
    num_null_aggregates = len(no_partial_values.loc[pd.isna(no_partial_values)])
    num_partial_aggregates = len(
        no_partial_values.loc[no_partial_values < window_length_num]
    )

    # because we shift, gap is included as nan values in the series.
    # Count treats nans in a window as values that don't get counted,
    # so the gap rows get included in the count for whether a window has "min periods".
    # This is different than max, for example, which does not count nans in a window as values towards "min periods"
    assert num_null_aggregates == window_length_num - 1
    if isinstance(gap, str):
        # gap isn't handled, so we'll never have any partial aggregates
        assert num_partial_aggregates == 0
    else:
        assert num_partial_aggregates == gap_num
        def rolling_mean(datetime, numeric):
            x = pd.Series(numeric.values, index=datetime.values)
            rolled_series = _roll_series_with_gap(x,
                                                  self.window_length,
                                                  gap=self.gap,
                                                  min_periods=self.min_periods)

            if isinstance(self.gap, str):
                additional_args = (self.gap, np.mean, self.min_periods)
                return rolled_series.apply(_apply_roll_with_offset_gap,
                                           args=additional_args).values
            return rolled_series.mean().values
Esempio n. 14
0
def test_apply_roll_with_offset_gap(window_length, gap, rolling_series_pd):
    def max_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=1)

    rolling_max_obj = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap)
    rolling_max_series = rolling_max_obj.apply(max_wrapper)

    def min_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, gap, min, min_periods=1)

    rolling_min_obj = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap)
    rolling_min_series = rolling_min_obj.apply(min_wrapper)

    assert len(rolling_max_series) == len(rolling_series_pd)
    assert len(rolling_min_series) == len(rolling_series_pd)

    gap_num = get_number_from_offset(gap)
    window_length_num = get_number_from_offset(window_length)
    for i in range(len(rolling_series_pd)):
        start_idx = i - gap_num - window_length_num + 1
        # Now that we have the _apply call, this acts as expected
        end_idx = i - gap_num

        # If start and end are negative, they're entirely before
        if start_idx < 0 and end_idx < 0:
            assert pd.isnull(rolling_max_series.iloc[i])
            assert pd.isnull(rolling_min_series.iloc[i])
            continue

        if start_idx < 0:
            start_idx = 0

        # Because the row values are a range from 0 to 20, the rolling min will be the start index
        # and the rolling max will be the end idx
        assert rolling_min_series.iloc[i] == start_idx
        assert rolling_max_series.iloc[i] == end_idx
Esempio n. 15
0
def test_apply_roll_with_offset_data_min_periods_too_big(rolling_series_pd):
    window_length = "5D"
    gap = "2d"

    # Since the data has a daily frequency, there will only be, at most, 5 rows in the window
    min_periods = 6

    def max_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, gap, max, min_periods=min_periods)

    rolling_max_obj = _roll_series_with_gap(
        rolling_series_pd, window_length, min_periods=min_periods, gap=gap
    )
    rolling_max_series = rolling_max_obj.apply(max_wrapper)

    # The resulting series is comprised entirely of nans
    assert rolling_max_series.isna().sum() == len(rolling_series_pd)
Esempio n. 16
0
def test_roll_series_with_non_offset_string_inputs(rolling_series_pd):
    error = "Cannot roll series. The specified gap, test, is not a valid offset alias."
    with pytest.raises(ValueError, match=error):
        _roll_series_with_gap(rolling_series_pd, window_size="4D", gap="test")

    error = "Cannot roll series. The specified window length, test, is not a valid offset alias."
    with pytest.raises(ValueError, match=error):
        _roll_series_with_gap(rolling_series_pd, window_size="test", gap="7D")

    # Test mismatched types error
    error = (
        "Cannot roll series with offset gap, 2d, and numeric window length, 7. "
        "If an offset alias is used for gap, the window length must also be defined as an offset alias. "
        "Please either change gap to be numeric or change window length to be an offset alias."
    )
    with pytest.raises(TypeError, match=error):
        _roll_series_with_gap(rolling_series_pd, window_size=7, gap="2d").max()
Esempio n. 17
0
def test_apply_roll_with_offset_gap_min_periods(min_periods, rolling_series_pd):
    window_length = "5d"
    window_length_num = 5
    gap = "3d"
    gap_num = 3

    def count_wrapper(sub_s):
        return _apply_roll_with_offset_gap(sub_s, gap, len, min_periods=min_periods)

    rolling_count_obj = _roll_series_with_gap(rolling_series_pd, window_length, gap=gap)
    rolling_count_series = rolling_count_obj.apply(count_wrapper)

    # gap essentially creates rolling series that have no elements; which should be nan
    # to differentiate from when a window only has null values
    num_empty_aggregates = rolling_count_series.isna().sum()
    num_partial_aggregates = len(
        (rolling_count_series.loc[rolling_count_series != 0]).loc[
            rolling_count_series < window_length_num
        ]
    )

    assert num_empty_aggregates == min_periods - 1 + gap_num
    assert num_partial_aggregates == window_length_num - min_periods
def test_rolling_count(window_length, gap, rolling_series_pd):
    gap_num = get_number_from_offset(gap)
    window_length_num = get_number_from_offset(window_length)

    expected_vals = (_roll_series_with_gap(
        rolling_series_pd,
        window_length_num,
        gap=gap_num,
        min_periods=window_length_num,
    ).count().values)

    primitive_instance = RollingCount(window_length=window_length,
                                      gap=gap,
                                      min_periods=window_length_num)
    primitive_func = primitive_instance.get_function()

    actual_vals = pd.Series(primitive_func(rolling_series_pd.index))

    num_nans = gap_num + window_length_num - 1
    assert actual_vals.isna().sum() == num_nans
    # RollingCount will not match the exact _roll_series_with_gap call,
    # because it handles the min_periods difference within the primitive
    pd.testing.assert_series_equal(
        pd.Series(expected_vals).iloc[num_nans:], actual_vals.iloc[num_nans:])
Esempio n. 19
0
def test_roll_series_with_no_gap(window_length, rolling_series_pd):
    actual_rolling = _roll_series_with_gap(rolling_series_pd, window_length).mean()
    expected_rolling = rolling_series_pd.rolling(window_length, min_periods=1).mean()

    pd.testing.assert_series_equal(actual_rolling, expected_rolling)