Esempio n. 1
0
def test_diff_detector_require_thresholds(require_threshold: bool):
    """
    Should fail if requiring thresholds, but not calling cross_validate
    """
    X = pd.DataFrame(np.random.random((100, 5)))
    y = pd.DataFrame(np.random.random((100, 2)))

    model = DiffBasedAnomalyDetector(
        base_estimator=MultiOutputRegressor(LinearRegression()),
        require_thresholds=require_threshold,
    )

    model.fit(X, y)

    if require_threshold:
        # FAIL: Forgot to call .cross_validate to calculate thresholds.
        with pytest.raises(AttributeError):
            model.anomaly(X, y)

        model.cross_validate(X=X, y=y)
        model.anomaly(X, y)
    else:
        # thresholds not required
        model.anomaly(X, y)
Esempio n. 2
0
def test_diff_detector(scaler, index, lookback, with_thresholds: bool):
    """
    Test the functionality of the DiffBasedAnomalyDetector
    """

    # Some dataset.
    X, y = (
        pd.DataFrame(np.random.random((10, 3))),
        pd.DataFrame(np.random.random((10, 3))),
    )

    base_estimator = MultiOutputRegressor(estimator=LinearRegression())
    model = DiffBasedAnomalyDetector(base_estimator=base_estimator,
                                     scaler=scaler,
                                     require_thresholds=False)

    assert isinstance(model, AnomalyDetectorBase)

    assert model.get_params() == dict(base_estimator=base_estimator,
                                      scaler=scaler)

    if with_thresholds:
        model.cross_validate(X=X, y=y)

    model.fit(X, y)

    output: np.ndarray = model.predict(X)
    base_df = model_utils.make_base_dataframe(tags=["A", "B", "C"],
                                              model_input=X,
                                              model_output=output,
                                              index=index)

    # Base prediction dataframe has none of these columns
    assert not any(col in base_df.columns for col in (
        "total-anomaly-scaled",
        "total-anomaly-unscaled",
        "tag-anomaly-scaled",
        "tag-anomaly-unscaled",
    ))

    # Apply the anomaly detection logic on the base prediction df
    anomaly_df = model.anomaly(X, y, timedelta(days=1))

    # Should have these added error calculated columns now.
    assert all(col in anomaly_df.columns for col in (
        "total-anomaly-scaled",
        "total-anomaly-unscaled",
        "tag-anomaly-scaled",
        "tag-anomaly-unscaled",
    ))

    # Verify calculation for unscaled data
    feature_error_unscaled = np.abs(base_df["model-output"].values - y.values)
    total_anomaly_unscaled = np.square(feature_error_unscaled).mean(axis=1)
    assert np.allclose(feature_error_unscaled,
                       anomaly_df["tag-anomaly-unscaled"].values)
    assert np.allclose(total_anomaly_unscaled,
                       anomaly_df["total-anomaly-unscaled"].values)

    # Verify calculations for scaled data
    feature_error_scaled = np.abs(
        scaler.transform(base_df["model-output"].values) - scaler.transform(y))
    total_anomaly_scaled = np.square(feature_error_scaled).mean(axis=1)
    assert np.allclose(feature_error_scaled,
                       anomaly_df["tag-anomaly-scaled"].values)
    assert np.allclose(total_anomaly_scaled,
                       anomaly_df["total-anomaly-scaled"].values)

    if with_thresholds:
        assert "anomaly-confidence" in anomaly_df.columns
        assert "total-anomaly-confidence" in anomaly_df.columns
    else:
        assert "anomaly-confidence" not in anomaly_df.columns
        assert "total-anomaly-confidence" not in anomaly_df.columns
Esempio n. 3
0
def test_diff_detector_with_window(scaler, len_x_y: int, time_index: bool,
                                   lookback: int, with_thresholds: bool):
    """
    Test the functionality of the DiffBasedAnomalyDetector
    """

    # Some dataset.
    X, y = (
        pd.DataFrame(np.random.random((len_x_y, 3))),
        pd.DataFrame(np.random.random((len_x_y, 3))),
    )
    tags = ["A", "B", "C"]
    if time_index:
        index = pd.date_range("2019-01-01", "2019-01-11", periods=len_x_y)
    else:
        index = range(len_x_y)

    base_estimator = MultiOutputRegressor(estimator=LinearRegression())
    model = DiffBasedAnomalyDetector(
        base_estimator=base_estimator,
        scaler=scaler,
        require_thresholds=False,
        window=144,
    )

    assert isinstance(model, AnomalyDetectorBase)

    assert model.get_params() == dict(base_estimator=base_estimator,
                                      scaler=scaler,
                                      window=144)

    if with_thresholds:
        model.cross_validate(X=X, y=y)

    model.fit(X, y)

    output: np.ndarray = model.predict(X)
    base_df = model_utils.make_base_dataframe(tags=tags,
                                              model_input=X,
                                              model_output=output,
                                              index=index)
    # Base prediction dataframe has none of these columns
    assert not any(col in base_df.columns for col in (
        "total-anomaly-scaled",
        "total-anomaly-unscaled",
        "tag-anomaly-scaled",
        "tag-anomaly-unscaled",
        "smooth-total-anomaly-scaled",
        "smooth-total-anomaly-unscaled",
        "smooth-tag-anomaly-scaled",
        "smooth-tag-anomaly-unscaled",
    ))

    # Apply the anomaly detection logic on the base prediction df
    anomaly_df = model.anomaly(X, y)

    # Should have these added error calculated columns now.
    assert all(col in anomaly_df.columns for col in (
        "total-anomaly-scaled",
        "total-anomaly-unscaled",
        "tag-anomaly-scaled",
        "tag-anomaly-unscaled",
        "smooth-total-anomaly-scaled",
        "smooth-total-anomaly-unscaled",
        "smooth-tag-anomaly-scaled",
        "smooth-tag-anomaly-unscaled",
    ))

    # Verify calculation for unscaled data
    feature_error_unscaled = pd.DataFrame(
        data=np.abs(base_df["model-output"].to_numpy() - y.to_numpy()),
        index=index,
        columns=tags,
    )
    total_anomaly_unscaled = pd.Series(
        data=np.square(feature_error_unscaled).mean(axis=1))
    assert np.allclose(feature_error_unscaled.to_numpy(),
                       anomaly_df["tag-anomaly-unscaled"].to_numpy())
    assert np.allclose(
        total_anomaly_unscaled.to_numpy(),
        anomaly_df["total-anomaly-unscaled"].to_numpy(),
    )

    smooth_feature_error_unscaled = (feature_error_unscaled.rolling(
        model.window).median().dropna())
    smooth_total_anomaly_unscaled = (total_anomaly_unscaled.rolling(
        model.window).median().dropna())
    assert np.allclose(
        smooth_feature_error_unscaled.to_numpy(),
        anomaly_df["smooth-tag-anomaly-unscaled"].dropna().to_numpy(),
    )
    assert np.allclose(
        smooth_total_anomaly_unscaled.to_numpy(),
        anomaly_df["smooth-total-anomaly-unscaled"].dropna().to_numpy(),
    )

    # Verify calculations for scaled data
    feature_error_scaled = pd.DataFrame(
        data=np.abs(
            scaler.transform(base_df["model-output"].to_numpy()) -
            scaler.transform(y)),
        index=index,
        columns=tags,
    )
    total_anomaly_scaled = pd.Series(data=np.square(feature_error_scaled).mean(
        axis=1))
    assert np.allclose(feature_error_scaled.to_numpy(),
                       anomaly_df["tag-anomaly-scaled"].to_numpy())
    assert np.allclose(total_anomaly_scaled,
                       anomaly_df["total-anomaly-scaled"].to_numpy())

    smooth_feature_error_scaled = (feature_error_scaled.rolling(
        model.window).median().dropna())
    smooth_total_anomaly_scaled = (total_anomaly_scaled.rolling(
        model.window).median().dropna())
    assert np.allclose(
        smooth_feature_error_scaled.to_numpy(),
        anomaly_df["smooth-tag-anomaly-scaled"].dropna().to_numpy(),
    )
    assert np.allclose(
        smooth_total_anomaly_scaled.to_numpy(),
        anomaly_df["smooth-total-anomaly-scaled"].dropna().to_numpy(),
    )

    # Check number of NA's is consistent with window size
    if len_x_y >= model.window:
        assert (anomaly_df["smooth-tag-anomaly-scaled"].isna().sum().sum() ==
                (model.window - 1) *
                anomaly_df["smooth-tag-anomaly-scaled"].shape[1])
        assert (anomaly_df["smooth-total-anomaly-scaled"].isna().sum() ==
                model.window - 1)

    if with_thresholds:
        assert "anomaly-confidence" in anomaly_df.columns
        assert "total-anomaly-confidence" in anomaly_df.columns
    else:
        assert "anomaly-confidence" not in anomaly_df.columns
        assert "total-anomaly-confidence" not in anomaly_df.columns