Beispiel #1
0
def test_tukey_no_outliers():
    """If all the data has the same value there are no outliers"""
    data = pd.Series([1 for _ in range(10)])
    assert_series_equal(
        pd.Series([False for _ in range(10)]),
        outliers.tukey(data)
    )
Beispiel #2
0
def test_tukey_outlier_below():
    """tukey properly detects an outlier that is too low."""
    data = pd.Series([5, 9, 9, 8, 7, 1, 7, 8, 6, 8])
    assert_series_equal(
        pd.Series([
            False, False, False, False, False, True, False, False, False, False
        ]), outliers.tukey(data))
Beispiel #3
0
def test_tukey_lower_criteria():
    """With lower criteria the single small value is not an outlier."""
    data = pd.Series([5, 9, 9, 8, 7, 1, 7, 8, 6, 8])
    assert_series_equal(
        pd.Series([False for _ in range(len(data))]),
        outliers.tukey(data, k=3)
    )
Beispiel #4
0
def test_tukey_outlier_above():
    """tukey properly detects an outlier that is too high."""
    data = pd.Series([0, 1, 3, 2, 1, 3, 4, 1, 1, 9, 2])
    assert_series_equal(
        pd.Series([
            False, False, False, False, False, False, False, False, False,
            True, False
        ]), outliers.tukey(data))
Beispiel #5
0
import pandas as pd
import pathlib

# %%
# First, we read in the ac_power_inv_7539_outliers example. Min-max normalized
# AC power is represented by the "value_normalized" column. There is a boolean
# column "outlier" where inserted outliers are labeled as True, and all other
# values are labeled as False. These outlier values were inserted manually into
# the data set to illustrate outlier detection by each of the functions.
# We use a normalized time series example provided by the PV Fleets Initiative.
# This example is adapted from the DuraMAT DataHub
# clipping data set:
# https://datahub.duramat.org/dataset/inverter-clipping-ml-training-set-real-data
pvanalytics_dir = pathlib.Path(pvanalytics.__file__).parent
ac_power_file_1 = pvanalytics_dir / 'data' / 'ac_power_inv_7539_outliers.csv'
data = pd.read_csv(ac_power_file_1, index_col=0, parse_dates=True)
print(data.head(10))

# %%
# We then use :py:func:`pvanalytics.quality.outliers.tukey` to identify
# outliers in the time series, and plot the data with the tukey outlier mask.
tukey_outlier_mask = tukey(data=data['value_normalized'],
                           k=0.5)
data['value_normalized'].plot()
data.loc[tukey_outlier_mask, 'value_normalized'].plot(ls='', marker='o')
plt.legend(labels=["AC Power", "Detected Outlier"])
plt.xlabel("Date")
plt.ylabel("Normalized AC Power")
plt.tight_layout()
plt.show()