def __init__(
        self,
        dataframe: pd.DataFrame,
        line_shape_fallback: str = "linear",
    ) -> None:
        for column in self.REQUIRED_COLUMNS:
            if column not in dataframe.columns:
                raise KeyError(f"{column} column is missing from UNSMRY data")

        dataframe = dataframe.copy()
        # ensure correct format of date
        dataframe["REAL"] = dataframe["REAL"].astype(int)
        dataframe["DATE"] = pd.to_datetime(dataframe["DATE"])
        make_date_column_datetime_object(dataframe)

        self._dataframe = self._remove_columns_with_only_0(dataframe)

        self.line_shape_fallback = set_simulation_line_shape_fallback(
            line_shape_fallback
        )
        self._dates = sorted(self._dataframe["DATE"].unique())
        self._vector_names = self._determine_vector_names()

        # add vectors to vector selector
        self.vector_selector_data: list = []
        for vector in self._vector_names:
            add_vector_to_vector_selector_data(self.vector_selector_data, vector)
Esempio n. 2
0
def test_make_date_column_datetime_object_no_rows_df() -> None:
    # Copy to prevent modification if input_df
    test_df = INPUT_NO_ROWS_DF.copy()

    make_date_column_datetime_object(test_df)

    assert_frame_equal(test_df, INPUT_NO_ROWS_DF)
Esempio n. 3
0
def create_relative_to_date_df(
    df: pd.DataFrame, relative_date: datetime.datetime
) -> pd.DataFrame:
    """
    Create dataframe where data for relative_date is subtracted from respective
    vector data.

    I.e. Subtract realization data at given relative date from corresponding
    realizations at each individual date for each vector column in dataframe.

    `Assume:`
    Set of realizations are equal for each date in "DATE" column of input dataframe.

    `Input:`
    * df - `Columns` in dataframe: ["DATE", "REAL", vector1, ..., vectorN]

    `Output:`
    * df - `Columns` in dataframe: ["DATE", "REAL", vector1, ..., vectorN]

    NOTE:
    - This function iterates over realization group in input dataframe
    - For-loop makes it possible to get realization not present in relative_date_df, if
    realization is not present in relative_date_df the realization is excluded output.
    """

    assert_date_column_is_datetime_object(df)

    if not set(["DATE", "REAL"]).issubset(set(df.columns)):
        raise ValueError('Expect column "DATE" and "REAL" in input dataframe!')

    # Columns of correct dtype
    _columns = {name: pd.Series(dtype=df.dtypes[name]) for name in df.columns}
    output_df = pd.DataFrame(_columns)

    relative_date_df: pd.DataFrame = df.loc[df["DATE"] == relative_date].drop(
        columns=["DATE"]
    )
    if relative_date_df.empty:
        # Dataframe with columns, but no rows
        return output_df

    vectors = [elm for elm in df.columns if elm not in ("DATE", "REAL")]

    # NOTE: This for-loop makes it possible to get real not represented in relative_date_df!
    for real, real_df in df.groupby("REAL"):
        relative_date_data = relative_date_df.loc[
            relative_date_df["REAL"] == real
        ].drop(columns=["REAL"])

        # If realization does not exist in relative_date_df
        if relative_date_data.empty:
            continue

        real_df[vectors] = real_df[vectors].sub(relative_date_data.iloc[0], axis=1)
        output_df = pd.concat([output_df, real_df], ignore_index=True)

    make_date_column_datetime_object(output_df)
    return output_df
Esempio n. 4
0
def test_make_date_column_datetime_object_datetime_year_2263_df() -> None:
    _verify_expected_df_date_column_data(EXPECTED_YEAR_2263_DF)

    # Copy to prevent modification if input_df
    test_df = INPUT_DATETIME_YEAR_2263_DF.copy()

    make_date_column_datetime_object(test_df)

    assert_frame_equal(test_df, EXPECTED_YEAR_2263_DF)
Esempio n. 5
0
def test_make_date_column_datetime_object_input_date_year_2020_df() -> None:
    # fmt: off
    input_date_year_2020_df = pd.DataFrame(
        columns=["DATE", "A"],
        data=[
            [datetime.date(2020, 1, 15), 1.0],
            [datetime.date(2020, 2, 15), 2.0],
            [datetime.date(2020, 3, 15), 3.0],
        ],
    )
    # fmt: on
    with pytest.raises(ValueError) as err:
        make_date_column_datetime_object(input_date_year_2020_df)
    assert str(err.value) == f'Column "DATE" of type {datetime.date} is not handled!'
    def __create_delta_ensemble_vectors_df(
        self,
        vector_names: List[str],
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:
        """
        Get vectors dataframe with delta vectors for ensemble A and B, for common realizations

        `Return:` Dataframe with delta ensemble data for common vectors and realizations in ensemble
        A and B.

        `Output:`
        * DataFrame with columns ["DATE", "REAL", vector1, ..., vectorN]

        `Input:`
        * vector_names: List[str] - List of vector names to get data for
        * resampling_frequency: Optional[Frequency] - Optional resampling frequency
        * realizations: Optional[Sequence[int]] - Optional sequence of realization numbers for
        vectors

        NOTE:
        - Performs "inner join". Only obtain matching index ["DATE", "REAL"] - i.e "DATE"-"REAL"
        combination present in only one vector -> neglected
        - Ensures equal dates samples and realizations by dropping nan-values
        """

        if not vector_names:
            raise ValueError("List of requested vector names is empty")

        # NOTE: index order ["DATE","REAL"] to obtain column order when
        # performing reset_index() later
        ensemble_a_vectors_df = self._provider_a.get_vectors_df(
            vector_names, resampling_frequency,
            realizations).set_index(["DATE", "REAL"])
        ensemble_b_vectors_df = self._provider_b.get_vectors_df(
            vector_names, resampling_frequency,
            realizations).set_index(["DATE", "REAL"])

        # Reset index, sort values by "REAL" and thereafter by "DATE" to
        # group realizations and order by date
        ensembles_delta_vectors_df = (
            ensemble_a_vectors_df.sub(ensemble_b_vectors_df).dropna(
                axis=0,
                how="any").reset_index().sort_values(["REAL", "DATE"],
                                                     ignore_index=True))

        make_date_column_datetime_object(ensembles_delta_vectors_df)

        return ensembles_delta_vectors_df
Esempio n. 7
0
def test_make_date_column_datetime_object_input_timestamp_datetime_df() -> None:
    with pytest.raises(AttributeError) as err:
        make_date_column_datetime_object(INPUT_TIMESTAMP_DATETIME_DF)
    assert str(err.value) == "Can only use .dt accessor with datetimelike values"
Esempio n. 8
0
def test_make_date_column_datetime_object_input_empty_df() -> None:
    with pytest.raises(ValueError) as err:
        make_date_column_datetime_object(INPUT_EMPTY_DF)
    assert str(err.value) == 'df does not contain column "DATE"'
def test_create_vector_realization_traces() -> None:
    vector_df = pd.DataFrame(
        columns=["DATE", "REAL", "A"],
        data=[
            [datetime.datetime(2020, 1, 1), 1, 1.0],
            [datetime.datetime(2020, 2, 1), 1, 2.0],
            [datetime.datetime(2031, 5, 10), 2, 5.0],
            [datetime.datetime(2031, 6, 10), 2, 6.0],
        ],
    )
    make_date_column_datetime_object(vector_df)

    created_traces = create_vector_realization_traces(
        vector_df=vector_df,
        ensemble="Test ensemble",
        color="red",
        legend_group="Test group",
        line_shape="linear",
        hovertemplate="Test hovertemplate ",
        show_legend=True,
        legendrank=2,
    )

    expected_traces = [
        {
            "line": {
                "width": 1,
                "shape": "linear"
            },
            "x":
            [datetime.datetime(2020, 1, 1),
             datetime.datetime(2020, 2, 1)],
            "y": [1.0, 2.0],
            "hovertemplate":
            "Test hovertemplate Realization: 1, Ensemble: Test ensemble",
            "name": "Test group",
            "legendgroup": "Test group",
            "marker": {
                "color": "red"
            },
            "legendrank": 2,
            "showlegend": True,
        },
        {
            "line": {
                "width": 1,
                "shape": "linear"
            },
            "x":
            [datetime.datetime(2031, 5, 10),
             datetime.datetime(2031, 6, 10)],
            "y": [5.0, 6.0],
            "hovertemplate":
            "Test hovertemplate Realization: 2, Ensemble: Test ensemble",
            "name": "Test group",
            "legendgroup": "Test group",
            "marker": {
                "color": "red"
            },
            "legendrank": 2,
            "showlegend": False,
        },
    ]

    assert expected_traces == created_traces
    def create_calculated_vectors_df(
            self,
            realizations: Optional[Sequence[int]] = None) -> pd.DataFrame:
        """Get dataframe with calculated vector data for provided vectors.

        The returned dataframe contains columns with name of vector and corresponding calculated
        data.

        The calculated vectors for delta ensembles are created by first creating the calculated
        vector data for ensemble A and B separately, and thereafter subtracting the data in ensemble
        B from A. Thereby one obtain creating the delta ensemble of the resulting calculated
        vectors.

        Calculated vectors are created with same sampling frequency as providers is set with. I.e.
        resampling frequency is given for providers supporting resampling, otherwise sampling
        frequency is fixed.

        `Input:`
        * realizations: Sequence[int] - Sequency of realization numbers to include in calculation

        `Output:`
        * dataframe with vector names in columns and their calculated data in rows
        `Columns` in dataframe: ["DATE", "REAL", vector1, ..., vectorN]
        """
        if not self.has_vector_calculator_expressions():
            raise ValueError(
                f'Assembled vector data accessor for provider "{self._name}"'
                "has no vector calculator expressions")

        provider_a_calculated_vectors_df = pd.DataFrame()
        provider_b_calculated_vectors_df = pd.DataFrame()
        for expression in self._vector_calculator_expressions:
            provider_a_calculated_vector_df = create_calculated_vector_df(
                expression, self._provider_a, realizations,
                self._resampling_frequency)
            provider_b_calculated_vector_df = create_calculated_vector_df(
                expression, self._provider_b, realizations,
                self._resampling_frequency)

            if (provider_a_calculated_vector_df.empty
                    or provider_b_calculated_vector_df.empty):
                # TODO: Consider raising ValueError of vector calculation in one provider fails?
                # If both fails, it's okay?
                continue

            def __inner_merge_dataframes(first: pd.DataFrame,
                                         second: pd.DataFrame) -> pd.DataFrame:
                if first.empty:
                    return second
                return pd.merge(first, second, how="inner")

            provider_a_calculated_vectors_df = __inner_merge_dataframes(
                provider_a_calculated_vectors_df,
                provider_a_calculated_vector_df)
            provider_b_calculated_vectors_df = __inner_merge_dataframes(
                provider_b_calculated_vectors_df,
                provider_b_calculated_vector_df)

        # NOTE: index order ["DATE","REAL"] to obtain column order when
        # performing reset_index() later
        provider_a_calculated_vectors_df.set_index(["DATE", "REAL"],
                                                   inplace=True)
        provider_b_calculated_vectors_df.set_index(["DATE", "REAL"],
                                                   inplace=True)

        # Reset index, sort values by "REAL" and thereafter by "DATE" to
        # group realizations and order by date
        delta_ensemble_calculated_vectors_df = (
            provider_a_calculated_vectors_df.sub(
                provider_b_calculated_vectors_df).dropna(
                    axis=0,
                    how="any").reset_index().sort_values(["REAL", "DATE"],
                                                         ignore_index=True))

        make_date_column_datetime_object(delta_ensemble_calculated_vectors_df)

        if self._relative_date:
            return dataframe_utils.create_relative_to_date_df(
                delta_ensemble_calculated_vectors_df,
                self._relative_date,
            )
        return delta_ensemble_calculated_vectors_df
EXPECTED_PER_DAY_WEEKLY_DF = pd.DataFrame(
    columns=["DATE", "REAL", "PER_DAY_A", "PER_DAY_B"],
    data=[
        [datetime.datetime(2021, 1, 1),  1, 50.0/7.0,  250.0/7.0],
        [datetime.datetime(2021, 1, 8),  1, 50.0/7.0,  250.0/7.0],
        [datetime.datetime(2021, 1, 15), 1, 0.0,       0.0      ],
        [datetime.datetime(2021, 1, 1),  2, 100.0/7.0, 350.0/7.0],
        [datetime.datetime(2021, 1, 8),  2, 100.0/7.0, 350.0/7.0],
        [datetime.datetime(2021, 1, 15), 2, 0.0,       0.0      ],
        [datetime.datetime(2021, 1, 1),  4, 200.0/7.0, 450.0/7.0],
        [datetime.datetime(2021, 1, 8),  4, 200.0/7.0, 450.0/7.0],
        [datetime.datetime(2021, 1, 15), 4, 0.0,       0.0      ],
    ],
)
# Convert date columns to datetime.datetime
make_date_column_datetime_object(INPUT_WEEKLY_DF)
make_date_column_datetime_object(EXPECTED_PER_INTVL_WEEKLY_DF)
make_date_column_datetime_object(EXPECTED_PER_DAY_WEEKLY_DF)

# Monthly frequency - rate per day implies divide on days in month
INPUT_MONTHLY_DF = pd.DataFrame(
    columns=["DATE", "REAL", "A", "B"],
    data=[
        [datetime.datetime(2021, 1, 1), 1, 50.0,   250.0 ],
        [datetime.datetime(2021, 2, 1), 1, 100.0,  500.0 ],
        [datetime.datetime(2021, 3, 1), 1, 150.0,  750.0 ],
        [datetime.datetime(2021, 1, 1), 2, 300.0,  350.0 ],
        [datetime.datetime(2021, 2, 1), 2, 400.0,  700.0 ],
        [datetime.datetime(2021, 3, 1), 2, 500.0,  1050.0],
        [datetime.datetime(2021, 1, 1), 4, 1000.0, 450.0 ],
        [datetime.datetime(2021, 2, 1), 4, 1200.0, 900.0 ],
Esempio n. 12
0
# Sum of col "A" and "B" in Delta
EXPECTED_SUM_A_AND_B_DF = pd.DataFrame(
    columns=["DATE", "REAL", "Sum A and B"],
    data=[
        [datetime.datetime(2000, 1, 1), 1, 459.0],
        [datetime.datetime(2000, 2, 1), 1, 918.0],
        [datetime.datetime(2000, 3, 1), 1, 1377.0],
        [datetime.datetime(2000, 1, 1), 2, 2754.0],
        [datetime.datetime(2000, 2, 1), 2, 3663.0],
        [datetime.datetime(2000, 3, 1), 2, 4572.0],
        [datetime.datetime(2000, 1, 1), 4, 9099.0],
        [datetime.datetime(2000, 2, 1), 4, 10908.0],
        [datetime.datetime(2000, 3, 1), 4, 12717.0],
    ])
make_date_column_datetime_object(INPUT_A_DF)
make_date_column_datetime_object(INPUT_B_DF)
make_date_column_datetime_object(EXPECTED_DELTA_DF)
make_date_column_datetime_object(EXPECTED_DELTA_INVTL_DF)
make_date_column_datetime_object(EXPECTED_SUM_A_AND_B_DF)

# Dates AFTER year 2262!
AFTER_2262_DATES = pd.Series([
    datetime.datetime(2265, 1, 1),
    datetime.datetime(2265, 2, 1),
    datetime.datetime(2265, 3, 1),
    datetime.datetime(2265, 1, 1),
    datetime.datetime(2265, 2, 1),
    datetime.datetime(2265, 3, 1),
    datetime.datetime(2265, 1, 1),
    datetime.datetime(2265, 2, 1),
Esempio n. 13
0
def create_vectors_statistics_df(vectors_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create vectors statistics dataframe for given vectors in columns of provided vectors dataframe

    Calculate min, max, mean, p10, p90 and p50 for each vector in dataframe column

    `Input:`
    * vectors_df: pd.DataFrame - Dataframe with vectors dataframe and columns:
        ["DATE", "REAL", vector1, ... , vectorN]

    `Returns:`
    * Dataframe with double column level:\n
      [ "DATE",     vector1,                        ... vectorN
                    MEAN, MIN, MAX, P10, P90, P50   ... MEAN, MIN, MAX, P10, P90, P50]
    """
    assert_date_column_is_datetime_object(vectors_df)

    # Get vectors names, keep order
    columns_list = list(vectors_df.columns)
    vector_names = sorted((set(columns_list) ^ set(["DATE", "REAL"])),
                          key=columns_list.index)

    # If no rows of data:
    if not vectors_df.shape[0]:
        columns_tuples = [("DATE", "")]
        for vector in vector_names:
            columns_tuples.extend([
                (vector, StatisticsOptions.MEAN),
                (vector, StatisticsOptions.MIN),
                (vector, StatisticsOptions.MAX),
                (vector, StatisticsOptions.P10),
                (vector, StatisticsOptions.P90),
                (vector, StatisticsOptions.P50),
            ])
        return pd.DataFrame(columns=pd.MultiIndex.from_tuples(columns_tuples))

    # Invert p10 and p90 due to oil industry convention.
    def p10(x: List[float]) -> np.floating:
        return np.nanpercentile(x, q=90)

    def p90(x: List[float]) -> np.floating:
        return np.nanpercentile(x, q=10)

    def p50(x: List[float]) -> np.floating:
        return np.nanpercentile(x, q=50)

    statistics_df: pd.DataFrame = (vectors_df[["DATE"] + vector_names].groupby(
        ["DATE"]).agg([np.nanmean, np.nanmin, np.nanmax, p10, p90,
                       p50]).reset_index(level=["DATE"], col_level=0))

    # Rename columns to StatisticsOptions enum types for strongly typed format
    col_stat_label_map = {
        "nanmin": StatisticsOptions.MIN,
        "nanmax": StatisticsOptions.MAX,
        "nanmean": StatisticsOptions.MEAN,
        "p10": StatisticsOptions.P10,
        "p90": StatisticsOptions.P90,
        "p50": StatisticsOptions.P50,
    }
    statistics_df.rename(columns=col_stat_label_map, level=1, inplace=True)

    make_date_column_datetime_object(statistics_df)

    return statistics_df
Esempio n. 14
0
# Sum of col "A" and "B" in input df
EXPECTED_SUM_A_AND_B_DF = pd.DataFrame(
    columns=["DATE", "REAL", "Sum A and B"],
    data=[
        [datetime.datetime(2000, 1, 1), 1, 51.0],
        [datetime.datetime(2000, 2, 1), 1, 102.0],
        [datetime.datetime(2000, 3, 1), 1, 153.0],
        [datetime.datetime(2000, 1, 1), 2, 306.0],
        [datetime.datetime(2000, 2, 1), 2, 407.0],
        [datetime.datetime(2000, 3, 1), 2, 508.0],
        [datetime.datetime(2000, 1, 1), 4, 1011.0],
        [datetime.datetime(2000, 2, 1), 4, 1212.0],
        [datetime.datetime(2000, 3, 1), 4, 1413.0],
    ])
make_date_column_datetime_object(INPUT_DF)
make_date_column_datetime_object(EXPECTED_PER_INTVL_DF)
make_date_column_datetime_object(EXPECTED_SUM_A_AND_B_DF)

# Dates AFTER year 2262!
AFTER_2262_DATES = pd.Series([
    datetime.datetime(2265, 1, 1),
    datetime.datetime(2265, 2, 1),
    datetime.datetime(2265, 3, 1),
    datetime.datetime(2265, 1, 1),
    datetime.datetime(2265, 2, 1),
    datetime.datetime(2265, 3, 1),
    datetime.datetime(2265, 1, 1),
    datetime.datetime(2265, 2, 1),
    datetime.datetime(2265, 3, 1),
])
Esempio n. 15
0
                                          [datetime(2000, 2, 1), 0, 23.0],
                                          [datetime(2000, 3, 1), 0, 33.0],
                                          [datetime(2000, 4, 1), 0, 43.0],
                                          [datetime(2000, 5, 1), 0, 53.0],
                                      ])
# Data of real = 0 for "WBH" and "WAH"
EXPECTED_WB_WA_HISTORY_DF = pd.DataFrame(
    columns=["DATE", "REAL", "WB", "WA"],
    data=[
        [datetime(2000, 1, 1), 0, 17.0, 13.0],
        [datetime(2000, 2, 1), 0, 27.0, 23.0],
        [datetime(2000, 3, 1), 0, 37.0, 33.0],
        [datetime(2000, 4, 1), 0, 47.0, 43.0],
        [datetime(2000, 5, 1), 0, 57.0, 53.0],
    ])
make_date_column_datetime_object(INPUT_DF)
make_date_column_datetime_object(EXPECTED_WA_HISTORY_DF)
make_date_column_datetime_object(EXPECTED_WB_WA_HISTORY_DF)

# Dates AFTER year 2262!
# NOTE: datetime.datetime after year 2262 is not converted to pd.Timestamp, thus
# no need to make date column datetime object
INPUT_YEAR_2265_DF = pd.DataFrame(
    columns=["DATE", "REAL", "WA", "WAH", "WB", "WBH"],
    data=[
        [datetime(2265, 1, 1), 4, 11.0, 13.0, 15.0, 17.0],
        [datetime(2265, 2, 1), 4, 21.0, 23.0, 25.0, 27.0],
        [datetime(2265, 3, 1), 4, 31.0, 33.0, 35.0, 37.0],
        [datetime(2265, 4, 1), 4, 41.0, 43.0, 45.0, 47.0],
        [datetime(2265, 5, 1), 4, 51.0, 53.0, 55.0, 57.0],
        [datetime(2265, 1, 1), 2, 110.0, 115.0, 135.0, 139.0],
          [datetime.datetime(2020, 1, 1), 2, 20.0, 60.0],
          [datetime.datetime(2020, 2, 1), 2, 200.0, 600.0],
          [datetime.datetime(2020, 3, 1), 2, 2000.0, 6000.0],
          [datetime.datetime(2020, 4, 1), 2, 20000.0, 60000.0],
          [datetime.datetime(2020, 5, 1), 2, 200000.0, 600000.0],
          [datetime.datetime(2020, 1, 1), 4, 30.0, 70.0],
          [datetime.datetime(2020, 2, 1), 4, 300.0, 700.0],
          [datetime.datetime(2020, 3, 1), 4, 3000.0, 7000.0],
          [datetime.datetime(2020, 4, 1), 4, 30000.0, 70000.0],
          [datetime.datetime(2020, 5, 1), 4, 300000.0, 700000.0],
          [datetime.datetime(2020, 1, 1), 5, 40.0, 80.0],
          [datetime.datetime(2020, 2, 1), 5, 400.0, 800.0],
          [datetime.datetime(2020, 3, 1), 5, 4000.0, 8000.0],
          [datetime.datetime(2020, 4, 1), 5, 40000.0, 80000.0],
          [datetime.datetime(2020, 5, 1), 5, 400000.0, 800000.0]])
make_date_column_datetime_object(INPUT_YEAR_2020_DF)

# pylint: disable=line-too-long
# Columns are:
#           ["DATE", "A"                             "B"                          ]
#           [        MEAN, MIN, MAX, P10, P90, P50,  MEAN, MIN, MAX, P10, P90, P50]
# NOTE: P10 is 90 percentil and P90 is 10 percentile for oil standard
EXPECTED_STATISTICS_YEAR_2020_DF = pd.DataFrame(
    columns=pd.MultiIndex.from_tuples([
        ("DATE", ""),
        ("A", StatisticsOptions.MEAN),
        ("A", StatisticsOptions.MIN),
        ("A", StatisticsOptions.MAX),
        ("A", StatisticsOptions.P10),
        ("A", StatisticsOptions.P90),
        ("A", StatisticsOptions.P50),
INPUT_VECTOR_STATISTICS_DF = pd.DataFrame(
    columns=[
        "DATE",
        StatisticsOptions.MEAN,
        StatisticsOptions.MIN,
        StatisticsOptions.MAX,
        StatisticsOptions.P10,
        StatisticsOptions.P90,
        StatisticsOptions.P50,
    ],
    data=[
        [datetime.datetime(2020, 1, 1), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
        [datetime.datetime(2020, 1, 2), 1.5, 2.5, 3.5, 4.5, 5.5, 6.5],
    ],
)
make_date_column_datetime_object(INPUT_VECTOR_STATISTICS_DF)

# *******************************************************************
#####################################################################
#
# UNIT TESTS
#
#####################################################################
# *******************************************************************


def test_crate_vector_observation_traces() -> None:
    first_observation = {
        "date": datetime.datetime(2020, 1, 1),
        "value": 2.0,
        "comment": "first obs",