Beispiel #1
0
    def test_run_reloaded_simple_pipeline(self):
        pipeline = Pipeline()

        imputer_power_statistics = LinearInterpolater(
            method="nearest", dim="time",
            name="imputer_power")(x=pipeline["load_power_statistics"])
        imputer_price = LinearInterpolater(
            method="nearest", dim="time",
            name="imputer_price")(x=pipeline["price_day_ahead"])
        scaler = SKLearnWrapper(StandardScaler())(x=imputer_price)
        SKLearnWrapper(LinearRegression())(x=scaler,
                                           target1=imputer_price,
                                           target2=imputer_power_statistics)

        pipeline.to_folder("./pipe1")
        sleep(1)

        pipeline2 = Pipeline.from_folder("./pipe1")

        data = pd.read_csv("data/getting_started_data.csv",
                           index_col="time",
                           sep=",",
                           parse_dates=["time"],
                           infer_datetime_format=True)
        train = data[6000:]
        test = data[:6000]
        pipeline2.train(train)
        pipeline2.test(test)
Beispiel #2
0
    def test_add_pipeline_to_pipeline_and_test(self, fm_mock):
        # Add some steps to the pipeline

        # Assert that the computation is set to fit_transform if the ComputationMode was default

        step = MagicMock()
        step.computation_mode = ComputationMode.Default
        step.finished = False
        time = pd.date_range('2000-01-01', freq='24H', periods=7)

        ds = xr.Dataset({'foo': ('time', [2, 3, 4, 5, 6, 7, 8]), 'time': time})

        subpipeline = Pipeline()
        subpipeline.add(module=step)
Beispiel #3
0
    def test_from_folder(self, isdir_mock, mock_file, json_mock, pickle_mock,
                         fm_mock):
        scaler = StandardScaler()
        linear_regression = LinearRegression()

        isdir_mock.return_value = True
        json_mock.load.return_value = pipeline_json

        pickle_mock.load.side_effect = [scaler, linear_regression]

        pipeline = Pipeline.from_folder("test_pipeline")
        calls_open = [
            call(os.path.join("test_pipeline", "StandardScaler.pickle"), "rb"),
            call(os.path.join("test_pipeline", "LinearRegression.pickle"),
                 "rb"),
            call(os.path.join("test_pipeline", "pipeline.json"), "r")
        ]

        mock_file.assert_has_calls(calls_open, any_order=True)

        json_mock.load.assert_called_once()
        assert pickle_mock.load.call_count == 2

        isdir_mock.assert_called_once()
        self.assertEqual(3, len(pipeline.id_to_step))
Beispiel #4
0
    def test_add_pipeline_to_pipeline_and_train(self, fm_mock,
                                                create_summary_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()

        detector(dataset=sub_pipeline["regression"])

        regressor = SKLearnWrapper(LinearRegression(), name="regression")(
            x=self.pipeline["test"], target=self.pipeline["target"])
        sub_pipeline(regression=regressor)

        summary_formatter_mock = MagicMock()
        self.pipeline.train(pd.DataFrame({
            "test": [24, 24],
            "target": [12, 24]
        },
                                         index=pd.to_datetime([
                                             '2015-06-03 00:00:00',
                                             '2015-06-03 01:00:00'
                                         ])),
                            summary_formatter=summary_formatter_mock)

        for step in self.pipeline.id_to_step.values():
            assert step.current_run_setting.computation_mode == ComputationMode.FitTransform

        create_summary_mock.assert_has_calls(
            [call(summary_formatter_mock),
             call(summary_formatter_mock)])
def create_test_pipeline(modules):
    regressor_svr, regressor_lin_reg = modules

    # Create test pipeline which works on a batch size of one hour.
    pipeline = Pipeline("../results/test_pipeline", batch=pd.Timedelta("1h"))

    # Add the svr regressor to the pipeline. This regressor should be called if it is not daytime
    regressor_svr_power_statistics = regressor_svr(ClockShift=pipeline["ClockShift"],
                                                   ClockShift_1=pipeline["ClockShift_1"],
                                                   condition=lambda x, y: not is_daytime(x, y),
                                                   computation_mode=ComputationMode.Transform,
                                                   callbacks=[LinePlotCallback('SVR')])

    # Add the linear regressor to the pipeline. This regressor should be called if it is daytime
    regressor_lin_reg_power_statistics = regressor_lin_reg(ClockShift=pipeline["ClockShift"],
                                                           ClockShift_1=pipeline["ClockShift_1"],
                                                           condition=lambda x, y: is_daytime(x, y),
                                                           computation_mode=ComputationMode.Transform,
                                                           callbacks=[LinePlotCallback('LinearRegression')])

    # Calculate the root mean squared error (RMSE) between the linear regression and the true values, save it as csv file
    RmseCalculator()(
        y_hat=(regressor_svr_power_statistics, regressor_lin_reg_power_statistics), y=pipeline["load_power_statistics"],
        callbacks=[LinePlotCallback('RMSE'), CSVCallback('RMSE')])

    return pipeline
Beispiel #6
0
    def pipe(params):
        keras_model = get_keras_model(params)

        pipeline = Pipeline(path="../results")

        imputer_power_statistics = LinearInterpolater(
            method='nearest', dim='time',
            name='imputer_power')(x=pipeline['load_power_statistics'])

        power_scaler = SKLearnWrapper(module=StandardScaler(),
                                      name='scaler_power')
        scale_power_statistics = power_scaler(x=imputer_power_statistics)

        shift_power_statistics = ClockShift(
            lag=1, name='ClockShift_Lag1')(x=scale_power_statistics)
        shift_power_statistics2 = ClockShift(
            lag=2, name='ClockShift_Lag2')(x=scale_power_statistics)

        keras_wrapper = KerasWrapper(keras_model,
                                     fit_kwargs={'batch_size': 32, 'epochs': 100, 'verbose': 0},
                                     compile_kwargs={'loss': 'mse', 'optimizer': 'Adam', 'metrics': ['mse']}) \
            (ClockShift_Lag1=shift_power_statistics,
             ClockShift_Lag2=shift_power_statistics2,
             target=scale_power_statistics)

        inverse_power_scale_dl = power_scaler(
            x=keras_wrapper,
            computation_mode=ComputationMode.Transform,
            use_inverse_transform=True,
            callbacks=[LinePlotCallback('prediction')])

        rmse_dl = RmseCalculator()(keras_model=inverse_power_scale_dl,
                                   y=pipeline['load_power_statistics'],
                                   callbacks=[CSVCallback('RMSE')])

        pipeline.train(train)
        result = pipeline.test(test)

        return {
            "loss": float(result['RmseCalculator'].values),
            "status": STATUS_OK,
            "eval_time": time.time() - start
        }
Beispiel #7
0
    def test_load(self, from_folder_mock, fm_mock):
        created_pipeline = MagicMock()
        from_folder_mock.return_value = created_pipeline
        pipeline = Pipeline.load({
            'name': 'Pipeline',
            'class': 'Pipeline',
            'module': 'pywatts.core.pipeline',
            'pipeline_path': 'save_path'
        })

        from_folder_mock.assert_called_once_with("save_path")
        self.assertEqual(created_pipeline, pipeline)
Beispiel #8
0
    def test_create_and_run_simple_pipeline(self):
        pipeline = Pipeline()
        imputer_power_statistics = LinearInterpolater(
            method="nearest", dim="time",
            name="imputer_power")(x=pipeline["load_power_statistics"])
        imputer_price = LinearInterpolater(
            method="nearest", dim="time",
            name="imputer_price")(x=pipeline["price_day_ahead"])
        scaler = SKLearnWrapper(StandardScaler())(x=imputer_price)
        lin_regression = SKLearnWrapper(LinearRegression())(
            x=scaler, target1=imputer_price, target2=imputer_power_statistics)

        RmseCalculator(name="Load")(y=imputer_power_statistics,
                                    pred=lin_regression["target2"])
        RmseCalculator(name="Price")(y=imputer_price,
                                     pred=lin_regression["target1"])
        data = pd.read_csv("data/getting_started_data.csv",
                           index_col="time",
                           sep=",",
                           parse_dates=["time"],
                           infer_datetime_format=True)
        train = data[6000:]
        test = data[:6000]
        pipeline.train(train)
        pipeline.test(test)
Beispiel #9
0
    def test_add_pipeline_to_pipeline_and_save(self, open_mock, json_mock,
                                               fm_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()
        detector(dataset=sub_pipeline["regressor"])

        regressor = SKLearnWrapper(LinearRegression())(x=self.pipeline["test"])
        sub_pipeline(regression=regressor)

        self.pipeline.to_folder(path="path")

        self.assertEqual(json_mock.dump.call_count, 2)
def create_preprocessing_pipeline(power_scaler):
    pipeline = Pipeline(path="../results/preprocessing")

    # Deal with missing values through linear interpolation
    imputer_power_statistics = LinearInterpolater(method="nearest", dim="time",
                                                  name="imputer_power")(x=pipeline["scaler_power"])
    # Scale the data using a standard SKLearn scaler
    scale_power_statistics = power_scaler(x=imputer_power_statistics)

    # Create lagged time series to later be used in the regression
    ClockShift(lag=1)(x=scale_power_statistics)
    ClockShift(lag=2)(x=scale_power_statistics)
    return pipeline
Beispiel #11
0
    def test_save(self, os_mock, to_folder_mock, fm_mock):
        os_mock.path.join.return_value = "save_path"
        os_mock.path.isdir.return_value = False
        sub_pipeline = Pipeline(batch=pd.Timedelta("1h"))
        detector = MissingValueDetector()
        detector(dataset=sub_pipeline["test"])
        fm_mock = MagicMock()
        fm_mock.basic_path = "path_to_save"
        result = sub_pipeline.save(fm_mock)

        to_folder_mock.assert_called_once_with("save_path")
        os_mock.path.join.assert_called_once_with("path_to_save", "Pipeline")
        self.assertEqual(
            {
                'name': 'Pipeline',
                'class': 'Pipeline',
                'module': 'pywatts.core.pipeline',
                'params': {
                    'batch': '0 days 01:00:00'
                },
                'pipeline_path': 'save_path'
            }, result)
Beispiel #12
0
    def test_add_pipeline_to_pipeline_and_train(self, fm_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()

        detector(dataset=sub_pipeline["regression"])

        regressor = SKLearnWrapper(LinearRegression(), name="regression")(
            x=self.pipeline["test"], target=self.pipeline["target"])
        sub_pipeline(regression=regressor)

        self.pipeline.train(
            pd.DataFrame({
                "test": [24, 24],
                "target": [12, 24]
            },
                         index=pd.to_datetime(
                             ['2015-06-03 00:00:00', '2015-06-03 01:00:00'])))

        for step in self.pipeline.id_to_step.values():
            assert step.computation_mode == ComputationMode.FitTransform
def create_test_pipeline(modules):
    regressor_svr, regressor_lin_reg = modules

    # Create test pipeline which works on a batch size of one hour.
    pipeline = Pipeline("../results/test_pipeline", batch=pd.Timedelta("1h"))

    # Add the svr regressor to the pipeline. This regressor should be called if it is not daytime
    regressor_svr_power_statistics = regressor_svr(
        ClockShift=pipeline["ClockShift"],
        ClockShift_1=pipeline["ClockShift_1"],
        condition=lambda x, y: not is_daytime(x, y),
        computation_mode=ComputationMode.Transform,
        callbacks=[LinePlotCallback('SVR')])

    # Add the linear regressor to the pipeline. This regressor should be called if it is daytime
    regressor_lin_reg_power_statistics = regressor_lin_reg(
        ClockShift=pipeline["ClockShift"],
        ClockShift_1=pipeline["ClockShift_1"],
        condition=lambda x, y: is_daytime(x, y),
        computation_mode=ComputationMode.Transform,
        callbacks=[LinePlotCallback('LinearRegression')])

    # TODO what kind of RMSE has to be used here?
    #   * Rolling would not work, since the complete RMSE should be calculated for each Time Point
    #   * Summary do not work, since summaries are only executed once
    #   Is the current solution useful?
    #   Possible Solution: window_size=-1 means that the window is from the start until the current point in time.
    #                      In that case, the online learning has to be built in that way, that module only calculate
    #                      data for the desired/requested time steps.

    # Calculate the root mean squared error (RMSE) between the linear regression and the true values, save it as csv file
    RollingRMSE(window_size=1, window_size_unit="d")(
        y_hat=(regressor_svr_power_statistics,
               regressor_lin_reg_power_statistics),
        y=pipeline["load_power_statistics"],
        callbacks=[LinePlotCallback('RMSE'),
                   CSVCallback('RMSE')])

    return pipeline
Beispiel #14
0
    def test_batch_2H_transform(self, concat_mock, fm_mock):
        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})
        pipeline = Pipeline(batch=pd.Timedelta("2h"))
        step_one = MagicMock()
        step_one.get_result.return_value = {"step": da}
        step_one.name = "step"
        result_mock = MagicMock()
        concat_mock.return_value = result_mock
        pipeline.start_steps["foo"] = StartStep("foo"), None
        pipeline.start_steps["foo"][0].last = False
        step_one.further_elements.side_effect = [True, True, True, True, False]
        pipeline.add(module=step_one, input_ids=[1])

        result = pipeline.transform(foo=da)

        self.assertEqual(concat_mock.call_count, 3)
        self.assertEqual(step_one.get_result.call_count, 4)
        self.assertEqual(step_one.further_elements.call_count, 5)
        self.assertEqual({"step": result_mock}, result)
Beispiel #15
0
 def test_get_params(self, fm_mock):
     result = Pipeline(batch=pd.Timedelta("1h")).get_params()
     self.assertEqual(result, {"batch": pd.Timedelta("1h")})
Beispiel #16
0
from pywatts.callbacks import CSVCallback, LinePlotCallback
# From pyWATTS the pipeline is imported
from pywatts.core.pipeline import Pipeline
# All modules required for the pipeline are imported
from pywatts.modules import FunctionModule


def custom_multiplication(x: xr.Dataset):
    # Multiply the given dataset with 100.
    return x * 1000


# The main function is where the pipeline is created and run
if __name__ == "__main__":
    # Create a pipeline
    pipeline = Pipeline(path="../results")

    # Add a custom function to the FunctionModule and add the module to the pipeline
    function_module = FunctionModule(
        custom_multiplication, name="Multiplication")(
            x=pipeline["load_power_statistics"],
            callbacks=[CSVCallback("Mul"),
                       LinePlotCallback("Mul")])

    # Now, the pipeline is complete so we can run it and explore the results
    # Start the pipeline
    df = pd.read_csv("../data/getting_started_data.csv",
                     parse_dates=["time"],
                     infer_datetime_format=True,
                     index_col="time")
Beispiel #17
0
# Other modules required for the pipeline are imported
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima_model import ARIMA

from pywatts.callbacks import CSVCallback, LinePlotCallback
from pywatts.core.computation_mode import ComputationMode
from pywatts.core.pipeline import Pipeline

# All modules required for the pipeline are imported
from pywatts.modules import CalendarExtraction, CalendarFeature, ClockShift, LinearInterpolater, RmseCalculator, \
    SKLearnWrapper, SmTimeSeriesModelWrapper

if __name__ == "__main__":
    # Create a pipeline
    pipeline = Pipeline(path="../results/statsmodel")

    # Extract dummy calender features, using holidays from Germany
    cal_features = CalendarExtraction(features=[CalendarFeature.hour, CalendarFeature.weekday, CalendarFeature.month],
                                      continent="Europe", country="Germany"
                                      )(x=pipeline["load_power_statistics"])

    # Deal with missing values through linear interpolation
    imputer_power_statistics = LinearInterpolater(
        method="nearest", dim="time", name="imputer_power"
    )(x=pipeline["load_power_statistics"])

    # Scale the data using a standard SKLearn scaler
    power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power")
    scale_power_statistics = power_scaler(x=imputer_power_statistics)
Beispiel #18
0
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# From pyWATTS the pipeline is imported
from pywatts.callbacks import LinePlotCallback
from pywatts.core.computation_mode import ComputationMode
from pywatts.core.pipeline import Pipeline
# All modules required for the pipeline are imported
from pywatts.modules import CalendarExtraction, CalendarFeature, ClockShift, LinearInterpolater, SKLearnWrapper
from pywatts.summaries import RMSE

# The main function is where the pipeline is created and run
if __name__ == "__main__":
    # Create a pipeline
    pipeline = Pipeline(path="../results")

    # Extract dummy calender features, using holidays from Germany
    # NOTE: CalendarExtraction can't return multiple features.
    calendar = CalendarExtraction(continent="Europe",
                                  country="Germany",
                                  features=[
                                      CalendarFeature.month,
                                      CalendarFeature.weekday,
                                      CalendarFeature.weekend
                                  ])(x=pipeline["load_power_statistics"])

    # Deal with missing values through linear interpolation
    imputer_power_statistics = LinearInterpolater(
        method="nearest", dim="time",
        name="imputer_power")(x=pipeline["load_power_statistics"])
Beispiel #19
0
    input_2 = layers.Input(
        shape=(1, ),
        name='ClockShift_Lag2')  # layer name must match time series name
    merged = layers.Concatenate(axis=1)([input_1, input_2])
    hidden = layers.Dense(H, input_dim=D_in, activation='tanh',
                          name='hidden')(merged)
    output = layers.Dense(D_out, activation='linear', name='target')(
        hidden)  # layer name must match time series name
    model = Model(inputs=[input_1, input_2], outputs=output)
    return model


if __name__ == "__main__":
    keras_model = get_keras_model()

    pipeline = Pipeline(path="../results")

    # Deal with missing values through linear interpolation
    imputer_power_statistics = LinearInterpolater(
        method="nearest", dim="time",
        name="imputer_power")(x=pipeline["load_power_statistics"])

    # Scale the data using a standard SKLearn scaler
    power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power")
    scale_power_statistics = power_scaler(x=imputer_power_statistics)

    # Create lagged time series to later be used in the regression
    # sampler_module -> 2D-Zeitreihe
    shift_power_statistics = ClockShift(
        lag=1, name="ClockShift_Lag1")(x=scale_power_statistics)
    shift_power_statistics2 = ClockShift(
Beispiel #20
0
    # H is hidden dimension; D_out is output dimension.
    D_in, H, D_out = 2, 10, 1

    model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out),
    )

    return model


if __name__ == "__main__":
    pytorch_model = get_sequential_model()

    pipeline = Pipeline(path="../results")

    # Deal with missing values through linear interpolation
    imputer_power_statistics = LinearInterpolater(
        method="nearest", dim="time",
        name="imputer_power")(x=pipeline["load_power_statistics"])

    # Scale the data using a standard SKLearn scaler
    power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power")
    scale_power_statistics = power_scaler(x=imputer_power_statistics)

    # Create lagged time series to later be used in the regression
    shift_power_statistics = ClockShift(
        lag=1, name="ClockShift_Lag1")(x=scale_power_statistics)
    shift_power_statistics2 = ClockShift(
        lag=2, name="ClockShift_Lag2")(x=scale_power_statistics)
if __name__ == "__main__":
    # Read the data via pandas.
    data = pd.read_csv("../data/getting_started_data.csv", parse_dates=["time"], infer_datetime_format=True,
                       index_col="time")

    # Split the data into train and test data.
    train = data[:6000]
    test = data[8700:]

    # Create all modules which are used multiple times.
    regressor_lin_reg = SKLearnWrapper(module=LinearRegression(fit_intercept=True), name="Regression")
    regressor_svr = SKLearnWrapper(module=SVR(), name="Regression")
    power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power")

    # Build a train pipeline. In this pipeline, each step processes all data at once.
    train_pipeline = Pipeline(path="../results/train")

    # Create preprocessing pipeline for the preprocessing steps
    preprocessing_pipeline = create_preprocessing_pipeline(power_scaler)
    preprocessing_pipeline = preprocessing_pipeline(scaler_power=train_pipeline["load_power_statistics"])

    # Addd the regressors to the train pipeline
    regressor_lin_reg(ClockShift=preprocessing_pipeline["ClockShift"],
                      ClockShift_1=preprocessing_pipeline["ClockShift_1"],
                      target=train_pipeline["load_power_statistics"],
                      callbacks=[LinePlotCallback('LinearRegression')])
    regressor_svr(ClockShift=preprocessing_pipeline["ClockShift"],
                  ClockShift_1=preprocessing_pipeline["ClockShift_1"],
                  target=train_pipeline["load_power_statistics"],
                  callbacks=[LinePlotCallback('SVR')])
Beispiel #22
0
class TestPipeline(unittest.TestCase):
    @patch("pywatts.core.pipeline.FileManager")
    def setUp(self, fm_mock) -> None:
        self.fm_mock = fm_mock()
        self.pipeline = Pipeline()

    def tearDown(self) -> None:
        self.pipeline = None

    def test_add_input_as_positional(self):
        # Should fail with an better error message
        SKLearnWrapper(LinearRegression())(x=self.pipeline["input"])

    def test_add_only_module(self):
        SKLearnWrapper(LinearRegression())(x=self.pipeline["input"])
        # nodes 1 plus startstep
        self.assertEqual(len(self.pipeline.id_to_step), 2)

    def test_add_module_which_is_not_in_a_list(self):
        wrapper = SKLearnWrapper(
            LinearRegression())(input=self.pipeline["input"])
        SKLearnWrapper(LinearRegression())(x=wrapper)
        # nodes 1 plus startstep
        self.assertEqual(len(self.pipeline.id_to_step), 3)

    def test_add_pipeline_without_index(self):
        # This should raise an exception since pipeline might get multiple columns in the input dataframe
        with self.assertRaises(Exception) as context:
            SKLearnWrapper(StandardScaler())(
                x=self.pipeline)  # This should fail
        self.assertEqual(
            "Adding a pipeline as input might be ambigious. Specifiy the desired column of your dataset by using pipeline[<column_name>]",
            str(context.exception))

    def test_add_module_with_inputs(self):
        scaler1 = SKLearnWrapper(StandardScaler())(x=self.pipeline["x"])
        scaler2 = SKLearnWrapper(StandardScaler())(x=self.pipeline["test1"])
        SKLearnWrapper(LinearRegression())(input_1=scaler1, input_2=scaler2)

        # Three modules plus start step and one collect step
        self.assertEqual(5, len(self.pipeline.id_to_step))

    def test_add_module_with_one_input_without_a_list(self):
        scaler = SKLearnWrapper(StandardScaler())(input=self.pipeline["test"])
        SKLearnWrapper(LinearRegression())(input=scaler)

        # Three modules plus start step and one collect step
        self.assertEqual(3, len(self.pipeline.id_to_step))

    @patch('pywatts.core.pipeline.FileManager')
    @patch('pywatts.core.pipeline.json')
    @patch("builtins.open", new_callable=mock_open)
    def test_to_folder(self, mock_file, json_mock, fm_mock):
        scaler = SKLearnWrapper(StandardScaler())(input=self.pipeline["input"])
        SKLearnWrapper(LinearRegression())(x=scaler)
        fm_mock_object = MagicMock()
        fm_mock.return_value = fm_mock_object
        fm_mock_object.get_path.side_effect = [
            os.path.join('test_pipeline', 'StandardScaler.pickle'),
            os.path.join('test_pipeline', 'LinearRegression.pickle'),
            os.path.join('test_pipeline', 'pipeline.json'),
        ]

        self.pipeline.to_folder("test_pipeline")

        calls_open = [
            call(os.path.join('test_pipeline', 'StandardScaler.pickle'), 'wb'),
            call(os.path.join('test_pipeline', 'LinearRegression.pickle'),
                 'wb'),
            call(os.path.join('test_pipeline', 'pipeline.json'), 'w')
        ]
        mock_file.assert_has_calls(calls_open, any_order=True)
        args, kwargs = json_mock.dump.call_args
        assert kwargs["obj"]["id"] == pipeline_json["id"]
        assert kwargs["obj"]["name"] == pipeline_json["name"]

        assert kwargs["obj"]["modules"] == pipeline_json["modules"]
        assert kwargs["obj"]["steps"] == pipeline_json["steps"]

    @patch('pywatts.core.pipeline.FileManager')
    @patch('pywatts.modules.sklearn_wrapper.pickle')
    @patch('pywatts.core.pipeline.json')
    @patch("builtins.open", new_callable=mock_open)
    @patch('pywatts.core.pipeline.os.path.isdir')
    def test_from_folder(self, isdir_mock, mock_file, json_mock, pickle_mock,
                         fm_mock):
        scaler = StandardScaler()
        linear_regression = LinearRegression()

        isdir_mock.return_value = True
        json_mock.load.return_value = pipeline_json

        pickle_mock.load.side_effect = [scaler, linear_regression]

        pipeline = Pipeline.from_folder("test_pipeline")
        calls_open = [
            call(os.path.join("test_pipeline", "StandardScaler.pickle"), "rb"),
            call(os.path.join("test_pipeline", "LinearRegression.pickle"),
                 "rb"),
            call(os.path.join("test_pipeline", "pipeline.json"), "r")
        ]

        mock_file.assert_has_calls(calls_open, any_order=True)

        json_mock.load.assert_called_once()
        assert pickle_mock.load.call_count == 2

        isdir_mock.assert_called_once()
        self.assertEqual(3, len(pipeline.id_to_step))

    def test_module_naming_conflict(self):
        # This test should check, that modules with the same name do not lead to an error
        # What should this test?
        # self.fail()
        pass

    def test_add_with_target(self):
        SKLearnWrapper(LinearRegression())(input=self.pipeline["input"],
                                           target=self.pipeline["target"])
        self.assertEqual(3, len(self.pipeline.id_to_step))

    def test_multiple_same_module(self):
        reg_module = SKLearnWrapper(module=LinearRegression())
        reg_one = reg_module(x=self.pipeline["test"],
                             target=self.pipeline["target"])
        reg_two = reg_module(x=self.pipeline["test2"],
                             target=self.pipeline["target"])
        detector = MissingValueDetector()
        detector(dataset=reg_one)
        detector(dataset=reg_two)

        # Three start steps (test, test2, target), two regressors two detectors
        self.assertEqual(7, len(self.pipeline.id_to_step))
        modules = []
        for element in self.pipeline.id_to_step.values():
            if isinstance(element, Step) and not element.module in modules:
                modules.append(element.module)
        # One sklearn wrappers, one missing value detector
        self.assertEqual(2, len(modules))

        self.pipeline.train(
            pd.DataFrame(
                {
                    "test": [1, 2, 2, 3, 4],
                    "test2": [2, 2, 2, 2, 2],
                    "target": [2, 2, 4, 4, -5]
                },
                index=pd.DatetimeIndex(
                    pd.date_range('2000-01-01', freq='24H', periods=5))))

    @patch('pywatts.core.pipeline.Pipeline._create_summary')
    @patch('pywatts.core.pipeline.FileManager')
    def test_add_pipeline_to_pipeline_and_train(self, fm_mock,
                                                create_summary_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()

        detector(dataset=sub_pipeline["regression"])

        regressor = SKLearnWrapper(LinearRegression(), name="regression")(
            x=self.pipeline["test"], target=self.pipeline["target"])
        sub_pipeline(regression=regressor)

        summary_formatter_mock = MagicMock()
        self.pipeline.train(pd.DataFrame({
            "test": [24, 24],
            "target": [12, 24]
        },
                                         index=pd.to_datetime([
                                             '2015-06-03 00:00:00',
                                             '2015-06-03 01:00:00'
                                         ])),
                            summary_formatter=summary_formatter_mock)

        for step in self.pipeline.id_to_step.values():
            assert step.current_run_setting.computation_mode == ComputationMode.FitTransform

        create_summary_mock.assert_has_calls(
            [call(summary_formatter_mock),
             call(summary_formatter_mock)])

    @patch('pywatts.core.pipeline.FileManager')
    def test_add_pipeline_to_pipeline_and_test(self, fm_mock):
        # Add some steps to the pipeline

        # Assert that the computation is set to fit_transform if the ComputationMode was default

        step = MagicMock()
        step.computation_mode = ComputationMode.Default
        step.finished = False
        time = pd.date_range('2000-01-01', freq='24H', periods=7)

        ds = xr.Dataset({'foo': ('time', [2, 3, 4, 5, 6, 7, 8]), 'time': time})

        subpipeline = Pipeline()
        subpipeline.add(module=step)

        # BUG: In step_factory.py -> create_step the file_manager of the pipeline is accessed
        # and the pipeline is None...
        # subpipeline(self.pipeline)

        # self.pipeline.test(ds)

        # step.set_computation_mode.assert_called_once_with(ComputationMode.Transform)

        # step.reset.assert_called_once()

    @patch("pywatts.core.pipeline.FileManager")
    @patch('pywatts.core.pipeline.json')
    @patch("builtins.open", new_callable=mock_open)
    def test_add_pipeline_to_pipeline_and_save(self, open_mock, json_mock,
                                               fm_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()
        detector(dataset=sub_pipeline["regressor"])

        regressor = SKLearnWrapper(LinearRegression())(x=self.pipeline["test"])
        sub_pipeline(regression=regressor)

        self.pipeline.to_folder(path="path")

        self.assertEqual(json_mock.dump.call_count, 2)

    def create_summary_in_subpipelines(self):
        assert False

    @patch('pywatts.core.pipeline.FileManager')
    def test__collect_batch_results_naming_conflict(self, fm_mock):
        step_one = MagicMock()
        step_one.name = "step"
        step_two = MagicMock()
        step_two.name = "step"
        result_step_one = MagicMock()
        result_step_two = MagicMock()
        merged_result = {"step": result_step_one, "step_1": result_step_two}

        step_one.get_result.return_value = {"step": result_step_one}
        step_two.get_result.return_value = {"step_1": result_step_two}

        result = self.pipeline._collect_results([step_one, step_two])

        # Assert that steps are correclty called.
        step_one.get_result.assert_called_once_with(None,
                                                    None,
                                                    return_all=True)
        step_two.get_result.assert_called_once_with(None,
                                                    None,
                                                    return_all=True)

        # Assert return value is correct
        self.assertEqual(merged_result, result)

    @patch("pywatts.core.pipeline.FileManager")
    def test_get_params(self, fm_mock):
        result = Pipeline(batch=pd.Timedelta("1h")).get_params()
        self.assertEqual(result, {"batch": pd.Timedelta("1h")})

    def test_set_params(self):
        self.pipeline.set_params(batch=pd.Timedelta("2h"))
        self.assertEqual(self.pipeline.get_params(),
                         {"batch": pd.Timedelta("2h")})

    def test__collect_batch_results(self):
        step_one = MagicMock()
        step_one.name = "step_one"
        step_two = MagicMock()
        step_two.name = "step_two"
        result_step_one = MagicMock()
        result_step_two = MagicMock()
        merged_result = {
            "step_one": result_step_one,
            "step_two": result_step_two
        }

        step_one.get_result.return_value = {"step_one": result_step_one}
        step_two.get_result.return_value = {"step_two": result_step_two}

        result = self.pipeline._collect_results([step_one, step_two])

        # Assert that steps are correclty called.
        step_one.get_result.assert_called_once_with(None,
                                                    None,
                                                    return_all=True)
        step_two.get_result.assert_called_once_with(None,
                                                    None,
                                                    return_all=True)

        # Assert return value is correct
        self.assertEqual(merged_result, result)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.xr.concat")
    def test_batched_pipeline(self, concat_mock, fm_mock):
        # Add some steps to the pipeline

        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})

        # Assert that the computation is set to fit_transform if the ComputationMode was default
        first_step = MagicMock()
        first_step.run_setting = RunSetting(ComputationMode.Default)
        first_step.finished = False
        first_step.further_elements.side_effect = [
            True, True, True, True, False
        ]

        first_step.get_result.return_value = {"one": da}
        self.pipeline.set_params(pd.Timedelta("24h"))
        self.pipeline.add(module=first_step)

        data = pd.DataFrame({
            "test": [1, 2, 2, 3],
            "test2": [2, 2, 2, 2]
        },
                            index=pd.DatetimeIndex(
                                pd.date_range('2000-01-01',
                                              freq='24H',
                                              periods=4)))
        self.pipeline.test(data)

        first_step.set_run_setting.assert_called_once()
        self.assertEqual(
            first_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.Transform)
        calls = [
            call(pd.Timestamp('2000-01-01 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-02 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-02 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-03 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-03 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-04 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-04 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-05 00:00:00', freq='24H'),
                 return_all=True),
        ]
        first_step.get_result.assert_has_calls(calls, any_order=True)
        self.assertEqual(concat_mock.call_count, 3)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.xr.concat")
    def test_batch_2H_transform(self, concat_mock, fm_mock):
        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})
        pipeline = Pipeline(batch=pd.Timedelta("2h"))
        step_one = MagicMock()
        step_one.get_result.return_value = {"step": da}
        step_one.name = "step"
        result_mock = MagicMock()
        concat_mock.return_value = result_mock
        pipeline.start_steps["foo"] = StartStep("foo"), None
        pipeline.start_steps["foo"][0].last = False
        step_one.further_elements.side_effect = [True, True, True, True, False]
        pipeline.add(module=step_one, input_ids=[1])

        result = pipeline.transform(foo=da)

        self.assertEqual(concat_mock.call_count, 3)
        self.assertEqual(step_one.get_result.call_count, 4)
        self.assertEqual(step_one.further_elements.call_count, 5)
        self.assertEqual({"step": result_mock}, result)

    @patch('pywatts.core.pipeline.FileManager')
    @patch("pywatts.core.pipeline._get_time_indexes", return_value=["time"])
    def test_transform_pipeline(self, get_time_indexes_mock, fm_mock):
        input_mock = MagicMock()
        input_mock.indexes = {"time": ["20.12.2020"]}
        step_two = MagicMock()
        result_mock = MagicMock()
        step_two.name = "mock"
        step_two.get_result.return_value = {"mock": result_mock}
        self.pipeline.add(module=step_two, input_ids=[1])

        result = self.pipeline.transform(x=input_mock)

        step_two.get_result.assert_called_once_with("20.12.2020",
                                                    None,
                                                    return_all=True)
        get_time_indexes_mock.assert_called_once_with({"x": input_mock})
        self.assertEqual({"mock": result_mock}, result)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.Pipeline.from_folder")
    def test_load(self, from_folder_mock, fm_mock):
        created_pipeline = MagicMock()
        from_folder_mock.return_value = created_pipeline
        pipeline = Pipeline.load({
            'name': 'Pipeline',
            'class': 'Pipeline',
            'module': 'pywatts.core.pipeline',
            'pipeline_path': 'save_path'
        })

        from_folder_mock.assert_called_once_with("save_path")
        self.assertEqual(created_pipeline, pipeline)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.Pipeline.to_folder")
    @patch("pywatts.core.pipeline.os")
    def test_save(self, os_mock, to_folder_mock, fm_mock):
        os_mock.path.join.return_value = "save_path"
        os_mock.path.isdir.return_value = False
        sub_pipeline = Pipeline(batch=pd.Timedelta("1h"))
        detector = MissingValueDetector()
        detector(dataset=sub_pipeline["test"])
        fm_mock = MagicMock()
        fm_mock.basic_path = "path_to_save"
        result = sub_pipeline.save(fm_mock)

        to_folder_mock.assert_called_once_with("save_path")
        os_mock.path.join.assert_called_once_with("path_to_save", "Pipeline")
        self.assertEqual(
            {
                'name': 'Pipeline',
                'class': 'Pipeline',
                'module': 'pywatts.core.pipeline',
                'params': {
                    'batch': '0 days 01:00:00'
                },
                'pipeline_path': 'save_path'
            }, result)

    @patch("pywatts.core.pipeline.FileManager")
    @patch("pywatts.core.pipeline.xr.concat")
    def test_batch_1_transform(self, concat_mock, fm_mock):
        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})
        pipeline = Pipeline(batch=pd.Timedelta("1h"))
        step_one = MagicMock()
        step_one.get_result.return_value = {"step": da}
        step_one.name = "step"
        result_mock = MagicMock()
        concat_mock.return_value = result_mock
        pipeline.start_steps["foo"] = StartStep("foo"), None
        pipeline.start_steps["foo"][0].last = False
        step_one.further_elements.side_effect = [
            True, True, True, True, True, True, True, False
        ]
        pipeline.add(module=step_one, input_ids=[1])

        result = pipeline.transform(foo=da)

        self.assertEqual(concat_mock.call_count, 6)
        self.assertEqual(step_one.get_result.call_count, 7)
        self.assertEqual(step_one.further_elements.call_count, 8)
        self.assertEqual({"step": result_mock}, result)

    @patch('pywatts.core.pipeline.FileManager')
    def test_test(self, fm_mock):
        # Add some steps to the pipeline

        # Assert that the computation is set to fit_transform if the ComputationMode was default
        first_step = MagicMock()
        first_step.computation_mode = ComputationMode.Default
        first_step.finished = False
        time = pd.date_range('2000-01-01', freq='1H', periods=7)

        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})

        first_step.get_result.return_value = {"first": da}
        second_step = MagicMock()
        second_step.computation_mode = ComputationMode.Train
        second_step.finished = False
        second_step.get_result.return_value = {"Second": da}

        self.pipeline.add(module=first_step)
        self.pipeline.add(module=second_step)

        self.pipeline.test(
            pd.DataFrame({
                "test": [1, 2, 2, 3, 4],
                "test2": [2, 2, 2, 2, 2]
            },
                         index=pd.DatetimeIndex(
                             pd.date_range('2000-01-01', freq='24H',
                                           periods=5))))

        first_step.get_result.assert_called_once_with(pd.Timestamp(
            '2000-01-01 00:00:00', freq='24H'),
                                                      None,
                                                      return_all=True)
        second_step.get_result.assert_called_once_with(pd.Timestamp(
            '2000-01-01 00:00:00', freq='24H'),
                                                       None,
                                                       return_all=True)

        first_step.set_run_setting.assert_called_once()
        self.assertEqual(
            first_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.Transform)
        second_step.set_run_setting.assert_called_once()
        self.assertEqual(
            second_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.Transform)

        first_step.reset.assert_called_once()
        second_step.reset.assert_called_once()

    @patch('pywatts.core.pipeline.FileManager')
    def test_train(self, fmmock):
        # Add some steps to the pipeline
        time = pd.date_range('2000-01-01', freq='1H', periods=7)

        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})

        # Assert that the computation is set to fit_transform if the ComputationMode was default
        first_step = MagicMock()
        first_step.computation_mode = ComputationMode.Default
        first_step.finished = False
        first_step.get_result.return_value = {"first": da}

        second_step = MagicMock()
        second_step.computation_mode = ComputationMode.Train
        second_step.finished = False
        second_step.get_result.return_value = {"second": da}

        self.pipeline.add(module=first_step)
        self.pipeline.add(module=second_step)

        data = pd.DataFrame({
            "test": [1, 2, 2, 3, 4],
            "test2": [2, 2, 2, 2, 2]
        },
                            index=pd.DatetimeIndex(
                                pd.date_range('2000-01-01',
                                              freq='24H',
                                              periods=5)))
        result, summary = self.pipeline.train(data, summary=True)

        first_step.set_run_setting.assert_called_once()
        self.assertEqual(
            first_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.FitTransform)
        second_step.set_run_setting.assert_called_once()
        self.assertEqual(
            second_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.FitTransform)

        first_step.get_result.assert_called_once_with(pd.Timestamp(
            '2000-01-01 00:00:00', freq='24H'),
                                                      None,
                                                      return_all=True)
        second_step.get_result.assert_called_once_with(pd.Timestamp(
            '2000-01-01 00:00:00', freq='24H'),
                                                       None,
                                                       return_all=True)

        first_step.reset.assert_called_once()
        second_step.reset.assert_called_once()
        xr.testing.assert_equal(result["second"], da)

    @patch("builtins.open", new_callable=mock_open)
    def test_horizon_greater_one_regression_inclusive_summary_file(
            self, open_mock):
        lin_reg = LinearRegression()
        self.fm_mock.get_path.return_value = "summary_path"

        multi_regressor = SKLearnWrapper(lin_reg)(
            foo=self.pipeline["foo"],
            target=self.pipeline["target"],
            target2=self.pipeline["target2"])
        RMSE()(y=self.pipeline["target"], prediction=multi_regressor["target"])

        time = pd.date_range('2000-01-01', freq='24H', periods=5)

        foo = xr.DataArray([1, 2, 3, 4, 5],
                           dims=["time"],
                           coords={'time': time})
        target = xr.DataArray([[2, 3], [2, 4], [2, 5], [2, 6], [2, 7]],
                              dims=["time", "horizon"],
                              coords={
                                  'time': time,
                                  "horizon": [1, 2]
                              })
        target2 = xr.DataArray([3, 3, 3, 3, 3],
                               dims=["time"],
                               coords={'time': time})

        ds = xr.Dataset({'foo': foo, "target": target, "target2": target2})

        result, summary = self.pipeline.train(ds, summary=True)

        self.assertTrue("Training Time" in summary)
        self.assertTrue("RMSE" in summary)

        self.fm_mock.get_path.assert_called_once_with("summary.md")
        open_mock().__enter__.return_value.write.assert_called_once_with(
            summary)

        self.assertTrue("target" in result.keys())
Beispiel #23
0
 def setUp(self, fm_mock) -> None:
     self.fm_mock = fm_mock()
     self.pipeline = Pipeline()
Beispiel #24
0
from pywatts.modules.trend_extraction import TrendExtraction
from pywatts.utils._xarray_time_series_utils import numpy_to_xarray
from pywatts.wrapper.function_module import FunctionModule

# NOTE If you choose a horizon greater than 24 you have to shift the profile -> Else future values may be considered for calculating the profile.
HORIZON = 24


def get_diff(x, profile):
    return numpy_to_xarray(x.values - profile.values, x, "difference")


drift_occured = False

if __name__ == "__main__":
    pipeline = Pipeline("pnn_pipeline")

    profile_moving = RollingMean(
        window_size=28,
        group_by=RollingGroupBy.WorkdayWeekend)(x=(pipeline["BldgX"]))
    difference = FunctionModule(get_diff)(x=pipeline["BldgX"],
                                          profile=profile_moving)
    trend = TrendExtraction(168, 5)(x=difference)
    calendar = CalendarExtraction(
        country="BadenWurttemberg",
        features=[
            CalendarFeature.hour_sine, CalendarFeature.month_sine,
            CalendarFeature.day_sine, CalendarFeature.monday,
            CalendarFeature.tuesday, CalendarFeature.wednesday,
            CalendarFeature.thursday, CalendarFeature.friday,
            CalendarFeature.hour_cos, CalendarFeature.day_cos,