Ejemplo n.º 1
0
    def test_add_module_with_inputs(self):
        scaler1 = SKLearnWrapper(StandardScaler())(x=self.pipeline["x"])
        scaler2 = SKLearnWrapper(StandardScaler())(x=self.pipeline["test1"])
        SKLearnWrapper(LinearRegression())(input_1=scaler1, input_2=scaler2)

        # Three modules plus start step and one collect step
        self.assertEqual(5, len(self.pipeline.id_to_step))
Ejemplo n.º 2
0
    def test_transform_multiple_output(self):
        lin_reg = LinearRegression()
        multi_regressor = MultiOutputRegressor(lin_reg)
        wrapper = SKLearnWrapper(module=multi_regressor)
        time = pd.date_range('2000-01-01', freq='24H', periods=5)
        time2 = pd.date_range('2000-01-08', freq='24H', periods=1)

        bar = xr.DataArray([1, 2, 3, 4, 5],
                           dims=["time"],
                           coords={'time': time})
        foo = xr.DataArray([1], dims=["time"], coords={'time': time2})
        target = xr.DataArray([2, 2, 2, 2, 2],
                              dims=["time"],
                              coords={'time': time})
        target2 = xr.DataArray([3, 3, 3, 3, 3],
                               dims=["time"],
                               coords={'time': time})

        wrapper.fit(bar=bar, target1=target, target2=target2)

        result = wrapper.transform(bar=foo)
        self.assertAlmostEqual(result["target1"].values[0], 2.0)
        self.assertAlmostEqual(result["target2"].values[0], 3.0)
        self.assertEqual(result["target1"].shape, (1, 1))
        self.assertEqual(result["target2"].shape, (1, 1))
Ejemplo n.º 3
0
    def test_to_folder(self, mock_file, json_mock, fm_mock):
        scaler = SKLearnWrapper(StandardScaler())(input=self.pipeline["input"])
        SKLearnWrapper(LinearRegression())(x=scaler)
        fm_mock_object = MagicMock()
        fm_mock.return_value = fm_mock_object
        fm_mock_object.get_path.side_effect = [
            os.path.join('test_pipeline', 'StandardScaler.pickle'),
            os.path.join('test_pipeline', 'LinearRegression.pickle'),
            os.path.join('test_pipeline', 'pipeline.json'),
        ]

        self.pipeline.to_folder("test_pipeline")

        calls_open = [
            call(os.path.join('test_pipeline', 'StandardScaler.pickle'), 'wb'),
            call(os.path.join('test_pipeline', 'LinearRegression.pickle'),
                 'wb'),
            call(os.path.join('test_pipeline', 'pipeline.json'), 'w')
        ]
        mock_file.assert_has_calls(calls_open, any_order=True)
        args, kwargs = json_mock.dump.call_args
        assert kwargs["obj"]["id"] == pipeline_json["id"]
        assert kwargs["obj"]["name"] == pipeline_json["name"]

        assert kwargs["obj"]["modules"] == pipeline_json["modules"]
        assert kwargs["obj"]["steps"] == pipeline_json["steps"]
Ejemplo n.º 4
0
    def test_fit_regression_multiple_datavariables(self):
        time = pd.date_range('2000-01-01', freq='24H', periods=7)
        time2 = pd.date_range('2000-01-08', freq='24H', periods=1)

        bar = xr.DataArray([2, 2, 2, 2, 3, 3, 3],
                           dims=["time"],
                           coords={'time': time})
        foo = xr.DataArray([4, 4, 4, 4, 6, 6, 6],
                           dims=["time"],
                           coords={'time': time})
        target = xr.DataArray([6, 6, 6, 6, 9, 9, 9],
                              dims=["time"],
                              coords={'time': time})

        lin_reg = LinearRegression()
        wrapper = SKLearnWrapper(module=lin_reg)
        self.assertFalse("coef_" in lin_reg.__dir__())

        wrapper.fit(bar=bar, foo=foo, target=target)
        result = wrapper.transform(bar=xr.DataArray([2],
                                                    dims=["time"],
                                                    coords={'time': time2}),
                                   foo=xr.DataArray([4],
                                                    dims=["time"],
                                                    coords={'time': time2}))
        self.assertAlmostEqual(result["target"].values[0, 0], 6.0)
        self.assertEqual(result["target"].shape, (1, 1))
Ejemplo n.º 5
0
    def test_fit_TransformerMixin(self):
        scaler = StandardScaler()
        wrapper = SKLearnWrapper(module=scaler)
        self.assertFalse("mean_" in scaler.__dir__())

        wrapper.fit(test=xr.DataArray([1, 2, 3, 4, 5]))

        self.assertTrue("mean_" in scaler.__dir__())
        self.assertIsNotNone(scaler.mean_)
Ejemplo n.º 6
0
    def test_fit_RegressorMixin(self):
        lin_reg = LinearRegression()
        wrapper = SKLearnWrapper(module=lin_reg)
        self.assertFalse("coef_" in lin_reg.__dir__())

        wrapper.fit(test=xr.DataArray([1, 2, 3, 4, 5]),
                    target=xr.DataArray([2, 2, 2, 2, 2]))

        self.assertTrue("coef_" in lin_reg.__dir__())
        self.assertIsNotNone(lin_reg.coef_)
Ejemplo n.º 7
0
    def test_transform_RegressorMixin(self):
        svr = SVR()
        wrapper = SKLearnWrapper(module=svr)
        time = pd.date_range('2000-01-08', freq='24H', periods=1)
        bar = xr.DataArray([1], dims=["time"], coords={'time': time})

        wrapper.fit(test=xr.DataArray([1, 2, 3, 4, 5]),
                    target=xr.DataArray([2, 2, 2, 2, 2]))

        result = wrapper.transform(bar=bar)
        assert result["target"].values[0] == 2.0
        self.assertEqual(result["target"].shape, (1, 1))
Ejemplo n.º 8
0
    def test_multiple_same_module(self):
        reg_module = SKLearnWrapper(module=LinearRegression())
        reg_one = reg_module(x=self.pipeline["test"],
                             target=self.pipeline["target"])
        reg_two = reg_module(x=self.pipeline["test2"],
                             target=self.pipeline["target"])
        detector = MissingValueDetector()
        detector(dataset=reg_one)
        detector(dataset=reg_two)

        # Three start steps (test, test2, target), two regressors two detectors
        self.assertEqual(7, len(self.pipeline.id_to_step))
        modules = []
        for element in self.pipeline.id_to_step.values():
            if isinstance(element, Step) and not element.module in modules:
                modules.append(element.module)
        # One sklearn wrappers, one missing value detector
        self.assertEqual(2, len(modules))

        self.pipeline.train(
            pd.DataFrame(
                {
                    "test": [1, 2, 2, 3, 4],
                    "test2": [2, 2, 2, 2, 2],
                    "target": [2, 2, 4, 4, -5]
                },
                index=pd.DatetimeIndex(
                    pd.date_range('2000-01-01', freq='24H', periods=5))))
Ejemplo n.º 9
0
    def test_add_pipeline_to_pipeline_and_train(self, fm_mock,
                                                create_summary_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()

        detector(dataset=sub_pipeline["regression"])

        regressor = SKLearnWrapper(LinearRegression(), name="regression")(
            x=self.pipeline["test"], target=self.pipeline["target"])
        sub_pipeline(regression=regressor)

        summary_formatter_mock = MagicMock()
        self.pipeline.train(pd.DataFrame({
            "test": [24, 24],
            "target": [12, 24]
        },
                                         index=pd.to_datetime([
                                             '2015-06-03 00:00:00',
                                             '2015-06-03 01:00:00'
                                         ])),
                            summary_formatter=summary_formatter_mock)

        for step in self.pipeline.id_to_step.values():
            assert step.current_run_setting.computation_mode == ComputationMode.FitTransform

        create_summary_mock.assert_has_calls(
            [call(summary_formatter_mock),
             call(summary_formatter_mock)])
Ejemplo n.º 10
0
    def test_create_and_run_simple_pipeline(self):
        pipeline = Pipeline()
        imputer_power_statistics = LinearInterpolater(method="nearest", dim="time",
                                                      name="imputer_power")(x=pipeline["load_power_statistics"])
        imputer_price = LinearInterpolater(method="nearest", dim="time",
                                           name="imputer_price")(x=pipeline["price_day_ahead"])
        scaler = SKLearnWrapper(StandardScaler())(x=imputer_price)
        lin_regression = SKLearnWrapper(LinearRegression())(x=scaler, target1=imputer_price, target2=imputer_power_statistics)

        RMSE(name="Load")(y=imputer_power_statistics, pred=lin_regression["target2"])
        RMSE(name="Price")(y=imputer_price, pred=lin_regression["target1"])
        data = pd.read_csv(f"{FIXTURE_DIR}/getting_started_data.csv", index_col="time", sep=",", parse_dates=["time"],
                           infer_datetime_format=True)
        train = data[6000:]
        test = data[:6000]
        pipeline.train(train)
        pipeline.test(test)
Ejemplo n.º 11
0
 def test_add_pipeline_without_index(self):
     # This should raise an exception since pipeline might get multiple columns in the input dataframe
     with self.assertRaises(Exception) as context:
         SKLearnWrapper(StandardScaler())(
             x=self.pipeline)  # This should fail
     self.assertEqual(
         "Adding a pipeline as input might be ambigious. Specifiy the desired column of your dataset by using pipeline[<column_name>]",
         str(context.exception))
Ejemplo n.º 12
0
    def test_fit_ClassifierMixin(self):
        svc = SVC()
        wrapper = SKLearnWrapper(module=svc)
        time = pd.date_range('2000-01-01', freq='24H', periods=5)
        time2 = pd.date_range('2000-01-08', freq='24H', periods=1)
        bar = xr.DataArray([1, 2, 3, 4, 5],
                           dims=["time"],
                           coords={'time': time})
        foo = xr.DataArray([1], dims=["time"], coords={'time': time2})
        target = xr.DataArray([0, 0, 1, 1, 1],
                              dims=["time"],
                              coords={'time': time})

        wrapper.fit(bar=bar, target=target)

        result = wrapper.transform(bar=foo)
        assert result["target"].values[0] == 0
        self.assertEqual(result["target"].shape, (1, 1))
Ejemplo n.º 13
0
    def test_run_reloaded_simple_pipeline(self):
        pipeline = Pipeline()

        imputer_power_statistics = LinearInterpolater(method="nearest", dim="time",
                                                      name="imputer_power")(x=pipeline["load_power_statistics"])
        imputer_price = LinearInterpolater(method="nearest", dim="time",
                                           name="imputer_price")(x=pipeline["price_day_ahead"])
        scaler = SKLearnWrapper(StandardScaler())(x=imputer_price)
        SKLearnWrapper(LinearRegression())(x=scaler, target1=imputer_price, target2=imputer_power_statistics)

        pipeline.to_folder("./pipe1")
        sleep(1)

        pipeline2 = Pipeline.from_folder("./pipe1")

        data = pd.read_csv(f"{FIXTURE_DIR}/getting_started_data.csv", index_col="time", sep=",", parse_dates=["time"],
                           infer_datetime_format=True)
        train = data[6000:]
        test = data[:6000]
        pipeline2.train(train)
        pipeline2.test(test)
Ejemplo n.º 14
0
    def test_DensityMixin(self):
        gauss_density = GaussianMixture(n_components=2)
        wrapper = SKLearnWrapper(module=gauss_density)

        time = pd.date_range('2000-01-01', freq='24H', periods=10)
        time2 = pd.date_range('2000-01-08', freq='24H', periods=1)

        bar = xr.DataArray([2, 4, 5, 4, 3, 2, 1, 5, 5, 5],
                           dims=["time"],
                           coords={'time': time})
        wrapper.fit(bar=bar)

        bar1 = xr.DataArray([5], dims=["time"], coords={'time': time2})
        bar2 = xr.DataArray([2], dims=["time"], coords={'time': time2})

        result1 = wrapper.transform(bar=bar1)
        result0 = wrapper.transform(bar=bar2)

        assert result1.values[0] != result0.values[0]

        self.assertEqual(result1.shape, (1, ))
        self.assertEqual(result0.shape, (1, ))
Ejemplo n.º 15
0
    def test_add_pipeline_to_pipeline_and_save(self, open_mock, json_mock,
                                               fm_mock):
        sub_pipeline = Pipeline()

        detector = MissingValueDetector()
        detector(dataset=sub_pipeline["regressor"])

        regressor = SKLearnWrapper(LinearRegression())(x=self.pipeline["test"])
        sub_pipeline(regression=regressor)

        self.pipeline.to_folder(path="path")

        self.assertEqual(json_mock.dump.call_count, 2)
Ejemplo n.º 16
0
    def test_fit_ClusterMixin(self):
        kmeans = KMeans(n_clusters=2)
        wrapper = SKLearnWrapper(module=kmeans)
        # self.assertFalse("coef_" in lin_reg.__dir__())

        time = pd.date_range('2000-01-01', freq='24H', periods=10)
        time2 = pd.date_range('2000-01-08', freq='24H', periods=1)

        bar = xr.DataArray([2, 4, 5, 4, 2, 2, 1, 5, 5, 5],
                           dims=["time"],
                           coords={'time': time})
        foo1 = xr.DataArray([5], dims=["time"], coords={'time': time2})
        foo2 = xr.DataArray([2], dims=["time"], coords={'time': time2})

        wrapper.fit(bar=bar)

        result1 = wrapper.transform(foo=foo1)
        result0 = wrapper.transform(foo=foo2)

        # Assert that both tested datapoints are in different clusters
        assert result1.values[0].argmax() != result0.values[0].argmax()

        self.assertEqual(result1.shape, (1, 2))
        self.assertEqual(result0.shape, (1, 2))
Ejemplo n.º 17
0
    def test_horizon_greater_one_regression_inclusive_summary_file(
            self, open_mock):
        lin_reg = LinearRegression()
        self.fm_mock.get_path.return_value = "summary_path"

        multi_regressor = SKLearnWrapper(lin_reg)(
            foo=self.pipeline["foo"],
            target=self.pipeline["target"],
            target2=self.pipeline["target2"])
        RMSE()(y=self.pipeline["target"], prediction=multi_regressor["target"])

        time = pd.date_range('2000-01-01', freq='24H', periods=5)

        foo = xr.DataArray([1, 2, 3, 4, 5],
                           dims=["time"],
                           coords={'time': time})
        target = xr.DataArray([[2, 3], [2, 4], [2, 5], [2, 6], [2, 7]],
                              dims=["time", "horizon"],
                              coords={
                                  'time': time,
                                  "horizon": [1, 2]
                              })
        target2 = xr.DataArray([3, 3, 3, 3, 3],
                               dims=["time"],
                               coords={'time': time})

        ds = xr.Dataset({'foo': foo, "target": target, "target2": target2})

        result, summary = self.pipeline.train(ds, summary=True)

        self.assertTrue("Training Time" in summary)
        self.assertTrue("RMSE" in summary)

        self.fm_mock.get_path.assert_called_once_with("summary.md")
        open_mock().__enter__.return_value.write.assert_called_once_with(
            summary)

        self.assertTrue("target" in result.keys())
Ejemplo n.º 18
0
    return pipeline


if __name__ == "__main__":
    # Read the data via pandas.
    data = pd.read_csv("../data/getting_started_data.csv",
                       parse_dates=["time"],
                       infer_datetime_format=True,
                       index_col="time")

    # Split the data into train and test data.
    train = data[:6000]
    test = data[8700:]

    # Create all modules which are used multiple times.
    regressor_lin_reg = SKLearnWrapper(
        module=LinearRegression(fit_intercept=True), name="Regression")
    regressor_svr = SKLearnWrapper(module=SVR(), name="Regression")
    power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power")

    # Build a train pipeline. In this pipeline, each step processes all data at once.
    train_pipeline = Pipeline(path="../results/train")

    # Create preprocessing pipeline for the preprocessing steps
    preprocessing_pipeline = create_preprocessing_pipeline(power_scaler)
    preprocessing_pipeline = preprocessing_pipeline(
        scaler_power=train_pipeline["load_power_statistics"])

    # Addd the regressors to the train pipeline
    regressor_lin_reg(ClockShift=preprocessing_pipeline["ClockShift"],
                      ClockShift_1=preprocessing_pipeline["ClockShift_1"],
                      target=train_pipeline["load_power_statistics"],
Ejemplo n.º 19
0
 def test_set_params(self):
     scaler = StandardScaler()
     wrapper = SKLearnWrapper(module=scaler)
     self.assertEqual(scaler.get_params()["with_mean"], True)
     wrapper.set_params(with_mean=False, )
     self.assertEqual(scaler.get_params()["with_mean"], False)
Ejemplo n.º 20
0
 def test_get_params(self):
     scaler = StandardScaler()
     wrapper = SKLearnWrapper(module=scaler)
     self.assertEqual(wrapper.get_params(), scaler.get_params())
Ejemplo n.º 21
0
 def test_add_with_target(self):
     SKLearnWrapper(LinearRegression())(input=self.pipeline["input"],
                                        target=self.pipeline["target"])
     self.assertEqual(3, len(self.pipeline.id_to_step))
Ejemplo n.º 22
0
    model = Model(inputs=[input_1, input_2], outputs=output)
    return model


if __name__ == "__main__":
    keras_model = get_keras_model()

    pipeline = Pipeline(path="../results")

    # Deal with missing values through linear interpolation
    imputer_power_statistics = LinearInterpolater(
        method="nearest", dim="time",
        name="imputer_power")(x=pipeline["load_power_statistics"])

    # Scale the data using a standard SKLearn scaler
    power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power")
    scale_power_statistics = power_scaler(x=imputer_power_statistics)

    # Create lagged time series to later be used in the regression
    # sampler_module -> 2D-Zeitreihe
    shift_power_statistics = ClockShift(
        lag=1, name="ClockShift_Lag1")(x=scale_power_statistics)
    shift_power_statistics2 = ClockShift(
        lag=2, name="ClockShift_Lag2")(x=scale_power_statistics)

    keras_wrapper = KerasWrapper(keras_model,
                                 fit_kwargs={"batch_size": 8, "epochs": 1},
                                 compile_kwargs={"loss": "mse", "optimizer": "Adam", "metrics": ["mse"]}) \
        (ClockShift_Lag1=shift_power_statistics,
         ClockShift_Lag2=shift_power_statistics2,
         target=scale_power_statistics)
Ejemplo n.º 23
0
    # NOTE: CalendarExtraction can't return multiple features.
    calendar = CalendarExtraction(continent="Europe",
                                  country="Germany",
                                  features=[
                                      CalendarFeature.month,
                                      CalendarFeature.weekday,
                                      CalendarFeature.weekend
                                  ])(x=pipeline["load_power_statistics"])

    # Deal with missing values through linear interpolation
    imputer_power_statistics = LinearInterpolater(
        method="nearest", dim="time",
        name="imputer_power")(x=pipeline["load_power_statistics"])

    # Scale the data using a standard SKLearn scaler
    power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power")
    scale_power_statistics = power_scaler(x=imputer_power_statistics)

    # Create lagged time series to later be used in the regression
    shift_power_statistics = ClockShift(
        lag=1, name="ClockShift_Lag1")(x=scale_power_statistics)
    shift_power_statistics2 = ClockShift(
        lag=2, name="ClockShift_Lag2")(x=scale_power_statistics)

    # Create a linear regression that uses the lagged values to predict the current value
    # NOTE: SKLearnWrapper has to collect all **kwargs itself and fit it against target.
    #       It is also possible to implement a join/collect class
    regressor_power_statistics = SKLearnWrapper(module=LinearRegression(
        fit_intercept=True))(
            power_lag1=shift_power_statistics,
            power_lag2=shift_power_statistics2,
Ejemplo n.º 24
0
    def test_add_module_with_one_input_without_a_list(self):
        scaler = SKLearnWrapper(StandardScaler())(input=self.pipeline["test"])
        SKLearnWrapper(LinearRegression())(input=scaler)

        # Three modules plus start step and one collect step
        self.assertEqual(3, len(self.pipeline.id_to_step))
Ejemplo n.º 25
0
 def test_add_input_as_positional(self):
     # Should fail with an better error message
     SKLearnWrapper(LinearRegression())(x=self.pipeline["input"])
Ejemplo n.º 26
0
 def test_add_only_module(self):
     SKLearnWrapper(LinearRegression())(x=self.pipeline["input"])
     # nodes 1 plus startstep
     self.assertEqual(len(self.pipeline.id_to_step), 2)
Ejemplo n.º 27
0
 def test_add_module_which_is_not_in_a_list(self):
     wrapper = SKLearnWrapper(
         LinearRegression())(input=self.pipeline["input"])
     SKLearnWrapper(LinearRegression())(x=wrapper)
     # nodes 1 plus startstep
     self.assertEqual(len(self.pipeline.id_to_step), 3)