def test_add_module_with_inputs(self): scaler1 = SKLearnWrapper(StandardScaler())(x=self.pipeline["x"]) scaler2 = SKLearnWrapper(StandardScaler())(x=self.pipeline["test1"]) SKLearnWrapper(LinearRegression())(input_1=scaler1, input_2=scaler2) # Three modules plus start step and one collect step self.assertEqual(5, len(self.pipeline.id_to_step))
def test_transform_multiple_output(self): lin_reg = LinearRegression() multi_regressor = MultiOutputRegressor(lin_reg) wrapper = SKLearnWrapper(module=multi_regressor) time = pd.date_range('2000-01-01', freq='24H', periods=5) time2 = pd.date_range('2000-01-08', freq='24H', periods=1) bar = xr.DataArray([1, 2, 3, 4, 5], dims=["time"], coords={'time': time}) foo = xr.DataArray([1], dims=["time"], coords={'time': time2}) target = xr.DataArray([2, 2, 2, 2, 2], dims=["time"], coords={'time': time}) target2 = xr.DataArray([3, 3, 3, 3, 3], dims=["time"], coords={'time': time}) wrapper.fit(bar=bar, target1=target, target2=target2) result = wrapper.transform(bar=foo) self.assertAlmostEqual(result["target1"].values[0], 2.0) self.assertAlmostEqual(result["target2"].values[0], 3.0) self.assertEqual(result["target1"].shape, (1, 1)) self.assertEqual(result["target2"].shape, (1, 1))
def test_to_folder(self, mock_file, json_mock, fm_mock): scaler = SKLearnWrapper(StandardScaler())(input=self.pipeline["input"]) SKLearnWrapper(LinearRegression())(x=scaler) fm_mock_object = MagicMock() fm_mock.return_value = fm_mock_object fm_mock_object.get_path.side_effect = [ os.path.join('test_pipeline', 'StandardScaler.pickle'), os.path.join('test_pipeline', 'LinearRegression.pickle'), os.path.join('test_pipeline', 'pipeline.json'), ] self.pipeline.to_folder("test_pipeline") calls_open = [ call(os.path.join('test_pipeline', 'StandardScaler.pickle'), 'wb'), call(os.path.join('test_pipeline', 'LinearRegression.pickle'), 'wb'), call(os.path.join('test_pipeline', 'pipeline.json'), 'w') ] mock_file.assert_has_calls(calls_open, any_order=True) args, kwargs = json_mock.dump.call_args assert kwargs["obj"]["id"] == pipeline_json["id"] assert kwargs["obj"]["name"] == pipeline_json["name"] assert kwargs["obj"]["modules"] == pipeline_json["modules"] assert kwargs["obj"]["steps"] == pipeline_json["steps"]
def test_fit_regression_multiple_datavariables(self): time = pd.date_range('2000-01-01', freq='24H', periods=7) time2 = pd.date_range('2000-01-08', freq='24H', periods=1) bar = xr.DataArray([2, 2, 2, 2, 3, 3, 3], dims=["time"], coords={'time': time}) foo = xr.DataArray([4, 4, 4, 4, 6, 6, 6], dims=["time"], coords={'time': time}) target = xr.DataArray([6, 6, 6, 6, 9, 9, 9], dims=["time"], coords={'time': time}) lin_reg = LinearRegression() wrapper = SKLearnWrapper(module=lin_reg) self.assertFalse("coef_" in lin_reg.__dir__()) wrapper.fit(bar=bar, foo=foo, target=target) result = wrapper.transform(bar=xr.DataArray([2], dims=["time"], coords={'time': time2}), foo=xr.DataArray([4], dims=["time"], coords={'time': time2})) self.assertAlmostEqual(result["target"].values[0, 0], 6.0) self.assertEqual(result["target"].shape, (1, 1))
def test_fit_TransformerMixin(self): scaler = StandardScaler() wrapper = SKLearnWrapper(module=scaler) self.assertFalse("mean_" in scaler.__dir__()) wrapper.fit(test=xr.DataArray([1, 2, 3, 4, 5])) self.assertTrue("mean_" in scaler.__dir__()) self.assertIsNotNone(scaler.mean_)
def test_fit_RegressorMixin(self): lin_reg = LinearRegression() wrapper = SKLearnWrapper(module=lin_reg) self.assertFalse("coef_" in lin_reg.__dir__()) wrapper.fit(test=xr.DataArray([1, 2, 3, 4, 5]), target=xr.DataArray([2, 2, 2, 2, 2])) self.assertTrue("coef_" in lin_reg.__dir__()) self.assertIsNotNone(lin_reg.coef_)
def test_transform_RegressorMixin(self): svr = SVR() wrapper = SKLearnWrapper(module=svr) time = pd.date_range('2000-01-08', freq='24H', periods=1) bar = xr.DataArray([1], dims=["time"], coords={'time': time}) wrapper.fit(test=xr.DataArray([1, 2, 3, 4, 5]), target=xr.DataArray([2, 2, 2, 2, 2])) result = wrapper.transform(bar=bar) assert result["target"].values[0] == 2.0 self.assertEqual(result["target"].shape, (1, 1))
def test_multiple_same_module(self): reg_module = SKLearnWrapper(module=LinearRegression()) reg_one = reg_module(x=self.pipeline["test"], target=self.pipeline["target"]) reg_two = reg_module(x=self.pipeline["test2"], target=self.pipeline["target"]) detector = MissingValueDetector() detector(dataset=reg_one) detector(dataset=reg_two) # Three start steps (test, test2, target), two regressors two detectors self.assertEqual(7, len(self.pipeline.id_to_step)) modules = [] for element in self.pipeline.id_to_step.values(): if isinstance(element, Step) and not element.module in modules: modules.append(element.module) # One sklearn wrappers, one missing value detector self.assertEqual(2, len(modules)) self.pipeline.train( pd.DataFrame( { "test": [1, 2, 2, 3, 4], "test2": [2, 2, 2, 2, 2], "target": [2, 2, 4, 4, -5] }, index=pd.DatetimeIndex( pd.date_range('2000-01-01', freq='24H', periods=5))))
def test_add_pipeline_to_pipeline_and_train(self, fm_mock, create_summary_mock): sub_pipeline = Pipeline() detector = MissingValueDetector() detector(dataset=sub_pipeline["regression"]) regressor = SKLearnWrapper(LinearRegression(), name="regression")( x=self.pipeline["test"], target=self.pipeline["target"]) sub_pipeline(regression=regressor) summary_formatter_mock = MagicMock() self.pipeline.train(pd.DataFrame({ "test": [24, 24], "target": [12, 24] }, index=pd.to_datetime([ '2015-06-03 00:00:00', '2015-06-03 01:00:00' ])), summary_formatter=summary_formatter_mock) for step in self.pipeline.id_to_step.values(): assert step.current_run_setting.computation_mode == ComputationMode.FitTransform create_summary_mock.assert_has_calls( [call(summary_formatter_mock), call(summary_formatter_mock)])
def test_create_and_run_simple_pipeline(self): pipeline = Pipeline() imputer_power_statistics = LinearInterpolater(method="nearest", dim="time", name="imputer_power")(x=pipeline["load_power_statistics"]) imputer_price = LinearInterpolater(method="nearest", dim="time", name="imputer_price")(x=pipeline["price_day_ahead"]) scaler = SKLearnWrapper(StandardScaler())(x=imputer_price) lin_regression = SKLearnWrapper(LinearRegression())(x=scaler, target1=imputer_price, target2=imputer_power_statistics) RMSE(name="Load")(y=imputer_power_statistics, pred=lin_regression["target2"]) RMSE(name="Price")(y=imputer_price, pred=lin_regression["target1"]) data = pd.read_csv(f"{FIXTURE_DIR}/getting_started_data.csv", index_col="time", sep=",", parse_dates=["time"], infer_datetime_format=True) train = data[6000:] test = data[:6000] pipeline.train(train) pipeline.test(test)
def test_add_pipeline_without_index(self): # This should raise an exception since pipeline might get multiple columns in the input dataframe with self.assertRaises(Exception) as context: SKLearnWrapper(StandardScaler())( x=self.pipeline) # This should fail self.assertEqual( "Adding a pipeline as input might be ambigious. Specifiy the desired column of your dataset by using pipeline[<column_name>]", str(context.exception))
def test_fit_ClassifierMixin(self): svc = SVC() wrapper = SKLearnWrapper(module=svc) time = pd.date_range('2000-01-01', freq='24H', periods=5) time2 = pd.date_range('2000-01-08', freq='24H', periods=1) bar = xr.DataArray([1, 2, 3, 4, 5], dims=["time"], coords={'time': time}) foo = xr.DataArray([1], dims=["time"], coords={'time': time2}) target = xr.DataArray([0, 0, 1, 1, 1], dims=["time"], coords={'time': time}) wrapper.fit(bar=bar, target=target) result = wrapper.transform(bar=foo) assert result["target"].values[0] == 0 self.assertEqual(result["target"].shape, (1, 1))
def test_run_reloaded_simple_pipeline(self): pipeline = Pipeline() imputer_power_statistics = LinearInterpolater(method="nearest", dim="time", name="imputer_power")(x=pipeline["load_power_statistics"]) imputer_price = LinearInterpolater(method="nearest", dim="time", name="imputer_price")(x=pipeline["price_day_ahead"]) scaler = SKLearnWrapper(StandardScaler())(x=imputer_price) SKLearnWrapper(LinearRegression())(x=scaler, target1=imputer_price, target2=imputer_power_statistics) pipeline.to_folder("./pipe1") sleep(1) pipeline2 = Pipeline.from_folder("./pipe1") data = pd.read_csv(f"{FIXTURE_DIR}/getting_started_data.csv", index_col="time", sep=",", parse_dates=["time"], infer_datetime_format=True) train = data[6000:] test = data[:6000] pipeline2.train(train) pipeline2.test(test)
def test_DensityMixin(self): gauss_density = GaussianMixture(n_components=2) wrapper = SKLearnWrapper(module=gauss_density) time = pd.date_range('2000-01-01', freq='24H', periods=10) time2 = pd.date_range('2000-01-08', freq='24H', periods=1) bar = xr.DataArray([2, 4, 5, 4, 3, 2, 1, 5, 5, 5], dims=["time"], coords={'time': time}) wrapper.fit(bar=bar) bar1 = xr.DataArray([5], dims=["time"], coords={'time': time2}) bar2 = xr.DataArray([2], dims=["time"], coords={'time': time2}) result1 = wrapper.transform(bar=bar1) result0 = wrapper.transform(bar=bar2) assert result1.values[0] != result0.values[0] self.assertEqual(result1.shape, (1, )) self.assertEqual(result0.shape, (1, ))
def test_add_pipeline_to_pipeline_and_save(self, open_mock, json_mock, fm_mock): sub_pipeline = Pipeline() detector = MissingValueDetector() detector(dataset=sub_pipeline["regressor"]) regressor = SKLearnWrapper(LinearRegression())(x=self.pipeline["test"]) sub_pipeline(regression=regressor) self.pipeline.to_folder(path="path") self.assertEqual(json_mock.dump.call_count, 2)
def test_fit_ClusterMixin(self): kmeans = KMeans(n_clusters=2) wrapper = SKLearnWrapper(module=kmeans) # self.assertFalse("coef_" in lin_reg.__dir__()) time = pd.date_range('2000-01-01', freq='24H', periods=10) time2 = pd.date_range('2000-01-08', freq='24H', periods=1) bar = xr.DataArray([2, 4, 5, 4, 2, 2, 1, 5, 5, 5], dims=["time"], coords={'time': time}) foo1 = xr.DataArray([5], dims=["time"], coords={'time': time2}) foo2 = xr.DataArray([2], dims=["time"], coords={'time': time2}) wrapper.fit(bar=bar) result1 = wrapper.transform(foo=foo1) result0 = wrapper.transform(foo=foo2) # Assert that both tested datapoints are in different clusters assert result1.values[0].argmax() != result0.values[0].argmax() self.assertEqual(result1.shape, (1, 2)) self.assertEqual(result0.shape, (1, 2))
def test_horizon_greater_one_regression_inclusive_summary_file( self, open_mock): lin_reg = LinearRegression() self.fm_mock.get_path.return_value = "summary_path" multi_regressor = SKLearnWrapper(lin_reg)( foo=self.pipeline["foo"], target=self.pipeline["target"], target2=self.pipeline["target2"]) RMSE()(y=self.pipeline["target"], prediction=multi_regressor["target"]) time = pd.date_range('2000-01-01', freq='24H', periods=5) foo = xr.DataArray([1, 2, 3, 4, 5], dims=["time"], coords={'time': time}) target = xr.DataArray([[2, 3], [2, 4], [2, 5], [2, 6], [2, 7]], dims=["time", "horizon"], coords={ 'time': time, "horizon": [1, 2] }) target2 = xr.DataArray([3, 3, 3, 3, 3], dims=["time"], coords={'time': time}) ds = xr.Dataset({'foo': foo, "target": target, "target2": target2}) result, summary = self.pipeline.train(ds, summary=True) self.assertTrue("Training Time" in summary) self.assertTrue("RMSE" in summary) self.fm_mock.get_path.assert_called_once_with("summary.md") open_mock().__enter__.return_value.write.assert_called_once_with( summary) self.assertTrue("target" in result.keys())
return pipeline if __name__ == "__main__": # Read the data via pandas. data = pd.read_csv("../data/getting_started_data.csv", parse_dates=["time"], infer_datetime_format=True, index_col="time") # Split the data into train and test data. train = data[:6000] test = data[8700:] # Create all modules which are used multiple times. regressor_lin_reg = SKLearnWrapper( module=LinearRegression(fit_intercept=True), name="Regression") regressor_svr = SKLearnWrapper(module=SVR(), name="Regression") power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power") # Build a train pipeline. In this pipeline, each step processes all data at once. train_pipeline = Pipeline(path="../results/train") # Create preprocessing pipeline for the preprocessing steps preprocessing_pipeline = create_preprocessing_pipeline(power_scaler) preprocessing_pipeline = preprocessing_pipeline( scaler_power=train_pipeline["load_power_statistics"]) # Addd the regressors to the train pipeline regressor_lin_reg(ClockShift=preprocessing_pipeline["ClockShift"], ClockShift_1=preprocessing_pipeline["ClockShift_1"], target=train_pipeline["load_power_statistics"],
def test_set_params(self): scaler = StandardScaler() wrapper = SKLearnWrapper(module=scaler) self.assertEqual(scaler.get_params()["with_mean"], True) wrapper.set_params(with_mean=False, ) self.assertEqual(scaler.get_params()["with_mean"], False)
def test_get_params(self): scaler = StandardScaler() wrapper = SKLearnWrapper(module=scaler) self.assertEqual(wrapper.get_params(), scaler.get_params())
def test_add_with_target(self): SKLearnWrapper(LinearRegression())(input=self.pipeline["input"], target=self.pipeline["target"]) self.assertEqual(3, len(self.pipeline.id_to_step))
model = Model(inputs=[input_1, input_2], outputs=output) return model if __name__ == "__main__": keras_model = get_keras_model() pipeline = Pipeline(path="../results") # Deal with missing values through linear interpolation imputer_power_statistics = LinearInterpolater( method="nearest", dim="time", name="imputer_power")(x=pipeline["load_power_statistics"]) # Scale the data using a standard SKLearn scaler power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power") scale_power_statistics = power_scaler(x=imputer_power_statistics) # Create lagged time series to later be used in the regression # sampler_module -> 2D-Zeitreihe shift_power_statistics = ClockShift( lag=1, name="ClockShift_Lag1")(x=scale_power_statistics) shift_power_statistics2 = ClockShift( lag=2, name="ClockShift_Lag2")(x=scale_power_statistics) keras_wrapper = KerasWrapper(keras_model, fit_kwargs={"batch_size": 8, "epochs": 1}, compile_kwargs={"loss": "mse", "optimizer": "Adam", "metrics": ["mse"]}) \ (ClockShift_Lag1=shift_power_statistics, ClockShift_Lag2=shift_power_statistics2, target=scale_power_statistics)
# NOTE: CalendarExtraction can't return multiple features. calendar = CalendarExtraction(continent="Europe", country="Germany", features=[ CalendarFeature.month, CalendarFeature.weekday, CalendarFeature.weekend ])(x=pipeline["load_power_statistics"]) # Deal with missing values through linear interpolation imputer_power_statistics = LinearInterpolater( method="nearest", dim="time", name="imputer_power")(x=pipeline["load_power_statistics"]) # Scale the data using a standard SKLearn scaler power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power") scale_power_statistics = power_scaler(x=imputer_power_statistics) # Create lagged time series to later be used in the regression shift_power_statistics = ClockShift( lag=1, name="ClockShift_Lag1")(x=scale_power_statistics) shift_power_statistics2 = ClockShift( lag=2, name="ClockShift_Lag2")(x=scale_power_statistics) # Create a linear regression that uses the lagged values to predict the current value # NOTE: SKLearnWrapper has to collect all **kwargs itself and fit it against target. # It is also possible to implement a join/collect class regressor_power_statistics = SKLearnWrapper(module=LinearRegression( fit_intercept=True))( power_lag1=shift_power_statistics, power_lag2=shift_power_statistics2,
def test_add_module_with_one_input_without_a_list(self): scaler = SKLearnWrapper(StandardScaler())(input=self.pipeline["test"]) SKLearnWrapper(LinearRegression())(input=scaler) # Three modules plus start step and one collect step self.assertEqual(3, len(self.pipeline.id_to_step))
def test_add_input_as_positional(self): # Should fail with an better error message SKLearnWrapper(LinearRegression())(x=self.pipeline["input"])
def test_add_only_module(self): SKLearnWrapper(LinearRegression())(x=self.pipeline["input"]) # nodes 1 plus startstep self.assertEqual(len(self.pipeline.id_to_step), 2)
def test_add_module_which_is_not_in_a_list(self): wrapper = SKLearnWrapper( LinearRegression())(input=self.pipeline["input"]) SKLearnWrapper(LinearRegression())(x=wrapper) # nodes 1 plus startstep self.assertEqual(len(self.pipeline.id_to_step), 3)