def test_foreshadow_param_optimize(): # TODO: Make this test faster import pickle import json import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from foreshadow.foreshadow import Foreshadow from foreshadow.preparer import DataPreparer from foreshadow.optimizers.param_mapping import param_mapping boston_path = get_file_path("data", "boston_housing.csv") test_json_path = get_file_path("configs", "optimizer_test.json") truth_path = get_file_path("configs", "search_space_optimize.pkl") data = pd.read_csv(boston_path) js = json.load(open(test_json_path, "r")) fs = Foreshadow( DataPreparer(from_json=js), False, LinearRegression(), ProblemType.REGRESSION, GridSearchCV, ) fs.pipeline = Pipeline([("preparer", fs.X_preparer), ("estimator", fs.estimator)]) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25) results = param_mapping(fs.pipeline, x_train, y_train) # (If you change default configs) or file structure, you will need to # verify the outputs are correct manually and regenerate the pickle # truth file. truth = pickle.load(open(truth_path, "rb")) assert results[0].keys() == truth[0].keys()
def test_foreshadow_param_optimize_invalid_dict_key(): import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from foreshadow.foreshadow import Foreshadow from foreshadow.preparer import DataPreparer from foreshadow.cachemanager import CacheManager boston_path = get_file_path("data", "boston_housing.csv") data = pd.read_csv(boston_path) fs = Foreshadow( DataPreparer( cache_manager=CacheManager(), from_json={"combinations": [{ "fake.fake": "[1,2]" }]}, ), False, LinearRegression(), ProblemType.REGRESSION, GridSearchCV, ) fs.pipeline = Pipeline([("preprocessor", fs.X_preparer), ("estimator", fs.estimator)]) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25) with pytest.raises(ValueError) as e: param_mapping(fs.pipeline, x_train, y_train) # noqa: F821 assert str(e.value) == "Invalid JSON Key fake in {}"
def test_foreshadow_param_optimize_invalid_array_idx(): import json import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from foreshadow.foreshadow import Foreshadow from foreshadow.preparer import DataPreparer from foreshadow.cachemanager import CacheManager boston_path = get_file_path("data", "boston_housing.csv") test_path = get_file_path("configs", "invalid_optimizer_config.json") data = pd.read_csv(boston_path) cfg = json.load(open(test_path, "r")) fs = Foreshadow( DataPreparer(CacheManager(), from_json=cfg), False, LinearRegression(), ProblemType.REGRESSION, GridSearchCV, ) fs.pipeline = Pipeline([("preprocessor", fs.X_preparer), ("estimator", fs.estimator)]) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25) with pytest.raises(ValueError) as e: param_mapping(fs.pipeline, x_train, y_train) # noqa: F821 assert str(e.value).startswith("Attempted to index list")
def test_foreshadow_param_optimize_no_combinations(): import pickle import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from foreshadow.foreshadow import Foreshadow from foreshadow.preparer import DataPreparer from foreshadow.cachemanager import CacheManager boston_path = get_file_path("data", "boston_housing.csv") test_path = get_file_path("configs", "search_space_no_combo.pkl") data = pd.read_csv(boston_path) fs = Foreshadow( DataPreparer(cache_manager=CacheManager(), from_json={}), False, LinearRegression(), ProblemType.REGRESSION, GridSearchCV, ) fs.pipeline = Pipeline([("preprocessor", fs.X_preparer), ("estimator", fs.estimator)]) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25) results = param_mapping(fs.pipeline, x_train, y_train) # noqa: F821 truth = pickle.load(open(test_path, "rb")) assert results[0].keys() == truth[0].keys()