def test_foreshadow_serialization_boston_housing_regression_multiprocessing( tmpdir): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression np.random.seed(1337) boston = load_boston() X_df = pd.DataFrame(boston.data, columns=boston.feature_names) y_df = pd.DataFrame(boston.target, columns=["target"]) X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2) shadow = Foreshadow(estimator=LinearRegression(), problem_type=ProblemType.REGRESSION) shadow.configure_multiprocessing(n_job=-1) shadow.fit(X_train, y_train) score = shadow.score(X_test, y_test) print(score)
def test_foreshadow_abort_on_empty_data_frame_after_cleaning( filename, problem_type, X_start, X_end, target): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.model_selection import train_test_split np.random.seed(1337) data_path = get_file_path("data", filename) data = pd.read_csv(data_path) X_df = data.loc[:, X_start:X_end] y_df = data.loc[:, target] X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2) from foreshadow.estimators import AutoEstimator estimator = AutoEstimator( problem_type=problem_type, auto="tpot", estimator_kwargs={"max_time_mins": 1}, ) shadow = Foreshadow(estimator=estimator, problem_type=problem_type) with pytest.raises(ValueError) as excinfo: shadow.fit(X_train, y_train) error_msg = ("All columns are dropped since they all have over 90% of " "missing values. Aborting foreshadow.") assert error_msg in str(excinfo.value)
def test_foreshadow_titanic(tmpdir): import pandas as pd train_data = pd.read_csv(get_file_path("data", "titanic-train.csv")) X_train_df = train_data.loc[:, "Pclass":"Embarked"] y_train_df = train_data.loc[:, "Survived"] X_train_df = X_train_df.drop(columns=["SibSp", "Parch", "Cabin"]) X_train, X_test, y_train, y_test = train_test_split(X_train_df, y_train_df, test_size=0.2, random_state=42) from foreshadow.estimators import AutoEstimator estimator = AutoEstimator( problem_type=ProblemType.CLASSIFICATION, auto="tpot", estimator_kwargs={ "max_time_mins": 1, "random_state": 42 }, ) shadow = Foreshadow(estimator=estimator, problem_type=ProblemType.CLASSIFICATION) shadow.override_intent(column_name="Name", intent=IntentType.TEXT) shadow.fit(X_train, y_train) score = shadow.score(X_test, y_test) print(score)
def test_foreshadow_serialization_adults_small_classification_override(): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression np.random.seed(1337) data_path = get_file_path("data", "adult_small.csv") adult = pd.read_csv(data_path) X_df = adult.loc[:, "age":"workclass"] y_df = adult.loc[:, "class"] X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2) shadow = Foreshadow(estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION) shadow.fit(X_train, y_train) score1 = shadow.score(X_test, y_test) from foreshadow.intents import IntentType shadow.override_intent("age", IntentType.CATEGORICAL) shadow.override_intent("workclass", IntentType.CATEGORICAL) shadow.fit(X_train, y_train) assert shadow.get_intent("age") == IntentType.CATEGORICAL assert shadow.get_intent("workclass") == IntentType.CATEGORICAL score2 = shadow.score(X_test, y_test) print(score1, score2)
def test_foreshadow_integration_data_cleaner_can_drop(filename, problem_type, X_start, X_end, target, tmpdir): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.model_selection import train_test_split np.random.seed(1337) data_path = get_file_path("data", filename) data = pd.read_csv(data_path) # local_file_folder = "examples" # data = pd.read_csv("/".join([local_file_folder, filename])) X_df = data.loc[:, X_start:X_end] y_df = data.loc[:, target] X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2) from foreshadow.estimators import AutoEstimator estimator = AutoEstimator( problem_type=problem_type, auto="tpot", estimator_kwargs={"max_time_mins": 1}, ) shadow = Foreshadow(estimator=estimator, problem_type=problem_type) pickled_fitted_pipeline_location = tmpdir.join("fitted_pipeline.p") shadow.fit(X_train, y_train) shadow.pickle_fitted_pipeline(pickled_fitted_pipeline_location) import pickle with open(pickled_fitted_pipeline_location, "rb") as fopen: pipeline = pickle.load(fopen) # If there are new empty columns in the test set, the program should # not fail. X_test[X_start] = np.nan score1 = shadow.score(X_test, y_test) score2 = pipeline.score(X_test, y_test) import unittest assertions = unittest.TestCase("__init__") # given the randomness of the tpot algorithm and the short run # time we configured, there is no guarantee the performance can # converge. The test here aims to evaluate if both cases have # produced a reasonable score and the difference is small. # assert score1 > 0.76 and score2 > 0.76 assertions.assertAlmostEqual(score1, score2, places=2)
def test_text_classification_foreshadow(): import pandas as pd categories = [ "alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med", ] from sklearn.datasets import fetch_20newsgroups twenty_train = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=42) X_train = pd.DataFrame( data=twenty_train.data, columns=["text"], index=list(range(len(twenty_train.data))), ) y_train = pd.Series( data=twenty_train.target, name="category", index=list(range(len(twenty_train.target))), ) twenty_test = fetch_20newsgroups(subset="test", categories=categories, shuffle=True, random_state=42) X_test = pd.DataFrame(data=twenty_test.data, columns=["text"]) y_test = pd.Series(data=twenty_test.target, name="category") from foreshadow.estimators import AutoEstimator estimator = AutoEstimator( problem_type=ProblemType.CLASSIFICATION, auto="tpot", estimator_kwargs={ "max_time_mins": 1, "random_state": 42 }, ) shadow = Foreshadow(estimator=estimator, problem_type=ProblemType.CLASSIFICATION) shadow.fit(X_train, y_train) score = shadow.score(X_test, y_test) print(score) # this gives about 87.5%
def test_foreshadow_pickling_and_unpickling_tpot(tmpdir): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split np.random.seed(1337) cancer = load_breast_cancer() cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names) cancery_df = pd.DataFrame(cancer.target, columns=["target"]) X_train, X_test, y_train, y_test = train_test_split(cancerX_df, cancery_df, test_size=0.2) from foreshadow.estimators import AutoEstimator estimator = AutoEstimator( problem_type=ProblemType.CLASSIFICATION, auto="tpot", estimator_kwargs={"max_time_mins": 1}, ) shadow = Foreshadow(estimator=estimator, problem_type=ProblemType.CLASSIFICATION) pickled_file_location = tmpdir.join("fitted_pipeline.p") shadow.fit(X_train, y_train) shadow.pickle_fitted_pipeline(pickled_file_location) import pickle with open(pickled_file_location, "rb") as fopen: pipeline = pickle.load(fopen) score1 = shadow.score(X_test, y_test) score2 = pipeline.score(X_test, y_test) import unittest assertions = unittest.TestCase("__init__") # given the randomness of the tpot algorithm and the short run # time we configured, there is no guarantee the performance can # converge. The test here aims to evaluate if both cases have # produced a reasonable score and the difference is small. # Changing the decimal point to 1 due to failure on azure pipeline but # cannot be reproduced locally. assertions.assertAlmostEqual(score1, score2, places=2)
def test_foreshadow_param_optimize_fit(mocker): import pandas as pd from foreshadow.base import BaseEstimator, TransformerMixin from sklearn.model_selection._search import BaseSearchCV from foreshadow.foreshadow import Foreshadow boston_path = get_file_path("data", "boston_housing.csv") data = pd.read_csv(boston_path) class DummyRegressor(BaseEstimator, TransformerMixin): def fit(self, X, y): return self class DummySearch(BaseSearchCV): def __init__(self, estimator, params): self.best_estimator_ = estimator def fit(self, X, y=None, **fit_params): return self class DummyDataPreparer(BaseEstimator, TransformerMixin): def fit(self, X, y): return self mocker.patch("foreshadow.preparer.DataPreparer", return_value=DummyDataPreparer) fs = Foreshadow( problem_type=ProblemType.REGRESSION, estimator=DummyRegressor(), optimizer=DummySearch, ) x = data.drop(["medv"], axis=1, inplace=False) y = data[["medv"]] fs.fit(x, y) assert isinstance(fs.pipeline.steps[-1][1].estimator, DummyRegressor) fs2 = Foreshadow( problem_type=ProblemType.REGRESSION, X_preparer=False, y_preparer=False, estimator=DummyRegressor(), optimizer=DummySearch, ) fs2.fit(x, y) assert isinstance(fs2.pipeline.steps[-1][1], DummyRegressor)
def test_foreshadow_y_preparer(mocker): import numpy as np from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from foreshadow.foreshadow import Foreshadow import pandas as pd np.random.seed(0) y_pipeline = Pipeline([("yohe", StandardScaler())]) setattr(y_pipeline, "pipeline", y_pipeline) estimator = LinearRegression() X = pd.DataFrame(np.array([0] * 50 + [1] * 50).reshape((-1, 1)), columns=["col1"]) y = pd.DataFrame(np.random.normal(100, 10, 100).reshape((-1, 1)), columns=["y"]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # Let foreshadow set to defaults, we will overwrite them y_preparer = mocker.PropertyMock(return_value=y_pipeline) mocker.patch.object(Foreshadow, "y_preparer", y_preparer) foreshadow = Foreshadow(problem_type=ProblemType.REGRESSION, estimator=estimator) foreshadow.fit(X_train, y_train) foreshadow_predict = foreshadow.predict(X_test) foreshadow_score = foreshadow.score(X_test, y_test) expected_predict = np.array([ [102.19044770619593], [102.19044770619593], [102.19044770619593], [100.05275170774354], [102.19044770619593], [102.19044770619593], [102.19044770619593], [102.19044770619593], [100.05275170774354], [100.05275170774354], ]) expected_score = -0.3576910440975052 assert np.allclose(foreshadow_predict, expected_predict) assert np.allclose(foreshadow_score, expected_score)
def test_core_foreshadow_example_regression(): import numpy as np import pandas as pd from sklearn.datasets import load_boston from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split from foreshadow.foreshadow import Foreshadow np.random.seed(0) boston = load_boston() bostonX_df = pd.DataFrame(boston.data, columns=boston.feature_names) bostony_df = pd.DataFrame(boston.target, columns=["target"]) X_train, X_test, y_train, y_test = train_test_split(bostonX_df, bostony_df, test_size=0.2) model = Foreshadow(estimator=LinearRegression(), problem_type=ProblemType.REGRESSION) model.fit(X_train, y_train) score = r2_score(y_test, model.predict(X_test)) print("Boston score: %f" % score)
def test_core_foreshadow_example_classification(): import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split from foreshadow.foreshadow import Foreshadow np.random.seed(0) iris = load_iris() irisX_df = pd.DataFrame(iris.data, columns=iris.feature_names) irisy_df = pd.DataFrame(iris.target, columns=["target"]) X_train, X_test, y_train, y_test = train_test_split(irisX_df, irisy_df, test_size=0.2) model = Foreshadow(estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION) model.fit(X_train, y_train) score = f1_score(y_test, model.predict(X_test), average="weighted") print("Iris score: %f" % score)
def test_foreshadow_serialization_breast_cancer_non_auto_estimator(): from foreshadow.foreshadow import Foreshadow import pandas as pd import numpy as np from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression np.random.seed(1337) cancer = load_breast_cancer() cancerX_df = pd.DataFrame(cancer.data, columns=cancer.feature_names) cancery_df = pd.DataFrame(cancer.target, columns=["target"]) X_train, X_test, y_train, y_test = train_test_split(cancerX_df, cancery_df, test_size=0.2) shadow = Foreshadow(estimator=LogisticRegression(), problem_type=ProblemType.CLASSIFICATION) shadow.fit(X_train, y_train) score = shadow.score(X_test, y_test) print(score)