Beispiel #1
0
def test_foreshadow_titanic(tmpdir):
    import pandas as pd

    train_data = pd.read_csv(get_file_path("data", "titanic-train.csv"))
    X_train_df = train_data.loc[:, "Pclass":"Embarked"]
    y_train_df = train_data.loc[:, "Survived"]

    X_train_df = X_train_df.drop(columns=["SibSp", "Parch", "Cabin"])

    X_train, X_test, y_train, y_test = train_test_split(X_train_df,
                                                        y_train_df,
                                                        test_size=0.2,
                                                        random_state=42)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=ProblemType.CLASSIFICATION,
        auto="tpot",
        estimator_kwargs={
            "max_time_mins": 1,
            "random_state": 42
        },
    )

    shadow = Foreshadow(estimator=estimator,
                        problem_type=ProblemType.CLASSIFICATION)

    shadow.override_intent(column_name="Name", intent=IntentType.TEXT)
    shadow.fit(X_train, y_train)

    score = shadow.score(X_test, y_test)
    print(score)
Beispiel #2
0
def test_foreshadow_sampling_performance_comparison():
    X_train, X_test, y_train, y_test = train_test_split_local_file_common(
        file_path=get_file_path("data", "adult_small.csv"),
        X_start="age",
        X_end="workclass",
        target="class",
    )
    shadow = construct_foreshadow_object_common(
        problem_type=ProblemType.CLASSIFICATION)
    import time

    start = time.time()
    shadow.X_preparer.fit_transform(X_train, y_train)
    end = time.time()
    time_taken1 = end - start

    shadow2 = construct_foreshadow_object_common(
        problem_type=ProblemType.CLASSIFICATION)
    shadow2.configure_sampling(enable_sampling=False)

    start = time.time()
    shadow2.X_preparer.fit_transform(X_train, y_train)
    end = time.time()
    time_taken2 = end - start

    # using sampling should be faster than without sampling on this dataset
    # as it has more than 40,000 rows.
    assert time_taken1 < time_taken2
Beispiel #3
0
def test_foreshadow_serialization_adults_small_classification_override():
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    np.random.seed(1337)

    data_path = get_file_path("data", "adult_small.csv")

    adult = pd.read_csv(data_path)
    X_df = adult.loc[:, "age":"workclass"]
    y_df = adult.loc[:, "class"]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    shadow = Foreshadow(estimator=LogisticRegression(),
                        problem_type=ProblemType.CLASSIFICATION)
    shadow.fit(X_train, y_train)
    score1 = shadow.score(X_test, y_test)

    from foreshadow.intents import IntentType

    shadow.override_intent("age", IntentType.CATEGORICAL)
    shadow.override_intent("workclass", IntentType.CATEGORICAL)
    shadow.fit(X_train, y_train)

    assert shadow.get_intent("age") == IntentType.CATEGORICAL
    assert shadow.get_intent("workclass") == IntentType.CATEGORICAL
    score2 = shadow.score(X_test, y_test)
    print(score1, score2)
Beispiel #4
0
def test_smarttransformer_function(smart_child):
    """Test overall SmartTransformer functionality

    Args:
        smart_child: A subclass of SmartTransformer.

    """
    import numpy as np
    import pandas as pd

    from foreshadow.concrete import StandardScaler

    boston_path = get_file_path("data", "boston_housing.csv")

    df = pd.read_csv(boston_path)

    smart = smart_child(cache_manager=CacheManager())
    smart_data = smart.fit_transform(df[["crim"]])

    std = StandardScaler()
    std_data = std.fit_transform(df[["crim"]])

    assert smart_data.equals(std_data)

    smart.fit(df[["crim"]])
    smart_data = smart.transform(df[["crim"]])

    std.fit(df[["crim"]])
    std_data = std.transform(df[["crim"]])

    # TODO, remove when SmartTransformer is no longer wrapped
    # Column names will be different, thus np.allclose() is used
    assert np.allclose(smart_data, std_data)
Beispiel #5
0
def test_foreshadow_abort_on_empty_data_frame_after_cleaning(
        filename, problem_type, X_start, X_end, target):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split

    np.random.seed(1337)

    data_path = get_file_path("data", filename)

    data = pd.read_csv(data_path)
    X_df = data.loc[:, X_start:X_end]
    y_df = data.loc[:, target]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=problem_type,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )

    shadow = Foreshadow(estimator=estimator, problem_type=problem_type)

    with pytest.raises(ValueError) as excinfo:
        shadow.fit(X_train, y_train)
    error_msg = ("All columns are dropped since they all have over 90% of "
                 "missing values. Aborting foreshadow.")
    assert error_msg in str(excinfo.value)
Beispiel #6
0
def test_transformer_wrapper_function():
    import numpy as np
    import pandas as pd
    from sklearn.preprocessing import StandardScaler as StandardScaler
    from foreshadow.concrete import StandardScaler as CustomScaler

    boston_path = get_file_path("data", "boston_housing.csv")

    df = pd.read_csv(boston_path)

    custom = CustomScaler()
    sklearn = StandardScaler()

    custom.fit(df[["crim"]])
    sklearn.fit(df[["crim"]])

    custom_tf = custom.transform(df[["crim"]])
    sklearn_tf = sklearn.transform(df[["crim"]])

    assert np.array_equal(custom_tf.values, sklearn_tf)

    custom_tf = custom.fit_transform(df[["crim"]])
    sklearn_tf = sklearn.fit_transform(df[["crim"]])

    assert np.array_equal(custom_tf.values, sklearn_tf)
Beispiel #7
0
def test_foreshadow_param_optimize():  # TODO: Make this test faster
    import pickle
    import json

    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV

    from foreshadow.foreshadow import Foreshadow
    from foreshadow.preparer import DataPreparer
    from foreshadow.optimizers.param_mapping import param_mapping

    boston_path = get_file_path("data", "boston_housing.csv")
    test_json_path = get_file_path("configs", "optimizer_test.json")

    truth_path = get_file_path("configs", "search_space_optimize.pkl")

    data = pd.read_csv(boston_path)
    js = json.load(open(test_json_path, "r"))

    fs = Foreshadow(
        DataPreparer(from_json=js),
        False,
        LinearRegression(),
        ProblemType.REGRESSION,
        GridSearchCV,
    )

    fs.pipeline = Pipeline([("preparer", fs.X_preparer),
                            ("estimator", fs.estimator)])

    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25)

    results = param_mapping(fs.pipeline, x_train, y_train)

    # (If you change default configs) or file structure, you will need to
    # verify the outputs are correct manually and regenerate the pickle
    # truth file.
    truth = pickle.load(open(truth_path, "rb"))

    assert results[0].keys() == truth[0].keys()
Beispiel #8
0
def test_console_generate_level3(filename, y_var, problem_type, estimator):
    data_path = get_file_path("data", filename)

    args = ["--level", "3", data_path, y_var, problem_type]

    model = generate_model(args)

    assert isinstance(model[0].estimator, AutoEstimator)
Beispiel #9
0
def test_foreshadow_integration_data_cleaner_can_drop(filename, problem_type,
                                                      X_start, X_end, target,
                                                      tmpdir):
    from foreshadow.foreshadow import Foreshadow
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split

    np.random.seed(1337)

    data_path = get_file_path("data", filename)

    data = pd.read_csv(data_path)
    # local_file_folder = "examples"
    # data = pd.read_csv("/".join([local_file_folder, filename]))

    X_df = data.loc[:, X_start:X_end]
    y_df = data.loc[:, target]

    X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                        y_df,
                                                        test_size=0.2)

    from foreshadow.estimators import AutoEstimator

    estimator = AutoEstimator(
        problem_type=problem_type,
        auto="tpot",
        estimator_kwargs={"max_time_mins": 1},
    )

    shadow = Foreshadow(estimator=estimator, problem_type=problem_type)

    pickled_fitted_pipeline_location = tmpdir.join("fitted_pipeline.p")
    shadow.fit(X_train, y_train)
    shadow.pickle_fitted_pipeline(pickled_fitted_pipeline_location)

    import pickle

    with open(pickled_fitted_pipeline_location, "rb") as fopen:
        pipeline = pickle.load(fopen)

    # If there are new empty columns in the test set, the program should
    # not fail.
    X_test[X_start] = np.nan
    score1 = shadow.score(X_test, y_test)
    score2 = pipeline.score(X_test, y_test)

    import unittest

    assertions = unittest.TestCase("__init__")
    # given the randomness of the tpot algorithm and the short run
    # time we configured, there is no guarantee the performance can
    # converge. The test here aims to evaluate if both cases have
    # produced a reasonable score and the difference is small.
    # assert score1 > 0.76 and score2 > 0.76
    assertions.assertAlmostEqual(score1, score2, places=2)
Beispiel #10
0
def test_smart_impute_simple_mean():
    import numpy as np
    import pandas as pd
    from foreshadow.smart import SimpleFillImputer

    heart_path = get_file_path("data", "heart-h.csv")
    heart_impute_path = get_file_path("data", "heart-h_impute_mean.csv")

    impute = SimpleFillImputer()
    df = pd.read_csv(heart_path)

    data = df[["chol"]]

    impute.fit(data)
    out = impute.transform(data)
    truth = pd.read_csv(heart_impute_path, index_col=0)

    assert np.array_equal(out, truth)
Beispiel #11
0
def test_console_generate_ignore_time():
    from foreshadow.console import generate_model

    data_path = get_file_path("data", "boston_housing.csv")

    args = [data_path, "medv", "--level", "2", "--time", "20"]

    with pytest.warns(UserWarning, match="Time parameter not applicable"):
        generate_model(args)
Beispiel #12
0
def test_smart_impute_multiple():
    import numpy as np
    import pandas as pd
    from foreshadow.smart import MultiImputer

    heart_path = get_file_path("data", "heart-h.csv")
    heart_impute_path = get_file_path("data", "heart-h_impute_multi.csv")

    impute = MultiImputer()
    df = pd.read_csv(heart_path)

    data = df[["thalach", "chol", "trestbps", "age"]]

    impute.fit(data)
    out = impute.transform(data)
    truth = pd.read_csv(heart_impute_path, index_col=0)

    assert np.allclose(truth.values, out.values)
Beispiel #13
0
def test_console_generate_invalid_file():
    from foreshadow.console import generate_model

    data_path = get_file_path("data", "missing_file.csv")
    args = ["--level", "5", data_path, "badtarget", "regression"]

    with pytest.raises(ValueError) as e:
        generate_model(args)

    assert "Failed to load file." in str(e.value)
Beispiel #14
0
def test_transformer_fancy_impute_set_params():
    import numpy as np
    import pandas as pd
    from foreshadow.concrete import FancyImputer

    impute_kwargs = {"fill_method": "mean"}

    impute = FancyImputer(method="SimpleFill", impute_kwargs=impute_kwargs)
    heart_path = get_file_path("data", "heart-h.csv")
    heart_impute_path = get_file_path("data", "heart-h_impute_mean.csv")

    df = pd.read_csv(heart_path)

    data = df[["chol"]]

    impute.fit(data)
    out = impute.transform(data)
    truth = pd.read_csv(heart_impute_path, index_col=0)

    assert np.array_equal(out, truth)
Beispiel #15
0
def test_console_generate_invalid_target():
    from foreshadow.console import generate_model

    data_path = get_file_path("data", "boston_housing.csv")

    args = ["--level", "5", data_path, "badtarget", "regression"]

    with pytest.raises(ValueError) as e:
        generate_model(args)

    assert "Invalid target variable" in str(e.value)
Beispiel #16
0
def test_transformer_keep_cols():
    import pandas as pd
    from foreshadow.concrete import StandardScaler as CustomScaler

    boston_path = get_file_path("data", "boston_housing.csv")

    df = pd.read_csv(boston_path)

    custom = CustomScaler(keep_columns=True)
    custom_tf = custom.fit_transform(df[["crim"]])

    assert custom_tf.shape[1] == 2
Beispiel #17
0
def test_console_parse_args_multiprocess():
    from foreshadow.console import process_argument

    data_path = get_file_path("data", "boston_housing.csv")

    args = ["--level", "1", data_path, "medv", "regression"]
    cargs = process_argument(args)
    assert cargs.multiprocess is False

    args = ["--level", "1", "--multiprocess", data_path, "medv", "regression"]
    cargs = process_argument(args)
    assert cargs.multiprocess is True
Beispiel #18
0
def test_transformer_naming_default():
    from foreshadow.concrete import StandardScaler
    import pandas as pd

    boston_path = get_file_path("data", "boston_housing.csv")

    df = pd.read_csv(boston_path)

    scaler = StandardScaler(keep_columns=False)
    out = scaler.fit_transform(df[["crim"]])

    assert out.iloc[:, 0].name == "crim"
Beispiel #19
0
def test_foreshadow_param_optimize_invalid_array_idx():
    import json

    import pandas as pd

    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    from foreshadow.foreshadow import Foreshadow
    from foreshadow.preparer import DataPreparer
    from foreshadow.cachemanager import CacheManager

    boston_path = get_file_path("data", "boston_housing.csv")
    test_path = get_file_path("configs", "invalid_optimizer_config.json")

    data = pd.read_csv(boston_path)
    cfg = json.load(open(test_path, "r"))

    fs = Foreshadow(
        DataPreparer(CacheManager(), from_json=cfg),
        False,
        LinearRegression(),
        ProblemType.REGRESSION,
        GridSearchCV,
    )

    fs.pipeline = Pipeline([("preprocessor", fs.X_preparer),
                            ("estimator", fs.estimator)])

    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25)

    with pytest.raises(ValueError) as e:
        param_mapping(fs.pipeline, x_train, y_train)  # noqa: F821

    assert str(e.value).startswith("Attempted to index list")
Beispiel #20
0
def test_console_generate_and_execute_model(filename, family, y_var,
                                            problem_type, estimator):
    from foreshadow.console import generate_model, execute_model

    data_path = get_file_path("data", filename)

    args = ["--family", family, data_path, y_var, problem_type]

    model = generate_model(args)

    assert isinstance(model[0].estimator, estimator)

    execute_model(*model)
Beispiel #21
0
def test_foreshadow_param_optimize_no_combinations():
    import pickle

    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    from foreshadow.foreshadow import Foreshadow
    from foreshadow.preparer import DataPreparer
    from foreshadow.cachemanager import CacheManager

    boston_path = get_file_path("data", "boston_housing.csv")
    test_path = get_file_path("configs", "search_space_no_combo.pkl")

    data = pd.read_csv(boston_path)

    fs = Foreshadow(
        DataPreparer(cache_manager=CacheManager(), from_json={}),
        False,
        LinearRegression(),
        ProblemType.REGRESSION,
        GridSearchCV,
    )

    fs.pipeline = Pipeline([("preprocessor", fs.X_preparer),
                            ("estimator", fs.estimator)])

    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    x_train, _, y_train, _ = train_test_split(x, y, test_size=0.25)

    results = param_mapping(fs.pipeline, x_train, y_train)  # noqa: F821

    truth = pickle.load(open(test_path, "rb"))

    assert results[0].keys() == truth[0].keys()
Beispiel #22
0
def test_smarttransformer_attributeerror(smart_child, mocker):
    import pandas as pd
    from foreshadow.exceptions import TransformerNotFound

    boston_path = get_file_path("data", "boston_housing.csv")

    df = pd.read_csv(boston_path)

    smart = smart_child()
    smart.pick_transformer = mocker.Mock()
    smart.pick_transformer.return_value = "INVALID"

    with pytest.raises(TransformerNotFound):
        smart.fit(df[["crim"]])
Beispiel #23
0
def test_foreshadow_adults_classification():
    X_train, X_test, y_train, y_test = train_test_split_local_file_common(
        file_path=get_file_path("data", "adult.csv"),
        X_start="age",
        X_end="native-country",
        target="class",
    )
    shadow = construct_foreshadow_object_common(
        problem_type=ProblemType.CLASSIFICATION)

    shadow.fit(X_train, y_train)

    score = shadow.score(X_test, y_test)
    print(score)
Beispiel #24
0
def test_foreshadow_param_optimize_fit(mocker):
    import pandas as pd
    from foreshadow.base import BaseEstimator, TransformerMixin
    from sklearn.model_selection._search import BaseSearchCV

    from foreshadow.foreshadow import Foreshadow

    boston_path = get_file_path("data", "boston_housing.csv")
    data = pd.read_csv(boston_path)

    class DummyRegressor(BaseEstimator, TransformerMixin):
        def fit(self, X, y):
            return self

    class DummySearch(BaseSearchCV):
        def __init__(self, estimator, params):
            self.best_estimator_ = estimator

        def fit(self, X, y=None, **fit_params):
            return self

    class DummyDataPreparer(BaseEstimator, TransformerMixin):
        def fit(self, X, y):
            return self

    mocker.patch("foreshadow.preparer.DataPreparer",
                 return_value=DummyDataPreparer)

    fs = Foreshadow(
        problem_type=ProblemType.REGRESSION,
        estimator=DummyRegressor(),
        optimizer=DummySearch,
    )
    x = data.drop(["medv"], axis=1, inplace=False)
    y = data[["medv"]]

    fs.fit(x, y)
    assert isinstance(fs.pipeline.steps[-1][1].estimator, DummyRegressor)

    fs2 = Foreshadow(
        problem_type=ProblemType.REGRESSION,
        X_preparer=False,
        y_preparer=False,
        estimator=DummyRegressor(),
        optimizer=DummySearch,
    )

    fs2.fit(x, y)
    assert isinstance(fs2.pipeline.steps[-1][1], DummyRegressor)
Beispiel #25
0
def test_smart_impute_simple_none():
    import numpy as np
    import pandas as pd
    from foreshadow.smart import SimpleFillImputer

    heart_path = get_file_path("data", "heart-h.csv")

    impute = SimpleFillImputer(threshold=0.05)
    df = pd.read_csv(heart_path)

    data = df[["chol"]]

    impute.fit(data)
    out = impute.transform(data)

    assert np.allclose(data, out, equal_nan=True)
Beispiel #26
0
def test_console_generate_ignore_method():
    from foreshadow.console import generate_model

    data_path = get_file_path("data", "boston_housing.csv")

    args = [
        "--level",
        "3",
        data_path,
        "medv",
        "regression",
        "--method",
        "method",
    ]

    with pytest.warns(UserWarning, match="Method will be ignored"):
        generate_model(args)
Beispiel #27
0
def test_data_preparer_fit(cleaner_kwargs):
    """Test fitting of DataPreparer after creation with kwargs.

    Args:
          cleaner_kwargs: kwargs to CleanerMapper step

    """
    from foreshadow.preparer import DataPreparer
    from foreshadow.cachemanager import CacheManager
    import pandas as pd

    boston_path = get_file_path("data", "boston_housing.csv")
    data = pd.read_csv(boston_path)

    cs = CacheManager()
    dp = DataPreparer(cs, cleaner_kwargs=cleaner_kwargs)
    dp.fit(data)
Beispiel #28
0
def test_smart_impute_multiple_none():
    import pandas as pd
    from sklearn.pipeline import Pipeline
    from foreshadow.smart import MultiImputer
    from foreshadow.utils import PipelineStep

    boston_path = get_file_path("data", "boston_housing.csv")

    impute = MultiImputer()
    df = pd.read_csv(boston_path)

    data = df[["crim", "nox", "indus"]]

    impute.fit(data)
    impute.transform(data)

    assert isinstance(impute.transformer, Pipeline)
    assert impute.transformer.steps[0][PipelineStep["NAME"]] == "null"
Beispiel #29
0
def test_smarttransformer_function_override(smart_child):
    """Test SmartTransformer override through parameter specification.

    Args:
        smart_child: A subclass of SmartTransformer.

    """
    import numpy as np
    import pandas as pd

    from foreshadow.concrete import SimpleImputer

    boston_path = get_file_path("data", "boston_housing.csv")
    df = pd.read_csv(boston_path)

    smart = smart_child(
        transformer="SimpleImputer",
        name="impute",
        cache_manager=CacheManager(),
    )
    smart_data = smart.fit_transform(df[["crim"]])

    assert isinstance(smart.transformer, SimpleImputer)
    # assert smart.transformer.name == "impute"
    # not relevant anymore.

    std = SimpleImputer()
    std_data = std.fit_transform(df[["crim"]])

    assert smart_data.equals(std_data)

    smart.fit(df[["crim"]])
    smart_data = smart.transform(df[["crim"]])

    std.fit(df[["crim"]])
    std_data = std.transform(df[["crim"]])

    assert std_data.columns[0] == "crim"

    # TODO, remove when SmartTransformer is no longer wrapped
    # Column names will be different, thus np.allclose() is used
    assert np.allclose(smart_data, std_data)
Beispiel #30
0
def test_get_config_only_sys():
    import pickle

    from foreshadow.config import config
    from foreshadow.utils.testing import get_file_path

    resolved = config.get_config()

    test_data_path = get_file_path("configs", "configs_default.pkl")

    # (If you change default configs) or file structure, you will need to
    # verify the outputs are correct manually and regenerate the pickle
    # truth file.
    # with open(test_data_path, "wb") as fopen:
    #     pickle.dump(config[cfg_hash], fopen)

    with open(test_data_path, "rb") as fopen:
        test_data = pickle.load(fopen)

    assert resolved == test_data