Exemple #1
0
def test_regular_df_X_y():
    # autofeat with df without column names
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
    df = afreg.fit_transform(pd.DataFrame(X), pd.DataFrame(target))
    # score once with original, once with transformed data
    assert afreg.score(pd.DataFrame(X), target) >= 0.999, "R^2 should be 1."
    assert afreg.score(df, target) >= 0.999, "R^2 should be 1."
    assert list(df.columns)[:3] == ["0", "1", "2"], "Wrong column names"
Exemple #2
0
def test_regular_X_y():
    # autofeat with numpy arrays
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
    df = afreg.fit_transform(X, target)
    assert afreg.score(X, target) >= 0.999, "R^2 should be 1."
    assert afreg.score(df, target) >= 0.999, "R^2 should be 1."
    assert list(df.columns)[:3] == ["x000", "x001",
                                    "x002"], "Wrong column names"
Exemple #3
0
def test_weird_colnames():
    # autofeat with df with weird column names
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
    df = afreg.fit_transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]),
                             pd.DataFrame(target))
    assert afreg.score(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]),
                       target) >= 0.999, "R^2 should be 1."
    assert list(df.columns)[:3] == ["x 1.1", "2", "x/3"], "Wrong column names"
    # error if the column names aren't the same as before
    try:
        afreg.score(pd.DataFrame(X, columns=["x 11", 2, "x/3"]), target)
    except ValueError:
        pass
    else:
        raise AssertionError("Should throw error on mismatch column names")
Exemple #4
0
def test_categorical_cols():
    np.random.seed(15)
    x1 = np.random.rand(1000)
    x2 = np.random.randn(1000)
    x3 = np.random.rand(1000)
    x4 = np.array(200 * [4] + 300 * [5] + 500 * [2], dtype=int)
    target = 2 + 15 * x1 + 3 / (x2 - 1 / x3) + 5 * (x2 + np.log(x1))**3 + x4
    X = np.vstack([x1, x2, x3, x4]).T
    afreg = AutoFeatRegressor(verbose=1,
                              categorical_cols=["x4", "x5"],
                              feateng_steps=3)
    try:
        df = afreg.fit_transform(
            pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]), target)
    except ValueError:
        pass
    else:
        raise AssertionError(
            "categorical_cols not in df should throw an error")
    afreg = AutoFeatRegressor(verbose=1,
                              categorical_cols=["x4"],
                              feateng_steps=3)
    df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]),
                             target)
    assert list(df.columns)[3:6] == [
        "cat_x4_2.0", "cat_x4_4.0", "cat_x4_5.0"
    ], "categorical_cols were not transformed correctly"
    assert "x4" not in df.columns, "categorical_cols weren't deleted from df"
    df = afreg.transform(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]))
    assert list(df.columns)[3:6] == [
        "cat_x4_2.0", "cat_x4_4.0", "cat_x4_5.0"
    ], "categorical_cols were not transformed correctly"
    assert "x4" not in df.columns, "categorical_cols weren't deleted from df"
    assert afreg.score(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]),
                       target) >= 0.999, "R^2 should be 1."
Exemple #5
0
def test_units():
    np.random.seed(15)
    x1 = np.random.rand(1000)
    x2 = np.random.randn(1000)
    x3 = np.random.rand(1000)
    target = 2 + 15 * x1 + 3 / (x2 - 1 / x3) + 5 * (x2 * np.log(x1))**3
    X = np.vstack([x1, x2, x3]).T
    units = {"x2": "m/sec", "x3": "min/mm"}
    afreg = AutoFeatRegressor(verbose=1, units=units, feateng_steps=3)
    _ = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]),
                            target)
    assert afreg.score(pd.DataFrame(X, columns=["x1", "x2", "x3"]),
                       target) >= 0.999, "R^2 should be 1."
Exemple #6
0
from autofeat import FeatureSelector, AutoFeatRegressor
from sklearn.pipeline import make_pipeline
import pickle
def main():
    # Get the dataset from the users GitHub repository
    dataset_path = "https://raw.githubusercontent.com/" + os.environ["GITHUB_REPOSITORY"] +"/master/dataset.csv"
    df = pd.read_csv(dataset_path)
    print()
    print(df.describe())

for steps in range(5):
    np.random.seed(55)
    print("### AutoFeat with %i feateng_steps" % steps)
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=steps)
    df = afreg.fit_transform(df_org, target)
    r2 = afreg.score(df_org, target)
    print("## Final R^2: %.4f" % r2)
    plt.figure()
    plt.scatter(afreg.predict(df_org), target, s=2);
    plt.title("%i FE steps (R^2: %.4f; %i new features)" % (steps, r2, len(afreg.new_feat_cols_)))
afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
# train on noisy data
df = afreg.fit_transform(df_org, target_noisy)
# test on real targets
print("Final R^2: %.4f" % afreg.score(df, target))
plt.figure()
plt.scatter(afreg.predict(df), target, s=2);
afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
# train on noisy data
df = afreg.fit_transform(df_org, target_very_noisy)
# test on real targets