Exemple #1
0
def test_nans():
    # nans are ok in transform but not fit or predict (due to sklearn model)
    X, target = get_random_data()
    X[998, 0] = np.nan
    X[999, 1] = np.nan
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=3)
    try:
        _ = afreg.fit_transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]),
                                pd.DataFrame(target))
    except ValueError:
        pass
    else:
        raise AssertionError("fit with NaNs should throw an error")
    _ = afreg.fit_transform(pd.DataFrame(X[:900], columns=["x 1.1", 2, "x/3"]),
                            pd.DataFrame(target[:900]))
    try:
        _ = afreg.predict(pd.DataFrame(X[900:], columns=["x 1.1", 2, "x/3"]))
    except ValueError:
        pass
    else:
        raise AssertionError("predict with NaNs should throw an error")
    df = afreg.transform(pd.DataFrame(X, columns=["x 1.1", 2, "x/3"]))
    assert all([pd.isna(df.iloc[998, 0]),
                pd.isna(df.iloc[999, 1])]), "Original features should be NaNs"
    assert np.sum(
        np.array(pd.isna(df.iloc[998]),
                 dtype=int)) >= 2, "There should be at least 2 NaNs in row 998"
    assert np.sum(
        np.array(pd.isna(df.iloc[999]),
                 dtype=int)) >= 2, "There should be at least 3 NaNs in row 999"
Exemple #2
0
def test_categorical_cols():
    np.random.seed(15)
    x1 = np.random.rand(1000)
    x2 = np.random.randn(1000)
    x3 = np.random.rand(1000)
    x4 = np.array(200 * [4] + 300 * [5] + 500 * [2], dtype=int)
    target = 2 + 15 * x1 + 3 / (x2 - 1 / x3) + 5 * (x2 + np.log(x1))**3 + x4
    X = np.vstack([x1, x2, x3, x4]).T
    afreg = AutoFeatRegressor(verbose=1,
                              categorical_cols=["x4", "x5"],
                              feateng_steps=3)
    try:
        df = afreg.fit_transform(
            pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]), target)
    except ValueError:
        pass
    else:
        raise AssertionError(
            "categorical_cols not in df should throw an error")
    afreg = AutoFeatRegressor(verbose=1,
                              categorical_cols=["x4"],
                              feateng_steps=3)
    df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]),
                             target)
    assert list(df.columns)[3:6] == [
        "cat_x4_2.0", "cat_x4_4.0", "cat_x4_5.0"
    ], "categorical_cols were not transformed correctly"
    assert "x4" not in df.columns, "categorical_cols weren't deleted from df"
    df = afreg.transform(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]))
    assert list(df.columns)[3:6] == [
        "cat_x4_2.0", "cat_x4_4.0", "cat_x4_5.0"
    ], "categorical_cols were not transformed correctly"
    assert "x4" not in df.columns, "categorical_cols weren't deleted from df"
    assert afreg.score(pd.DataFrame(X, columns=["x1", "x2", "x3", "x4"]),
                       target) >= 0.999, "R^2 should be 1."
Exemple #3
0
def test_do_almost_nothing():
    X, target = get_random_data()
    afreg = AutoFeatRegressor(verbose=1, feateng_steps=0, featsel_runs=0)
    df = afreg.fit_transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]),
                             target)
    assert list(df.columns) == ["x1", "x2", "x3"], "Only original columns"
    df = afreg.transform(pd.DataFrame(X, columns=["x1", "x2", "x3"]))
    assert list(df.columns) == ["x1", "x2", "x3"], "Only original columns"
Exemple #4
0
    def regression(self):
        X_train, X_test, y_train, y_test, categoricalCols = \
            self.dataFunction(preprocessed=True, specifics="AUTOFEAT", trainSize=self.trainSize, steps=self.steps, nDataPoints=self.nDataPoints)

        featureEngColumns = None
        # If feature engineering not wanted for categorical values, uncomment
        # if categoricalCols is not None:
        #    featureEngColumns = X_train.columns.values.tolist()
        #    featureEngColumns = [i for i in featureEngColumns + categoricalCols if i not in featureEngColumns or i not in categoricalCols]

        # Measure runtime
        start_time = time.time()
        print(f"Start time: {start_time}")

        # Automated feature engineering with autofeat
        model = AutoFeatRegressor(verbose=1,
                                  feateng_steps=self.feateng_steps,
                                  featsel_runs=self.featuresel_steps,
                                  categorical_cols=categoricalCols,
                                  feateng_cols=featureEngColumns)

        # Fit model and get transformed dataframe with additional features
        x_train_extended = model.fit_transform(X_train, y_train)
        total_time = int(divmod(time.time() - start_time, 60)[0])
        print(f"Time: {total_time}")

        # Export model
        dump(
            model,
            f"{self.savePath}/feng{model.feateng_steps}_fsel{model.featsel_runs}_time{total_time}_model.joblib"
        )

        x_test_extended = model.transform(X_test)

        # Predictions
        predictions = {}

        predictionModel = DecisionTreeRegressor()
        predictionModel.fit(x_train_extended, y_train)
        predictions["DecisionTree"] = mean_squared_error(
            y_test, predictionModel.predict(x_test_extended))
        print(f"Final MSE prediction score: {predictions['DecisionTree']}")

        predictionModel = RandomForestRegressor(n_estimators=10)
        predictionModel.fit(x_train_extended, y_train)
        predictions["RandomForest"] = mean_squared_error(
            y_test, predictionModel.predict(x_test_extended))
        print(f"Final MSE prediction score: {predictions['RandomForest']}")

        predictionModel = LinearRegression()
        predictionModel.fit(x_train_extended, y_train)
        predictions["LinearRegression"] = mean_squared_error(
            y_test, predictionModel.predict(x_test_extended))
        print(f"Final MSE prediction score: {predictions['LinearRegression']}")

        predictionModel = LassoLarsCV(cv=5)
        predictionModel.fit(x_train_extended, y_train)
        predictions["LassoLarsCV"] = mean_squared_error(
            y_test, predictionModel.predict(x_test_extended))
        print(f"Final MSE prediction score: {predictions['LassoLarsCV']}")

        # Additionally save transformations steps since not saved in joblib file
        predictions["new_features"] = model.new_feat_cols_

        # Export predictions
        with open(
                f"{self.savePath}/feng{model.feateng_steps}_fsel{model.featsel_runs}_performance.pkl",
                'wb') as file:
            pickle.dump(predictions, file)

        return model