def test_all_strategies():
    X = pd.DataFrame([[2, 4, 6, "a"], [4, 6, 8, "a"], [6, 4, 8, "b"],
                      [np.nan, np.nan, np.nan, np.nan]])

    X_expected = pd.DataFrame([[2, 4, 6, "a"], [4, 6, 8, "a"], [6, 4, 8, "b"],
                               [4, 4, 100, "a"]])

    X.columns = ['A', 'B', 'C', 'D']
    X_expected.columns = ['A', 'B', 'C', 'D']

    strategies = {
        'A': {
            "impute_strategy": "mean"
        },
        'B': {
            "impute_strategy": "median"
        },
        'C': {
            "impute_strategy": "constant",
            "fill_value": 100
        },
        'D': {
            "impute_strategy": "most_frequent"
        },
    }

    transformer = PerColumnImputer(impute_strategies=strategies)
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t, check_dtype=False)
def test_transform_drop_all_nan_columns():
    X = pd.DataFrame({
        "all_nan": [np.nan, np.nan, np.nan],
        "some_nan": [np.nan, 1, 0],
        "another_col": [0, 1, 2]
    })
    strategies = {
        'all_nan': {
            "impute_strategy": "most_frequent"
        },
        'some_nan': {
            "impute_strategy": "most_frequent"
        },
        'another_col': {
            "impute_strategy": "most_frequent"
        }
    }
    transformer = PerColumnImputer(impute_strategies=strategies)
    transformer.fit(X)
    X_expected_arr = pd.DataFrame({
        "some_nan": [0, 1, 0],
        "another_col": [0, 1, 2]
    })
    assert_frame_equal(X_expected_arr,
                       transformer.transform(X),
                       check_dtype=False)
    assert_frame_equal(
        X,
        pd.DataFrame({
            "all_nan": [np.nan, np.nan, np.nan],
            "some_nan": [np.nan, 1, 0],
            "another_col": [0, 1, 2]
        }))
def test_invalid_parameters():
    with pytest.raises(ValueError):
        strategies = ("impute_strategy", 'mean')
        PerColumnImputer(impute_strategies=strategies)

    with pytest.raises(ValueError):
        strategies = ['mean']
        PerColumnImputer(impute_strategies=strategies)
Ejemplo n.º 4
0
def test_per_column_imputer_woodwork_custom_overrides_returned_by_components(X_df, has_nan):
    y = pd.Series([1, 2, 1])
    if has_nan:
        X_df.iloc[len(X_df) - 1, 0] = np.nan
    override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean]
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        imputer = PerColumnImputer()
        imputer.fit(X, y)
        transformed = imputer.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        assert transformed.logical_types == {0: logical_type}
Ejemplo n.º 5
0
def test_transform_drop_all_nan_columns_empty():
    X = pd.DataFrame([[np.nan, np.nan, np.nan]])
    strategies = {'0': {"impute_strategy": "most_frequent"}, }
    transformer = PerColumnImputer(impute_strategies=strategies)
    assert transformer.fit_transform(X).to_dataframe().empty
    assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))

    strategies = {'0': {"impute_strategy": "most_frequent"}}
    transformer = PerColumnImputer(impute_strategies=strategies)
    transformer.fit(X)
    assert transformer.transform(X).to_dataframe().empty
    assert_frame_equal(X, pd.DataFrame([[np.nan, np.nan, np.nan]]))
Ejemplo n.º 6
0
def test_all_strategies():
    X = pd.DataFrame({"A": pd.Series([2, 4, 6, np.nan]),
                      "B": pd.Series([4, 6, 4, np.nan]),
                      "C": pd.Series([6, 8, 8, np.nan]),
                      "D": pd.Series(["a", "a", "b", np.nan])})

    X_expected = pd.DataFrame({"A": pd.Series([2, 4, 6, 4]),
                               "B": pd.Series([4, 6, 4, 4]),
                               "C": pd.Series([6, 8, 8, 100]),
                               "D": pd.Series(["a", "a", "b", "a"], dtype="category")})

    strategies = {
        'A': {"impute_strategy": "mean"},
        'B': {"impute_strategy": "median"},
        'C': {"impute_strategy": "constant", "fill_value": 100},
        'D': {"impute_strategy": "most_frequent"},
    }

    transformer = PerColumnImputer(impute_strategies=strategies)
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe(), check_dtype=False)
Ejemplo n.º 7
0
def test_non_numeric_valid(non_numeric_df):
    X = non_numeric_df

    # most frequent with all strings
    strategies = {'C': {"impute_strategy": "most_frequent"}}
    transformer = PerColumnImputer(impute_strategies=strategies)

    X_expected = pd.DataFrame({"A": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "B": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "C": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "D": pd.Series(["a", "b", "a", "a"], dtype="category")})

    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())

    # constant with all strings
    strategies = {'D': {"impute_strategy": "constant", "fill_value": 100}}
    transformer = PerColumnImputer(impute_strategies=strategies)

    X_expected = pd.DataFrame([["a", "a", "a", "a"],
                               ["b", "b", "b", "b"],
                               ["a", "a", "a", "a"],
                               ["a", "a", "a", 100]])
    X_expected.columns = ['A', 'B', 'C', 'D']
    X_expected = pd.DataFrame({"A": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "B": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "C": pd.Series(["a", "b", "a", "a"], dtype="category"),
                               "D": pd.Series(["a", "b", "a", 100], dtype="category")})
    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())
def test_fit_transform():
    X = pd.DataFrame([[2], [4], [6], [np.nan]])

    X_expected = pd.DataFrame([[2], [4], [6], [4]])

    X.columns = ['A']
    X_expected.columns = ['A']

    strategies = {'A': {"impute_strategy": "median"}}

    transformer = PerColumnImputer(impute_strategies=strategies)
    transformer.fit(X)
    X_t = transformer.transform(X)

    transformer = PerColumnImputer(impute_strategies=strategies)
    X_fit_transform = transformer.fit_transform(X)

    assert_frame_equal(X_t, X_fit_transform, check_dtype=False)
def test_non_numeric_valid(non_numeric_df):
    X = non_numeric_df

    # most frequent with all strings
    strategies = {'C': {"impute_strategy": "most_frequent"}}
    transformer = PerColumnImputer(impute_strategies=strategies)

    X_expected = pd.DataFrame([["a", "a", "a", "a"], ["b", "b", "b", "b"],
                               ["a", "a", "a", "a"], ["a", "a", "a", "a"]])
    X_expected.columns = ['A', 'B', 'C', 'D']

    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t, check_dtype=False)

    # constant with all strings
    strategies = {'D': {"impute_strategy": "constant", "fill_value": 100}}
    transformer = PerColumnImputer(impute_strategies=strategies)

    X_expected = pd.DataFrame([["a", "a", "a", "a"], ["b", "b", "b", "b"],
                               ["a", "a", "a", "a"], ["a", "a", "a", 100]])
    X_expected.columns = ['A', 'B', 'C', 'D']

    X_t = transformer.fit_transform(X)
    assert_frame_equal(X_expected, X_t, check_dtype=False)
Ejemplo n.º 10
0
def test_describe_component():
    enc = OneHotEncoder()
    imputer = Imputer()
    simple_imputer = SimpleImputer("mean")
    column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)})
    scaler = StandardScaler()
    feature_selection_clf = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
    feature_selection_reg = RFRegressorSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
    drop_col_transformer = DropColumns(columns=['col_one', 'col_two'])
    drop_null_transformer = DropNullColumns()
    datetime = DateTimeFeaturizer()
    text_featurizer = TextFeaturizer()
    lsa = LSA()
    pca = PCA()
    lda = LinearDiscriminantAnalysis()
    ft = DFSTransformer()
    us = Undersampler()
    assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10,
                                                                                        'features_to_encode': None,
                                                                                        'categories': None,
                                                                                        'drop': 'if_binary',
                                                                                        'handle_unknown': 'ignore',
                                                                                        'handle_missing': 'error'}}
    assert imputer.describe(return_dict=True) == {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': "most_frequent",
                                                                                    'categorical_fill_value': None,
                                                                                    'numeric_impute_strategy': "mean",
                                                                                    'numeric_fill_value': None}}
    assert simple_imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}}
    assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}}
    assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}}
    assert feature_selection_clf.describe(return_dict=True) == {'name': 'RF Classifier Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}}
    assert feature_selection_reg.describe(return_dict=True) == {'name': 'RF Regressor Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}}
    assert drop_col_transformer.describe(return_dict=True) == {'name': 'Drop Columns Transformer', 'parameters': {'columns': ['col_one', 'col_two']}}
    assert drop_null_transformer.describe(return_dict=True) == {'name': 'Drop Null Columns Transformer', 'parameters': {'pct_null_threshold': 1.0}}
    assert datetime.describe(return_dict=True) == {'name': 'DateTime Featurization Component',
                                                   'parameters': {'features_to_extract': ['year', 'month', 'day_of_week', 'hour'],
                                                                  'encode_as_categories': False}}
    assert text_featurizer.describe(return_dict=True) == {'name': 'Text Featurization Component', 'parameters': {}}
    assert lsa.describe(return_dict=True) == {'name': 'LSA Transformer', 'parameters': {}}
    assert pca.describe(return_dict=True) == {'name': 'PCA Transformer', 'parameters': {'n_components': None, 'variance': 0.95}}
    assert lda.describe(return_dict=True) == {'name': 'Linear Discriminant Analysis Transformer', 'parameters': {'n_components': None}}
    assert ft.describe(return_dict=True) == {'name': 'DFS Transformer', 'parameters': {"index": "index"}}
    assert us.describe(return_dict=True) == {'name': 'Undersampler', 'parameters': {"balanced_ratio": 4, "min_samples": 100, "min_percentage": 0.1}}
    # testing estimators
    base_classifier = BaselineClassifier()
    base_regressor = BaselineRegressor()
    lr_classifier = LogisticRegressionClassifier()
    en_classifier = ElasticNetClassifier()
    en_regressor = ElasticNetRegressor()
    et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto")
    et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto")
    rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3)
    rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
    linear_regressor = LinearRegressor()
    svm_classifier = SVMClassifier()
    svm_regressor = SVMRegressor()
    assert base_classifier.describe(return_dict=True) == {'name': 'Baseline Classifier', 'parameters': {'strategy': 'mode'}}
    assert base_regressor.describe(return_dict=True) == {'name': 'Baseline Regressor', 'parameters': {'strategy': 'mean'}}
    assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}}
    assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}}
    assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'max_iter': 1000, 'normalize': False}}
    assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
    assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
    assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}}
    assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}}
    assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False, 'n_jobs': -1}}
    assert svm_classifier.describe(return_dict=True) == {'name': 'SVM Classifier', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True}}
    assert svm_regressor.describe(return_dict=True) == {'name': 'SVM Regressor', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}}
    try:
        xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75)
        xgb_regressor = XGBoostRegressor(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75)
        assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}}
        assert xgb_regressor.describe(return_dict=True) == {'name': 'XGBoost Regressor', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}}
    except ImportError:
        pass
    try:
        cb_classifier = CatBoostClassifier()
        cb_regressor = CatBoostRegressor()
        assert cb_classifier.describe(return_dict=True) == {'name': 'CatBoost Classifier', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True}}
        assert cb_regressor.describe(return_dict=True) == {'name': 'CatBoost Regressor', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': False}}
    except ImportError:
        pass
    try:
        lg_classifier = LightGBMClassifier()
        lg_regressor = LightGBMRegressor()
        assert lg_classifier.describe(return_dict=True) == {'name': 'LightGBM Classifier', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31,
                                                                                                          'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}}
        assert lg_regressor.describe(return_dict=True) == {'name': 'LightGBM Regressor', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 20, 'max_depth': 0, 'num_leaves': 31,
                                                                                                        'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}}
    except ImportError:
        pass
def test_non_numeric_errors(non_numeric_df):
    # test col with all strings
    X = non_numeric_df

    # mean with all strings
    strategies = {'A': {"impute_strategy": "mean"}}
    with pytest.raises(ValueError,
                       match="Cannot use mean strategy with non-numeric data"):
        transformer = PerColumnImputer(impute_strategies=strategies)
        transformer.fit_transform(X)
    with pytest.raises(ValueError,
                       match="Cannot use mean strategy with non-numeric data"):
        transformer = PerColumnImputer(impute_strategies=strategies)
        transformer.fit(X)

    # median with all strings
    strategies = {'B': {"impute_strategy": "median"}}
    with pytest.raises(
            ValueError,
            match="Cannot use median strategy with non-numeric data"):
        transformer = PerColumnImputer(impute_strategies=strategies)
        transformer.fit_transform(X)
    with pytest.raises(
            ValueError,
            match="Cannot use median strategy with non-numeric data"):
        transformer = PerColumnImputer(impute_strategies=strategies)
        transformer.fit(X)