def test_error_if_all_constant_and_quasi_constant_features():
    # test case 7: when input contains all constant and quasi constant features
    with pytest.raises(ValueError):
        transformer = DropConstantFeatures(tol=0.7)
        transformer.fit_transform(
            pd.DataFrame({
                "col1": [1, 1, 1, 1],
                "col2": [1, 1, 1, 1],
                "col3": [1, 1, 1, 2],
                "col4": [1, 1, 1, 2],
            }))
def test_drop_constant_features_with_list_of_variables(df_constant_features):
    # test case 3: drop features showing threshold more than 0.7 with variable list
    transformer = DropConstantFeatures(
        tol=0.7, variables=["Name", "const_feat_num", "quasi_feat_num"])
    X = transformer.fit_transform(df_constant_features)

    # expected result
    df = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack"],
        "City": ["London", "Manchester", "Liverpool", "Bristol"],
        "Age": [20, 21, 19, 18],
        "Marks": [0.9, 0.8, 0.7, 0.6],
        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        "const_feat_cat": ["a", "a", "a", "a"],
        "quasi_feat_cat": ["a", "a", "a", "b"],
    })

    # init params
    assert transformer.tol == 0.7
    assert transformer.variables == [
        "Name", "const_feat_num", "quasi_feat_num"
    ]

    # fit attr
    assert transformer.constant_features_ == [
        "const_feat_num", "quasi_feat_num"
    ]
    assert transformer.input_shape_ == (4, 9)

    # transform params
    pd.testing.assert_frame_equal(X, df)
def clean_data(X):
    X.dropna(subset=['target'], inplace=True)
    y = X.pop('target')
    X.drop(columns='ID', inplace=True)
    X['v22'] = X['v22'].apply(az_to_int)
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    con_cols = X.select_dtypes(include=['number']).columns.tolist()
    num_missing_imputer = SimpleImputer(strategy='median')
    cat_missing_imputer = CategoricalImputer(fill_value='__MISS__')
    rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__')
    cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency")
    X[con_cols] = num_missing_imputer.fit_transform(X[con_cols])
    X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols])
    X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols])
    X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols])
    # more cleaning
    trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005)
    X = trimmer.fit_transform(X)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234)
    X, Y = undersampler.fit_resample(X, y)
    quasi_constant = DropConstantFeatures(tol=0.998)
    X = quasi_constant.fit_transform(X)
    print(f"Quasi Features to drop {quasi_constant.features_to_drop_}")
    # Remove duplicated features¶
    duplicates = DropDuplicateFeatures()
    X = duplicates.fit_transform(X)
    print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}")
    print(f"Dropping duplicate features {duplicates.features_to_drop_}")
    drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore")
    X = drop_corr.fit_transform(X)
    print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}")
    print(f"Dropping correlared features {drop_corr.features_to_drop_}")
    X['target'] = Y
    return X
Example #4
0
def test_drop_constant_and_quasiconstant_features(df_constant_features):
    transformer = DropConstantFeatures(tol=0.7, variables=None)
    X = transformer.fit_transform(df_constant_features)

    # expected result
    df = pd.DataFrame(
        {
            "Name": ["tom", "nick", "krish", "jack"],
            "City": ["London", "Manchester", "Liverpool", "Bristol"],
            "Age": [20, 21, 19, 18],
            "Marks": [0.9, 0.8, 0.7, 0.6],
            "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        }
    )

    # init params
    assert transformer.tol == 0.7
    assert transformer.variables == [
        "Name",
        "City",
        "Age",
        "Marks",
        "dob",
        "const_feat_num",
        "const_feat_cat",
        "quasi_feat_num",
        "quasi_feat_cat",
    ]

    # fit attr
    assert transformer.constant_features_ == [
        "const_feat_num",
        "const_feat_cat",
        "quasi_feat_num",
        "quasi_feat_cat",
    ]
    assert transformer.input_shape_ == (4, 9)

    # transform params
    pd.testing.assert_frame_equal(X, df)
def test_drop_constant_features(df_constant_features):
    transformer = DropConstantFeatures(tol=1, variables=None)
    X = transformer.fit_transform(df_constant_features)

    # expected result
    df = pd.DataFrame({
        "Name": ["tom", "nick", "krish", "jack"],
        "City": ["London", "Manchester", "Liverpool", "Bristol"],
        "Age": [20, 21, 19, 18],
        "Marks": [0.9, 0.8, 0.7, 0.6],
        "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        "quasi_feat_num": [1, 1, 1, 2],
        "quasi_feat_cat": ["a", "a", "a", "b"],
    })

    # fit attribute
    assert transformer.features_to_drop_ == [
        "const_feat_num", "const_feat_cat"
    ]

    # transform output
    pd.testing.assert_frame_equal(X, df)