Esempio n. 1
0
def test_user_provides_grouping_label_name_and_variable_list(df_enc_big):
    # test case 2: user provides alternative grouping value and variable list
    encoder = RareLabelEncoder(tol=0.15,
                               n_categories=5,
                               variables=["var_A", "var_B"],
                               replace_with="Other")
    X = encoder.fit_transform(df_enc_big)

    # expected output
    df = {
        "var_A": ["A"] * 6 + ["B"] * 10 + ["Other"] * 4 + ["D"] * 10 +
        ["Other"] * 4 + ["G"] * 6,
        "var_B": ["A"] * 10 + ["B"] * 6 + ["Other"] * 4 + ["D"] * 10 +
        ["Other"] * 4 + ["G"] * 6,
        "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 + ["E"] * 2 +
        ["F"] * 2 + ["G"] * 6,
    }
    df = pd.DataFrame(df)

    # test init params
    assert encoder.tol == 0.15
    assert encoder.n_categories == 5
    assert encoder.replace_with == "Other"
    assert encoder.variables == ["var_A", "var_B"]
    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B"]
    assert encoder.n_features_in_ == 3
    # test transform output
    pd.testing.assert_frame_equal(X, df)
Esempio n. 2
0
def test_defo_params_plus_automatically_find_variables(df_enc_big):
    # test case 1: defo params, automatically select variables
    encoder = RareLabelEncoder(tol=0.06,
                               n_categories=5,
                               variables=None,
                               replace_with="Rare")
    X = encoder.fit_transform(df_enc_big)

    # expected output
    df = {
        "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
    }
    df = pd.DataFrame(df)

    # test init params
    assert encoder.tol == 0.06
    assert encoder.n_categories == 5
    assert encoder.replace_with == "Rare"
    assert encoder.variables is None
    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.n_features_in_ == 3
    # test transform output
    pd.testing.assert_frame_equal(X, df)
Esempio n. 3
0
def test_variables_cast_as_category(df_enc_big):
    # test case 1: defo params, automatically select variables
    encoder = RareLabelEncoder(tol=0.06,
                               n_categories=5,
                               variables=None,
                               replace_with="Rare")

    df_enc_big = df_enc_big.copy()
    df_enc_big["var_B"] = df_enc_big["var_B"].astype("category")

    X = encoder.fit_transform(df_enc_big)

    # expected output
    df = {
        "var_A": ["A"] * 6 + ["B"] * 10 + ["C"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_C": ["A"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
    }
    df = pd.DataFrame(df)

    # test fit attr
    assert encoder.variables_ == ["var_A", "var_B", "var_C"]
    assert encoder.n_features_in_ == 3
    # test transform output
    pd.testing.assert_frame_equal(X, df)
Esempio n. 4
0
def clean_data(X):
    X.dropna(subset=['target'], inplace=True)
    y = X.pop('target')
    X.drop(columns='ID', inplace=True)
    X['v22'] = X['v22'].apply(az_to_int)
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    con_cols = X.select_dtypes(include=['number']).columns.tolist()
    num_missing_imputer = SimpleImputer(strategy='median')
    cat_missing_imputer = CategoricalImputer(fill_value='__MISS__')
    rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__')
    cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency")
    X[con_cols] = num_missing_imputer.fit_transform(X[con_cols])
    X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols])
    X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols])
    X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols])
    # more cleaning
    trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005)
    X = trimmer.fit_transform(X)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234)
    X, Y = undersampler.fit_resample(X, y)
    quasi_constant = DropConstantFeatures(tol=0.998)
    X = quasi_constant.fit_transform(X)
    print(f"Quasi Features to drop {quasi_constant.features_to_drop_}")
    # Remove duplicated features¶
    duplicates = DropDuplicateFeatures()
    X = duplicates.fit_transform(X)
    print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}")
    print(f"Dropping duplicate features {duplicates.features_to_drop_}")
    drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore")
    X = drop_corr.fit_transform(X)
    print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}")
    print(f"Dropping correlared features {drop_corr.features_to_drop_}")
    X['target'] = Y
    return X
Esempio n. 5
0
def test_max_n_categories(df_enc_big):
    # test case 6: user provides the maximum number of categories they want
    rare_encoder = RareLabelEncoder(tol=0.10,
                                    max_n_categories=4,
                                    n_categories=5)
    X = rare_encoder.fit_transform(df_enc_big)
    df = {
        "var_A": ["A"] * 6 + ["B"] * 10 + ["Rare"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_B": ["A"] * 10 + ["B"] * 6 + ["Rare"] * 4 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
        "var_C": ["Rare"] * 4 + ["B"] * 6 + ["C"] * 10 + ["D"] * 10 +
        ["Rare"] * 4 + ["G"] * 6,
    }
    df = pd.DataFrame(df)
    pd.testing.assert_frame_equal(X, df)
Esempio n. 6
0
def test_max_n_categories_with_numeric_var(df_enc_numeric):
    # ignore_format=True
    rare_encoder = RareLabelEncoder(tol=0.10,
                                    max_n_categories=2,
                                    n_categories=1,
                                    ignore_format=True)

    X = rare_encoder.fit_transform(df_enc_numeric[["var_A", "var_B"]])

    df = df_enc_numeric[["var_A", "var_B"]].copy()
    df.replace({3: "Rare"}, inplace=True)

    # massive workaround because for some reason, doing a normal pd.assert_equal
    # was telling me that 2 columns that were identical, were actually not.
    # I think there was a problem with the type of each number perhaps
    for i in range(len(df)):
        assert str(list(X["var_A"])[i]) == str(list(df["var_A"])[i])
        assert str(list(X["var_B"])[i]) == str(list(df["var_B"])[i])
Esempio n. 7
0
    def transform(self, X, y=None):
        pd.options.mode.chained_assignment = None  # default='warn' - turn off warning about  data overwrite
        for category in self.categories:
            x = X[category].copy()  # not use copy to intentionally change value
            idx_nan = x.loc[pd.isnull(x)].index  # find nan values in analyzed feature column

            # replace missing values
            x[idx_nan] = 'MISS'
            encoder = RareLabelEncoder(tol=self.tol, n_categories=self.n_categories,
                                       max_n_categories=self.max_n_categories,
                                       replace_with=self.replace_with)

            x = x.to_frame(name=category)  # convert pd.series to dataframe
            x = encoder.fit_transform(x)
            X[category] = x
            if not self.impute_missing_label:
                X[category].loc[idx_nan] = np.nan
        pd.options.mode.chained_assignment = 'warn'  # default='warn' - turn on warning about  data overwrite
        return X
Esempio n. 8
0
#so to avoid ranking these features, we apply rare label + weight of evidence encoding for C5
#this is for the Logistic Regression model only, as ordinality isn't a problem for tree-based models

# rare label encoding:
# we set the threshold to 0.1
# categories with proporation lower than 0.1 may not have any class label 1 due to the label imbalance
# and this will impede the application of WOE encoding (log 0 is undefined)

encoder = RareLabelEncoder(tol=0.1,
                           n_categories=2,
                           variables=[
                               'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8',
                               'C9', 'C10', 'C11', 'C12'
                           ],
                           replace_with='Rare')
train_enc = encoder.fit_transform(X_sm_c)

#WOE encoding:
woe_encoder = WoEEncoder(variables=[
    'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12'
])
train_enc1 = woe_encoder.fit_transform(train_enc, X_sm['newlabel'])

train_enc1
"""# 3. Model Building

# Logistic Regression
"""

#reassemble training dataset
#for categorical features, use the datasets after applying rare label + WOE encoding