def train_models(ModelClass,
                 invoices,
                 observation_end_dates,
                 rfe=False,
                 **kwargs):
    train_results = dict(
        models=[],
        observation_end_dates=observation_end_dates,
        X_train=[],
        y_train=[],
        X_test=[],
        y_test=[],
    )
    X_trains, y_trains = [], [],

    for observation_end_date in observation_end_dates:
        X_train, y_train, X_test, y_test = get_train_test_data(
            invoices, observation_end_date)
        X_trains.append(X_train)
        y_trains.append(y_train)
        X_train, y_train = pd.concat(
            X_trains[-lag:]).reset_index(drop=True), pd.concat(
                y_trains[-lag:]).reset_index(drop=True).astype(
                    int)  # Last "lag" months used as training data
        X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index(
            drop=True).astype(int)

        # Encode "MostBoughtItem" feature
        rare_encoder = RareLabelCategoricalEncoder(
            tol=0.02 if len(X_train) < 100 else 0.01,
            variables=['MostBoughtItem']).fit(X_train)
        X_train = rare_encoder.transform(X_train)
        X_test = rare_encoder.transform(X_test)
        mean_enc = MeanCategoricalEncoder(variables=['MostBoughtItem']).fit(
            X_train, y_train)
        X_train = mean_enc.transform(X_train)
        X_test = mean_enc.transform(X_test)

        if rfe:
            sel_ = RFE(ModelClass(**kwargs), n_features_to_select=8)
            sel_.fit(X_train, y_train)
            selected_feats = X_train.columns[(sel_.get_support())]
            model = ModelClass(**kwargs).fit(X_train[selected_feats], y_train)
            train_results['X_train'].append(X_train[selected_feats])
            train_results['X_test'].append(X_test[selected_feats])
        else:
            model = ModelClass(**kwargs)
            model.fit(X_train, y_train)
            train_results['X_train'].append(X_train)
            train_results['X_test'].append(X_test)
        train_results['models'].append(model)
        train_results['y_train'].append(y_train)
        train_results['y_test'].append(y_test)

    return train_results
Ejemplo n.º 2
0
def test_RareLabelEncoder():
    df = {
        'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 +
        ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5,
        'target': [1] * 63
    }
    df = pd.DataFrame(df)

    transf_df = {
        'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 +
        ['Rare'] * 8 + ['I'] * 10 + ['K'] * 5,
        'target': [1] * 63
    }
    transf_df = pd.DataFrame(transf_df)

    encoder = RareLabelCategoricalEncoder(tol=0.05,
                                          n_categories=9,
                                          variables=['category'])
    encoder.fit(df)
    X = encoder.transform(df)

    pd.testing.assert_frame_equal(X, transf_df)
    assert encoder.variables == ['category']
    assert encoder.input_shape_ == (63, 2)

    df = {
        'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 +
        ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5,
        'target': [1] * 63
    }
    df = pd.DataFrame(df)

    transf_df = {
        'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 +
        ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5,
        'target': [1] * 63
    }
    transf_df = pd.DataFrame(transf_df)

    encoder = RareLabelCategoricalEncoder(tol=0.01,
                                          n_categories=9,
                                          variables=['category'])
    encoder.fit(df)
    X = encoder.transform(df)

    pd.testing.assert_frame_equal(X, transf_df)
    assert encoder.variables == ['category']
    assert encoder.input_shape_ == (63, 2)
Ejemplo n.º 3
0
def test_RareLabelEncoder(dataframe_enc_big, dataframe_enc_big_na):
    # test case 1: defo params, automatically select variables
    encoder = RareLabelCategoricalEncoder(tol=0.06,
                                          n_categories=5,
                                          variables=None,
                                          replace_with='Rare')
    X = encoder.fit_transform(dataframe_enc_big)

    df = {
        'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4 + ['D'] * 10 +
        ['Rare'] * 4 + ['G'] * 6,
        'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4 + ['D'] * 10 +
        ['Rare'] * 4 + ['G'] * 6,
        'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 +
        ['Rare'] * 4 + ['G'] * 6,
    }
    df = pd.DataFrame(df)

    # init params
    assert encoder.tol == 0.06
    assert encoder.n_categories == 5
    assert encoder.replace_with == 'Rare'
    assert encoder.variables == ['var_A', 'var_B', 'var_C']
    # fit params
    assert encoder.input_shape_ == (40, 3)
    # transform params
    pd.testing.assert_frame_equal(X, df)

    # test case 2: user provides alternative grouping value and variable list
    encoder = RareLabelCategoricalEncoder(tol=0.15,
                                          n_categories=5,
                                          variables=['var_A', 'var_B'],
                                          replace_with='Other')
    X = encoder.fit_transform(dataframe_enc_big)

    df = {
        'var_A': ['A'] * 6 + ['B'] * 10 + ['Other'] * 4 + ['D'] * 10 +
        ['Other'] * 4 + ['G'] * 6,
        'var_B': ['A'] * 10 + ['B'] * 6 + ['Other'] * 4 + ['D'] * 10 +
        ['Other'] * 4 + ['G'] * 6,
        'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 +
        ['F'] * 2 + ['G'] * 6
    }
    df = pd.DataFrame(df)

    # init params
    assert encoder.tol == 0.15
    assert encoder.n_categories == 5
    assert encoder.replace_with == 'Other'
    assert encoder.variables == ['var_A', 'var_B']
    # fit params
    assert encoder.input_shape_ == (40, 3)
    # transform params
    pd.testing.assert_frame_equal(X, df)

    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(tol=5)

    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(n_categories=0.5)

    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(replace_with=0.5)

    # test case 3: when the variable has low cardinality
    with pytest.warns(UserWarning):
        encoder = RareLabelCategoricalEncoder(n_categories=10)
        encoder.fit(dataframe_enc_big)

    # test case 4: when dataset contains na, fit method
    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(n_categories=4)
        encoder.fit(dataframe_enc_big_na)

    # test case 5: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(n_categories=4)
        encoder.fit(dataframe_enc_big)
        encoder.transform(dataframe_enc_big_na)
Ejemplo n.º 4
0
used_car_price_pipeline = Pipeline([
    ("preprocessing", preprocessors.Preprocessing()),
    ("random_num_impute",
     ArbitraryNumberImputer(arbitrary_number=-9,
                            variables=['year', 'odometer'])),
    ("cylinder_impute",
     CategoricalVariableImputer(imputation_method='missing',
                                fill_value='-1',
                                variables=['cylinders'])),
    ("categorical_impute",
     CategoricalVariableImputer(imputation_method='missing',
                                fill_value='missing')),
    ("rare_label_manufacturer",
     RareLabelCategoricalEncoder(tol=0.01,
                                 variables='manufacturer',
                                 n_categories=5,
                                 replace_with='rare')),
    ("rare_label_cylinder",
     RareLabelCategoricalEncoder(tol=0.01,
                                 variables='cylinders',
                                 n_categories=5,
                                 replace_with='-1')),
    ("rare_label_condition",
     RareLabelCategoricalEncoder(tol=0.07,
                                 variables=['condition'],
                                 n_categories=3,
                                 replace_with='rare')),
    ('rare_label_type',
     RareLabelCategoricalEncoder(tol=0.04,
                                 variables=['type'],
                                 n_categories=3,
Ejemplo n.º 5
0
         variables=config.model_config.categorical_vars,
         transformer=SimpleImputer(strategy="constant", fill_value="missing"),
     ),
 ),
 (
     "temporal_variable",
     pp.TemporalVariableEstimator(
         variables=config.model_config.temporal_vars,
         reference_variable=config.model_config.drop_features,
     ),
 ),
 (
     "rare_label_encoder",
     RareLabelCategoricalEncoder(
         tol=config.model_config.rare_label_tol,
         n_categories=config.model_config.rare_label_n_categories,
         variables=config.model_config.categorical_vars,
     ),
 ),
 (
     "categorical_encoder",
     pp.SklearnTransformerWrapper(
         variables=config.model_config.categorical_vars,
         transformer=OrdinalEncoder(),
     ),
 ),
 (
     "drop_features",
     pp.DropUnecessaryFeatures(
         variables_to_drop=config.model_config.drop_features,
     ),
Ejemplo n.º 6
0
    
    X_test[variable] = np.where(X_test[variable].isin(
        frequent_cat), X_test[variable], 'Rare')

    return X_train, X_test

# Transforming
for variable in ['Neighborhood', 'Exterior1st', 'Exterior2nd']:
    X_train, X_test = rare_encoding(X_train, X_test, variable, 0.05)


-# With Feature-Engine
# Rare value encoder
rare_encoder = RareLabelCategoricalEncoder(
    tol=0.05,  # minimal percentage to be considered non-rare
    n_categories=4, # minimal number of categories the variable should have to re-cgroup rare categories
    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd',
               'MasVnrType', 'ExterQual', 'BsmtCond'] # variables to re-group
)  




----OUTLIERS
# let's make boxplots to visualise outliers in the continuous variables 
# and histograms to get an idea of the distribution

for var in numerical:
    plt.figure(figsize=(6,4))
    plt.subplot(1, 2, 1)
    fig = data.boxplot(column=var)
    fig.set_title('')


read_data = read_data[read_data.A4 != 'l']
read_data=read_data[read_data.A5 != 'gg']



## remove category l and gg from variable l A4 and A5 simultaneously
read_data = read_data[read_data.A4 != 'l']
read_data=read_data[read_data.A5 != 'gg']

## encode rare categories in dataset
rare_encoder = RareLabelCategoricalEncoder(
    tol=0.05,  # minimal percentage to be considered non-rare
    n_categories=2, # minimal number of categories the variable should have to re-cgroup rare categories
    variables=['A1','A4','A5','A6','A7','A9','A10','A12'] # variables to re-group
) 
rare_encoder.fit(read_data)
rare_encoder.variables
read_data = rare_encoder.transform(read_data)


rare_encoder_A13 = RareLabelCategoricalEncoder(
    tol=0.1,  # minimal percentage to be considered non-rare
    n_categories=2, # minimal number of categories the variable should have to re-cgroup rare categories
    variables=['A13'] # variables to re-group
) 
rare_encoder_A13.fit(read_data)
rare_encoder_A13.variables
read_data = rare_encoder_A13.transform(read_data)