예제 #1
0
    def _fit_catboost(self, df, y, target, parameter):
        cat_encoder = ce.CatBoostEncoder()

        cat_encoder.fit(df[target].map(to_str), df[y])
        name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_catboost' for x in
                cat_encoder.get_feature_names()]
        self.trans_ls.append(('catboost', name, target, cat_encoder))
 def test_catBoost_reference2(self):
     # The reference is from:
     #   https://www.youtube.com/watch?v=hqYQ8Yj9vB0
     # time:
     #     35:03
     # as obtained on 21 Aug 2019.
     # Note: they have an error at line [smooth 6 4.3 4.1]. It should be [smooth 6 4 4.1 3.9]
     X = pd.DataFrame({
         'col1': [
             'fuzzy', 'soft', 'smooth', 'fuzzy', 'smooth', 'soft', 'smooth',
             'smooth'
         ]
     })
     y = pd.Series([4, 1, 4, 3, 6, 0, 7, 5])
     enc = encoders.CatBoostEncoder()
     obtained = enc.fit_transform(X, y)
     prior = 30. / 8
     self.assertEqual(list(obtained['col1']), [
         prior, prior, prior, (4 + prior) / 2, (4 + prior) / 2,
         (1 + prior) / 2, (10 + prior) / 3, (17 + prior) / 4
     ])
     print([
         prior, prior, prior, (4 + prior) / 2, (4 + prior) / 2,
         (1 + prior) / 2, (10 + prior) / 3, (17 + prior) / 4
     ])
def catboost_encoder(X_train, Y_train, X_val, Y_val, target_col: str, cat_features=None, features=None):
    """
    CatBoost_Encoding: カテゴリ列を目的変数の1行前の行からのみに変換する特徴量エンジニアリング
    CatBoost使ったターゲットエンコーディング
    https://www.kaggle.com/matleonard/categorical-encodings
    """
    X_train = pd.DataFrame(X_train, columns=features)
    Y_train = pd.DataFrame(Y_train, columns=[target_col])
    X_val = pd.DataFrame(X_val, columns=features)
    Y_val = pd.DataFrame(Y_val, columns=[target_col])
    
    train_df = X_train.join(Y_train)
    valid_df = X_val.join(Y_val)

    cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

    # trainだけでfitすること(validationやtest含めるとリークする)
    cb_enc.fit(train_df[cat_features], train_df[target_col])

    train_encoded = train_df.join(
        cb_enc.transform(train_df[cat_features]).add_suffix("_cb")
    )
    valid_encoded = valid_df.join(
        cb_enc.transform(valid_df[cat_features]).add_suffix("_cb")
    )
    
    features = train_encoded.drop(target_col, axis=1).columns.to_list()
    
    #return train_encoded, valid_encoded
    return train_encoded.drop(target_col, axis=1), valid_encoded.drop(target_col, axis=1), features
예제 #4
0
def impute_column(data, column, target_col='diabetes_mellitus'):
    class WidsColumnImputerCVTrainer(CVTrainer):

        # Return 'Score' - MSE between 'ytrue' and 'ypred'
        def score(self, typ: str):
            ypred, ytrue = self.predict(typ).align(self.ds.labeled.y,
                                                   join='right')
            return mean_squared_error(ytrue, ypred)

        # Dividing Data into 'X' and 'y'

    dt = data.drop(columns=[target_col]).copy()
    dt[f'is_train_{column}'] = (~dt[column].isnull()).astype(int).values
    X = dt.drop(columns=[column])
    y = dt[column].values

    # Defining the 'Categorial' columns (of types 'object' & 'category') & Encoder type (CatBoostEncoder)

    cat_columns = dt.select_dtypes(['object', 'category']).columns.tolist()
    encoder = ce.CatBoostEncoder(cols=cat_columns)
    encoder.fit(X, y)
    dt = encoder.transform(X)
    dt[column] = y

    # Data Block & Data Manager

    data_block = WidsDataBlock(dt)

    ds = create_data_manager(data_block=data_block,
                             cv_column='cv_fold',
                             train_split_column=f'is_train_{column}',
                             label_columns=[column],
                             cv_object=KFold(n_splits=5,
                                             shuffle=True,
                                             random_state=42))

    # Defining Lgbm (light Gradient Boosting Machine) - Params & Trainer Model
    lgbm_params = dict(init_params=dict(metric='rmse',
                                        max_depth=-1,
                                        num_leaves=31,
                                        min_data_per_group=10,
                                        learning_rate=0.1),
                       fit_params=dict(verbose=-1, early_stopping_rounds=100))

    lgbm_trainer = WidsColumnImputerCVTrainer(
        fold_trainer_cls=LightGBMRegressorFoldTrainer,
        ds=ds,
        model_name=f'{column}_impute',
        params=lgbm_params,
        save_path=config.outputs_path)
    lgbm_trainer.fit()

    dt = (data.join(
        lgbm_trainer.predict('tst').iloc[:, 0].to_frame(f'tmp_{column}').
        groupby(level=0).mean()).assign(
            **{
                column:
                lambda dx: dx[column].fillna(0) + dx[f'tmp_{column}'].fillna(0)
            }).drop(columns=[f'tmp_{column}']))
    return dt
def get_catboost_encoder(df, cols, target):
    """catboostエンコード"""
    ce_cbe = ce.CatBoostEncoder(cols=cols,
                                random_state=42).fit_transform(X=df[cols],
                                                               y=df[target])
    df[f'catboost_encode_{cols}_{target}'] = ce_cbe
    return df
def kFold_encoder_features(train, test, features, seed_seed=2019):
    print("k-fold ce encoder ...")
    train = train.copy()
    test = test.copy()
    kfold = StratifiedKFold(n_splits=5, random_state=seed_seed, shuffle=True)
    encoder = ce.CatBoostEncoder(cols=features)
    # encoder = ce.WOEEncoder(cols=[feat])
    # ce 默认nan用label.mean填充
    train[features] = train[features].astype(str)
    test[features] = test[features].astype(str)
    # test
    encoder.fit(train, train['label'])
    test = encoder.transform(test)
    test[features] = test[features].astype(np.float32)
    # train
    feat_encoder = train.copy()

    for n_fold, (train_idx,
                 valid_idx) in enumerate(kfold.split(train, train['label'])):
        print('processing fold: ', n_fold + 1)
        encoder.fit(train.loc[train_idx], train.loc[train_idx, 'label'])

        v_df = encoder.transform(train.loc[valid_idx])  #df

        feat_encoder.loc[valid_idx, features] = v_df[features].values

    train[features] = feat_encoder[features].astype(np.float32)
    print('ce encoder done.')

    return pd.concat([train, test], ignore_index=True)
예제 #7
0
def dataset_numerical_train():
    train = pd.read_csv('./csv/dataset.csv', low_memory=False)

    cat_features = [
        'province', 'district', 'maCv', 'FIELD_8', 'FIELD_9', 'FIELD_10',
        'FIELD_12', 'FIELD_13', 'FIELD_17', 'FIELD_18', 'FIELD_19', 'FIELD_20',
        'FIELD_22', 'FIELD_23', 'FIELD_24', 'FIELD_25', 'FIELD_26', 'FIELD_27',
        'FIELD_28', 'FIELD_29', 'FIELD_30', 'FIELD_31', 'FIELD_35', 'FIELD_36',
        'FIELD_37', 'FIELD_38', 'FIELD_39', 'FIELD_40', 'FIELD_41', 'FIELD_42',
        'FIELD_43', 'FIELD_44', 'FIELD_47', 'FIELD_48', 'FIELD_49'
    ]

    cat_features_remove = cat_features + ['id']
    target_enc = ce.CatBoostEncoder(cols=cat_features)

    target_enc.fit(train[cat_features], train['label'])

    train = train.join(
        target_enc.transform(train[cat_features]).add_suffix('_process'))

    train = train.drop(columns=cat_features_remove)

    f7_array = train['FIELD_7'].apply(lambda x: '[]'
                                      if x != x else x).apply(literal_eval)
    train['FIELD_7'] = f7_array.apply(len)

    train = train.replace(to_replace='None', value=np.nan)

    train.to_csv('./csv/preprocess/numerical_train.csv')

    return True
예제 #8
0
def fit(X, y, output_dir, **kwargs):
    """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
    DataRobot runs this hook when the task is being trained inside a blueprint.
    As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit CatBoost encoder], that is then used to transform new data.
    The input parameters are passed by DataRobot based on project and blueprint configuration.

    Parameters
    -------
    X: pd.DataFrame
        Training data that DataRobot passes when this task is being trained.
    y: pd.Series
        Project's target column (None is passed for unsupervised projects).
    output_dir: str
        A path to the output folder; the artifact [a pickle file containing a pre-fit CatBoost Encoder] must be saved into this folder to be re-used in transform().

    Returns
    -------
    None
        fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
        so that the trained object can be used during scoring inside transform()
    """

    # Transform categorical columns into a numeric transformation using Weight of Evidence
    encoder_catboost = ce.CatBoostEncoder(cols=X.columns)
    encoder_catboost.fit(X, y)

    # dump the trained object
    # into an artifact [in this example - woe.pkl]
    # and save it into output_dir so that it can be used later to impute on new data
    output_dir_path = Path(output_dir)
    if output_dir_path.exists() and output_dir_path.is_dir():
        with open("{}/catboost.pkl".format(output_dir), "wb") as fp:
            pickle.dump(encoder_catboost, fp)
예제 #9
0
def Train_test_random_forest(hdf, ord_feat_num, ord_feat_cat, nom_feat, cont_feat):
    ord_feat = ord_feat_cat.union(ord_feat_num)
    X_train = hdf.loc[~hdf['SalePrice'].isnull(), :]

    y_train = np.log1p(X_train.loc[~X_train['SalePrice'].isnull(), 'SalePrice'])

    X_train.drop(columns = ['SalePrice'], inplace = True)



    X_train.drop((X_train.loc[X_train['GrLivArea']>4400, :]).index, inplace = True)

    X_train.drop((X_train.loc[X_train['LotArea']>100000, :]).index, inplace = True)

    X_train.drop((X_train.loc[X_train['LotFrontage']>250, :]).index, inplace = True)

    X_test = hdf.loc[hdf['SalePrice'].isnull(), :].drop(columns = ['SalePrice'])

    ord_enc = ce.OrdinalEncoder(cols=ord_feat).fit(X_train,y_train)
    X_train = ord_enc.transform(X_train)

    X_test = ord_enc.transform(X_test)

    perm = np.random.permutation(len(X_train))
    X_train = X_train.iloc[perm].reset_index(drop=True)
    y_train = y_train.iloc[perm].reset_index(drop=True)

    nom_enc = ce.CatBoostEncoder(cols=nom_feat).fit(X_train,y_train)
    X_train = nom_enc.transform(X_train)

    X_test = nom_enc.transform(X_test)

    X_train.drop(columns = ['Id'], inplace = True)
    return X_train, y_train, X_test
예제 #10
0
    def catboost_encoder(train_df,
                         valid_df,
                         target_col: str,
                         cat_features=None):
        """
        CatBoost_Encoding: カテゴリ列を目的変数の1行前の行からのみに変換する特徴量エンジニアリング
        CatBoost使ったターゲットエンコーディング
        https://www.kaggle.com/matleonard/categorical-encodings
        """
        # conda install -c conda-forge category_encoders
        import category_encoders as ce

        if cat_features is None:
            cat_features = train_df.select_dtypes(
                include=["object", "category", "bool"]).columns.to_list()

        cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

        # trainだけでfitすること(validationやtest含めるとリークする)
        cb_enc.fit(train_df[cat_features], train_df[target_col])

        train_encoded = train_df.join(
            cb_enc.transform(train_df[cat_features]).add_suffix("_cb"))
        valid_encoded = valid_df.join(
            cb_enc.transform(valid_df[cat_features]).add_suffix("_cb"))
        return train_encoded, valid_encoded
예제 #11
0
def catboost_enc(df_norm):
    catboost_enc = ce.CatBoostEncoder(cols=cat_features)
    catboost_enc.fit(df_norm[cat_features], df_norm['precio'])
    data = df_norm.join(
        catboost_enc.transform(df_norm[cat_features]).add_suffix('_cb'))
    data = data.drop(['tipodepropiedad', 'provincia', 'ciudad'], axis=1)
    return data, catboost_enc
예제 #12
0
def target_encoder_catboost(df, train_df, cols, target):
    ce_cbe = ce.CatBoostEncoder(cols=cols, random_state=42)
    ce_cbe.fit(X=train_df[cols], y=train_df[target])
    _df = ce_cbe.transform(df[cols])
    # カラム名の変更
    for col in cols:
        _df = _df.rename({col: f'{col}_targetenc_ce_cbe'}, axis=1)
    return pd.concat([df, _df], axis=1)
    def preprocess(self, df, train=True):
        '''
        Process data for training model

        :param df: pandas Dataframe
               train: boolean
        :return processed pandas dataframe and pd.Series with target
        '''
        print('Creating dataframe for data manipulation')
        cons = pd.DataFrame({
            'column': df.columns,
            'missing_perc': (df.isna().sum() / df.shape[0]) * 100,
            'dtype': df.dtypes
        })
        print('Droping columns with missing values')
        cons = cons[cons['missing_perc'] == 0]
        print('Dropping column with id')
        cons = cons[cons['column'] != 'Id']
        print('Creating list with numeric features')
        numeric_features = list(
            cons[(cons['dtype'] == 'int64') | (cons['dtype'] == 'float') |
                 (cons['dtype'] == 'bool')]['column'])
        print('Creating list with categorical features')
        categoric_features = list(cons[(cons['dtype'] == 'object')]['column'])
        self.categoric_features = categoric_features
        print('removing target')
        if train == True:
            numeric_features.remove('y')
        else:
            pass
        print(self.categoric_features)
        print('feature encoder')
        print('feature normalization and encoding')
        std_scaler = StandardScaler()
        if train == True:
            y = df['y']
            df = df.drop(columns={'y'})
            self.numeric_features = numeric_features
            self.categoric_features = categoric_features
            self.feature_names = self.numeric_features + self.categoric_features
            self.scaler = std_scaler
            self.catb = ce.CatBoostEncoder(cols=self.categoric_features)
            df[self.numeric_features] = self.scaler.fit_transform(
                df[self.numeric_features])
            df[self.categoric_features] = self.catb.fit_transform(
                df[self.categoric_features], y=y)
            self.train_features = self.numeric_features + self.categoric_features
            return df[self.categoric_features + self.numeric_features], y
        else:
            df[self.numeric_features] = self.scaler.transform(
                df[self.numeric_features])
            df[self.categoric_features] = self.catb.transform(
                df[self.categoric_features])
            for column in df[self.categoric_features +
                             self.numeric_features].columns:
                df[column] = df[column].fillna(df[column].mean())
            return df[self.categoric_features + self.numeric_features]
예제 #14
0
def catboost_multiple(df, cols):
    encoder = ce.CatBoostEncoder(cols,
                                 return_df=1,
                                 drop_invariant=1,
                                 handle_missing='return_nan',
                                 sigma=None,
                                 a=2)
    encoder.fit(X=df, y=df['set_clicked'])
    df = encoder.transform(df)
    return df
예제 #15
0
    def test_catBoost(self):
        X = pd.DataFrame({'col1':['A', 'B', 'B', 'C', 'A']})
        y = pd.Series([1, 0, 1, 0, 1])
        enc = encoders.CatBoostEncoder()
        obtained = enc.fit_transform(X, y)
        self.assertEqual(list(obtained['col1']), [0.6, 0.6, 0.6/2, 0.6, 1.6/2], 'The nominator is incremented by the prior. The denominator by 1.')

        X_t = pd.DataFrame({'col1': ['B', 'B', 'A']})
        obtained = enc.transform(X_t)
        self.assertEqual(list(obtained['col1']), [1.6/3, 1.6/3, 2.6/3])
예제 #16
0
    def test_catBoost_missing(self):
        X = pd.DataFrame({'col1':['A', 'B', 'B', 'C', 'A', None, None, None]})
        y = pd.Series([1, 0, 1, 0, 1, 0, 1, 0])
        enc = encoders.CatBoostEncoder(handle_missing='value')
        obtained = enc.fit_transform(X, y)
        self.assertEqual(list(obtained['col1']), [0.5, 0.5, 0.5/2, 0.5, 1.5/2, 0.5, 0.5/2, 1.5/3], 'We treat None as another category.')

        X_t = pd.DataFrame({'col1': ['B', 'B', 'A', None]})
        obtained = enc.transform(X_t)
        self.assertEqual(list(obtained['col1']), [1.5/3, 1.5/3, 2.5/3, 1.5/4])
예제 #17
0
def categoryEncoder(dt, col):
    cache = dt[[col, 'target']]
    encoder = ce.CatBoostEncoder(cols=col)
    train = cache[cache['target'].isna() == False]
    encoder.fit(train, train['target'])
    cache = encoder.transform(cache)
    dt[col + '_ctr'] = cache[col]
    del train, cache
    gc.collect()
    return dt
예제 #18
0
 def process(self, df, etapa_treino=True):
     '''
     Process data for training the model.
     :param df: Pandas DataFrame
     :param etapa_treino: Boolean
     :return: processed Pandas Data Frame
     '''
     print('Creating DataFrame for Data Manipulation')
     cons = pd.DataFrame({
         'column': df.columns,
         'missing_perc': (df.isna().sum() / df.shape[0]) * 100,
         'dtype': df.dtypes
     })
     print('Droping columns with missing values')
     cons = cons[cons['missing_perc'] == 0]
     print('Dropping column with Id')
     cons = cons[cons['column'] != 'Id']
     print('Creating list with numeric features')
     numeric_features = list(cons[(cons['dtype'] == 'int64') |
                                  (cons['dtype'] == 'float')]['column'])
     print('Creating list with categoric features')
     categoric_features = list(cons[(cons['dtype'] == 'object')]['column'])
     print('Removing target')
     if etapa_treino == True:
         numeric_features.remove('SalePrice')
     else:
         pass
     print('Feature encoder')
     print('Feature Normalization and Encoding')
     std_scaler = StandardScaler()
     if etapa_treino == True:
         y = df['SalePrice']
         df = df.drop(columns={'SalePrice'})
         self.numeric_features = numeric_features
         self.categoric_features = categoric_features
         self.feature_names = self.numeric_features + self.categoric_features
         self.scaler = std_scaler
         self.catb = ce.CatBoostEncoder(cols=self.categoric_features)
         df[self.numeric_features] = self.scaler.fit_transform(
             df[self.numeric_features])
         df[self.categoric_features] = self.catb.fit_transform(
             df[self.categoric_features], y=y)
         self.train_features = self.numeric_features + self.categoric_features
         return df[self.categoric_features + self.numeric_features], y
     else:
         df[self.numeric_features] = self.scaler.transform(
             df[self.numeric_features])
         df[self.categoric_features] = self.catb.transform(
             df[self.categoric_features])
         for column in df[self.categoric_features +
                          self.numeric_features].columns:
             df[column] = df[column].fillna(df[column].mean())
         return df[self.categoric_features + self.numeric_features]
예제 #19
0
    def __init__(self, encoder_type, columns_name=None):
        """
        :param encoder_type:
        :param columns_name: list, 特征名组成的列表名
        """
        if encoder_type == "BackwardDe":  # 反向差分编码
            self.encoder = ce.BackwardDifferenceEncoder(cols=columns_name)

        elif encoder_type == "BaseN":  # BaseN编码
            self.encoder = ce.BaseNEncoder(cols=columns_name)

        elif encoder_type == "Binary":  # 二值编码
            self.encoder = ce.BinaryEncoder(cols=columns_name)

        elif encoder_type == "Catboost":
            self.encoder = ce.CatBoostEncoder(cols=columns_name)

        elif encoder_type == "Hash":
            self.encoder = ce.HashingEncoder(cols=columns_name)

        elif encoder_type == "Helmert":
            self.encoder = ce.HelmertEncoder(cols=columns_name)

        elif encoder_type == "JamesStein":
            self.encoder = ce.JamesSteinEncoder(cols=columns_name)

        elif encoder_type == "LOO":  # LeaveOneOutEncoder  编码
            self.encoder = ce.LeaveOneOutEncoder(cols=columns_name)

        elif encoder_type == "ME":
            self.encoder = ce.MEstimateEncoder(cols=columns_name)  # M估计编码器

        elif encoder_type == "OneHot":
            self.encoder = ce.OneHotEncoder(cols=columns_name)

        elif encoder_type == "OridinalEncoder":  # 原始编码
            self.encoder = ce.OrdinalEncoder(cols=columns_name)

        elif encoder_type == "Sum":  # 求和编码
            self.encoder = ce.SumEncoder(cols=columns_name)

        elif encoder_type == "Polynomial":  # 多项式编码
            self.encoder = ce.PolynomialEncoder(cols=columns_name)

        elif encoder_type == "Target":  # 目标编码
            self.encoder = ce.TargetEncoder(cols=columns_name)

        elif encoder_type == "WOE":  # WOE 编码器
            self.encoder = ce.WOEEncoder(cols=columns_name)

        else:
            raise ValueError("请选择正确的编码方式")
예제 #20
0
def preprocessing(train, test):
    cat_features = [
        'province', 'district', 'maCv', 'FIELD_8', 'FIELD_9', 'FIELD_10',
        'FIELD_12', 'FIELD_13', 'FIELD_17', 'FIELD_18', 'FIELD_19', 'FIELD_20',
        'FIELD_22', 'FIELD_23', 'FIELD_24', 'FIELD_25', 'FIELD_26', 'FIELD_27',
        'FIELD_28', 'FIELD_29', 'FIELD_30', 'FIELD_31', 'FIELD_35', 'FIELD_36',
        'FIELD_37', 'FIELD_38', 'FIELD_39', 'FIELD_40', 'FIELD_41', 'FIELD_42',
        'FIELD_43', 'FIELD_44', 'FIELD_47', 'FIELD_48', 'FIELD_49'
    ]
    # End catBoostEncoder We must remove columns and label from training data and test data

    cat_features_remove = cat_features
    target_enc = ce.CatBoostEncoder(cols=cat_features)

    target_enc.fit(train[cat_features], train['label'])

    train = train.join(
        target_enc.transform(train[cat_features]).add_suffix('_process'))
    test = test.join(
        target_enc.transform(test[cat_features]).add_suffix('_process'))

    train = train.drop(columns=cat_features_remove)
    test = test.drop(columns=cat_features_remove)

    train = train.replace(to_replace='None', value=np.nan)
    test = test.replace(to_replace='None', value=np.nan)

    my_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

    convert_tool = my_imputer.fit(train)

    train = pd.DataFrame(convert_tool.transform(train), columns=train.columns)
    test = pd.DataFrame(convert_tool.transform(test), columns=test.columns)

    special_column = "FIELD_55".split(" ")
    index_outlier_data = detection_outlier(train, special_column).astype(int)

    train = train.drop(index_outlier_data, axis=0).reset_index(drop=True)

    train_label = train['label']
    # print(train[index_outlier_data.astype(int)])
    train = train.drop(columns=['label'])
    test = test.drop(columns=['label'])
    print(len(index_outlier_data) / len(train))
    scaler = StandardScaler()
    scaler.fit(train)
    train = pd.DataFrame(scaler.transform(train), columns=train.columns)
    test = pd.DataFrame(scaler.transform(test), columns=test.columns)

    return train, train_label, test
예제 #21
0
    def test_catBoost(self):
        X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A']})
        y = pd.Series([1, 0, 1, 0, 1])
        enc = encoders.CatBoostEncoder()
        obtained = enc.fit_transform(X, y)
        self.assertEqual(
            list(obtained['col1']), [0.6, 0.6, 0.6 / 2, 0.6, 1.6 / 2],
            'The nominator is incremented by the prior. The denominator by 1.')

        # For testing set, use statistics calculated on all the training data.
        # See: CatBoost: unbiased boosting with categorical features, page 4.
        X_t = pd.DataFrame({'col1': ['B', 'B', 'A']})
        obtained = enc.transform(X_t)
        self.assertEqual(list(obtained['col1']), [1.6 / 3, 1.6 / 3, 2.6 / 3])
예제 #22
0
파일: ex2.py 프로젝트: zsyed-gg/learntools
def cb_encodings_solution():
    cat_features = ['app', 'device', 'os', 'channel']
    cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

    train, valid, _ = get_data_splits()

    # Learn encoding from the training set
    cb_enc.fit(train[cat_features], train["is_attributed"])

    # Apply encoding to the train and validation sets
    train_encoded = train.join(cb_enc.transform(train[cat_features]).add_suffix('_cb'))
    valid_encoded = valid.join(cb_enc.transform(valid[cat_features]).add_suffix('_cb'))

    return train_encoded, valid_encoded
    def categoricalEncoding(self):
        '''
        Numerical encoding for categorical data with Catboost.

        Returns
        -------
        None.

        '''
        catEncoder = ce.CatBoostEncoder(drop_invariant=True,
                                        return_df=True,
                                        random_state=2020)
        catEncoder.fit(self.trainX, self.trainY)
        self.trainX = catEncoder.transform(self.trainX)
        self.testX = catEncoder.transform(self.testX)
    def categorical_encoding(self):
        '''
        Encode X_train and X_test categorical features using Catboost encoder.

        Returns
        -------
        None.

        '''
        print('Encoding...')
        catEncoder = ce.CatBoostEncoder(drop_invariant=True,
                                        return_df=True,
                                        random_state=2020)
        catEncoder.fit(self.X_train, self.y_train)
        self.X_train = catEncoder.transform(self.X_train)
        self.X_test = catEncoder.transform(self.X_test)
예제 #25
0
    def treat_categorical_features(self, categorical_features, categorical_method, X_train, y_train, X_test):
        """
        Deal with categorical features
        """
        
        if categorical_features is None or categorical_method is None: 
            return X_train, X_test
        
        X_train[categorical_features] = X_train[categorical_features].fillna('MISSING')
        X_test[categorical_features] = X_test[categorical_features].fillna('MISSING')

        enc = ce.CatBoostEncoder(cols = categorical_features, drop_invariant = False).fit(X_train, y_train)
        X_train = enc.transform(X_train)
        X_test = enc.transform(X_test)
        
        return X_train, X_test      
예제 #26
0
def non_to_num(option=None):
    train = pd.read_csv('./csv/base/train.csv', low_memory=False)
    if option == 2:
        test = pd.read_csv('./csv/testset.csv', low_memory=False)
    else:
        test = pd.read_csv('./csv/dataset.csv', low_memory=False)

    cat_features = [
        'province', 'district', 'maCv', 'FIELD_8', 'FIELD_9', 'FIELD_10',
        'FIELD_12', 'FIELD_13', 'FIELD_17', 'FIELD_18', 'FIELD_19', 'FIELD_20',
        'FIELD_22', 'FIELD_23', 'FIELD_24', 'FIELD_25', 'FIELD_26', 'FIELD_27',
        'FIELD_28', 'FIELD_29', 'FIELD_30', 'FIELD_31', 'FIELD_35', 'FIELD_36',
        'FIELD_37', 'FIELD_38', 'FIELD_39', 'FIELD_40', 'FIELD_41', 'FIELD_42',
        'FIELD_43', 'FIELD_44', 'FIELD_47', 'FIELD_48', 'FIELD_49'
    ]
    # End catBoostEncoder We must remove columns and label from training data and test data

    cat_features_remove = cat_features + ['id']
    target_enc = ce.CatBoostEncoder(cols=cat_features)

    target_enc.fit(train[cat_features], train['label'])

    train = train.join(
        target_enc.transform(train[cat_features]).add_suffix('_process'))
    test = test.join(
        target_enc.transform(test[cat_features]).add_suffix('_process'))

    if 'id' in test.columns:
        test = test.drop(columns=cat_features_remove)
    else:
        test = test.drop(columns=cat_features)
    train = train.drop(columns=cat_features_remove)

    f7_array = train['FIELD_7'].apply(lambda x: '[]'
                                      if x != x else x).apply(literal_eval)
    train['FIELD_7'] = f7_array.apply(len)

    f7_array = test['FIELD_7'].apply(lambda x: '[]'
                                     if x != x else x).apply(literal_eval)
    test['FIELD_7'] = f7_array.apply(len)

    train = train.replace(to_replace='None', value=np.nan)
    test = test.replace(to_replace='None', value=np.nan)
    test.to_csv('./csv/preprocess/numerical_test.csv')
    train.to_csv('./csv/preprocess/numerical_train.csv')
    return True
예제 #27
0
 def test_catBoost_reference(self):
     # The reference is from:
     #   https://catboost.ai/docs/concepts/algorithm-main-stages_cat-to-numberic.html
     # paragraph:
     #     Transforming categorical features to numerical features in classification
     # as obtained on 17 Aug 2019.
     X = pd.DataFrame({
         'col1': ['rock', 'indie', 'rock', 'rock', 'pop', 'indie', 'rock']
     })
     y = pd.Series([0, 0, 1, 1, 1, 0, 0])
     enc = encoders.CatBoostEncoder()
     obtained = enc.fit_transform(X, y)
     prior = 3. / 7  # Since we do not support prior passing, we replace the prior in the reference = 0.05 with the sample prior = 3/7.
     self.assertEqual(list(obtained['col1']), [
         prior, prior, prior / 2, (1 + prior) / 3, prior, prior / 2,
         (2 + prior) / 4
     ])
예제 #28
0
    def __handle_cat_features(self, test_size):
        self.df[self.categorical_feat] = self.df[self.categorical_feat].fillna(
            'MISSING')
        self.original_df = self.df.copy()

        X_train, X_test = train_test_split(self.df,
                                           test_size=test_size,
                                           random_state=9999)

        y_train = X_train.pop(self.target)
        y_test = X_test.pop(self.target)

        enc = ce.CatBoostEncoder(cols=self.categorical_feat,
                                 drop_invariant=False).fit(X_train, y_train)
        X_train = enc.transform(X_train)
        X_test = enc.transform(X_test)

        self.df = X_test.join(y_test)
예제 #29
0
    def __init__(self, model_name, df, teams_df):
        self.model_name = model_name
        self.df = df
        self.teams_df = teams_df
        self.cat_encoding = ca.CatBoostEncoder()
        self.scaler = StandardScaler()
        self.clf = None
        self.save_folder = 'saved_models'

        self.features = list(
            filter(lambda x: x not in NOT_FEATURES, df.columns))
        self.category_columns = df[self.features].select_dtypes(
            'category').columns
        self.to_standard_features = df[self.features].select_dtypes(
            [int, float]).columns

        self.column_transformer = make_column_transformer(
            (self.cat_encoding, self.features),
            (self.scaler, self.to_standard_features),
            remainder='passthrough')
예제 #30
0
    def categoricalEncoding(self):
        '''
        Fits a catBoostEncoder using the train data and transforms train,
        test and external categorical features data using the encoder.

        Returns
        -------
        None.

        '''
        print('Encoding categorical variable')
        target_encoder = LabelEncoder()
        y_train = target_encoder.fit_transform(self.y_train)
        catEncoder = ce.CatBoostEncoder(drop_invariant=True,
                                        return_df=True,
                                        random_state=2020)
        catEncoder.fit(self.X_train, y_train)
        self.X_train = catEncoder.transform(self.X_train)
        self.X_test = catEncoder.transform(self.X_test)
        self.external_data = catEncoder.transform(self.external_data)