Ejemplo n.º 1
0
def target_encoding(train,
                    target,
                    test=None,
                    feat_to_encode=None,
                    smooth=0.2,
                    random_state=9527):
    print('Target encoding...')
    train.sort_index(inplace=True)
    target = train.pop(target)
    if feat_to_encode is None:
        feat_to_encode = train.columns.tolist()
    smoothing = smooth
    oof = pd.DataFrame([])
    for tr_idx, oof_idx in StratifiedKFold(n_splits=5,
                                           random_state=random_state,
                                           shuffle=True).split(train, target):
        ce_target_encoder = TargetEncoder(cols=feat_to_encode,
                                          smoothing=smoothing)
        ce_target_encoder.fit(train.iloc[tr_idx, :], target.iloc[tr_idx])
        oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]),
                         ignore_index=False)
    ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing)
    ce_target_encoder.fit(train, target)
    train = oof.sort_index()
    if test is not None:
        test = ce_target_encoder.transform(test)
    features = list(train)
    print('Target encoding done!')
    return train, test, features, target
def run(dataset_version, params):
    train, val, test = load_data(dataset_version)

    X = train.drop(columns='target_pct_vunerable')
    y = train.target_pct_vunerable

    # Will use this as local val score and compare with CV score
    X_val = val.drop(columns='target_pct_vunerable')
    y_val = val.target_pct_vunerable

    X_test = test.copy()

    # Create categorical encoder
    cat_cols = X.select_dtypes('object').columns.tolist()
    enc = TargetEncoder(cols=cat_cols)

    # Tune no. estimators on validation set
    X_train = enc.fit_transform(X, y)
    X_val = enc.transform(X_val)

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train,
              y,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              verbose=25,
              early_stopping_rounds=50)

    params.update({'n_estimators': model.best_iteration_})

    # Combine validation set back with train set
    data = pd.concat([train, val], axis=0, sort=False)

    X = data.drop(columns='target_pct_vunerable')
    y = data.target_pct_vunerable

    X = enc.fit_transform(X, y)

    model = lgb.LGBMRegressor(**params)
    model.fit(X, y)

    # Make a submission file
    X_test = enc.transform(X_test)

    test_preds = model.predict(X_test)

    sub = pd.DataFrame({'ward': X_test.index, y.name: test_preds})

    now = datetime.now()
    fname = f'lgbm_{data_version}_{now.year}-{now.month}-{now.day}--{now.hour}-{now.minute}.csv'
    fname

    sub.to_csv('../data/submissions/lgbm_best_reproduce.csv', index=False)
Ejemplo n.º 3
0
def TargetEncode(data,target): 
    #Select all categorical columns
    data_to_encode=data.select_dtypes(include=['object'])
    print('Data to be encoded: ')
    cols=list(data_to_encode.columns)
    print(len(cols))
    cols='\n'.join(cols)
    print(cols)
    print('\n')
    print('\n')
    #For each column, encode using target encoder
    cols=list(data_to_encode.columns)
    model=TargetEncoder().fit(X=data[cols],y=data[target])
    #File where the target encoding model is saved
    filename="targetencodemodel.sav"
    #Open file in binary mode
    f=open(filename,'wb')
    #Dump model to file
    pickle.dump(model,f)
    f.close()
    print("Model saved in ",filename)
    print("\n")
    print("\n")
    #Read Model from file
    # f=open(filename,'rb')
    # model1=pickle.load(f)
    # f.close()
    # print("Model Loaded")
    # print(model1)
    # print('\n')
    data[cols]=model.transform(X=data[cols])
    #Return encoded data
    return data
Ejemplo n.º 4
0
class DFMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = TargetEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols], y)

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.drop(columns=self.transform_cols)
        new_X = pd.concat(
            [new_X, self.model.transform(X[self.transform_cols])], axis=1)

        return new_X

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
Ejemplo n.º 5
0
    def data_prepare(self):
        self.__train_feature_before = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_before_df.csv"))
        self.__train_feature_after = pd.read_csv(
            os.path.join(self.__input_path, "train_feature_after_df.csv"))
        self.__train = pd.concat(
            [self.__train_feature_before, self.__train_feature_after])
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_feature_df.csv"))

        self.__train_label = self.__train["TARGET"].copy()
        self.__train_feature = (self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                  axis=1)).copy()
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()].copy()
        self.__categorical_columns = self.__train_feature.select_dtypes(
            include="object").columns.tolist()

        encoder = TargetEncoder()
        encoder.fit(self.__train_feature[self.__categorical_columns],
                    self.__train_label)
        self.__train_feature[self.__categorical_columns] = encoder.transform(
            self.__train_feature[self.__categorical_columns])
Ejemplo n.º 6
0
class CategoricalPreprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mode_imputer = SimpleImputer(strategy="most_frequent")
        self.cat_cols = [
            'home_ownership', 'purpose', 'addr_state', 'initial_list_status'
        ]
        self.target_encoder = TargetEncoder(handle_missing='return_nan',
                                            handle_unknown='return_nan')

    def fit(self, X, y=None):
        self.mode_imputer.fit(X[self.cat_cols])
        self.target_encoder.fit(X["zip_code"], y)
        return self

    def transform(self, X, y=None):
        Xc = X.copy()

        # encode emp_length
        lookup = {
            '< 1 year': 0,
            '1 year': 1,
            '2 years': 2,
            '3 years': 3,
            '4 years': 4,
            '5 years': 5,
            '6 years': 6,
            '7 years': 7,
            '8 years': 8,
            '9 years': 9,
            '10+ years': 10
        }
        Xc["emp_length"] = Xc["emp_length"].replace(lookup)

        # issue date
        Xc["issue_d"] = pd.to_datetime(Xc["issue_d"])
        tmp = Xc[
            "issue_d"].values  # keep a copy of the raw date for when we transform earliest credit line
        Xc["issue_d"] = (
            Xc["issue_d"] -
            datetime.datetime(2000, 1, 1)).astype('timedelta64[M]')

        # earliest credit line
        Xc["earliest_cr_line"] = pd.to_datetime(Xc["earliest_cr_line"])
        Xc["earliest_cr_line"] = (
            tmp - Xc["earliest_cr_line"]).astype('timedelta64[M]')

        # imputation for home_ownership, purpose, addr_state, and initial_list_status
        Xc[self.cat_cols] = self.mode_imputer.transform(Xc[self.cat_cols])

        # encode zip code
        Xc["zip_code"] = self.target_encoder.transform(Xc["zip_code"])

        return Xc

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
Ejemplo n.º 7
0
def target_encoding(X_train, y_train, X_test, cols, cv_id):
    cols = list(cols)
    train_new = X_train.copy()
    test_new = X_test.copy()
    test_new[:] = 0
    cv = PredefinedSplit(cv_id)
    X_train.index = X_train.index.astype(int)
    for trn_idx, val_idx in tqdm(cv.split(X_train), total=cv.get_n_splits()):
        enc = TargetEncoder(cols=cols)
        enc.fit(X_train.iloc[trn_idx], y_train[trn_idx])
        train_new.iloc[val_idx] = enc.transform(X_train.iloc[val_idx])
        test_new += enc.transform(X_test)
    test_new /= cv.get_n_splits()
    train_new = train_new[cols]
    test_new = test_new[cols]
    train_new.columns = train_new.columns + '_target'
    test_new.columns = test_new.columns + '_target'
    print(list(train_new.columns))
    return train_new, test_new
Ejemplo n.º 8
0
def fit_model(X_train, y_train, X_val, y_val, **params):

    if args.model == "catboost":

        if args.gpu:
            model = CatBoostRegressor(**params, loss_function="RMSE", random_state=42, use_best_model=True,
                                      task_type="GPU")
        else:
            model = CatBoostRegressor(**params, loss_function="RMSE", random_state=42, use_best_model=True,
                                      task_type="CPU")
        model.fit(X_train, y_train,
                  cat_features=cat_cols,
                  early_stopping_rounds=config.EARLY_STOPPING_ROUNDS,
                  eval_set=(X_val, y_val),
                  plot=False)
        return model, None

    elif args.model == "xgboost":

        te = TargetEncoder(cols=cat_cols, smoothing=300)
        te.fit(X_train, y_train)
        X_train = te.transform(X_train)
        X_val = te.transform(X_val)
        if args.gpu:
            model = XGBRegressor(**params, random_state=42, verbosity=1,
                                 tree_method='gpu_hist', gpu_id=0, predictor="cpu_predictor")
        else:
            model = XGBRegressor(**params, random_state=42, verbosity=1)
        model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train),
                            (X_val, y_val)],
                  eval_metric="rmse",
                  early_stopping_rounds=config.EARLY_STOPPING_ROUNDS,
                  verbose=True)
        return model, te

    else:

        raise ValueError("Invalid value passed to model. Has to be either CatBoost or XGBoost.")
Ejemplo n.º 9
0
def categorical_encoding(df_X, y, cat_vars, id_train, method=None):
    if method is None:
        return df_X.values, df_X.columns

    target_enc = TargetEncoder(cols=cat_vars,
                               drop_invariant=False,
                               return_df=True,
                               impute_missing=False,
                               handle_unknown='error')
    target_enc.fit(df_X.iloc[id_train], pd.Series(y).iloc[id_train])
    df_X = target_enc.transform(df_X)

    return df_X.values, df_X.columns
    def _create_feature(cls, conf) -> pd.DataFrame:
        df = Application.get_df(conf)

        # fit with train data and transform both data
        categorical_columns = [
            col for col in df.columns if df[col].dtype == 'object'
        ]
        train_df = df[df['TARGET'].notnull()].copy()
        test_df = df[df['TARGET'].isnull()].copy()

        feature = pd.DataFrame()
        folds = StratifiedKFold(**conf.model.kfold_params)
        for n_fold, (train_idx, valid_idx) in tqdm(
                enumerate(
                    folds.split(train_df[categorical_columns],
                                train_df['TARGET'])),
                total=conf.model.kfold_params.n_splits):
            encoder = TargetEncoder(cols=categorical_columns).fit(
                train_df.iloc[train_idx][categorical_columns + ['SK_ID_CURR']],
                train_df.iloc[train_idx]['TARGET'])
            valid_te = encoder.transform(
                train_df.iloc[valid_idx][categorical_columns +
                                         ['SK_ID_CURR']]).rename(columns={
                                             col: f"{col}_target_encode"
                                             for col in categorical_columns
                                         })
            test_te = encoder.transform(
                test_df[categorical_columns + ['SK_ID_CURR']]).rename(columns={
                    col: f"{col}_target_encode"
                    for col in categorical_columns
                })
            feature = feature.append(valid_te, sort=True).append(test_te,
                                                                 sort=True)

        # take mean of oof target mean for test data
        feature = feature.groupby('SK_ID_CURR').mean()

        return feature
class EntityEmbeddingTree(BaseEstimator, TransformerMixin):
    def __init__(self, *, numeric_columns, categorical_columns):
        self.__numeric_columns = numeric_columns
        self.__categorical_columns = categorical_columns
        self.__target_encoder, self.__one_hot_encoder = [
            None for _ in range(2)
        ]
        self.__max_target, self.__max_param = [None for _ in range(2)]
        self.__clf = None

    def fit(self, X, y):
        X = X.copy(deep=True)
        y = y.copy(deep=True)

        self.__target_encoder = TargetEncoder()
        X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0)
        X[self.__categorical_columns] = X[self.__categorical_columns].fillna(
            "missing").astype(str)
        X[self.__categorical_columns] = self.__target_encoder.fit_transform(
            X[self.__categorical_columns], y)

        self.__max_target, self.__max_param = optimize_rf(X, y)
        self.__clf = RandomForestClassifier(
            min_samples_leaf=max(
                min(self.__max_param["min_samples_leaf"], 1.0), 0),
            n_estimators=max(int(round(self.__max_param["n_estimators"])), 1))

        self.__clf.fit(X, y)
        gc.collect()

        return self

    def transform(self, X):
        X = X.copy(deep=True)

        X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0)
        X[self.__categorical_columns] = X[self.__categorical_columns].fillna(
            "missing").astype(str)
        X[self.__categorical_columns] = self.__target_encoder.transform(
            X[self.__categorical_columns])
        gc.collect()

        return pd.DataFrame(self.__clf.apply(X)).astype(str)

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X=X, y=y)

        return self.transform(X)
Ejemplo n.º 12
0
def frontend_preproc(df, y):
    '''
    Function that produces the preprocessing of the DataFrame before applying the model on the front-end.
    :df: concat of df_input by the user and X features of the model
    :y: target
    '''
    ### Feature Engineering
    ohe_cols = ['gearbox', 'fuel_type', 'warranty', 'dealer', 'doors']

    # OHE
    ohe = OneHotEncoder(categories='auto')
    feature_arr = ohe.fit_transform(df[ohe_cols]).toarray()
    feature_labels = ohe.categories_

    # Using a dictionary to produce all the new OHE columns
    feature_cols = []
    for k, v in dict(zip(ohe_cols, feature_labels)).items():
        for i in v:
            el = k + '_' + str(i)
            feature_cols.append(el)

    ohe_features = pd.DataFrame(feature_arr, columns=feature_cols)
    df = pd.concat([df, ohe_features], axis=1)
    df = df.drop(ohe_cols, axis=1)

    # Target Encoding
    cat_cols = df.select_dtypes(exclude=["number"]).columns
    cols_encoded = list(map(lambda c: c + '_encoded', cat_cols))

    t_encoder = TargetEncoder()
    t_encoder.fit(df[1:][cat_cols], y)
    df[cols_encoded] = t_encoder.transform(df[cat_cols])
    df = df.drop(cat_cols, axis=1)

    # Column Transformation: QuantileTransformer
    qt = QuantileTransformer(n_quantiles=500,
                             output_distribution='normal',
                             random_state=33)

    data = qt.fit_transform(df)
    df = pd.DataFrame(data, columns=df.columns)
    
    return df
def clean_train_data_target_encoded(data):
    #uses target encodier instead
    data = data.reset_index(drop=True)
    train_y = data.iloc[:,-1]
    train_y = train_y.reset_index(drop=True)
    train_X = data.iloc[:,:-1]
    
    train_X = process_features(train_X)
    
    
    encoder = TargetEncoder(cols = ["Hair Color",
         "Wears Glasses","University Degree","Gender","Country","Profession", 
         "Housing Situation", "Satisfation with employer"], smoothing = 300)

    encoder.fit(train_X,train_y)
    data2 =  pd.concat([encoder.transform(train_X,train_y).reset_index(drop=True),train_y.reset_index(drop=True)],axis=1)
    #data2 = data2.fillna(method="ffill")
    
    return (data2,encoder)
Ejemplo n.º 14
0
class ScatterPlot(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        self.__train = None
        self.__train_feature, self.__train_label = [None for _ in range(2)]

        self.__encoder = None
        self.__pca, self.__t_sne = [None for _ in range(2)]

    def data_read(self):
        self.__train = pd.read_csv(os.path.join(self.__input_path,
                                                "train.csv"))
        self.__train = self.__train.drop(["id"], axis=1)
        self.__train_feature, self.__train_label = (self.__train.drop(
            ["target"],
            axis=1).copy(deep=True), self.__train["target"].copy(deep=True))
        self.__train_feature = self.__train_feature.astype(str)

    def data_prepare(self):
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature, self.__train_label)
        self.__train_feature = self.__encoder.transform(self.__train_feature)

        self.__pca = PCA(n_components=2, random_state=7)
        self.__train_feature = self.__pca.fit_transform(self.__train_feature)
        self.__train_feature = pd.DataFrame(self.__train_feature,
                                            columns=["col_1", "col_2"])

        # self.__t_sne = TSNE(verbose=True, random_state=7)
        # self.__train_feature = self.__t_sne.fit_transform(self.__train_feature)
        # self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"])

    def scatter_plot(self):
        _, ax = plt.subplots(figsize=(16, 9))
        ax = sns.scatterplot(x="col_1",
                             y="col_2",
                             hue=self.__train_label,
                             data=self.__train_feature,
                             ax=ax)
        ax.get_figure().savefig(os.path.join(self.__output_path, "PCA.png"))
def ProcessRawData(df, schemaCols=None):

    medianSimpleImputer = SimpleImputer(strategy='median')
    standardScaler = preprocessing.StandardScaler()

    # Adding extra features AgeLog and HeightLog
    df['AgeLog'] = np.log(df['Age'].values)
    df['HeightLog'] = np.log(df['Body Height [cm]'].values)

    # Fill missing values
    df[['Year of Record', 'Age', 'AgeLog', 'HeightLog']] = medianSimpleImputer.fit_transform(df[['Year of Record', 'Age', 'AgeLog', 'HeightLog']].values)

    # Scale numeric columns 1
    df[['Year of Record', 'Size of City', 'Body Height [cm]', 'Age', 'AgeLog']] = standardScaler.fit_transform(df[['Year of Record', 'Size of City', 'Body Height [cm]', 'Age', 'AgeLog']].values)

    # Scale numeric columns 2
    if 'Income in EUR' in df.columns:
        global YScaler
        YScaler = preprocessing.StandardScaler()
        df[['Income in EUR']] = YScaler.fit_transform(df[['Income in EUR']].values)

    # Reducing complexity of features
    df.Profession = list(df.Profession.map(S2))

    # To be used while writing results to CSV
    instances = df['Instance'].values
    df = df.drop(['Instance'], axis=1)

    print('Columns available 1 - ', df.columns)

    # Target encoding the data - could've been done with a single encoder object, will try later,
    if (schemaCols is None): # condition to skip fitting on Prediction dataset and only transform then
        global t1, t2, t3, t4, t5
        t1 = TargetEncoder()
        t2 = TargetEncoder()
        t3 = TargetEncoder()
        t4 = TargetEncoder()
        t5 = TargetEncoder()
        t1.fit(df.Country.values, df['Income in EUR'].values)
        t2.fit(df.Profession.values, df['Income in EUR'].values)
        t3.fit(df.Gender.values, df['Income in EUR'].values)
        t4.fit(df['University Degree'].values, df['Income in EUR'].values)
        t5.fit(df['Hair Color'].values, df['Income in EUR'].values)

    df.Country = t1.transform(df.Country.values)
    df.Profession = t2.transform(df.Profession.values)
    df.Gender = t3.transform(df.Gender.values)
    df['University Degree'] = t4.transform(df['University Degree'].values)
    df['Hair Color'] = t5.transform(df['Hair Color'].values)

    if (schemaCols is not None):
        newdf = pd.DataFrame()
        for columnName in schemaCols:
            if columnName not in df.columns:
                newdf[columnName] = 0
            else:
                newdf[columnName] = df[columnName].values
        df = newdf

    df = df.sort_index(axis=1)

    # standardize datasets prediction and training to use the same code from there on
    if 'Income in EUR' not in df.columns:
        df['Income in EUR'] = np.zeros(df.values.shape[0])

    if 'Income' in df.columns:
        df.drop('Income')

    X = df.drop('Income in EUR', axis=1).values
    Y = df['Income in EUR'].values

    print('Shape - ', df.shape)

    global featSel
    if featSel is None:
        print('k = ? ')
        featSel = SelectKBest(f_regression, k=10)
        featSel.fit(X, Y)

    X = featSel.transform(X)
    print('Shape after feature selection - ', X.shape)
    return instances, X, Y, df.columns
class LightGbmOneFold(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        # data prepare
        self.__sample_submission = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_feature, self.__test_feature = [None for _ in range(2)]
        self.__train_label = None
        self.__categorical_columns = None
        self.__encoder = None

        # model fit
        self.__folds = None
        self.__train_preds = None
        self.__test_preds = None
        self.__gbm = None

    def data_prepare(self):
        self.__sample_submission = pd.read_csv(os.path.join(self.__input_path, "sample_submission.csv"))

        # selected feature
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_select_feature_df.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(
            ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1)
        self.__test_feature = self.__test[self.__train_feature.columns.tolist()]

        self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature.loc[:, self.__categorical_columns], self.__train_label)
        self.__train_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(self.__train_feature.loc[:, self.__categorical_columns])
        )
        self.__test_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(self.__test_feature.loc[:, self.__categorical_columns])
        )

    def model_fit(self):
        feature_importance_df = pd.DataFrame()

        self.__gbm = LGBMClassifier(
            n_estimators=5000,
            learning_rate=0.0128,
            max_depth=8,
            num_leaves=11,
            min_split_gain=0.0018,
            min_child_weight=2.6880,
            colsample_bytree=0.5672,
            subsample=0.6406,
            reg_alpha=3.5025,
            reg_lambda=0.9549,
            n_jobs=-1
        )

        self.__gbm.fit(self.__train_feature, self.__train_label, verbose=True)
        self.__train_preds = self.__gbm.predict_proba(self.__train_feature)[:, 1]
        self.__test_preds = self.__gbm.predict_proba(self.__test_feature)[:, 1]

        feature_importance_df["feature"] = pd.Series(self.__train_feature.columns)
        feature_importance_df["importance"] = self.__gbm.feature_importances_
        feature_importance_df.to_csv(os.path.join(self.__output_path, "feature_importance.csv"), index=False)
        print("Train AUC score %.6f" % roc_auc_score(self.__train_label, self.__train_preds))

    def model_predict(self):
        self.__sample_submission["TARGET"] = self.__test_preds
        self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
Ejemplo n.º 17
0
#training['Hair Color'] = training['Hair Color'].replace( '0' ,'Nan_data')
#training['Hair Color'] = training['Hair Color'].replace( 'Unknown' ,'Nan_data')

#test['Hair Color'] = test['Hair Color'].replace( np.nan ,'Nan_data')
#test['Hair Color'] = test['Hair Color'].replace( '0' ,'Nan_data')
#test['Hair Color'] = test['Hair Color'].replace( 'Unknown' ,'Nan_data')

X = training.iloc[:, :-1]
y = training.iloc[:, -1]

#Target encoding for categorical features.

te = TargetEncoder()
te.fit(X, y)

X = te.transform(X)

predict_dataset = te.transform(test)

from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=0.3,
                                                  random_state=42)

#from catboost import CatBoostRegressor

# Using CatBoost
#cat_model3 = CatBoostRegressor(iterations=125000)
#cat_model3.fit(x_train, y_train)
Ejemplo n.º 18
0
import pandas as pd
from category_encoders import TargetEncoder
import joblib

data = pd.read_csv('./京东万象数据填充2.csv', encoding='GBK')
data = data.dropna(subset=['价格'])
data = data.dropna(subset=['数据标签'])
data = data.dropna(subset=['数据名称'])
data = data.dropna(subset=['店铺'])

enc = TargetEncoder(cols=['数据名称', '店铺', '数据标签'])
# print(type(enc))
dataframe = data[['数据名称', '店铺', '数据标签', '数据大小', '浏览量', '价格']]
enc.fit(dataframe, dataframe['价格'])

data1 = enc.transform(dataframe)
# print(type(data1))
# dataframe = pd.DataFrame({'数据名称': data1['数据名称'], '店铺': data1['店铺'],
#                           '数据标签': data1['数据标签'], '数据大小': data1['数据大小'],
#                           '浏览量': data1['浏览量'], '价格': data1['价格']})
joblib.dump(enc, 'encoding.joblib')
data1.to_csv('final_data.csv', encoding='GBK', sep=',')

train_data_1=pd.concat([train_myVolts_Null_item_type, train_myVolts_Not_Null_item_type], axis=0)
train_data_2=pd.concat([train_myVolts_Null_cbf_parser, train_myVolts_Not_Null_cbf_parser], axis=0)

train_myVolts['item_type']=train_data_1['item_type']
train_myVolts['cbf_parser']=train_data_2['cbf_parser']
train_myVolts['country_by_ip']=train_myVolts['country_by_ip'].fillna('missing')
print('Values with NANs Train',train_myVolts[feature_cols].isnull().sum())

y = train_myVolts.set_clicked
X = train_myVolts[feature_cols]
from category_encoders import TargetEncoder
t1 = TargetEncoder()
t1.fit(X, y)
X = t1.transform(X)

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1234)

##check
X_train.to_csv('X_train4.csv',index=False)
y_train.to_csv('y_train4.csv',index=False)


from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
logreg1 =LogisticRegression ()
# logreg1 = RandomForestClassifier(n_estimators=500)
# for random forest is 0.9920 but logistic is 0.9922
class LightGbmKfold(object):
    def __init__(self, *, input_path, output_path):
        self.__input_path, self.__output_path = input_path, output_path

        # data prepare
        self.__sample_submission = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_feature, self.__test_feature = [None for _ in range(2)]
        self.__train_feature_stacking_tree, self.__test_feature_stacking_tree = [
            None for _ in range(2)
        ]
        self.__train_feature_stacking_linear, self.__test_feature_stacking_linear = [
            None for _ in range(2)
        ]
        self.__train_feature_stacking_network, self.__test_feature_stacking_network = [
            None for _ in range(2)
        ]
        self.__train_feature_stacking_gp, self.__test_feature_stacking_gp = [
            None for _ in range(2)
        ]
        self.__train_label = None
        self.__categorical_columns = None
        self.__encoder = None

        # model fit
        self.__folds = None
        self.__oof_preds = None
        self.__sub_preds = None
        self.__gbm = None
        # self.__metric_weight = []

    def data_prepare(self):
        self.__sample_submission = pd.read_csv(
            os.path.join(self.__input_path, "sample_submission.csv"))

        # selected feature
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__test = pd.read_csv(
            os.path.join(self.__input_path, "test_select_feature_df.csv"))
        # stacking tree
        self.__train_feature_stacking_tree = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_tree_train.csv"))
        self.__test_feature_stacking_tree = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_tree_test.csv"))
        # stacking linear
        self.__train_feature_stacking_linear = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_linear_train.csv"))
        self.__test_feature_stacking_linear = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_linear_test.csv"))
        # stacking network
        self.__train_feature_stacking_network = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_network_train.csv"))
        self.__test_feature_stacking_network = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_network_test.csv"))
        # gp
        self.__train_feature_stacking_gp = pd.read_csv(
            os.path.join(self.__input_path, "genetic_train_feature.csv"))
        self.__test_feature_stacking_gp = pd.read_csv(
            os.path.join(self.__input_path, "genetic_test_feature.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)
        self.__test_feature = self.__test[
            self.__train_feature.columns.tolist()]

        self.__categorical_columns = self.__train_feature.select_dtypes(
            "object").columns.tolist()
        self.__encoder = TargetEncoder()
        self.__encoder.fit(
            self.__train_feature.loc[:, self.__categorical_columns],
            self.__train_label)
        self.__train_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature.loc[:, self.__categorical_columns]))
        self.__test_feature.loc[:, self.__categorical_columns] = (
            self.__encoder.transform(
                self.__test_feature.loc[:, self.__categorical_columns]))

        self.__train_feature = pd.concat([
            self.__train_feature, self.__train_feature_stacking_tree,
            self.__train_feature_stacking_linear,
            self.__train_feature_stacking_network,
            self.__train_feature_stacking_gp
        ],
                                         axis=1)
        self.__test_feature = pd.concat([
            self.__test_feature, self.__test_feature_stacking_tree,
            self.__test_feature_stacking_linear,
            self.__test_feature_stacking_network,
            self.__test_feature_stacking_gp
        ],
                                        axis=1)

    def model_fit(self):
        self.__folds = StratifiedKFold(n_splits=4,
                                       shuffle=True,
                                       random_state=8)
        self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0])
        self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0])
        # self.__sub_preds = np.zeros(shape=(self.__test_feature.shape[0], 5))

        feature_importance_df = pd.DataFrame()
        for n_fold, (trn_idx, val_idx) in enumerate(
                self.__folds.split(self.__train_feature, self.__train_label)):
            trn_x, trn_y = self.__train_feature.iloc[
                trn_idx], self.__train_label.iloc[trn_idx]
            val_x, val_y = self.__train_feature.iloc[
                val_idx], self.__train_label.iloc[val_idx]

            self.__gbm = LGBMClassifier(colsample_bytree=0.6659,
                                        learning_rate=0.0197,
                                        max_depth=8,
                                        min_child_weight=1.0652,
                                        min_split_gain=0.058,
                                        n_estimators=501,
                                        num_leaves=11,
                                        reg_alpha=2.2487,
                                        reg_lambda=6.2587,
                                        subsample=0.9401)

            self.__gbm.fit(trn_x,
                           trn_y,
                           eval_set=[(trn_x, trn_y), (val_x, val_y)],
                           eval_metric="auc",
                           verbose=True,
                           early_stopping_rounds=5)
            pred_val = self.__gbm.predict_proba(
                val_x, num_iteration=self.__gbm.best_iteration_)[:, 1]
            pred_test = self.__gbm.predict_proba(
                self.__test_feature,
                num_iteration=self.__gbm.best_iteration_)[:, 1]

            self.__oof_preds[val_idx] = pred_val
            self.__sub_preds += pred_test / self.__folds.n_splits
            # self.__sub_preds[:, n_fold] = pred_test

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = pd.Series(
                self.__train_feature.columns)
            fold_importance_df["importance"] = self.__gbm.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            # 保存 weight
            # self.__metric_weight.append(roc_auc_score(val_y, self.__oof_preds[val_idx]))
            print(
                "Fold %2d AUC : %.6f" %
                (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx])))

        feature_importance_df.to_csv(os.path.join(self.__output_path,
                                                  "feature_importance.csv"),
                                     index=False)
        print("Full AUC score %.6f" %
              roc_auc_score(self.__train_label, self.__oof_preds))

    def model_predict(self):
        # weight sum
        # self.__metric_weight = pd.Series(self.__metric_weight).rank()
        # self.__metric_weight = self.__metric_weight / self.__metric_weight.sum()
        # self.__metric_weight = self.__metric_weight.values.reshape((5, 1))
        # self.__sub_preds = np.dot(self.__sub_preds, self.__metric_weight)
        self.__sample_submission["TARGET"] = self.__sub_preds
        self.__sample_submission.to_csv(os.path.join(self.__output_path,
                                                     "sample_submission.csv"),
                                        index=False)
Ejemplo n.º 21
0
# SKLEARN TARGET ENCODING

#!pip install category_encoders
from category_encoders import TargetEncoder

us_adults = pd.read_csv("./adult.csv", na_values="?")

us_adults.head()

features_original = [f for f in us_adults.columns if f not in "income"]

features_original

target_mapping

#Remap outcome variable
us_adults.loc[:, "income"] = us_adults.income.map(target_mapping)

us_adults.income.value_counts()

te = TargetEncoder(return_df=True, smoothing=0)

te.fit(X=us_adults[features_original], y=us_adults.income)

encoded_df_sk = te.transform(X=us_adults[features_original])

encoded_df_sk.shape

encoded_df_sk.head()
Ejemplo n.º 22
0
class CatBoostKfold(object):

    def __init__(self, *, input_path_1, input_path_2, output_path):
        self.__input_path_1 = input_path_1
        self.__input_path_2 = input_path_2
        self.__output_path = output_path

        self.__sample_submission = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_res, self.__test_res = [None for _ in range(2)]

        self.__train_feature, self.__train_label = [None for _ in range(2)]
        self.__test_feature = None
        self.__categorical_index = None
        self.__encoder = None
        self.__numeric_index = None

        self.__folds = None
        self.__oof_preds = None
        self.__sub_preds = None
        self.__cat = None

    def data_prepare(self):
        self.__sample_submission = pd.read_csv(os.path.join(self.__input_path_1, "sample_submission.csv"))
        self.__train = pd.read_csv(os.path.join(self.__input_path_1, "train_feature_df.csv"))
        self.__test = pd.read_csv(os.path.join(self.__input_path_1, "test_feature_df.csv"))
        self.__train_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_train_res.csv"))
        self.__test_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_test_res.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1)
        self.__test_feature = self.__test[self.__train_feature.columns]

        self.__train_res = self.__train_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1)
        self.__test_res = self.__test_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1)

        self.__train_feature = pd.concat([self.__train_feature, self.__train_res], axis=1)
        self.__test_feature = pd.concat([self.__test_feature, self.__test_res], axis=1)

        self.__categorical_index = np.where(self.__train_feature.dtypes == "object")[0]
        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__train_feature.iloc[:, self.__categorical_index].fillna("missing")
        )
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__test_feature.iloc[:, self.__categorical_index].fillna("missing")
        )

        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature.iloc[:, self.__categorical_index], self.__train_label)
        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__encoder.transform(self.__train_feature.iloc[:, self.__categorical_index])
        )
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__encoder.transform(self.__test_feature.iloc[:, self.__categorical_index])
        )

        # There are NaNs in test dataset (feature number 77) but there were no NaNs in learn dataset"
        self.__numeric_index = np.where(self.__train_feature.dtypes != "object")[0]
        self.__train_feature.iloc[:, self.__numeric_index] = (
            self.__train_feature.iloc[:, self.__numeric_index].apply(
                lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0)
            )
        )
        self.__test_feature.iloc[:, self.__numeric_index] = (
            self.__test_feature.iloc[:, self.__numeric_index].apply(
                lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0)
            )
        )

        # blending 之前需要 shuffle, 这里其实并不需要, 因为后面 StratifiedKFold shuffle
        self.__train_feature, self.__train_label = shuffle(self.__train_feature, self.__train_label)

    def model_fit(self):
        self.__folds = StratifiedKFold(n_splits=5, shuffle=True)
        self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0])
        self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0])

        for n_fold, (trn_idx, val_idx) in enumerate(self.__folds.split(self.__train_feature, self.__train_label)):
            trn_x, trn_y = self.__train_feature.iloc[trn_idx], self.__train_label.iloc[trn_idx]
            val_x, val_y = self.__train_feature.iloc[val_idx], self.__train_label.iloc[val_idx]

            self.__cat = CatBoostClassifier(
                iterations=6000,
                od_wait=200,
                od_type="Iter",
                eval_metric="AUC"
            )
            self.__cat.fit(
                trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                use_best_model=True
            )
            pred_val = self.__cat.predict_proba(val_x)[:, 1]
            pred_test = self.__cat.predict_proba(self.__test_feature)[:, 1]

            self.__oof_preds[val_idx] = pred_val
            self.__sub_preds += pred_test / self.__folds.n_splits
            print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx])))
        print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds))

    def model_predict(self):
        self.__sample_submission["TARGET"] = self.__sub_preds
        self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
Ejemplo n.º 23
0
        X_tst = X_test.copy()
        print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30)

        # with timer('weight of evidence'):
        #     cat_cols = X_trn.select_dtypes(['object']).columns.tolist()
        #     woe = WeightOfEvidence(cols=cat_cols, suffix='woe')
        #     X_trn = pd.concat([X_trn, woe.fit_transform(X_trn.loc[:, cat_cols], y_trn)], axis=1)
        #     X_val = pd.concat([X_val, woe.transform(X_val.loc[:, cat_cols])], axis=1)
        #     X_tst = pd.concat([X_tst, woe.transform(X_tst.loc[:, cat_cols])], axis=1)

        with timer('target encoding'):
            cat_cols = X_trn.select_dtypes(['object']).columns.tolist()
            te = TargetEncoder(cols=cat_cols)
            X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols],
                                                      y_trn)
            X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols])
            X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols])

        # with timer('calc sample weight'):
        #     X_trn['is_test'] = 0
        #     X_tst['is_test'] = 1
        #     df = pd.concat([X_trn, X_tst])
        #     X = df.drop('is_test', axis=1)
        #     y = df.is_test.ravel()
        #     model = lgb.LGBMClassifier(**calc_weight_params)
        #     model.fit(X, y)
        #     proba = np.sqrt(rankdata(model.predict_proba(X)[:len(X_trn), 1])/len(X_trn))
        #     X_trn.drop('is_test', axis=1)
        #     X_tst.drop('is_test', axis=1)

        with timer('fit'):
Ejemplo n.º 24
0
M = pd.read_csv('prediction_data.csv')
M['Year of Record'] = simpleimputermedian.fit_transform(
    M['Year of Record'].values.reshape(-1, 1))
M['Age'] = simpleimputermedian.fit_transform(M['Age'].values.reshape(-1, 1))
M['Body Height [cm]'] = simpleimputermedian.fit_transform(
    M['Body Height [cm]'].values.reshape(-1, 1))
Mnoncateg = M.drop(
    ['Instance', 'Hair Color', 'Wears Glasses', 'Hair Color', 'Income'],
    axis=1)

X = datasetnoncateg.drop('Income in EUR', axis=1).values
Y = datasetnoncateg['Income in EUR'].values
#target encoding
t1 = TargetEncoder()
t1.fit(X, Y)
X = t1.transform(X)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                Y,
                                                test_size=0.33,
                                                random_state=0)
# regressor = BayesianRidge()
regressor = RandomForestRegressor()
#regressor = AdaBoostRegressor()
#regressor = = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)

fitResult = regressor.fit(Xtrain, Ytrain)
YPredTest = regressor.predict(Xtest)
#learningTest = pd.DataFrame({'Predicted': YPredTest, 'Actual': Ytest })
np.sqrt(metrics.mean_squared_error(Ytest, YPredTest))
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7, random_state=100)

# Installing category_encoders to import Target Encoder
!pip install category_encoders

# Importing Target Encoder
from category_encoders import TargetEncoder

# creating an object "te" for Target Encoder
te=TargetEncoder()

# Fitting Target Encoder on X_train and y_train (Training Data)
te.fit(X_train,y_train)

#Transforming X_train (Training Data)
X_train=te.transform(X_train)

#Transforming X_test (Training Data)
X_test=te.transform(X_test)

#Importing Logistic Regression from sklearn
from sklearn.linear_model import LogisticRegression

# Creating object for Logistic Regression
lr=LogisticRegression()

#Fitting Logistic Regression on X_train annd y_train (Training Data)
lr.fit(X_train,y_train)
#Predicting X_test (Training Data)
y_pred_train=lr.predict(X_test)
Ejemplo n.º 26
0
y = X.Income
y = y - X['Additional_income']
X = X.drop('Income', 1)
X = X.drop('Instance', 1)
X = X.drop('Additional_income', 1)

y1 = X1.Income
y1 = y1 - X1['Additional_income']
X1 = X1.drop('Income', 1)
X1 = X1.drop('Instance', 1)
temp = X1['Additional_income']
X1 = X1.drop('Additional_income', 1)

t1 = TargetEncoder()
t1.fit(X, y)
X = t1.transform(X)
X1 = t1.transform(X1)

mm_scaler = preprocessing.MinMaxScaler()
X = mm_scaler.fit_transform(X)
X1 = mm_scaler.transform(X1)

from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                y,
                                                test_size=0.10,
                                                random_state=0)

#from sklearn.linear_model import BayesianRidge
#regressor = BayesianRidge()
#reg = regressor.fit(X, y)
class BayesianOptimizationGoss(object):
    def __init__(self, *, input_path):
        self.__input_path = input_path

        # data prepare
        self.__train = None
        self.__train_label = None
        self.__train_feature = None
        self.__train_feature_stacking_tree = None
        self.__train_feature_stacking_linear = None
        self.__train_feature_stacking_network = None
        self.__train_feature_gp = None
        self.__encoder = None
        self.__categorical_columns = None

        # parameter tuning
        self.__gbm_bo = None
        self.__gbm_params = None
        self.__gp_params = {"alpha": 1e-4}

    def data_prepare(self):
        self.__train = pd.read_csv(
            os.path.join(self.__input_path, "train_select_feature_df.csv"))
        self.__train_feature_stacking_tree = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_tree_train.csv"))
        self.__train_feature_stacking_linear = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_linear_train.csv"))
        self.__train_feature_stacking_network = pd.read_csv(
            os.path.join(self.__input_path, "first_layer_network_train.csv"))
        self.__train_feature_gp = pd.read_csv(
            os.path.join(self.__input_path, "genetic_train_feature.csv"))
        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["TARGET"] + [
            col for col in self.__train.columns.tolist()
            if re.search(r"SK_ID", col)
        ],
                                                 axis=1)

        self.__encoder = TargetEncoder()
        self.__categorical_columns = self.__train_feature.select_dtypes(
            "object").columns.tolist()
        self.__encoder.fit(self.__train_feature[self.__categorical_columns],
                           self.__train_label)
        self.__train_feature[self.__categorical_columns] = (
            self.__encoder.transform(
                self.__train_feature[self.__categorical_columns]))

        self.__train_feature = pd.concat([
            self.__train_feature, self.__train_feature_stacking_tree,
            self.__train_feature_stacking_linear,
            self.__train_feature_stacking_network
        ],
                                         axis=1)

    def parameter_tuning(self):
        def __cv(drop_rate, max_drop, skip_drop, n_estimators, learning_rate,
                 max_depth, num_leaves, min_split_gain, min_child_weight,
                 colsample_bytree, subsample, reg_alpha, reg_lambda):
            val = cross_val_score(
                LGBMClassifier(
                    boosting_type="dart",
                    drop_rate=max(min(drop_rate, 1.0), 0),
                    max_drop=max(round(max_drop), 1),
                    skip_drop=max(min(skip_drop, 1.0), 0),
                    n_estimators=max(round(n_estimators), 1),
                    learning_rate=max(min(learning_rate, 1.0), 0),
                    max_depth=max(round(max_depth), 1),
                    num_leaves=(max(
                        round(2 ^ round(max_depth) if num_leaves > 2
                              ^ round(max_depth) else round(num_leaves)), 1)),
                    min_split_gain=max(min_split_gain, 0),
                    min_child_weight=max(min_child_weight, 0),
                    colsample_bytree=max(min(colsample_bytree, 1.0), 0),
                    subsample=max(min(subsample, 1.0), 0),
                    reg_alpha=max(reg_alpha, 0),
                    reg_lambda=max(reg_lambda, 0),
                    n_jobs=-1,
                    verbose=-1),
                self.__train_feature,
                self.__train_label,
                scoring="roc_auc",
                # 要与使用 blending 的 lightgbm 相同
                cv=StratifiedKFold(n_splits=5, shuffle=True,
                                   random_state=8)).mean()

            return val

        self.__gbm_params = {
            # dart parameter
            "drop_rate": (0, 1.0),
            "max_drop": (10, 200),
            "skip_drop": (0, 1.0),
            # Gradient boosting parameter
            "n_estimators": (500, 3000),
            "learning_rate": (0.001, 0.1),
            # tree parameter
            "max_depth": (4, 10),
            "num_leaves": (10, 200),
            "min_split_gain": (0.00001, 0.1),
            "min_child_weight": (1, 100),
            # bagging parameter
            "colsample_bytree": (0.5, 1.0),
            "subsample": (0.5, 1.0),
            # reg parameter
            "reg_alpha": (0, 10),
            "reg_lambda": (0, 10)
        }
        self.__gbm_bo = BayesianOptimization(__cv, self.__gbm_params)
        self.__gbm_bo.maximize(init_points=30, n_iter=130, **self.__gp_params)
with timer('training'):
    cv_results = []
    val_series = y_train.copy()
    test_df = pd.DataFrame()
    feat_df = pd.DataFrame(index=X_train.columns)
    for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_trn = X_train.iloc[trn_idx].copy()
        y_trn = y_train[trn_idx].copy()
        X_val = X_train.iloc[val_idx].copy()
        y_val = y_train[val_idx].copy()
        print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30)
        with timer('target encoding'):
            cat_cols = [f for f in X_trn.columns if X_trn[f].dtype=='object']
            te = TargetEncoder(cols=cat_cols)
            X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols], y_trn).values
            X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols]).values
            X_test_ = X_test.copy()
            X_test_.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols]).values
            X_trn.fillna(-9999)
            X_val.fillna(-9999)
            X_test_.fillna(-9999)
        
        with timer('fit'):
            model = lgb.LGBMClassifier(**lgb_params)
            model.fit(X_trn, y_trn, eval_set=[(X_trn, y_trn), (X_val, y_val)], **fit_params)
        
        p = model.predict_proba(X_val)[:, 1]
        val_series.iloc[val_idx] = p
        cv_results.append(roc_auc_score(y_val, p))
        test_df[i] = model.predict_proba(X_test_)[:, 1]
        feat_df[i] = model.feature_importances_
Ejemplo n.º 29
0
    "D:\PythonProjects\ML_Group_Data/tcd-ml-comp-201920-income-pred-group/test.csv"
)

train_data = preprocessing(train)
test_data = preprocessing(test)

y = train_data[target]
train_data.drop(target, axis=1, inplace=True)
test_data.drop(target, axis=1, inplace=True)

enc = TargetEncoder(cols=[
    'Gender', 'Country', 'Profession', 'University Degree',
    'Housing Situation', 'Satisfation with employer'
])
enc.fit(train_data, y)
train_data = enc.transform(train_data)
test_data = enc.transform(test_data)
train_data.head()
test_data.head()

#X_Train, X_Test, y_train, y_test = train_test_split(train_data, y, test_size=0.3, random_state=1)
X_Train = train_data
y_train = y

y_train_log = np.log(y_train)

training = lgb.Dataset(X_Train, y_train_log)
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['num_leaves'] = 140
Ejemplo n.º 30
0
with timer('training'):
    cv_results = []
    val_series = y_train.copy()
    test_df = pd.DataFrame()
    feat_df = pd.DataFrame(index=X_train.columns)
    for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_trn = X_train.iloc[trn_idx]
        y_trn = y_train[trn_idx]
        X_val = X_train.iloc[val_idx]
        y_val = y_train[val_idx]
        print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30)
        with timer('target encoding'):
            te = TargetEncoder()
            X_trn = te.fit_transform(X_trn, y_trn)
            X_val = te.transform(X_val)
            X_test_ = te.transform(X_test)
            X_trn.fillna(-9999)
            X_val.fillna(-9999)
            X_test_.fillna(-9999)
        
        with timer('fit'):
            model = lgb.LGBMClassifier(**lgb_params)
            model.fit(X_trn, y_trn, eval_set=[(X_trn, y_trn), (X_val, y_val)], **fit_params)
        
        p = model.predict_proba(X_val)[:, 1]
        val_series.iloc[val_idx] = p
        cv_results.append(roc_auc_score(y_val, p))
        test_df[i] = model.predict_proba(X_test_)[:, 1]
        feat_df[i] = model.feature_importances_