def train(x_train, y_train, x_valid, y_valid, n_estimators_0, objective,
          eval_metric, scoring, rmspe_xg, kfold, esr):
    # 1-设置参数初始值
    print("1-设置参数初始值")
    reg = XGBRegressor(
        # General Parameters
        booster="gbtree",
        silent=1,
        nthread=-1,
        n_jobs=-1,
        # Booster Parameters
        learning_rate=0.1,
        n_estimators=n_estimators_0,
        gamma=0,
        max_depth=7,
        min_child_weight=0.001,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0,
        reg_lambda=1,
        max_delta_step=0,
        scale_pos_weight=1,
        # Learning Task Parameters
        objective=objective,
        eval_metric=eval_metric,
        seed=0)

    # 2-训练最优弱分类器个数:n_estimators_1
    print("2-训练最优弱分类器个数:n_estimators_1")
    xgb_param = reg.get_xgb_params()
    d_train = xgb.DMatrix(x_train, y_train)
    d_valid = xgb.DMatrix(x_valid, y_valid)
    watchlist = [(d_train, "train"), (d_valid, "valid")]

    t_begin = pd.Timestamp.now()
    xgb_cv = xgb.cv(
        params=xgb_param,
        dtrain=d_train,
        num_boost_round=xgb_param["n_estimators"],
        nfold=kfold,
        feval=rmspe_xg,
        #metrics=eval_metric,
        early_stopping_rounds=int(xgb_param["n_estimators"] / esr),
        verbose_eval=None)
    t1 = pd.Timestamp.now()
    n_estimators_1 = xgb_cv.shape[0]
    reg.set_params(n_estimators=n_estimators_1)
    xgb_param = reg.get_xgb_params()
    print("分类器个数:%s, 用时:%s" % (n_estimators_1, (t1 - t_begin)))

    # 3-暴力搜索:learning_rate
    print("3-暴力搜索:learning_rate")
    param = {"learning_rate": [0.1, 0.2, 0.3]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_3 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    #model_3.grid_scores_; model_3.best_score_; model_3.best_estimator_
    best_param = model_3.best_params_["learning_rate"]
    reg.set_params(learning_rate=best_param)
    xgb_param = reg.get_xgb_params()
    print("learning_rate:%s, 用时:%s" % (best_param, (t1 - t0)))

    # 4-暴力搜索:max_depth, min_child_weight
    print("4-暴力搜索:max_depth, min_child_weight")
    param = {
        "max_depth": [3, 5, 7, 9, 11],
        "min_child_weight": [0.001, 0.01, 0.1, 1]
    }
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_4 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_4.best_params_["max_depth"]
    best_param_2 = model_4.best_params_["min_child_weight"]
    print("max_depth:%s,min_child_weight:%s,用时:%s" %
          (best_param_1, best_param_2, (t1 - t0)))

    # 5-精确搜索:max_depth
    print("5-精确搜索:max_depth")
    param = {"max_depth": [best_param_1 - 1, best_param_1, best_param_1 + 1]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_5 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_5.best_params_["max_depth"]
    reg.set_params(max_depth=best_param_1)
    xgb_param = reg.get_xgb_params()
    print("max_depth:%s,用时:%s" % (best_param_1, (t1 - t0)))

    # 6-暴力搜索:gamma
    print("6-暴力搜索:gamma")
    param = {"gamma": [0, 0.5, 1, 1.5, 2, 2.5]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_6 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_6.best_params_["gamma"]
    print("gamma:%s,用时:%s" % (best_param, (t1 - t0)))

    # 7-精确搜索:gamma
    print("7-精确搜索:gamma")
    if best_param == 0:
        param = {"gamma": [0, 0.1, 0.2, 0.3, 0.4]}
    else:
        param = {"gamma": np.arange(best_param - 0.2, best_param + 0.3, 0.1)}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_7 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_7.best_params_["gamma"]
    reg.set_params(gamma=best_param)
    xgb_param = reg.get_xgb_params()
    print("gamma:%s,用时:%s" % (best_param, (t1 - t0)))

    # 8-调整最优弱分类器个数:n_estimators_2
    print("8-调整最优弱分类器个数:n_estimators_2")
    reg.set_params(n_estimators=n_estimators_0)
    xgb_param = reg.get_xgb_params()

    t0 = pd.Timestamp.now()
    xgb_cv = xgb.cv(
        params=xgb_param,
        dtrain=d_train,
        num_boost_round=xgb_param["n_estimators"],
        nfold=kfold,
        feval=rmspe_xg,
        #metrics=eval_metric,
        early_stopping_rounds=int(xgb_param["n_estimators"] / esr),
        verbose_eval=None)
    t1 = pd.Timestamp.now()
    n_estimators_2 = xgb_cv.shape[0]
    reg.set_params(n_estimators=n_estimators_2)
    xgb_param = reg.get_xgb_params()
    print("分类器个数:%s, 用时:%s" % (n_estimators_2, (t1 - t0)))

    # 9-暴力搜索:subsample, colsample_bytree
    print("9-暴力搜索:subsample, colsample_bytree")
    param = {
        "subsample": [0.6, 0.7, 0.8, 0.9],
        "colsample_bytree": [0.6, 0.7, 0.8, 0.9]
    }
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_8 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_8.best_params_["subsample"]
    best_param_2 = model_8.best_params_["colsample_bytree"]
    print("subsample:%s,colsample_bytree:%s,用时:%s" %
          (best_param_1, best_param_2, (t1 - t0)))

    # 10-精确搜索:subsample, colsample_bytree
    print("10-精确搜索:subsample, colsample_bytree")
    param = {
        "subsample": [best_param_1 - 0.05, best_param_1, best_param_1 + 0.05],
        "colsample_bytree":
        [best_param_2 - 0.05, best_param_2, best_param_2 + 0.05]
    }
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_9 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_9.best_params_["subsample"]
    best_param_2 = model_9.best_params_["colsample_bytree"]
    reg.set_params(subsample=best_param_1, colsample_bytree=best_param_2)
    xgb_param = reg.get_xgb_params()
    print("subsample:%s,colsample_bytree:%s,用时:%s" %
          (best_param_1, best_param_2, (t1 - t0)))

    # 11-暴力搜索:reg_alpha
    print("11-暴力搜索:reg_alpha")
    param = {"reg_alpha": [0, 1, 2, 3]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_11 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_11.best_params_["reg_alpha"]
    reg.set_params(reg_alpha=best_param)
    xgb_param = reg.get_xgb_params()
    print("reg_alpha:%s,用时:%s" % (best_param, (t1 - t0)))

    # 12-精确搜索:reg_alpha
    print("12-精确搜索:reg_alpha")
    if best_param == 0:
        param = {"reg_alpha": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}
    else:
        param = {
            "reg_alpha": np.arange(best_param - 0.5, best_param + 0.5, 0.2)
        }
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_12 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_12.best_params_["reg_alpha"]
    reg.set_params(reg_alpha=best_param)
    xgb_param = reg.get_xgb_params()
    print("reg_alpha:%s,用时:%s" % (best_param, (t1 - t0)))

    # 13-暴力搜索:reg_lambda
    print("13-暴力搜索:reg_lambda")
    param = {"reg_lambda": [1, 3, 5, 7]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_13 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_13.best_params_["reg_lambda"]
    reg.set_params(reg_lambda=best_param)
    xgb_param = reg.get_xgb_params()
    print("reg_lambda:%s,用时:%s" % (best_param, (t1 - t0)))

    # 14-精确搜索:reg_lambda
    print("14-精确搜索:reg_lambda")
    param = {"reg_lambda": np.arange(best_param - 1, best_param + 1, 0.2)}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_14 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param = model_14.best_params_["reg_lambda"]
    reg.set_params(reg_lambda=best_param)
    xgb_param = reg.get_xgb_params()
    print("reg_lambda:%s,用时:%s" % (best_param, (t1 - t0)))

    # 15-精确搜索:max_delta_step, scale_pos_weight
    print("15-精确搜索:max_delta_step, scale_pos_weight")
    param = {"max_delta_step": [0, 1, 3, 5], "scale_pos_weight": [1, 3, 5, 7]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_12 = reg_gscv.fit(x_train, y_train)
    t1 = pd.Timestamp.now()
    best_param_1 = model_12.best_params_["max_delta_step"]
    best_param_2 = model_12.best_params_["scale_pos_weight"]
    reg.set_params(max_delta_step=best_param_1, scale_pos_weight=best_param_2)
    xgb_param = reg.get_xgb_params()
    print("max_delta_step:%s,scale_pos_weight:%s,用时:%s" %
          (best_param_1, best_param_2, (t1 - t0)))

    # 16-调整最优弱分类器个数:n_estimators_3
    print("16-调整最优弱分类器个数:n_estimators_3")
    reg.set_params(n_estimators=n_estimators_0)
    xgb_param = reg.get_xgb_params()

    t0 = pd.Timestamp.now()
    xgb_cv = xgb.cv(
        params=xgb_param,
        dtrain=d_train,
        num_boost_round=xgb_param["n_estimators"],
        nfold=kfold,
        feval=rmspe_xg,
        #metrics=eval_metric,
        early_stopping_rounds=int(xgb_param["n_estimators"] / esr),
        verbose_eval=None)
    t1 = pd.Timestamp.now()
    n_estimators_3 = xgb_cv.shape[0]
    reg.set_params(n_estimators=n_estimators_3)
    xgb_param = reg.get_xgb_params()
    print("分类器个数:%s, 用时:%s" % (n_estimators_3, (t1 - t0)))

    # 17-精确搜索:learning_rate
    print("17-精确搜索:learning_rate")
    lr = xgb_param["learning_rate"]
    param = {"learning_rate": [lr - 0.05, lr, lr + 0.05]}
    reg_gscv = GridSearchCV(estimator=reg,
                            param_grid=param,
                            scoring=scoring,
                            n_jobs=-1,
                            iid=False,
                            cv=kfold)

    t0 = pd.Timestamp.now()
    model_16 = reg_gscv.fit(x_train, y_train)
    t_1 = pd.Timestamp.now()
    best_param = model_16.best_params_["learning_rate"]
    reg.set_params(learning_rate=best_param)
    xgb_param = reg.get_xgb_params()
    print("learning_rate:%s,用时:%s" % (best_param, (t_1 - t0)))

    # 18-终极训练
    print("18-终极训练")
    model_res = xgb.train(params=xgb_param,
                          dtrain=d_train,
                          num_boost_round=xgb_param["n_estimators"],
                          evals=watchlist,
                          feval=rmspe_xg,
                          early_stopping_rounds=int(xgb_param["n_estimators"] /
                                                    esr))
    t_end = pd.Timestamp.now()
    print("参数训练完毕,总用时:%s" % (t_end - t_begin))
    return model_res, reg
Ejemplo n.º 2
0
#######################################################cv调参
xgb1 = XGBRegressor(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='reg:gamma',
                    nthread=4,
                    scale_pos_weight=1,
                    seed=1024)

#####parameter 1max_depth

xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param,
                  Dtrain,
                  num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=5,
                  metrics='rmse',
                  early_stopping_rounds=50)
xgb1.set_params(n_estimators=cvresult.shape[0])

param_test1 = {
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [3, 4, 5, 6, 7]
}
gsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,
                                               n_estimators=1000,
                                               gamma=0,
Ejemplo n.º 3
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.num_training_rounds = num_training_rounds
                # init the classifier
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))

    def predict(self, test_df):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        if self.target_type == 'binary':
            self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1]
        elif self.target_type == 'linear':
            self.output = self.clf.predict(self.test_df[self.predictors])
        return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if col not in self.cols_to_remove:
                if self.verbose:
                    print('converting', col)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                if self.verbose:
                    print(df[col].dtype)

        # drop those marked for dropping
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            for idx, value in enumerate(self.output):
                test_id = self.test_df[self.id_column][idx]
                test_output = self.output[idx]
                to_write = [test_id, test_output]
                if include_actual:
                    to_write.append(self.test_df[self.target_column][idx])
                writer.writerow(to_write)

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)
Ejemplo n.º 4
0
                        gamma=0,
                        subsample=0.7,
                        colsample_bytree=0.7,
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)
grid = GridSearchCV(estimator=xgb_best, param_grid=param_test, cv=5)
grid.fit(source_X, source_y)
grid.grid_scores_
grid.best_estimator_

xgb_best.fit(train_X, train_y)
xgb_best.score(test_X, test_y)
print(xgb_best.score(test_X, test_y))

xgb_param = xgb_best.get_xgb_params()
xgb.cv(xgb_param,
       xgtrain,
       num_boost_round=5000,
       nfold=15,
       metrics=['auc'],
       early_stopping_rounds=50,
       stratified=True,
       seed=1301)

full_xy = pd.concat([source_X, source_y], axis=1)
target = 'count'


def modelfit(alg,
             dtrain,
Ejemplo n.º 5
0
                             min_child_weight=3,
                             gamma=0,
                             reg_alpha=100,
                             subsample=0.8,
                             colsample_bytree=0.7,
                             objective='reg:linear',
                             base_score=0.5,
                             nthread=-1,
                             scale_pos_weight=1,
                             silent=0)

    #xgbParams = {'booster':'gbtree','objective':'reg:linear','gamma':0,'max_depth':5,'lambda':100,'subsample':0.8,'colsample_bytree':0.7,'min_child_weight':3,'eta':0.1}
    dtrain = xgb.DMatrix(train[train_features].values,
                         label=np.log1p(train[target].values))
    watchlist = [(dtrain, 'train')]
    model = xgb.train(xgbParams.get_xgb_params(),
                      dtrain,
                      5000,
                      watchlist,
                      early_stopping_rounds=50)
    model.save_model('./../sberbank.model')
    #model = xgb.Booster({'nthread':-1})
    #model.load_model('./../sberbank.model')
    dtest = xgb.DMatrix(test[test_features].values)
    pred = model.predict(dtest)
    answer = pd.DataFrame({'id': test[IDCol], 'price_doc': np.expm1(pred)})
    answer.to_csv('./../answer.csv')
    #gsearch = GridSearchCV(estimator = xgbParams, param_grid = param,scoring = 'neg_mean_squared_error', cv=5)
    #gsearch.fit(train[train_features], train[target])
    #print("best_params: %s"%gsearch.best_params_)
    #print('best_score: %s'%gsearch.best_score_)
Ejemplo n.º 6
0
                    subsample=0.8,
                    learning_rate=0.05,
                    random_state=42)

param_xgb = {'min_child_weight': [1, 2, 3, 4, 5]}
xgbc = model_fit(xgbc, xtrain, ytrain_casual, param_xgb, False)
xgbc = model_fit(xgbc, xtrain, ytrain_registered, param_xgb, False)

ypred_xgb_count = Test_Set_Report("XGBoost", xgbc)

result_xgb = pd.concat([ytest_count, pd.Series(ypred_xgb_count)], axis=1)

# xgb.cv is used to get the actual number of n_estimators required based on the learning rate,
# it uses early_stopping_rounds to get the optimal value
xdtrain = xgb.DMatrix(xtrain, label=ytrain_casual)
cvresult_xgb = xgb.cv(xgbc.get_xgb_params(),
                      xdtrain,
                      nfold=5,
                      num_boost_round=5000,
                      metrics='rmse',
                      early_stopping_rounds=50)

bestpred_xgb = print_feature_importance(xgbc)
''' Storing the predicted results along with the actual prediction for each phone number in a csv file'''

FinalResult_Python = FinalResult_Python.reset_index()
FinalResult_Python.drop(columns=['index'], axis=1, inplace=True)
FinalResult_Python = pd.concat([FinalResult_Python, result_xgb[0]], axis=1)
FinalResult_Python.rename(columns={0: 'Predicted Count'}, inplace=True)
FinalResult_Python.to_csv("PredictedRentalCount_Python.csv", index=False)
Ejemplo n.º 7
0
class Xgb:
    def __init__(self,
                 df,
                 target_column='',
                 id_column='',
                 target_type='binary',
                 categorical_columns=[],
                 drop_columns=[],
                 numeric_columns=[],
                 num_training_rounds=500,
                 verbose=1,
                 sample_fraction=1.0,
                 n_samples=1,
                 early_stopping_rounds=None,
                 prefix='xgb_model',
                 scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0 / sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0 / len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(learning_rate=0.1,
                                             n_estimators=num_training_rounds,
                                             subsample=0.8,
                                             colsample_bytree=0.8,
                                             objective='binary:logistic',
                                             scale_pos_weight=1,
                                             seed=123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(learning_rate=0.1,
                                             n_estimators=num_training_rounds,
                                             subsample=0.8,
                                             colsample_bytree=0.8,
                                             objective='multi:softmax',
                                             scale_pos_weight=1,
                                             seed=123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(n_estimators=num_training_rounds,
                                            objective='reg:linear')
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [
            x for x in self.df.columns
            if x not in [self.target_column, self.id_column]
        ]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df,
                                         fraction=self.sample_fraction,
                                         n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx + 1) + ' of ' + str(self.n_samples) +
                  ', sample_fraction=' + str(self.sample_fraction))
            xgtrain = xgb.DMatrix(current_df[self.predictors],
                                  label=current_df[self.target_column],
                                  missing=np.nan)
            try:
                cvresult = xgb.cv(
                    xgb_param,
                    xgtrain,
                    num_boost_round=self.clf.get_params()['n_estimators'],
                    nfold=5,
                    metrics=[self.scoring],
                    early_stopping_rounds=self.early_stopping_rounds,
                    show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(
                        xgb_param,
                        xgtrain,
                        num_boost_round=self.clf.get_params()['n_estimators'],
                        nfold=5,
                        metrics=[self.scoring],
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(
                        current_df[self.target_column].unique())
                    cvresult = xgb.cv(
                        xgb_param,
                        xgtrain,
                        num_boost_round=self.clf.get_params()['n_estimators'],
                        nfold=5,
                        metrics=[self.scoring],
                        early_stopping_rounds=self.early_stopping_rounds)
            self.clf.set_params(n_estimators=cvresult.shape[0])
            print('fitting model')
            self.clf.fit(current_df[self.predictors],
                         current_df[self.target_column],
                         eval_metric=self.scoring)

            #Predict training set:
            train_df_predictions = self.clf.predict(
                current_df[self.predictors])

            if self.target_type == 'binary' or self.target_type == 'multiclass':
                train_df_predprob = self.clf.predict_proba(
                    current_df[self.predictors])[:, 1]
                print("Accuracy : %.4g" % metrics.accuracy_score(
                    current_df[self.target_column].values,
                    train_df_predictions))
                if self.target_type == 'binary':
                    print("AUC Score (Train): %f" % metrics.roc_auc_score(
                        current_df[self.target_column], train_df_predprob))
            elif self.target_type == 'linear':
                print("Mean squared error: %f" % metrics.mean_squared_error(
                    current_df[self.target_column].values,
                    train_df_predictions))
                print("Root mean squared error: %f" % np.sqrt(
                    metrics.mean_squared_error(
                        current_df[self.target_column].values,
                        train_df_predictions)))
            filename = self.prefix + '_' + str(idx) + '.pkl'
            self.save(filename)

    def predict(self,
                test_df,
                return_multi_outputs=False,
                return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                if self.target_type == 'binary':
                    output = self.clf.predict_proba(
                        self.test_df[self.predictors])[:, 1]
                elif self.target_type == 'linear':
                    output = self.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb_load = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb_load.clf.predict_proba(
                            self.test_df[self.predictors])[:, 1]
                    elif self.target_type == 'linear':
                        output = xgb_load.clf.predict(
                            self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(
            self.clf.booster().get_fscore().items()),
                                    key=operator.itemgetter(1),
                                    reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature",
                                  "importance",
                                  kind="barh",
                                  color=sns.color_palette("deep", 3))

    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([
                df,
                pd.get_dummies(
                    df[col]).rename(columns=lambda x: col + '_' + str(x))
            ],
                           axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt / float(
                        len(df[col])
                ) > 0.6:  # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if col is not self.target_column:
                    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                        if df[col].std() == 0:
                            print('will drop', col)
                            self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[
                    col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = list(range(0, num_rows))
        print('INDICES', indices)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s], :])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
                         scale_pos_weight=1,
                         seed=27)

# In[ ]:

# results = cross_val_score(xgb_model, train_df_drop, log_target_df, cv=kfold, n_jobs=1, scoring='neg_mean_squared_error', verbose=True)
# results = np.sqrt( np.abs( results ) ) #MSLE TO RMSLE

# In[ ]:

# print(np.sqrt(np.abs(results)))

# In[ ]:

xgtrain = xgb.DMatrix(train_df_drop, log_target_df)
xgb_param = xgb_model.get_xgb_params()

# In[ ]:

results = xgb.cv(xgb_param,
                 xgtrain,
                 num_boost_round=n_estimators,
                 nfold=cv,
                 metrics='rmse',
                 early_stopping_rounds=early_stopping_rounds,
                 verbose_eval=20)

# In[ ]:

xgb_model.fit(X_train,
              y_train,
Ejemplo n.º 9
0
watchlist = [(d_train, "train"), (d_valid, "valid")]

# model
reg = XGBRegressor(booster="gbtree",
                   silent=1,
                   n_jobs=-1,
                   learning_rate=learning_rate,
                   n_estimators=n_estimators,
                   max_depth=max_depth,
                   subsample=subsample,
                   colsample_bytree=colsample_bytree,
                   objective=objective,
                   eval_metric=eval_metric,
                   seed=0)

xgb_param = reg.get_xgb_params()
model_allstores = xgb.train(params=xgb_param,
                            dtrain=d_train,
                            num_boost_round=xgb_param["n_estimators"],
                            evals=watchlist,
                            feval=fd.rmspe_xg,
                            early_stopping_rounds=1000)
f = open(txt_path + "model_allstores.txt", "wb")
pickle.dump(model_allstores, f)
f.close()

# 模型验证
y_hat_valid = model_allstores.predict(d_valid)
y_hat_valid = np.expm1(y_hat_valid).astype(np.int64)
fd.rmspe(np.expm1(y_valid), y_hat_valid)  # 验证集rmspe:0.13831
Ejemplo n.º 10
0
class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0/sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0/len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'multi:softmax',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction))
            xgtrain  = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan)
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(current_df[self.target_column].unique())
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
                self.clf.set_params(n_estimators=cvresult.shape[0])
                self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring)

                #Predict training set:
                train_df_predictions = self.clf.predict(current_df[self.predictors])

                if self.target_type == 'binary' or self.target_type == 'multiclass':
                    train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1]
                    print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions))
                    if self.target_type == 'binary':
                        print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob))
                elif self.target_type == 'linear':
                    print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))
                    print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)))
                filename = self.prefix + '_' + str(idx) + '.pkl'
                self.save(filename)

    def predict(self, test_df, return_multi_outputs=False, return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                xgb = self
                if self.target_type == 'binary':
                    output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                elif self.target_type == 'linear':
                    output = xgb.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                    elif self.target_type == 'linear':
                        output = xgb.clf.predict(self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = range(0,num_rows)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s],:])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb
Ejemplo n.º 11
0
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='reg:gamma',
 nthread=4,
 scale_pos_weight=1,
 seed=1024)
    
  


#####parameter 1max_depth 

xgb_param = xgb1.get_xgb_params()
cvresult = xgb.cv(xgb_param, Dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
        metrics='rmse', early_stopping_rounds=50)
xgb1.set_params(n_estimators=cvresult.shape[0])






param_test1 = {
 'max_depth':[3,4,5,6,7],
 'min_child_weight':[3,4,5,6,7]
}
gsearch1 = GridSearchCV(estimator = XGBRegressor(
 learning_rate =0.1,