Python XGBRegressor.booster Examples

Programming Language: Python

Namespace/Package Name: xgboost.sklearn

Class/Type: XGBRegressor

Method/Function: booster

Examples at hotexamples.com: 8

Python XGBRegressor.booster - 8 examples found. These are the top rated real world Python examples of xgboost.sklearn.XGBRegressor.booster extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

XGBRegressor(30)

fit(30)

predict(30)

set_params(25)

score(17)

get_xgb_params(8)

booster(6)

get_params(6)

evals_result(5)

apply(5)

predict_proba(2)

save_model(2)

load_model(1)

plot_importance(1)

loss_(1)

get_booster(1)

staged_predict(1)

Example #1

Show file

def get_feat_imp(train,ID='id',target='price_doc'):

    predictors = [x for x in train.columns if x not in [ID,target]]
    model = XGBRegressor( max_depth=5, learning_rate=0.05, n_estimators=385,
                          silent=True, objective='reg:linear', nthread=-1, min_child_weight=1,
                          max_delta_step=0, subsample=0.93, seed=27)
    model.fit(train[predictors],train[target])
    feat_imp = pd.Series(model.booster().get_fscore(),index=predictors).sort_values(ascending=False)
    return feat_imp

Example #2

Show file

gpfinal.drop('units_y', axis=1, inplace=True)

gpfinal.drop('releaseDate', axis=1, inplace=True)

test.drop('releaseDate', axis=1, inplace=True)

print test[test.isnull().any(axis=1)]

clf.fit(gpfinal,
        final_target,
        eval_metric='mae',
        eval_set=[(gpfinal, final_target)])

preds = clf.predict(test)

print clf.booster().get_score()
clf2 = RandomForestClassifier(n_jobs=2, random_state=0)
clf2.fit(gpfinal, final_target)
preds2 = clf2.predict(test)

#from sklearn.model_selection import train_test_split
#X_train, X_validation, y_train, y_validation = train_test_split(gpfinal, final_target, train_size=0.7, random_state=seed)
#categorical_features_indices = np.where(gpfinal.dtypes != np.float)[0]
#from catboost import CatBoostRegressor
#model=CatBoostRegressor(iterations=100, depth=3, learning_rate=0.1, loss_function='RMSE')
#model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)
#
#preds3 = model.predict(test)
#preds = (np.array(preds) + np.array(preds2) + np.array(preds3)) / 3
#preds = (np.array(preds) + np.array(preds2)) / 2

Example #3

Show file

File: __init__.py Project: softman123g/xgbmagic

class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, early_stopping_rounds=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.num_training_rounds = num_training_rounds
                # init the classifier
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))

    def predict(self, test_df):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        if self.target_type == 'binary':
            self.output = self.clf.predict_proba(self.test_df[self.predictors])[:,1]
        elif self.target_type == 'linear':
            self.output = self.clf.predict(self.test_df[self.predictors])
        return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if col not in self.cols_to_remove:
                if self.verbose:
                    print('converting', col)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                if self.verbose:
                    print(df[col].dtype)

        # drop those marked for dropping
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            for idx, value in enumerate(self.output):
                test_id = self.test_df[self.id_column][idx]
                test_output = self.output[idx]
                to_write = [test_id, test_output]
                if include_actual:
                    to_write.append(self.test_df[self.target_column][idx])
                writer.writerow(to_write)

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

Example #4

Show file


@sym_predict.register(XGBRegressor)
def sym_predict_xgb_regressor(estimator):
    dump = estimator.get_booster().get_dump()
    inputs = tuple(map(RealVariable, estimator.get_booster().feature_names))
    Var = VariableFactory(existing=inputs)
    calls = tuple(
        map(
            lambda x: ((Var(), ), (x, inputs)),
            map(lambda x: Function(inputs, tuple(), (x.expression(), )),
                map(Node.from_str, dump))))
    output = reduce(__add__, map(compose(first, first), calls)) + RealNumber(
        0.5)  # TODO: Why do I have to add 0.5?
    return Function(inputs, calls, (output, ))


if __name__ == '__main__':
    model = XGBRegressor(n_estimators=2, max_depth=1)
    X, y = make_regression()
    X = DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])])
    model.fit(X, y)
    print(sym_predict(model))
    code = sklearn2code(model, ['predict'], numpy_flat)
    print(code)
    print(model.booster().get_dump()[0])
    module = exec_module('module', code)
    print(module.predict(**X.loc[:10, :]))
    print(model.predict(X.loc[:10, :]))
    1 + 1

Example #5

Show file

File: xgbmagic.py Project: mirri66/xgbmagic

class Xgb:
    def __init__(self,
                 df,
                 target_column='',
                 id_column='',
                 target_type='binary',
                 categorical_columns=[],
                 drop_columns=[],
                 numeric_columns=[],
                 num_training_rounds=500,
                 verbose=1,
                 sample_fraction=1.0,
                 n_samples=1,
                 early_stopping_rounds=None,
                 prefix='xgb_model',
                 scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0 / sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0 / len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(learning_rate=0.1,
                                             n_estimators=num_training_rounds,
                                             subsample=0.8,
                                             colsample_bytree=0.8,
                                             objective='binary:logistic',
                                             scale_pos_weight=1,
                                             seed=123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(learning_rate=0.1,
                                             n_estimators=num_training_rounds,
                                             subsample=0.8,
                                             colsample_bytree=0.8,
                                             objective='multi:softmax',
                                             scale_pos_weight=1,
                                             seed=123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(n_estimators=num_training_rounds,
                                            objective='reg:linear')
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [
            x for x in self.df.columns
            if x not in [self.target_column, self.id_column]
        ]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df,
                                         fraction=self.sample_fraction,
                                         n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx + 1) + ' of ' + str(self.n_samples) +
                  ', sample_fraction=' + str(self.sample_fraction))
            xgtrain = xgb.DMatrix(current_df[self.predictors],
                                  label=current_df[self.target_column],
                                  missing=np.nan)
            try:
                cvresult = xgb.cv(
                    xgb_param,
                    xgtrain,
                    num_boost_round=self.clf.get_params()['n_estimators'],
                    nfold=5,
                    metrics=[self.scoring],
                    early_stopping_rounds=self.early_stopping_rounds,
                    show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(
                        xgb_param,
                        xgtrain,
                        num_boost_round=self.clf.get_params()['n_estimators'],
                        nfold=5,
                        metrics=[self.scoring],
                        early_stopping_rounds=self.early_stopping_rounds,
                        verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(
                        current_df[self.target_column].unique())
                    cvresult = xgb.cv(
                        xgb_param,
                        xgtrain,
                        num_boost_round=self.clf.get_params()['n_estimators'],
                        nfold=5,
                        metrics=[self.scoring],
                        early_stopping_rounds=self.early_stopping_rounds)
            self.clf.set_params(n_estimators=cvresult.shape[0])
            print('fitting model')
            self.clf.fit(current_df[self.predictors],
                         current_df[self.target_column],
                         eval_metric=self.scoring)

            #Predict training set:
            train_df_predictions = self.clf.predict(
                current_df[self.predictors])

            if self.target_type == 'binary' or self.target_type == 'multiclass':
                train_df_predprob = self.clf.predict_proba(
                    current_df[self.predictors])[:, 1]
                print("Accuracy : %.4g" % metrics.accuracy_score(
                    current_df[self.target_column].values,
                    train_df_predictions))
                if self.target_type == 'binary':
                    print("AUC Score (Train): %f" % metrics.roc_auc_score(
                        current_df[self.target_column], train_df_predprob))
            elif self.target_type == 'linear':
                print("Mean squared error: %f" % metrics.mean_squared_error(
                    current_df[self.target_column].values,
                    train_df_predictions))
                print("Root mean squared error: %f" % np.sqrt(
                    metrics.mean_squared_error(
                        current_df[self.target_column].values,
                        train_df_predictions)))
            filename = self.prefix + '_' + str(idx) + '.pkl'
            self.save(filename)

    def predict(self,
                test_df,
                return_multi_outputs=False,
                return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                if self.target_type == 'binary':
                    output = self.clf.predict_proba(
                        self.test_df[self.predictors])[:, 1]
                elif self.target_type == 'linear':
                    output = self.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb_load = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb_load.clf.predict_proba(
                            self.test_df[self.predictors])[:, 1]
                    elif self.target_type == 'linear':
                        output = xgb_load.clf.predict(
                            self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(
            self.clf.booster().get_fscore().items()),
                                    key=operator.itemgetter(1),
                                    reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature",
                                  "importance",
                                  kind="barh",
                                  color=sns.color_palette("deep", 3))

    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([
                df,
                pd.get_dummies(
                    df[col]).rename(columns=lambda x: col + '_' + str(x))
            ],
                           axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt / float(
                        len(df[col])
                ) > 0.6:  # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if col is not self.target_column:
                    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                        if df[col].std() == 0:
                            print('will drop', col)
                            self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[
                    col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = list(range(0, num_rows))
        print('INDICES', indices)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s], :])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb

Example #6

Show file

    for sample in range(p.N_SAMPLES):
        print('Sample {} size: {}'.format(sample, len(indexes[sample])))

    # List containing all predictions for each sample
    predictions = []
    # Train each sample individually and average the predictions
    for sample in range(p.N_SAMPLES):
        print('\n\nTraining sample {}'.format(sample))

        # Train sample from weeks 8 and 9
        train_week([8, 9], indexes[sample], model)

        # Test sample is the next one in the list
        sample_to_test = (sample + 1) % p.N_SAMPLES
        # Get best features and save them
        pd.Series(model.booster().get_fscore()).sort_values(ascending=False). \
            to_csv('{}/best_features_sample_{}.csv'.format(FEAT_DIR, sample))

        # Test
        print('\nTesting on sample {}'.format(sample_to_test))
        rmsle = test_week([8, 9], indexes[sample_to_test], model)
        print('\n--> RMSLE = {}'.format(rmsle))

        # Get predictions, average week10 with previous ones if the list is not
        # empty
        print('\nGetting predictions')
        week10 = []
        if p.WEEK10_OLD_PRED:
            # get IDs from week 10 (need to load week 10...)
            data_10_ids = list(load_week(10, 1).id)

Example #7

Show file

File: xgbmagic.py Project: gamblenull1996/xgbmagic

class Xgb:
    def __init__(self, df, target_column='', id_column='', target_type='binary', categorical_columns=[], drop_columns=[], numeric_columns=[], num_training_rounds=500, verbose=1, sample_fraction=1.0, n_samples=1, early_stopping_rounds=None, prefix='xgb_model', scoring=None):
        """
        input params:
        - df (DataFrame): dataframe of training data
        - target_column (string): name of target column
        - id_column (string): name of id column
        - target_type (string): 'linear' or 'binary'
        - categorical_columns (list): list of column names of categorical data. Will perform one-hot encoding
        - drop_columns (list): list of columns to drop
        - numeric_columns (list): list of columns to convert to numeric
        - verbose (bool): verbosity of printouts
        """
        # checks for sampling
        sample_fraction = float(sample_fraction)
        if sample_fraction > 1:
            sample_fraction = 1.0
        if sample_fraction * n_samples > 1:
            n_samples = round(1.0/sample_fraction)
        if sample_fraction <= 0:
            print('sample_fraction 0 or negative, switching to 0.1')
            sample_fraction = 0.1
        # if sample_fraction is results in sample smaller than 1
        if round(sample_fraction * len(df)) == 0:
            sample_fraction = 1.0/len(df)
        # check if data is dataframe
        if type(df) == pd.core.frame.DataFrame:
            self.df = df
            self.early_stopping_rounds = early_stopping_rounds
            if target_column:
                self.target_column = target_column
                self.id_column = id_column
                self.target_type = target_type
                self.categorical_columns = categorical_columns
                self.numeric_columns = numeric_columns
                self.drop_columns = drop_columns
                self.verbose = verbose
                self.sample_fraction = sample_fraction
                self.n_samples = n_samples
                self.num_training_rounds = num_training_rounds
                self.prefix = prefix
                # init the classifier:
                if self.target_type == 'binary':
                    self.scoring = 'auc'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'binary:logistic',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'multiclass':
                    self.scoring = 'merror'
                    self.clf = XGBClassifier(
                        learning_rate =0.1,
                        n_estimators = num_training_rounds,
                        subsample = 0.8,
                        colsample_bytree = 0.8,
                        objective = 'multi:softmax',
                        scale_pos_weight = 1,
                        seed = 123)
                elif self.target_type == 'linear':
                    self.scoring = 'rmse'
                    self.clf = XGBRegressor(
                            n_estimators = num_training_rounds,
                            objective = 'reg:linear'
                            )
                # if preferred scoring metric is stated:
                if scoring:
                    self.scoring = scoring
            else:
                print('please provide target column name')
        else:
            print('please provide pandas dataframe')

    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        # if subsampling
        if self.sample_fraction == 1.0:
            df_list = [self.df]
        else:
            df_list = self.random_sample(df=self.df, fraction=self.sample_fraction, n_samples=self.n_samples)
        print(df_list)
        for idx, current_df in enumerate(df_list):
            print('ITERATION ' + str(idx) + ' of ' + str(self.n_samples) +', sample_fraction=' + str(self.sample_fraction))
            xgtrain  = xgb.DMatrix(current_df[self.predictors], label=current_df[self.target_column], missing=np.nan)
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
            except:
                try:
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
                except:
                    xgb_param['num_class'] = len(current_df[self.target_column].unique())
                    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                        metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
                self.clf.set_params(n_estimators=cvresult.shape[0])
                self.clf.fit(current_df[self.predictors], current_df[self.target_column], eval_metric=self.scoring)

                #Predict training set:
                train_df_predictions = self.clf.predict(current_df[self.predictors])

                if self.target_type == 'binary' or self.target_type == 'multiclass':
                    train_df_predprob = self.clf.predict_proba(current_df[self.predictors])[:,1]
                    print("Accuracy : %.4g" % metrics.accuracy_score(current_df[self.target_column].values, train_df_predictions))
                    if self.target_type == 'binary':
                        print("AUC Score (Train): %f" % metrics.roc_auc_score(current_df[self.target_column], train_df_predprob))
                elif self.target_type == 'linear':
                    print("Mean squared error: %f" % metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions))
                    print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(current_df[self.target_column].values, train_df_predictions)))
                filename = self.prefix + '_' + str(idx) + '.pkl'
                self.save(filename)

    def predict(self, test_df, return_multi_outputs=False, return_mean_std=False):
        print('### predicting ###')
        print('## preprocessing test set')
        if self.id_column in test_df:
            ids = test_df[self.id_column]
        if self.target_column in test_df.columns:
            targets = test_df[self.target_column]
        self.test_df = self.preprocess(test_df, train=False)
        if self.id_column in test_df:
            self.test_df[self.id_column] = ids
        if self.target_column in test_df.columns:
            self.test_df[self.target_column] = targets
        for col in self.predictors:
            if col not in self.test_df.columns:
                self.test_df[col] = np.nan

        # prediction
        print('## predicting from test set')
        output_list = []
        output = None
        for idx, ns in enumerate(range(self.n_samples)):
            if self.n_samples == 1:
                xgb = self
                if self.target_type == 'binary':
                    output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                elif self.target_type == 'linear':
                    output = xgb.clf.predict(self.test_df[self.predictors])
            else:
                try:
                    filename = self.prefix + '_' + str(idx) + '.pkl'
                    xgb = self.load(filename)
                    if self.target_type == 'binary':
                        output = xgb.clf.predict_proba(self.test_df[self.predictors])[:,1]
                    elif self.target_type == 'linear':
                        output = xgb.clf.predict(self.test_df[self.predictors])
                    output_list.append(list(output))
                except IOError:
                    print('no file found, skipping')
        # average the outputs if n_samples is more than one
        if self.n_samples == 1:
            self.output = output
            try:
                self.multi_outputs = [list(output)]
            except:
                self.multi_outputs = None
        else:
            self.output = np.mean(output_list, axis=0)
            self.multi_outputs = output_list
        if return_multi_outputs:
            return self.multi_outputs
        elif return_mean_std:
            return (self.output, np.std(output_list, axis=0))
        else:
            return self.output

    def feature_importance(self, num_print=10, display=True):
        feature_importance = sorted(list(self.clf.booster().get_fscore().items()), key = operator.itemgetter(1), reverse=True)

        impt = pd.DataFrame(feature_importance)
        impt.columns = ['feature', 'importance']
        print(impt[:num_print])
        if display:
            impt[:num_print].plot("feature", "importance", kind="barh", color=sns.color_palette("deep", 3))


    def preprocess(self, df, train=True):
        # one hot encoding of categorical variables
        print('## one hot encoding of categorical variables')
        for col in self.categorical_columns:
            if self.verbose:
                print('one hot encoding: ', col)
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col+'_'+str(x))], axis=1)
            df = df.drop([col], axis=1)

        # if training, determine columns to be removed
        if train:
            # drop columns that are too sparse to be informative
            self.cols_to_remove = []
            print('## dropping columns below sparsity threshold')
            for col in df.columns:
                nan_cnt = 0
                for x in df[col]:
                    try:
                        if np.isnan(x):
                            nan_cnt += 1
                    except:
                        pass
                if nan_cnt/float(len(df[col])) > 0.6: # arbitrary cutoff, if more than 60% missing then drop
                    if self.verbose:
                        print('will drop', col)
                    self.cols_to_remove.append(col)

            # drop columns that have no standard deviation (not informative)
            print('## dropping columns with no variation')
            for col in df.columns:
                if df[col].dtype == 'int64' or df[col].dtype == 'float64':
                    if df[col].std() == 0:
                        print('will drop', col)
                        self.cols_to_remove.append(col)
        if self.verbose and self.cols_to_remove:
            print('dropping the following columns:', self.cols_to_remove)
            df = df.drop(self.cols_to_remove, axis=1)

        if self.verbose:
            print('## DataFrame shape is now:', df.shape)

        # convert to numerical where possible
        #print('## converting numerical data to numeric dtype')
        #df = df.convert_objects(convert_numeric=True)

        # convert columns specified to be int and float
        for col in self.numeric_columns:
            if self.verbose:
                print('converting', col)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if self.verbose:
                print(df[col].dtype)
        df = df.drop(self.drop_columns, axis=1)

        # drop all those that are object type
        print('## dropping non-numerical columns')
        for col in df.columns:
            if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'bool':
                pass
            else:
                if self.verbose:
                    print('dropping because not int, float, or bool:', col)
                df = df.drop([col], axis=1)
        return df

    def _to_int(self, num):
        try:
            return int(num)
        except:
            return

    def _to_float(self, num):
        try:
            return float(num)
        except:
            return

    def random_sample(self, df, fraction=0.2, n_samples=None):
        """
        splits into random samples
        - n_samples: how many samples you want returned (default = All)
        - fraction : what fraction of data to include in the sample (default = 0.2)
        """
        print('constructing random samples')
        num_rows = len(df)
        len_sample = round(fraction * num_rows)
        # create list of slice index lists
        indices = range(0,num_rows)
        slice_list = []
        tmp_idx_list = []
        while len(indices) > 0:
            while len(tmp_idx_list) < len_sample and len(indices) > 0:
                idx = indices.pop(random.randrange(len(indices)))
                tmp_idx_list.append(idx)
            slice_list.append(tmp_idx_list)
            tmp_idx_list = []
        # get slices
        sample_list = []
        for s in range(n_samples):
            try:
                sample_list.append(df.loc[slice_list[s],:])
            except:
                pass
        return sample_list

    def write_csv(self, filename, include_actual=False):
        """
        write results to csv
        - include actual: if actual values are known for test set, and we want to print them
        """
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            headers = [self.id_column, self.target_column]
            if include_actual:
                headers.append('actual')
            writer.writerow(headers)
            try:
                for idx, value in enumerate(self.output):
                    test_id = self.test_df[self.id_column][idx]
                    test_output = self.output[idx]
                    to_write = [test_id, test_output]
                    if include_actual:
                        to_write.append(self.test_df[self.target_column][idx])
                    writer.writerow(to_write)
                print('results written to ' + filename)
            except:
                print('write_csv failed')

    def save(self, filename='xgb.pkl'):
        joblib.dump(self, filename)

    def load(self, model_file='xgb.pkl'):
        xgb = joblib.load(model_file)
        return xgb

Example #8

Show file

File: TreeModel.py Project: pratvi123/kaggle-TaxiTripDuration

def XGB_Main(train, test):
    print "XGB_Main"
    train_y = train["trip_duration"].values
    train_y = np.log(train_y + 1)
    print "features:", x_columns
    print "feature size:", len(x_columns)
    train_x = train[x_columns].values
    test_x = test[x_columns].values

    start = time.time()
    Xtr, Xv, ytr, yv = train_test_split(train_x,
                                        train_y,
                                        test_size=0.3,
                                        random_state=2017)
    dtrain = xgb.DMatrix(Xtr, label=ytr)
    dvalid = xgb.DMatrix(Xv, label=yv)
    dtest = xgb.DMatrix(test_x)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

    # Try different parameters! My favorite is random search :)
    lr = 0.05
    n_rounds = 5000
    early_stopping_rounds = 50
    xgb_pars = {
        'min_child_weight': 100,
        'eta': lr,
        'colsample_bytree': 0.5,
        'max_depth': 10,
        'subsample': 0.85,
        'lambda': 0,
        'alpha': 0,
        'gamma': 0,
        'nthread': -1,
        'booster': 'gbtree',
        'silent': 1,
        'eval_metric': 'rmse',
        'objective': 'reg:linear'
    }

    # You could try to train with more epoch
    # model = xgb.train(xgb_pars, dtrain, n_rounds, watchlist, early_stopping_rounds=early_stopping_rounds,
    #                   maximize=False, verbose_eval=1)

    model = XGBRegressor(
        learning_rate=lr,
        n_estimators=n_rounds,
        max_depth=xgb_pars["max_depth"],
        min_child_weight=xgb_pars["min_child_weight"],
        gamma=xgb_pars["gamma"],  # 指定分裂节点损失下降的最小值
        subsample=xgb_pars["subsample"],
        colsample_bytree=xgb_pars["colsample_bytree"],
        objective=xgb_pars["objective"],
        nthread=xgb_pars["nthread"],
        reg_lambda=xgb_pars["lambda"],  # l2正则
        reg_alpha=xgb_pars["alpha"],  # l1正则
        seed=2017)
    model.fit(Xtr,
              ytr,
              early_stopping_rounds=early_stopping_rounds,
              eval_metric=xgb_pars["eval_metric"],
              eval_set=[[Xv, yv]])
    print("Time taken by above cell is {}.".format(time.time() - start))
    print('Modeling RMSLE %.5f' % model.best_score)
    # exit(1)

    # xgb.cv(xgb_pars, dtrain, num_boost_round=n_rounds)

    # grid seach sv
    # param_test1 = {
    #     'max_depth': np.arange(4, 22, 2),+
    #     'min_child_weight': np.arange(40, 100, 5)
    # }

    # train_model(model, train_x, train_y, cv=3, grid_search=True, re_fit=False, grid_params=param_test1)

    predicts = model.predict(test_x, ntree_limit=model.best_ntree_limit)
    predicts = np.exp(predicts) - 1
    test["trip_duration"] = predicts
    csv__format = "XGB_{}rounds_{}lr_{}f_{}weight_" \
                  "{}depth_{}cb_{}subsample_{}gamma_{}lambda_{}alpha.csv".format(n_rounds, lr, len(x_columns),
                                                                                 xgb_pars["min_child_weight"],
                                                                                 xgb_pars["max_depth"],
                                                                                 xgb_pars["colsample_bytree"],
                                                                                 xgb_pars["subsample"],
                                                                                 xgb_pars["gamma"], xgb_pars["lambda"],
                                                                                 xgb_pars["alpha"])
    save_path = "../result/" + csv__format

    save_result(test[["id", "trip_duration"]], save_path)

    # save model
    model_save_path = "../result/MODEL/" + csv__format
    model.booster().save_model(model_save_path.replace("csv", "model"))
    # feature importance
    feature_importance_dict = model.booster().get_fscore()
    print feature_importance_dict