# using different random seed to make sure variety of models seed = 72 np.random.seed(seed) DATE = '0318' utils.mkdir_p('../output/model/{}_{}/'.format(DATE, seed)) utils.mkdir_p('../output/sub/{}_{}/'.format(DATE, seed)) print("""#==== print param ======""") print('DATE:', DATE) print('seed:', seed) ################################## # loading data ################################## train = utils.load_pred_feature('train', keep_all=False) print( 'scale_pos_weight', 1.0 * train.label.value_counts().iloc[0] / train.label.value_counts().iloc[1]) ################################## # pre-processing ################################## train['final_time'] = pd.to_datetime(train.final_time) train['initial_time'] = pd.to_datetime(train.initial_time) print('pre-processing done') #============================================================================== # prepare training data #==============================================================================
'subsample':0.75, 'silent':1, 'nthread':27, 'eval_metric':'logloss', 'objective':'binary:logistic', 'tree_method':'hist' }# it's tree booster print("""#==== print param ======""") print('DATE:', DATE) print('seed:', seed) #============================================================================== # prepare #============================================================================== train = pd.concat([utils.load_pred_feature('trainW-0'), utils.load_pred_feature('trainW-1'), utils.load_pred_feature('trainW-2'), ], ignore_index=True) y_train = train['is_churn'] X_train = train.drop('is_churn', axis=1) del train gc.collect() X_train.fillna(-1, inplace = True) #============================================================================== # SPLIT!
from xgboost import XGBClassifier from matplotlib import pyplot import utils # made by author for efficiently dealing with data from sklearn.grid_search import GridSearchCV seed = 72 np.random.seed(seed) ########################################## #load dataset ########################################## # load dataset file_name = '../output/model/xgb_feature_tuning_seed_72.model' train_0 = utils.load_pred_feature('trainW-0', keep_all=False, model_file_name=file_name, n_top_features=48) train_1 = utils.load_pred_feature('trainW-1', keep_all=False, model_file_name=file_name, n_top_features=48) train_2 = utils.load_pred_feature('trainW-2', keep_all=False, model_file_name=file_name, n_top_features=48) # make data augmentation having same label distribution with training set provided by the kkbox per_churned_in_train_0 = train_0[['is_churn']].describe().ix['mean'][0] n_churned = train_1[train_1.is_churn == 0].shape[0] * per_churned_in_train_0 print('per_churned_in_train_0', per_churned_in_train_0) print('n_churned', int(n_churned)) train_1 = pd.concat([
'tree_method': 'hist' } # it's tree booster #subsample and colsample_bytree are build to control overfitting #via adding randomness to make training robust to noise print("""#==== print param ======""") print('DATE:', DATE) print('seed:', seed) #============================================================================== # prepare #============================================================================== train = pd.concat( [ utils.load_pred_feature('trainW-0'), # utils.load_pred_feature('trainW-1'), # utils.load_pred_feature('trainW-2'), ], ignore_index=True) y_train = train['is_churn'] X_train = train.drop('is_churn', axis=1) del train gc.collect() X_train.fillna(-1, inplace=True) def ceate_feature_map(features): f = open('../output/xgb.fmap', 'w')
import utils # made by author for efficiently dealing with data from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import log_loss from numpy import sort from sklearn.feature_selection import SelectFromModel # for feature selection seed = 72 np.random.seed(seed) ########################################## #load dataset ########################################## # load dataset train_0 = utils.load_pred_feature('trainW-0', keep_all=True) train_1 = utils.load_pred_feature('trainW-1', keep_all=True) train_2 = utils.load_pred_feature('trainW-2', keep_all=True) # make data augmentation having same label distribution with training set provided by the kkbox per_churned_in_train_0 = train_0[['is_churn']].describe().ix['mean'][0] n_churned = train_1[train_1.is_churn == 0].shape[0] * per_churned_in_train_0 print('per_churned_in_train_0', per_churned_in_train_0) print('n_churned', int(n_churned)) train_1 = pd.concat([ train_1[train_1.is_churn == 0], train_1[train_1.is_churn == 1].sample( n=int(n_churned), random_state=seed) ], ignore_index=True) per_churned_in_train_1 = train_1[['is_churn']].describe().ix['mean'][0] print('per_churned_in_train_1', per_churned_in_train_1) train_2 = pd.concat([
''' import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np import gc import sys sys.path.append('/Users/yunruili/xgboost/python-package') import xgboost as xgb import utils from xgboost.sklearn import XGBClassifier from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import StratifiedKFold train = utils.load_pred_feature('trainW-0') X = train.drop('is_churn', axis=1) y = train['is_churn'] # cross_validation strategies seed = 72 cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) # grid params_fixed = { 'silent': 1, 'objective': 'binary:logistic', } params_dist_grid = { 'max_depth': [5, 6, 7, 8, 9, 10],