# using different random seed to make sure variety of models
seed = 72
np.random.seed(seed)

DATE = '0318'
utils.mkdir_p('../output/model/{}_{}/'.format(DATE, seed))
utils.mkdir_p('../output/sub/{}_{}/'.format(DATE, seed))

print("""#==== print param ======""")
print('DATE:', DATE)
print('seed:', seed)

##################################
# loading data
##################################
train = utils.load_pred_feature('train', keep_all=False)

print(
    'scale_pos_weight', 1.0 * train.label.value_counts().iloc[0] /
    train.label.value_counts().iloc[1])

##################################
# pre-processing
##################################
train['final_time'] = pd.to_datetime(train.final_time)
train['initial_time'] = pd.to_datetime(train.initial_time)
print('pre-processing done')

#==============================================================================
# prepare training data
#==============================================================================
Ejemplo n.º 2
0
         'subsample':0.75,
         'silent':1,
         'nthread':27,
         'eval_metric':'logloss',
         'objective':'binary:logistic',
         'tree_method':'hist'
         }# it's  tree booster

print("""#==== print param ======""")
print('DATE:', DATE)
print('seed:', seed)

#==============================================================================
# prepare
#==============================================================================
train = pd.concat([utils.load_pred_feature('trainW-0'),
                   utils.load_pred_feature('trainW-1'),
                   utils.load_pred_feature('trainW-2'),
                   ], ignore_index=True)

y_train = train['is_churn']
X_train = train.drop('is_churn', axis=1)
del train
gc.collect()


X_train.fillna(-1, inplace = True)


#==============================================================================
# SPLIT!
from xgboost import XGBClassifier
from matplotlib import pyplot
import utils  # made by author for efficiently dealing with data
from sklearn.grid_search import GridSearchCV

seed = 72
np.random.seed(seed)

##########################################
#load dataset
##########################################

# load dataset
file_name = '../output/model/xgb_feature_tuning_seed_72.model'
train_0 = utils.load_pred_feature('trainW-0',
                                  keep_all=False,
                                  model_file_name=file_name,
                                  n_top_features=48)
train_1 = utils.load_pred_feature('trainW-1',
                                  keep_all=False,
                                  model_file_name=file_name,
                                  n_top_features=48)
train_2 = utils.load_pred_feature('trainW-2',
                                  keep_all=False,
                                  model_file_name=file_name,
                                  n_top_features=48)
# make data augmentation having same label distribution with training set provided by the kkbox
per_churned_in_train_0 = train_0[['is_churn']].describe().ix['mean'][0]
n_churned = train_1[train_1.is_churn == 0].shape[0] * per_churned_in_train_0
print('per_churned_in_train_0', per_churned_in_train_0)
print('n_churned', int(n_churned))
train_1 = pd.concat([
Ejemplo n.º 4
0
    'tree_method': 'hist'
}  # it's  tree booster

#subsample and colsample_bytree are build to control overfitting
#via adding randomness to make training robust to noise

print("""#==== print param ======""")
print('DATE:', DATE)
print('seed:', seed)

#==============================================================================
# prepare
#==============================================================================
train = pd.concat(
    [
        utils.load_pred_feature('trainW-0'),
        # utils.load_pred_feature('trainW-1'),
        # utils.load_pred_feature('trainW-2'),
    ],
    ignore_index=True)

y_train = train['is_churn']
X_train = train.drop('is_churn', axis=1)
del train
gc.collect()

X_train.fillna(-1, inplace=True)


def ceate_feature_map(features):
    f = open('../output/xgb.fmap', 'w')
Ejemplo n.º 5
0
import utils  # made by author for efficiently dealing with data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from numpy import sort
from sklearn.feature_selection import SelectFromModel  # for feature selection

seed = 72
np.random.seed(seed)

##########################################
#load dataset
##########################################

# load dataset
train_0 = utils.load_pred_feature('trainW-0', keep_all=True)
train_1 = utils.load_pred_feature('trainW-1', keep_all=True)
train_2 = utils.load_pred_feature('trainW-2', keep_all=True)
# make data augmentation having same label distribution with training set provided by the kkbox
per_churned_in_train_0 = train_0[['is_churn']].describe().ix['mean'][0]
n_churned = train_1[train_1.is_churn == 0].shape[0] * per_churned_in_train_0
print('per_churned_in_train_0', per_churned_in_train_0)
print('n_churned', int(n_churned))
train_1 = pd.concat([
    train_1[train_1.is_churn == 0], train_1[train_1.is_churn == 1].sample(
        n=int(n_churned), random_state=seed)
],
                    ignore_index=True)
per_churned_in_train_1 = train_1[['is_churn']].describe().ix['mean'][0]
print('per_churned_in_train_1', per_churned_in_train_1)
train_2 = pd.concat([
'''

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import gc
import sys
sys.path.append('/Users/yunruili/xgboost/python-package')
import xgboost as xgb
import utils
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

train = utils.load_pred_feature('trainW-0')

X = train.drop('is_churn', axis=1)
y = train['is_churn']

# cross_validation strategies
seed = 72
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
# grid
params_fixed = {
    'silent': 1,
    'objective': 'binary:logistic',
}

params_dist_grid = {
    'max_depth': [5, 6, 7, 8, 9, 10],