Ejemplo n.º 1
0
def _tune_max_depth__num_leaves(params, d_train):
    ptr.print_log('Tuning max_depth and num_leaves ...')

    max_depth_list = list(range(4,9))
    num_leaves_list = list(range(30,121,10))
    
    max_auc = 0.0
    best_max_depth = max_depth_list[0]
    best_num_leaves = num_leaves_list[0]
    
    for max_depth, num_leaves in zip(max_depth_list, num_leaves_list):
        # update params
        params['max_depth'] = max_depth
        params['num_leaves'] = num_leaves
        
        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('max_depth: {}; num_leaves: {}; auc: {}; rounds: {}'.\
                      format(max_depth, num_leaves, auc, rounds))

        # check auc
        if auc > max_auc:
            max_auc = auc
            best_max_depth = max_depth
            best_num_leaves = num_leaves

    ptr.print_log('best max_depth: {}'.format(best_max_depth))
    ptr.print_log('best num_leaves: {}'.format(best_num_leaves))
    ptr.print_log('max auc: {}'.format(max_auc))
    
    return best_max_depth, best_num_leaves
def build_model():
    ptr.print_log('STEP2: building model ...')

    global xgb_params
    global xgb_rounds

    global lgb_params
    global lgb_rounds

    # xgboost params
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'eta': 0.005,
        'max_depth': 4,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'alpha': 2.4,
        'lambda': 14.0,
        'silent': 1
    }

    # lightgbm params
    lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.0025,
        'max_depth': 6,
        'num_leaves': 50,
        'min_data_in_leaf': 200,
        'max_bin': 50,
        'verbosity': 0
    }
Ejemplo n.º 3
0
def _tune_subsample__colsample_bytree(params, d_train):
    ptr.print_log('Tuning subsample and colsample_bytree ...')

    subsample_list = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
    colsample_bytree = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]

    max_auc = 0.0
    best_subsample = subsample_list[0]
    best_colsample_bytree = colsample_bytree[0]

    for subsample, colsample_bytree in zip(subsample_list, colsample_bytree):
        # update params
        params['subsample'] = subsample
        params['colsample_bytree'] = colsample_bytree

        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('subsample: {}; colsample_bytree: {}; auc: {}; rounds: {}'.\
              format(subsample, colsample_bytree, auc, rounds))

        # check auc
        if auc > max_auc:
            max_auc = auc
            best_subsample = subsample
            best_colsample_bytree = colsample_bytree

    ptr.print_log('best subsample: {0}'.format(best_subsample))
    ptr.print_log('best colsample_bytree: {0}'.format(best_colsample_bytree))
    ptr.print_log('max auc: {0}'.format(max_auc))

    return best_subsample, best_colsample_bytree
Ejemplo n.º 4
0
def tune(data_x, data_y):
    ptr.print_log('Tuning xgboost parameters ...')

    d_train = xgb.DMatrix(data_x, label=data_y)

    params = {'objective': 'binary:logistic', 'silent': 1}

    # tune eta
    best_eta = _tune_eta(params, d_train)
    params['eta'] = best_eta

    # tune max_depth and min_child_weight
    best_max_depth, best_min_child_weight = _tune_max_depth__min_child_weight(
        params, d_train)
    params['max_depth'] = best_max_depth
    params['min_child_weight'] = best_min_child_weight

    # tune subsample and colsample_bytree
    best_subsample, best_colsample_bytree = _tune_subsample__colsample_bytree(
        params, d_train)
    params['subsample'] = best_subsample
    params['colsample_bytree'] = best_colsample_bytree

    # tune alpha and lambda
    best_alpha, best_lambda = _tune_alpha_lambda(params, d_train)
    params['subsample'] = best_alpha
    params['colsample_bytree'] = best_lambda

    # end
    ptr.print_log('XGBOOST TUNER was finished.')
Ejemplo n.º 5
0
def _tune_alpha_lambda(params, d_train):
    ptr.print_log('Tuning alpha and lambda ...')

    alpha_list = [0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8]
    lambda_list = [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]

    max_auc = 0.0
    best_alpha = alpha_list[0]
    best_lambda = lambda_list[0]

    for alpha, lambdaa in zip(alpha_list, lambda_list):
        # update params
        params['alpha'] = alpha
        params['lambda'] = lambdaa

        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('alpha: {}; lambdaa: {}; auc: {}; rounds: {}'.\
              format(alpha, lambdaa, auc, rounds))

        # check auc
        if auc > max_auc:
            max_auc = auc
            best_alpha = alpha
            best_lambda = lambdaa

    ptr.print_log('best alpha: {0}'.format(best_alpha))
    ptr.print_log('best lambda: {0}'.format(best_lambda))
    ptr.print_log('max auc: {0}'.format(max_auc))

    return alpha, lambdaa
Ejemplo n.º 6
0
def _tune_bagging_fraction__bagging_freq(params, d_train):
    ptr.print_log('Tuning bagging_fraction and bagging_freq ...')
    
    bagging_fraction_list = [0.1, 0.2, 0.4, 0.6, 0.8, 0.9]
    bagging_freq_list = list(range(0,51,10))
    
    max_auc = 0.0
    best_bagging_fraction = bagging_fraction_list[0]
    best_bagging_freq = bagging_freq_list[0]

    for bagging_fraction, bagging_freq in zip(bagging_fraction_list, bagging_freq_list):
        # update params
        params['bagging_fraction'] = bagging_fraction
        params['bagging_freq'] = bagging_freq
        
        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('bagging_fraction: {}; bagging_freq: {}; auc: {}; rounds: {}'.\
                      format(bagging_fraction, bagging_freq, auc, rounds))

        # check auc
        if auc > max_auc:
            max_auc = auc
            best_bagging_fraction = bagging_fraction
            best_bagging_freq = bagging_freq
    
    ptr.print_log('best bagging_fraction: {}'.format(best_bagging_fraction))
    ptr.print_log('best bagging_freq: {}'.format(best_bagging_freq))
    ptr.print_log('max auc: {}'.format(max_auc))    
    
    return best_bagging_fraction, best_bagging_freq
Ejemplo n.º 7
0
def _tune_max_depth__min_child_weight(params, d_train):
    ptr.print_log('Tuning max_depth and min_child_weight ...')

    max_depth_list = list(range(5, 10))
    min_child_weight_list = list(range(1, 5))

    max_auc = 0.0
    best_max_depth = max_depth_list[0]
    best_min_child_weight = min_child_weight_list[0]

    for max_depth, min_child_weight in zip(max_depth_list,
                                           min_child_weight_list):
        # update params
        params['max_depth'] = max_depth
        params['min_child_weight'] = min_child_weight

        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('max_depth: {}; min_child_weight: {}; auc: {}; rounds: {}'.\
                      format(max_depth, min_child_weight, auc, rounds))

        # check auc
        if auc > max_auc:
            max_auc = auc
            best_max_depth = max_depth
            best_min_child_weight = min_child_weight

    ptr.print_log('best max_depth: {0}'.format(best_max_depth))
    ptr.print_log('best min_child_weight: {0}'.format(best_min_child_weight))
    ptr.print_log('max auc: {0}'.format(max_auc))

    return best_max_depth, best_min_child_weight
Ejemplo n.º 8
0
def _tune_min_data_in_leaf(params, d_train):
    ptr.print_log('Tuning min_data_in_leaf...')

    min_data_in_leaf_list = list(range(100,1001,100))
    
    max_auc = 0.0
    best_min_data_in_leaf = min_data_in_leaf_list[0]
    
    for min_data_in_leaf in min_data_in_leaf_list:
        # update params
        params['min_data_in_leaf'] = min_data_in_leaf
        
        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('min_data_in_leaf: {}; auc: {}; rounds: {}'.\
                      format(min_data_in_leaf, auc, rounds))
        
        # check auc
        if auc > max_auc:
            max_auc = auc
            best_min_data_in_leaf = min_data_in_leaf
        
    ptr.print_log('best min_data_in_leaf: {}'.format(best_min_data_in_leaf))
    ptr.print_log('max auc: {}'.format(max_auc))
    
    return best_min_data_in_leaf
Ejemplo n.º 9
0
def _tune_eta(params, d_train):
    ptr.print_log('Tuning eta ...')

    eta_list = [0.2, 0.1, 0.05, 0.025, 0.005, 0.0025]

    max_auc = 0.0
    best_eta = eta_list[0]

    for eta in eta_list:
        # update params
        params['eta'] = eta

        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('eta: {}; auc: {}; rounds: {}'.format(eta, auc, rounds))

        # check auc
        if auc > max_auc:
            max_auc = auc
            best_eta = eta

    ptr.print_log('best eta: {0}'.format(best_eta))
    ptr.print_log('max auc: {0}'.format(max_auc))

    return best_eta
Ejemplo n.º 10
0
def _tune_feature_fraction(params, d_train):
    ptr.print_log('Tuning feature_fraction ...')
    
    feature_fraction_list = [0.1, 0.2, 0.4, 0.6, 0.8, 0.9]
    
    max_auc = 0.0
    best_feature_fraction = feature_fraction_list[0]

    for feature_fraction in feature_fraction_list:
        # update params
        params['feature_fraction'] = feature_fraction
        
        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('feature_fraction: {}; auc: {}; rounds: {}'.\
                      format(feature_fraction, auc, rounds))
        
        # check auc
        if auc > max_auc:
            max_auc = auc
            best_feature_fraction = feature_fraction
                    
    ptr.print_log('best feature_fraction: {}'.format(best_feature_fraction))
    ptr.print_log('max auc: {}'.format(max_auc))
    
    return best_feature_fraction
Ejemplo n.º 11
0
def _tune_learning_rate(params, d_train):
    ptr.print_log('Tuning learning_rate ...')
    
    learning_rate_list = [0.2, 0.1, 0.05, 0.025, 0.005, 0.0025]
    max_auc = 0.0
    best_learning_rate = learning_rate_list[0]

    for learning_rate in learning_rate_list:
        # update params
        params['learning_rate'] = learning_rate
        
        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('learning_rate: {}; auc: {}; rounds: {}'.\
                      format(learning_rate, auc, rounds))
        
        # check auc
        if auc > max_auc:
            max_auc = auc
            best_learning_rate = learning_rate
        
    ptr.print_log('best learning_rate: {}'.format(best_learning_rate))
    ptr.print_log('max auc: {}'.format(max_auc))
        
    return best_learning_rate
Ejemplo n.º 12
0
def _tune_max_bin(params, d_train):
    ptr.print_log('Tuning max_bin...')
    
    max_bin_list = list(range(50,301,50))
    
    max_auc = 0.0
    best_max_bin = max_bin_list[0]
    
    for max_bin in max_bin_list:
        # update params
        params['max_bin'] = max_bin
        
        # run cv
        auc, rounds = _run_cv(params, d_train)
        ptr.print_log('max_bin: {}; auc: {}; rounds: {}'.\
                      format(max_bin, auc, rounds))
        
        # check auc
        if auc > max_auc:
            max_auc = auc
            best_max_bin = max_bin
                    
    ptr.print_log('best max_bin: {}'.format(best_max_bin))
    ptr.print_log('max auc: {}'.format(max_auc))
    
    return best_max_bin
def generate_submission():
    ptr.print_log('STEP4: generating submission ...')

    submission = pd.read_csv(submission_path)

    XGB_WEIGHT_LIST = [1.0
                       ]  #, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0]
    LGB_WEIGHT_LIST = [0.0
                       ]  #, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

    for XGB_WEIGHT, LGB_WEIGHT in zip(XGB_WEIGHT_LIST, LGB_WEIGHT_LIST):
        submission['target'] = xgb_pred * XGB_WEIGHT + lgb_pred * LGB_WEIGHT
        submission.to_csv('sub{}_{}_{}.csv'.format(
            datetime.now().strftime('%Y%m%d_%H%M%S'), XGB_WEIGHT, LGB_WEIGHT),
                          index=False,
                          float_format='%.5f')
def _load_data():
    ptr.print_log('Loading data ...')

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    for c in train_df.select_dtypes(include=['float64']).columns:
        train_df[c] = train_df[c].astype(np.float32)
        test_df[c] = test_df[c].astype(np.float32)

    for c in train_df.select_dtypes(include=['int64']).columns[2:]:
        train_df[c] = train_df[c].astype(np.int32)
        test_df[c] = test_df[c].astype(np.int32)

    print('train shape: ', train_df.shape)
    print('test shape : ', test_df.shape)

    return train_df, test_df
def process_data():
    ptr.print_log('STEP1: processing data ...')

    global data_x
    global data_y
    global test_x

    # load data
    train_df, test_df = _load_data()

    # fill NA

    # encode features

    # add features

    # remove outliers
    #_remove_outliers(train_df)

    # select and drop features
    #_select_drop_features(train_df)
    #_select_drop_features(test_df)

    # prepare train and valid data
    ptr.print_log('Preparing train and test data ...')

    data_y = train_df['target']
    data_x = train_df.drop(['id', 'target'], axis=1)
    test_x = test_df[data_x.columns]

    data_x = data_x.values
    data_y = data_y.values
    test_x = test_x.values

    ptr.print_log('train x shape: {}'.format(data_x.shape))
    ptr.print_log('train y shape: {}'.format(data_y.shape))
    ptr.print_log('test x shape : {}'.format(test_x.shape))

    # release
    del train_df
    del test_df
    gc.collect()
def _select_drop_features(df):
    ptr.print_log(
        'Selecting and dropping features according to feature importance ...')
    '''
    drop_features = ['ps_ind_10_bin',
                     'ps_ind_11_bin',
                     'ps_calc_16_bin',
                     'ps_calc_15_bin',
                     'ps_calc_20_bin',
                     'ps_calc_18_bin',
                     'ps_ind_13_bin',
                     'ps_ind_18_bin',
                     'ps_calc_19_bin',
                     'ps_calc_17_bin',
                     'ps_car_08_cat',
                     'ps_ind_09_bin',
                     'ps_car_02_cat',
                     'ps_ind_14']
    
    df.drop(drop_features, axis=1, inplace=True)
    '''
    '''
Ejemplo n.º 17
0
def tune(data_x, data_y):
    ptr.print_log('\n')
    ptr.print_log('LIGHTGBM parameters are tuning ...')
    
    d_train = lgb.Dataset(data_x, label=data_y)
    
    # lightgbm params
    params = {
        'objective': 'binary',
    }
    
    # tune learning rate
    best_learning_rate = _tune_learning_rate(params, d_train)
    params['learning_rate'] = best_learning_rate
        
    # tune max_depth and num_leaves
    max_depth, num_leaves = _tune_max_depth__num_leaves(params, d_train)
    params['max_depth'] = max_depth
    params['num_leaves'] = num_leaves

    # tune min_data_in_leaf
    min_data_in_leaf = _tune_min_data_in_leaf(params, d_train)
    params['min_data_in_leaf'] = min_data_in_leaf
        
    # tune max_bin
    max_bin = _tune_max_bin(params, d_train)
    params['max_bin'] = max_bin
        
    # tune bagging_fraction and bagging_freq
    bagging_fraction, bagging_freq = _tune_bagging_fraction__bagging_freq(params, d_train)
    params['bagging_fraction'] = bagging_fraction
    params['bagging_freq'] = bagging_freq
    
    # tune feature_fraction
    feature_fraction = _tune_feature_fraction(params, d_train)    
    params['feature_fraction'] = feature_fraction
    
    # end
    ptr.print_log('LIGHTGBM TUNER was finished.')
    ptr.print_log('\n')
    for XGB_WEIGHT, LGB_WEIGHT in zip(XGB_WEIGHT_LIST, LGB_WEIGHT_LIST):
        submission['target'] = xgb_pred * XGB_WEIGHT + lgb_pred * LGB_WEIGHT
        submission.to_csv('sub{}_{}_{}.csv'.format(
            datetime.now().strftime('%Y%m%d_%H%M%S'), XGB_WEIGHT, LGB_WEIGHT),
                          index=False,
                          float_format='%.5f')


################################################################################
## main
def main():
    process_data()
    if IS_PARAMS_TUNNING is False:
        build_model()
        train_predict()
        generate_submission()
    else:
        # xgboost parameters tuning
        xgboost_tuner.tune(data_x, data_y)

        # lightgbm parameters tuning
        lightgbm_tuner.tune(data_x, data_y)


################################################################################
if __name__ == "__main__":
    ptr.print_log('TRAINER')
    main()
    ptr.print_log('THE END.')
def _encode_features(df):
    ptr.print_log('Encoding features ...')
def _add_features(df):
    ptr.print_log('Adding features ...')
def _remove_outliers(df):
    ptr.print_log('Removing features ...')
    '''
def train_predict():
    ptr.print_log('STEP3: training ...')

    global xgb_pred
    global lgb_pred

    kfold = 5

    # xgboost
    xgb_pred = 0.0
    d_test = xgb.DMatrix(test_x)
    skf = StratifiedKFold(n_splits=kfold)

    for i, (train_index, valid_index) in enumerate(skf.split(data_x, data_y)):
        ptr.print_log('xgboost kfold: {}'.format(i + 1))

        if i == 3:  # best cv
            train_x, valid_x = data_x[train_index], data_x[valid_index]
            train_y, valid_y = data_y[train_index], data_y[valid_index]

            d_train = xgb.DMatrix(train_x, train_y)
            d_valid = xgb.DMatrix(valid_x, valid_y)

            evals = [(d_train, 'train'), (d_valid, 'valid')]
            evals_result = {}
            xgb_model = xgb.train(xgb_params,
                                  d_train,
                                  num_boost_round=10000,
                                  evals=evals,
                                  feval=_gini_xgb,
                                  evals_result=evals_result,
                                  maximize=True,
                                  early_stopping_rounds=50,
                                  verbose_eval=100)

            xgb_pred += xgb_model.predict(
                d_test, ntree_limit=xgb_model.best_ntree_limit)

            if False:
                result_train_gini = evals_result['train']
                result_valid_gini = evals_result['valid']
                for j in range(xgb_model.best_iteration + 1):
                    train_gini = result_train_gini['gini'][j]
                    valid_gini = result_valid_gini['gini'][j]
                    ptr.print_log(
                        'round, train_gini, valid_gini: {0:04}, {1:0.6}, {2:0.6}'
                        .format(j, train_gini, valid_gini), False)

    #xgb_pred = xgb_pred / kfold
    xgb_pred = xgb_pred  # only choose cv 3
    gc.collect()

    # lightgbm
    lgb_pred = 0.0
    skf = StratifiedKFold(n_splits=kfold)

    if False:
        for i, (train_index,
                valid_index) in enumerate(skf.split(data_x, data_y)):
            ptr.print_log('lightgbm kfold: {}'.format(i + 1))

            train_x, valid_x = data_x[train_index], data_x[valid_index]
            train_y, valid_y = data_y[train_index], data_y[valid_index]

            d_train = lgb.Dataset(train_x, train_y)
            d_valid = lgb.Dataset(valid_x, valid_y)

            valid_sets = [d_train, d_valid]
            valid_names = ['train', 'valid']
            evals_result = {}

            lgb_model = lgb.train(lgb_params,
                                  d_train,
                                  num_boost_round=10000,
                                  valid_sets=valid_sets,
                                  valid_names=valid_names,
                                  feval=_gini_lgb,
                                  evals_result=evals_result,
                                  early_stopping_rounds=100,
                                  verbose_eval=100)

            lgb_pred += lgb_model.predict(
                test_x, num_iteration=lgb_model.best_iteration)

            result_train_gini = evals_result['train']
            result_valid_gini = evals_result['valid']
            for j in range(lgb_model.best_iteration + 1):
                train_gini = result_train_gini['gini'][j]
                valid_gini = result_valid_gini['gini'][j]
                ptr.print_log(
                    'round, train_gini, valid_gini: {0:04}, {1:0.6}, {2:0.6}'.
                    format(j, train_gini, valid_gini), False)

    lgb_pred = lgb_pred / kfold
    gc.collect()
def _fill_NA(df):
    ptr.print_log('Filling data ...')