def get_xgboost_classifier(X_train, y_train, X_val, y_val,params=None, tag=""): param_grid = {'max_depth':[3,5,7], 'min_child_weight': [1,3,5], 'n_estimators': [50]} if params is None: xgb = XGBClassifier( learning_rate =0.2, objective= 'binary:logistic', seed=27) t = start("training xgboost ") cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123) clf = grid_search.GridSearchCV(xgb, param_grid, cv=cv, n_jobs=1, scoring='roc_auc') clf = clf.fit(X_train,y_train) report(t, nitems=10*len(param_grid)) print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_)) print "With parameters:" best_parameters = clf.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) else: clf = XGBClassifier(**params) clf.fit(X_train, y_train, eval_set = [(X_train,y_train),(X_val,y_val)], eval_metric='auc', verbose=False) if plot_cv_curves: train = clf.evals_result()['validation_0']['auc'] val = clf.evals_result()['validation_1']['auc'] plot_cv_curve(train, val, tag) if plot_feature_importance: plot_feature_importance(clf, tag) return clf
#define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ClusterCentroids cc = ClusterCentroids(random_state=0) os_X,os_y = cc.fit_sample(X_train,y_train) #XGboost clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1, max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1, reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000) clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False) evals_result = clf_XG.evals_result() y_true, y_pred = y_test, clf_XG.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) print "G score: " , math.sqrt(recall/ specifity)
def plot_learning_curve_versus_tr_epoch(title='', ntrials=1, nfolds=10, save_csv=False, verbose=True, save_fig=False): X_df, Y_df = data_handler.load_XY() X = X_df.values Y = Y_df.values _ylabel = 'Mean AUROC' n_jobs = 4 # cross validation settup Ntrials = ntrials outter_nsplit = nfolds tot_count = Ntrials * outter_nsplit # Results store train_mat = np.zeros((tot_count, 500)) test_mat = np.zeros((tot_count, 500)) for i in range(Ntrials): init_time = time.time() print("trial = ", i) train_index = [] test_index = [] outer_cv = StratifiedKFold(n_splits=outter_nsplit, shuffle=True, random_state=i) for train_ind, test_ind in outer_cv.split(X, Y): train_index.append(train_ind.tolist()) test_index.append(test_ind.tolist()) for j in range(outter_nsplit): #outter_nsplit count = i * outter_nsplit + j print(str(count), " / ", str(tot_count)) X_train = X[train_index[j]] Y_train = Y[train_index[j]] X_test = X[test_index[j]] Y_test = Y[test_index[j]] eval_sets = [(X_train, Y_train), (X_test, Y_test)] clf = XGBClassifier(objective="binary:logistic", min_child_weight=1, **{'tree_method': 'exact'}, silent=True, n_jobs=4, random_state=3, seed=3, learning_rate=0.01, colsample_bylevel=0.9, colsample_bytree=0.9, n_estimators=500, gamma=0.8, max_depth=11, reg_lambda=0.8, subsample=0.4) clf.fit(X_train, Y_train, eval_metric=['auc'], eval_set=eval_sets, verbose=False) results = clf.evals_result() epochs = len(results['validation_0']['auc']) # record results train_mat[count] = results['validation_0']['auc'] test_mat[count] = results['validation_1']['auc'] if (verbose): print('Iter: %d, epochs: %d' % (count, epochs)) print('training result: %.4f, testing result: %.4f' % (train_mat[count][499], test_mat[count][499])) print('total time: %.4f mins' % ((time.time() - init_time) / 60)) # Results store epoch_lists = list(range(1, epochs + 1)) train_results = pd.DataFrame( data=train_mat, columns=['epoch_' + str(i) for i in epoch_lists]) test_results = pd.DataFrame( data=test_mat, columns=['epoch_' + str(i) for i in epoch_lists]) if (save_csv): data_handler.save_csv(train_results, title='mos2_learning_curve_train_raw') data_handler.save_csv(test_results, title='mos2_learning_curve_test_raw') print('end') _ylim = (0.5, 1.01) n_jobs = 4 # create learning curve values train_scores_mean = np.mean(train_mat, axis=0) train_scores_std = np.std(train_mat, axis=0) test_scores_mean = np.mean(test_mat, axis=0) test_scores_std = np.std(test_mat, axis=0) tr_size_df = pd.Series(epoch_lists, name='training_epoch') tr_sc_m_df = pd.Series(train_scores_mean, name='training_score_mean') val_sc_m_df = pd.Series(test_scores_mean, name='val_score_mean') tr_sc_std_df = pd.Series(train_scores_std, name='training_score_std') val_sc_std_df = pd.Series(test_scores_std, name='val_score_std') if (save_csv): res = pd.concat( [tr_size_df, tr_sc_m_df, val_sc_m_df, tr_sc_std_df, val_sc_std_df], axis=1) data_handler.save_csv(data=res, title=title + '_learning_curve') # plotting _ylim = (0.5, 1.01) fig = plt.figure(figsize=(12, 12 / 1.618)) ax1 = fig.add_subplot(111) ax1.set_ylim(_ylim) ax1.set_xlabel("Number of Training Epochs") ax1.set_ylabel(_ylabel) plt.grid(False) ax1.plot(tr_size_df, tr_sc_m_df, color="r", label="Training") #'o-', ax1.plot(tr_size_df, val_sc_m_df, color="b", label="Validation") #'^--', # plot error bars #ax1.errorbar(tr_size_df, tr_sc_m_df, yerr=tr_sc_std_df,color="r", ) #ax1.errorbar(tr_size_df, val_sc_m_df, yerr=val_sc_std_df) plt.setp(ax1.spines.values(), color='black') plt.legend(loc="lower right") plt.show() to_path = None if save_fig: to_path = data_handler.format_title(to_dir, title + '_learning_curve', '.png') fig.savefig(to_path, dpi=1000, bbox_inches="tight", pad_inches=0.1) return to_path
""" #%% # starting with 300 estimators to make a 1st plot, will keep all else at default. model = XGBClassifier(n_estimators=300) eval_set = [(X_train, y_train), (X_test, y_test)] eval_metric = ["merror", "rmse", "mlogloss", "auc"] model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True) train_merror = model.evals_result()['validation_0']['merror'] test_merror = model.evals_result()['validation_1']['merror'] merror_df = pd.DataFrame({ 'train': train_merror, 'test': test_merror, 'iteration': range(300) }) y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) #%% """
gamma=0, subsample=.6, colsample_bytree=.55, objective='binary:logistic', nthread=5, scale_pos_weight=45, seed=27, n_jobs=5) eval_set = [(x_train, y_train), (x_valid, y_valid)] xgb1.fit(x_train, y_train, eval_set=eval_set, eval_metric=f1_eval) pre_xgb = xgb1.predict(x_valid) if best_fx[2] < f1_score(y_valid, pre_xgb): best_fx[0] = eta best_fx[1] = np.where( np.array(xgb1.evals_result()['validation_1']['f1_err']) == min( xgb1.evals_result()['validation_1']['f1_err']))[0][0] best_fx[2] = f1_score(y_valid, pre_xgb) print(best_fx) print('--' * 40) print(best_fx) ################################ #Import libraries: import numpy as np import pandas as pd import xgboost as xgb from xgboost.sklearn import XGBClassifier from sklearn.grid_search import GridSearchCV #Perforing grid search from sklearn.model_selection import train_test_split
min_child_weight=1, max_depth=6, gamma=0, subsample=1, max_delta_step=0, colsample_bytree=1, reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000) clf_XG.fit(os_X, os_y, eval_set=[(os_X, os_y), (X_test, y_test)], eval_metric='auc', verbose=False) evals_result = clf_XG.evals_result() y_true, y_pred = y_test, clf_XG.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test, y_pred) np.set_printoptions(precision=2) print "Specifity: ", float( cnf_matrix[0, 0]) / (cnf_matrix[0, 0] + cnf_matrix[0, 1]) specifity = float(cnf_matrix[0, 0]) / (cnf_matrix[0, 0] + cnf_matrix[0, 1]) print "G score: ", math.sqrt(recall / specifity)
min_child_weight=1, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 max_depth=6, # 构建树的深度,越大越容易过拟合 gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。 subsample=1, # 随机采样训练样本 训练实例的子采样比 max_delta_step=0, #最大增量步长,我们允许每个树的权重估计。 colsample_bytree=1, # 生成树时进行的列采样 reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 #reg_alpha=0, # L1 正则项参数 #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重 #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标 #num_class=10, # 类别数,多分类与 multisoftmax 并用 n_estimators=100, #树的个数 seed=1000 #随机种子 #eval_metric= 'auc' ) clf.fit(X_train, y_train, eval_metric='auc') #设置验证集合 verbose=False不打印过程 clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='auc', verbose=False) #获取验证集合结果 evals_result = clf.evals_result() y_true, y_pred = y_test, clf.predict(X_test) print "Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred) #回归 #m_regress = xgb.XGBRegressor(n_estimators=1000,seed=0)
for i in range(n_iterations): folds = StratifiedKFold(y_train, n_folds=n_folds, shuffle=True) j = 0 for train_index, test_index in folds: print(str(i)+str(j)) X_train2, X_test2 = X_train.loc[train_index], X_train.loc[test_index] y_train2, y_test2 = y_train[train_index], y_train[test_index] X_train2, X_test2 = feature_engineering_extra(X_train2, X_test2, y_train2) X_train2 = csr_matrix(X_train2.values) X_test2 = csr_matrix(X_test2.values) clf.fit(X_train2, y_train2, eval_set=[(X_test2, y_test2)], eval_metric='mlogloss', verbose=False) df['column' + str(i)+str(j)] = clf.evals_result()['validation_0']['mlogloss'] df['column' + str(i)+str(j)] = df['column' + str(i)+str(j)].astype(float) j = j + 1 print('score', df.sum(axis=1).min()/(n_iterations*n_folds)) print('iteration', df.sum(axis=1).argmin() + 1) #print(df.sum(axis=1)/(n_iterations*n_folds)) for i in df.sum(axis=1)/(n_iterations*n_folds): print(i) if is_find_n == 1: X_train, X_test = feature_engineering(df_train, df_test, y_train) learning_rate, max_depth, ss, cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.1, 6, 0.7, 0.7, 0, 1, 1, 0 #learning_rate, max_depth, ss, cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.1, 4, 0.8, 0.8, 0, 1, 1, 0
# **early_stopping_rounds — overfitting prevention, stop early if no improvement in learning** # In[72]: y_pred1 = xgb1.predict(x_test) # In[74]: accuracy_score(y_t2, y_pred1) # ## Ploting Classifying errors and log loss with respect to each iteration # In[80]: # retrieve performance metrics results = xgb1.evals_result() epochs = len(results['validation_0']['error']) x = range(0, epochs) # plot log loss fig, ax = plt.subplots() ax.plot(x, results['validation_0']['logloss'], label='Train') ax.plot(x, results['validation_1']['logloss'], label='Test') ax.legend() plt.ylabel('Log Loss') plt.xlabel('Epochs') plt.title('XGBoost Log Loss') plt.show() # plot classification error fig, ax = plt.subplots() ax.plot(x, results['validation_0']['error'], label='Train') ax.plot(x, results['validation_1']['error'], label='Test')