scores = cross_val_score(model, X_train_final, y_train_final, cv=5) print( '***************************************Lasso (0.001)***************************************' ) print('cross validation score', scores.mean()) lasso_model_0001_train_score = lasso_0001.score(X_train_final, y_train_final) lasso_model_0001_test_score = lasso_0001.score(X_test_final, y_test_final) mse = mean_squared_error(y_test_final, y_pred) rmse = np.math.sqrt(mse) print('RMSE: {}'.format(rmse)) print('Train Score: {}'.format(lasso_model_0001_train_score)) print('Test Score: {}'.format(lasso_model_0001_test_score)) # ******************* Random Forest ************************ rfc = RandomForestClassifier() run_model(rfc, 'Random Forest') # ******************* Gradient Boost Classifier ************************ gbc = GradientBoostingClassifier() run_model(gbc, 'Gradient Boost Classifier') # ******************* XG Boost Classifier ************************ xgb = XGBClassifier() run_model(xgb, 'XG Boost Classifier') # After analysis the evaluation metrics, we decided to export log_reg, gradient boost and xgboost classifier joblib.dump(log_reg, 'log_reg_stars_model.joblib') joblib.dump(gbc, 'gbc_stars_model.joblib') joblib.dump(xgb, 'xgb_stars_model.joblib')
tree = rf.estimators_[ 5] #arbitrarily taking the 5th of the 10 trees that were generated # Export the image to a dot file export_graphviz(tree, out_file='branch.dot', feature_names=feature_list) # Use dot file to create a graph (graph, ) = pydot.graph_from_dot_file('branch.dot') # Write graph to a png file graph.write_png('branch.png') #displaying the 5th tree as a png image #XGBOOST clf = XGBClassifier(max_depth=3, n_jobs=4) probabilities = clf.fit(trainX, trainTargets).predict_proba( testX) #training the model and making predictions on the test data preds = clf.predict(testX) print "\nConfusion Matrix of XGBoost is:-\n" print confusion(testTargets, preds) cfmat(preds, 'XGBoost') #Display Confusion Matrix print "\nAUPRC score of XGBoost is:-\n" print average_precision_score(testTargets, probabilities[:, 1]) #Display the AUPRC score skplt.metrics.plot_precision_recall(testTargets, probabilities) #Plot AUPRC graph
scores_4=[] scores_5=[] scores_6=[] for i in range(100): print('###########Time############',i+1) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,stratify=y,random_state=i) ss=StandardScaler() x_train_ss=ss.fit_transform(x_train) x_test_ss=ss.transform(x_test) model = XGBClassifier( learning_rate =0.01, n_estimators=500, max_depth=3, min_child_weight=1, gamma=0, subsample=0.6, colsample_bytree=1.0, seed=1, nthread=6) model.fit(x_train_ss, y_train) y_proba=model.predict_proba(x_test_ss)[:,1] y_pred = model.predict(x_test_ss) auc = metrics.roc_auc_score(y_test, y_proba) acc = accuracy_score(y_test, y_pred) precision, recall, _thresholds = metrics.precision_recall_curve(y_test, y_proba) pr_auc = metrics.auc(recall, precision) mcc = matthews_corrcoef(y_test, y_pred)
import pandas as pd from xgboost.sklearn import XGBClassifier import numpy as np from sklearn.metrics import roc_auc_score from sklearn.model_selection import KFold clf = XGBClassifier(learning_rate=0.02, n_estimators=400, max_depth=7, min_child_weight=1.5, gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', n_jobs=-1, scale_pos_weight=1, max_delta_step=5, silent=False, num_class=3) fold_num = 5 df = pd.read_csv('test.csv') df.fillna(-1) data_array = df.values[:, 1:] kf = KFold(n_splits=fold_num) count = 0 aucs = [] for train_index, test_index in kf.split(data_array): print "count = %d" % count count += 1 clf.fit(data_array[list(train_index), :-1], data_array[list(train_index),
juiceboy = juiceboy.drop([u'Unnamed: 0', u'CustomerMD5Key', u'FirstDriverDrivingLicenseNumberY', \ u'CarParkingTypeId',u'FirstDriverDrivingLicenceType',\ u'CarDrivingEntitlement', u'CarTransmissionId', u'PolicyHolderResidencyArea'\ , u'car_flag_1', u'car_flag_4', u'car_flag_3', u'vendor_log_sales', u'RatedDriverNumber',\ 'CarFuelld','AllDriversNbConviction'], axis=1) # In[ ]: target = "labels" xgbl = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) supplied_arr = juiceboy predictors = [ x for x in train.columns if x not in [target, 'Unnamed: 0', u'CustomerMD5Key'] ] dtrain = train alg = xgbl useTrainCV = True cv_folds = 5
train_x,train_y = getTrain() test_x,res = getTest() #开始训练模型 clf = XGBClassifier(silent=0 ,#设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 #nthread=4,# cpu 线程数 默认最大 learning_rate= 0.3, # 如同学习率 min_child_weight=1, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 max_depth=6, # 构建树的深度,越大越容易过拟合 gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。 subsample=1, # 随机采样训练样本 训练实例的子采样比 max_delta_step=0,#最大增量步长,我们允许每个树的权重估计。 colsample_bytree=1, # 生成树时进行的列采样 reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 #reg_alpha=0, # L1 正则项参数 #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重 #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标 #num_class=10, # 类别数,多分类与 multisoftmax 并用 n_estimators=100, #树的个数 seed=1000 #随机种子 #eval_metric= 'auc' ) clf.fit(train_x, train_y) #测试数据 y_pre = clf.predict(test_x) print(y_pre) #获取打分值
def optimize_max_depth_and_min_child_weight ( model, x, t ): # Local variables. indent = 3 log ( 'Optimizing max depth and min child weight:', indent = indent ) # Initialize ssearch parameters. grid_resolution = 3 max_depth_min = 1 max_depth_max = 9 max_depth_stride = grid_resolution min_child_weight_min = 1 min_child_weight_max = 9 min_child_weight_stride = grid_resolution # Configure grid search. parameter_search_1 = { 'max_depth' : list ( range ( max_depth_min, max_depth_max, max_depth_stride ) ), 'min_child_weight' : list ( range ( min_child_weight_min, min_child_weight_max, min_child_weight_stride ) ) } # Perform grid search. grid_search_1 = GridSearchCV ( estimator = XGBClassifier ( learning_rate = Constant.Model.LEARNING_RATE, n_estimators = Constant.Model.N_ESTIMATORS, max_depth = Constant.Model.MAX_DEPTH, min_child_weight = Constant.Model.MIN_CHILD_WEIGHT, gamma = Constant.Model.GAMMA, subsample = Constant.Model.SUBSAMPLE, colsample_bytree = Constant.Model.COLSAMPLE_BYTREE, objective = Constant.Model.OBJECTIVE, scale_pos_weight = Constant.Model.SCALE_POS_WEIGHT, seed = Constant.Model.SEED ), param_grid = parameter_search_1, scoring = Constant.Model.GridSearch.Tree.SCORING, cv = Constant.Model.GridSearch.Tree.CV, verbose = Constant.Model.GridSearch.Tree.VERBOSE, n_jobs = 1, iid = False ) grid_search_1.fit ( x, t ) # Report results. for e in grid_search_1.grid_scores_: log ( str (e) , indent = indent+1 ) min_child_weight = grid_search_1.best_params_ [ 'min_child_weight' ] max_depth = grid_search_1.best_params_ [ 'max_depth' ] log ( 'Optimal min child weight = ' + str ( min_child_weight ), indent = indent+1 ) log ( 'Optimal max depth = ' + str ( max_depth ), indent = indent+1 ) log ( 'Best Score = ' + str ( grid_search_1.best_score_ ), indent = indent+1 ) # update model using optimized parameters. model.set_params ( min_child_weight = min_child_weight ) model.set_params ( max_depth = max_depth ) # Return updated model. return model
#data1_new_train.to_csv("data_train.csv") data1_new_test = data1_new.ix[11017:, :] X_train, X_test, Y_train, Y_test = train_test_split(data1_new_train, Y, test_size=0.3, random_state=42) #训练xgboost模型 #设置初试参数 xgb_train1 = XGBClassifier(booster="gbtree", learning_rate=0.02, n_estimators=800, max_depth=3, min_child_weight=4, gamma=0, reg_alpha=10, reg_lambda=10, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=8, scale_pos_weight=4, seed=27) xgb_train1.fit(X_train, Y_train, eval_metric="auc") from sklearn.feature_selection import SelectFromModel model = SelectFromModel(xgb_train1, threshold=0.01, prefit=True) X_new = model.transform(X_train) X_new_test = model.transform(X_test) xgb_train2 = XGBClassifier(booster="gbtree", learning_rate=0.01, n_estimators=1000, max_depth=5,
np.random.seed(seed) params_grid = { 'max_depth':[1, 3, 5, 7, 9], 'n_estimators':[10, 100, 1000], 'learning_rate':np.linspace(1e-16, 1, 3), 'min_child_weight':[1, 3, 5, 7], 'gamma':[0, 0.1, 0.3, 0.5] } params_fixed = { 'learning_rate': 0.1, 'silent': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'multi:softprob', 'nthread': 4, 'scale_pos_weight': 1, } bst_grid = GridSearchCV( estimator=XGBClassifier(**params_fixed, seed = seed), param_grid=params_grid, cv = 5, scoring='accuracy' ) train_labels = train_labels.astype(str) bst_grid.fit(train, train_labels) bst_grid.grid_scores_
X = data_removed.iloc[:, :-1] y = data_removed.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) clf_pred = clf.predict(X_test) print("***********Decision Tree Classifier************") print("Train Accuracy :", round(clf.score(X_train, y_train), 3)) print("Test Accuracy ", round(metrics.accuracy_score(y_test, clf_pred), 3)) xgb = XGBClassifier() xgb.fit(X_train, y_train) xgb_pred = xgb.predict(X_test) print("*************Xgboost Classifier************") print("Train Accuracy of xgb:", round(xgb.score(X_train, y_train), 3)) print("Test Accuracy ", round(metrics.accuracy_score(y_test, xgb_pred), 3)) rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_pred = rfc.predict(X_test) print("*************Random Forest Classifier************") print("Train Accuracy of Random Forest C:", round(rfc.score(X_train, y_train), 3)) print("Test Accuracy ", round(metrics.accuracy_score(rfc_pred, xgb_pred), 3)) #%% Regularization from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier # Parameter setting param = {} param['objective'] = 'binary:logistic' param['gamma'] = 0 param['max_depth'] = 6 param['min_child_weight'] = 1 param['max_delta_step'] = 0 param['subsample'] = 1 param['colsample_bytree'] = 1 param['silent'] = 1 param['seed'] = 0 param['base_score'] = 0.5 xclas = XGBClassifier(**param) xclas.fit(x, y) result = xclas.predict(x_test) text = open(fileName, "w+") s = csv.writer(text, delimiter=',', lineterminator='\n') s.writerow(["id", "label"]) for i in range(len(result)): s.writerow([i + 1, int(result[i])]) text.close() # X_train_path = '/home/tappy/X_train' # Y_train_path ='/home/tappy/Y_train' # X_test_path ='/home/tappy/X_test' # fileName = '/home/tappy/predictdata.csv' # # X_train_path = 'C:/Users/TappyHsieh/Desktop/X_train'
def xgb_model_cv(param_set): mean_roc_cv = 0 mean_F1_score_cv = 0 mean_recall_score_cv = 0 # Initialize the XGBClassifier with the dictionary parameters clf = XGBClassifier(max_depth=param_set['max_depth'], learning_rate=param_set['learning_rate'], n_estimators=param_set['n_estimators'], silent=True, objective='binary:logistic', nthread=-1, gamma=param_set['gamma'], min_child_weight=param_set['min_child_weight'], max_delta_step=0, subsample=param_set['subsample'], colsample_bytree = param_set['colsample_bytree']) for i in range(5): #create the dataframe that will contain all the features and labels train_df = pd.read_csv("subsets/train_cv_set3"+str(i)+".csv") test_df = pd.read_csv("subsets/test_cv_set3"+str(i)+".csv") train_df.sort_values('index', axis = 0, inplace=True) train_df = train_df.set_index('index') train_label_df = train_df.pop('class') test_df.sort_values('index', axis = 0, inplace=True) test_df = test_df.set_index('index') test_label_df = test_df.pop('class') # Train the classifier on the train set clf.fit(train_df, train_label_df) # Make predictions on the train set train_pred = clf.predict(train_df) train_predprob = clf.predict_proba(train_df)[:,1] # Make predictions on the test set test_pred = clf.predict(test_df) test_predprob = clf.predict_proba(test_df)[:,1] # Display the metrics score on the train set print "ROC score", roc_auc_score(train_label_df, train_pred, average = 'macro') print "ROC score proba", roc_auc_score(train_label_df, train_predprob, average = 'macro') print "F1 score for training set: {:.4f}.".format(f1_score(train_label_df, train_pred, pos_label=1.0)) print "Recall score for training set: {:.4f}.".format(recall_score(train_label_df, train_pred, pos_label=1.0, average='binary')) # Display the metrics score on the test set roc_score_cv = roc_auc_score(test_label_df, test_pred, average = 'macro') F1_score_cv = f1_score(test_label_df, test_pred, pos_label=1.0) recall_score_cv = recall_score(test_label_df, test_pred, pos_label=1.0, average='binary') print "ROC score cv", roc_score_cv print "ROC score proba cv", roc_auc_score(test_label_df, test_predprob, average = 'macro') print "F1 score for cv set: {:.4f}.".format(F1_score_cv) print "Recall score for cv set: {:.4f}.".format(recall_score_cv) # Calculate the average metrics scores over the 5-folds cv tests mean_roc_cv += roc_score_cv/5 mean_F1_score_cv += F1_score_cv/5 mean_recall_score_cv += recall_score_cv/5 # Display the average metrics scores over the 5-folds cv tests print "mean_roc_cv", mean_roc_cv print "mean_F1_score_cv", mean_F1_score_cv print "mean_recall_score_cv", mean_recall_score_cv score = [mean_roc_cv, mean_F1_score_cv, mean_recall_score_cv] return score
X = train # test = test.as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=125) # X_train = X_train.as_matrix() # X_test = X_test.as_matrix() # y_train = y_train.as_matrix() # y_test = y_test.as_matrix() # --------------------- Creating model xgbclassifier = XGBClassifier(n_estimators=100, nthread=-1, silent=False, seed=125, learning_rate=0.2) # xgbmodel = xgbclassifier.fit(X_train, y_train) xgbmodel = xgbclassifier.fit(X, Y) # pred = xgbmodel.predict(test) xgbmodel.score(X_test, y_test) pred = xgbmodel.predict_proba(test)[:, 1] # --------------------- Writing results pred_df['click'] = pred file_name = "Predictions\\prediction_" + str(datetime.datetime.now().date()) + "_" +\ str(datetime.datetime.now().strftime("%H%M%S")) + ".csv" pred_df.to_csv(path_or_buf="Predictions\\prediction_9.csv", index=False)
test = np.loadtxt("./stack1_test.txt") target = pd.read_csv('./target.csv', index_col=0) nfold = 10 outcome = target['status_group'] ## Classifiers # XGB xgb_classifier = XGBClassifier(max_depth=7, learning_rate=0.02, n_estimators=200, gamma=0.08, min_child_weight=3, subsample=0.5, colsample_bytree=0.9, reg_alpha=0.2, objective='multi:softmax') # logistic regr log_regr_classifier = LogisticRegression(C=10**(3), max_iter=800) classifiers = [xgb_classifier, log_regr_classifier] print("Start Stacking Models") log_loss_eval_rec = np.zeros((nfold, len(classifiers))) acc_eval_rec = np.zeros((nfold, len(classifiers)))
df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) #Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43, objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0) #xgb = gbc(n_estimators=1, learning_rate=.15, max_depth=) #xgb = knn(n_neighbors=50) #xgb = etc(n_estimators=60, n_jobs=2) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) #Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries pred = [] #prediction prob
def __init__(self): self.model = XGBClassifier(max_depth=3, n_jobs=4, random_state=42) self.name = 'XGB'
def mainLoop(df, data_dir, tissue_dir): data_dir = Path("data") tissue_dir = Path("tissue-specific") results = [] algs = [] TISSUE_list = df['SMTS'].unique() for TISSUE in TISSUE_list: cpm, cdat = loadData(TISSUE, data_dir, tissue_dir) #print(cpm.shape) #print(cdat.shape) all_age = df.loc[(df['SMTS'] == TISSUE), 'AGE'] #print(all_age.shape) cpm_train, cpm_test, y_train, y_test = splitData(cpm, all_age) c_train, c_test, y_train, y_test = splitData(cdat, all_age) #y_train.map({'20-29':0,'30-39':1,'40-49':2,'50-59':3, '60-69':4,'70-79':5}) #y_test.map({'20-29':0,'30-39':1,'40-49':2,'50-59':3, '60-69':4,'70-79':5}) y_train.replace( { "20-29": 0, "30-39": 1, "40-49": 2, "50-59": 3, "60-69": 4, "70-79": 5 }, inplace=True) y_test.replace( { "20-29": 0, "30-39": 1, "40-49": 2, "50-59": 3, "60-69": 4, "70-79": 5 }, inplace=True) #print(y_train.shape) #print(cpm_train.shape) #print(c_train.shape) keep = simpleExpressionFilter(c_train, 10) cpm_train = cpm_train.loc[:, (keep)] cpm_test = cpm_test.loc[:, (keep)] selector = sklearn.feature_selection.VarianceThreshold(threshold=.1) selector.fit(cpm_train) keep = selector.get_support(indices=True) cpm_train = cpm_train.iloc[:, keep] cpm_test = cpm_test.iloc[:, keep] xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='multi:softprob', nthread=30, scale_pos_weight=1, seed=1234) xgb1 = predOneTissue(xgb1, cpm_train, y_train) y_preds = xgb1.predict(cpm_test) score = scoreOneTissue(y_test, y_preds) #print(y_test,y_preds) results.append(score) algs.append(xgb1) results = pd.Series(results, index=TISSUE_list) return (algs, results)
# Disabling other pipes because we don't need them and it'll speed up this part a bit with nlp.disable_pipes(): docs = list(nlp.pipe(tweets)) doc_vectors = np.array([doc.vector for doc in docs]) print("doc vectors shape=", doc_vectors.shape) X_train, X_test, y_train, y_test = train_test_split(doc_vectors, labels, test_size=0.3, random_state=1, stratify=labels) xgb_model=None xgb_model = XGBClassifier( n_estimators=100, scale_pos_weight=SCALE_FACTOR, objective='binary:logistic', colsample=0.9, colsample_bytree=0.5, eta=0.1, max_depth=8, min_child_weight=6, subsample=0.9) print("Training xgb model....") xgb_model.fit(X_train, y_train) print("score=", xgb_model.score) preds = xgb_model.predict(X_test) print("f1 score=", f1_score(preds, y_test)) # we get an f1 score of arroud 0.64
def optimize_regularization_parameters ( model, x, t ): # Local variables. indent = 3 log ( 'Optimizing max depth and min child weight:', indent = indent ) # Initialize grid search parameters. reg_alpha_locus = 5.0 reg_alpha_min_index = -3 reg_alpha_max_index = 3 reg_alpha_index_stride_scale = 0.5 reg_alpha_index_range = range ( reg_alpha_min_index, reg_alpha_max_index + 1, 1 ) reg_alpha_search_domain = [ reg_alpha_locus + ( x * reg_alpha_index_stride_scale ) for x in reg_alpha_index_range ] reg_lambda_locus = 1.0 reg_lambda_min_index = -5 reg_lambda_max_index = 1 reg_lambda_index_stride_scale = 0.1 reg_lambda_index_range = range ( reg_lambda_min_index, reg_lambda_max_index + 1, 1 ) reg_lambda_search_domain = [ reg_lambda_locus + ( x * reg_lambda_index_stride_scale ) for x in reg_lambda_index_range ] # Configure grid search. parameter_search_1 = { 'reg_alpha' : reg_alpha_search_domain, 'reg_lambda' : reg_lambda_search_domain } # Perform grid search. grid_search_1 = GridSearchCV ( estimator = XGBClassifier ( learning_rate = Constant.Model.LEARNING_RATE, n_estimators = Constant.Model.N_ESTIMATORS, max_depth = Constant.Model.MAX_DEPTH, min_child_weight = Constant.Model.MIN_CHILD_WEIGHT, gamma = Constant.Model.GAMMA, subsample = Constant.Model.SUBSAMPLE, colsample_bytree = Constant.Model.COLSAMPLE_BYTREE, objective = Constant.Model.OBJECTIVE, scale_pos_weight = Constant.Model.SCALE_POS_WEIGHT, seed = Constant.Model.SEED ), param_grid = parameter_search_1, scoring = Constant.Model.GridSearch.Tree.SCORING, cv = Constant.Model.GridSearch.Tree.CV, verbose = Constant.Model.GridSearch.Tree.VERBOSE, n_jobs = 1, iid = False ) grid_search_1.fit ( x, t ) # Report results. for e in grid_search_1.grid_scores_: log ( str (e) , indent = indent+1 ) reg_alpha = grid_search_1.best_params_ [ 'reg_alpha' ] reg_lambda = grid_search_1.best_params_ [ 'reg_lambda' ] log ( 'Optimal regularization alpha = ' + str ( reg_alpha ), indent = indent+1 ) log ( 'Optimal regularization lambda = ' + str ( reg_lambda ), indent = indent+1 ) log ( 'Best Score = ' + str ( grid_search_1.best_score_ ), indent = indent+1 ) # update model using optimized parameters. model.set_params ( reg_alpha = reg_alpha ) model.set_params ( reg_lambda = reg_lambda ) # Return updated model. return model
print('Accuracy:%.4g' % metrics.accuracy_score(df['label'], pred)) print('AUC(train):%f' % metrics.roc_auc_score(df['label'], predprob)) feat_imp = pd.Series( alg.get_booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='feature importances') plt.ylabel('fm_score') # %% # 4.2. tuning parameters predictors = [x for x in df.columns if x not in ['label']] clf = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1) modelfit(clf, df2, predictors, cv_folds=10) # %% clf.set_params(n_estimators=193) # %% param1 = {'max_depth': range(3, 12, 2), 'min_child_weight': range(1, 6, 2)} gs1 = GridSearchCV(estimator=clf, param_grid=param1, scoring='f1_weighted', n_jobs=-1,
def optimize_subsample_and_colsample_bytree ( model, x, t ): # Local variables. indent = 3 log ( 'Optimizing sub-sample and column sample by tree:', indent = indent ) # Initialize ssearch parameters. subsample_min = 0.1 subsample_max = 0.4 subsample_stride = 0.01 colsample_bytree_min = 0.05 colsample_bytree_max = 0.1 colsample_bytree_stride = 0.01 # Configure grid search. parameter_search_1 = { 'subsample' : np.arange ( subsample_min, subsample_max, subsample_stride ), 'colsample_bytree' : np.arange ( colsample_bytree_min, colsample_bytree_max, colsample_bytree_stride ) } # Perform grid search. grid_search_1 = GridSearchCV ( estimator = XGBClassifier ( learning_rate = Constant.Model.LEARNING_RATE, n_estimators = Constant.Model.N_ESTIMATORS, max_depth = Constant.Model.MAX_DEPTH, min_child_weight = Constant.Model.MIN_CHILD_WEIGHT, gamma = Constant.Model.GAMMA, subsample = Constant.Model.SUBSAMPLE, colsample_bytree = Constant.Model.COLSAMPLE_BYTREE, objective = Constant.Model.OBJECTIVE, scale_pos_weight = Constant.Model.SCALE_POS_WEIGHT, seed = Constant.Model.SEED ), param_grid = parameter_search_1, scoring = Constant.Model.GridSearch.Tree.SCORING, cv = Constant.Model.GridSearch.Tree.CV, verbose = Constant.Model.GridSearch.Tree.VERBOSE, n_jobs = 1, iid = False ) grid_search_1.fit ( x, t ) # Report results. if False: for e in grid_search_1.grid_scores_: log ( str (e) , indent = indent+1 ) subsample = grid_search_1.best_params_ [ 'subsample' ] colsample_bytree = grid_search_1.best_params_ [ 'colsample_bytree' ] log ( 'Optimal subsample = ' + str ( subsample ), indent = indent+1 ) log ( 'Optimal colsample_bytree = ' + str ( colsample_bytree ), indent = indent+1 ) log ( 'Best Score = ' + str ( grid_search_1.best_score_ ), indent = indent+1 ) # update model using optimized parameters. model.set_params ( subsample = subsample ) model.set_params ( colsample_bytree = colsample_bytree ) # Return updated model. return model
split_size = size_1/4 print "split_size: " + str(split_size) df_All_1_a = df_All_1.iloc[0:split_size] df_All_1_b = df_All_1.iloc[(split_size+1):2*split_size] df_All_1_c = df_All_1.iloc[(2*split_size+1):3*split_size] df_All_1_d = df_All_1.iloc[(3*split_size+1):] ##################################################### ##################################################### print df_All_1_a.shape[0] df_All_train = pd.concat([df_All_0, df_All_1_a], axis=0) df_All_train = shuffle(df_All_train) X_train = df_All_train.drop(["certid", "label"], axis=1, inplace=False) y_train = df_All_train["label"] clf = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, gamma=0.01, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', reg_alpha=0.1, reg_lambda=0.1, seed=27) clf = clf.fit(X_train, y_train) X_test = df_All_test.drop(["certid", "label"], axis=1, inplace=False) pred = clf.predict(X_test).T cerid_arr = np.array(df_All_test["certid"]).T cerid_arr = np.vstack((cerid_arr, pred)) name_a = "test_a_" + str(i) + ".csv" np.savetxt(name_a,cerid_arr.T,delimiter=',', fmt = "%s") ################################################################### print df_All_1_b.shape[0] df_All_train = pd.concat([df_All_0, df_All_1_b], axis=0) df_All_train = shuffle(df_All_train) X_train = df_All_train.drop(["certid", "label"], axis=1, inplace=False) y_train = df_All_train["label"]
clf_dt = clf_dt.fit(X_train, y_train) clf_dt_file = open("clf_dt_adaboost_mdc1.pkl", "wb") pickle.dump(clf_dt, clf_dt_file) clf_dt_file.close() test_accuracy = checkAccuracy(clf_dt) print("Test Accuracy for Ada boost Classifier: ", test_accuracy) # from sklearn.ensemble import GradientBoostingRegressor # clf_dt = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1) # print(clf_dt) # test_accuracy = checkAccuracy(clf_dt) # print("Test Accuracy for Gradient Boosting Classifier: ",test_accuracy) from xgboost.sklearn import XGBClassifier clf_dt = XGBClassifier(n_estimators=1000) print(clf_dt) clf_dt = clf_dt.fit(X_train, y_train) clf_dt_file = open("clf_dt_xgb_mdc1.pkl", "wb") pickle.dump(clf_dt, clf_dt_file) clf_dt_file.close() #Fit clf to the training data test_accuracy = checkAccuracy(clf_dt) print("Test Accuracy for XGB Classifier: ", test_accuracy) # from hyperopt import fmin, tpe, hp, STATUS_OK,Trials # space ={ # 'n_estimators':hp.quniform('n_estimators',5,10,1), # 'learning_rate':hp.quniform('learning_rate',0.025,0.1,0.025), # 'max_depth':hp.quniform('max_depth',1,13,1), # 'min_child_weight': hp.quniform('min_child_weight',1,6,1),
labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) onehotencoder = OneHotEncoder(categorical_features = [1]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Fitting XGBoost to the Training set import xgboost as xgb from xgboost.sklearn import XGBClassifier classifier = XGBClassifier() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) print(accuracies.mean()) print(accuracies.std())
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from xgboost.sklearn import XGBClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.539663321713 exported_pipeline = XGBClassifier(learning_rate=0.1, max_depth=1, n_estimators=120, silent=1.0) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
""" #%% #%% #Feature selection """ xgb_fea = XGBClassifier( learning_rate =0.3, max_depth=4, min_child_weight=1, subsample=0.8, gamma=0, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27) XGBFEA(xgb_fea, x,y,2000) """ #%% #The best estimator xgbcv = XGBClassifier(learning_rate =0.05, n_estimators=1475, max_depth=4,reg_alpha=0.01, min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=8, seed=27) xgbcv=xgbcv.fit(x_train,y_train) y_pred=xgbcv.predict(x_val) print("accuracy_score:"+str(accuracy_score(y_val,y_pred))) print(confusion_matrix(y_val,y_pred)) print(classification_report(y_val,y_pred,digits=4)) print(y_pred.sum()/len(y_pred)) #%% """ Extract 20 important features """ imp_features=['TARGET','SK_ID_CURR','EXT_SOURCE_3', 'DAYS_BIRTH', 'AMT_CREDIT', 'EXT_SOURCE_2', 'AMT_ANNUITY', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'DAYS_EMPLOYED', 'DAYS_LAST_PHONE_CHANGE', 'AMT_INCOME_TOTAL' ,'REGION_POPULATION_RELATIVE', 'YEARS_BEGINEXPLUATATION_AVG', 'HOUR_APPR_PROCESS_START', 'FLOORSMAX_AVG', 'AMT_REQ_CREDIT_BUREAU_YEAR' ,'OBS_30_CNT_SOCIAL_CIRCLE','CODE_GENDER' , 'DEF_30_CNT_SOCIAL_CIRCLE','CODE_GENDER','NAME_FAMILY_STATUS','NAME_CONTRACT_TYPE'] Imxytrain=PTrain_o.loc[:,imp_features].reindex()
def perform_tuning(X_train, Y_train): print("Tuning max_depth ...") max_depth_search_space = np.linspace(4, 14, num=6, dtype=np.int) best_score_list = [] for max_depth in max_depth_search_space: estimator = XGBClassifier(max_depth=max_depth, learning_rate=0.1, n_estimators=1000000, min_child_weight=5, subsample=0.8, colsample_bytree=0.8, objective=OBJECTIVE) _, best_score = evaluate_estimator(estimator, X_train, Y_train) best_score_list.append(best_score) best_score_index = GET_BEST_SCORE_INDEX(best_score_list) optimal_max_depth = max_depth_search_space[best_score_index] print("The optimal max_depth is {:d}.".format(optimal_max_depth)) print("Tuning subsample ...") subsample_search_space = np.linspace(0.6, 1, num=5) best_score_list = [] for subsample in subsample_search_space: estimator = XGBClassifier(max_depth=optimal_max_depth, learning_rate=0.1, n_estimators=1000000, min_child_weight=5, subsample=subsample, colsample_bytree=0.8, objective=OBJECTIVE) _, best_score = evaluate_estimator(estimator, X_train, Y_train) best_score_list.append(best_score) best_score_index = GET_BEST_SCORE_INDEX(best_score_list) optimal_subsample = subsample_search_space[best_score_index] print("The optimal subsample is {:f}.".format(optimal_subsample)) print("Tuning min_child_weight ...") min_child_weight_search_space = np.linspace(1, 9, num=5) best_score_list = [] for min_child_weight in min_child_weight_search_space: estimator = XGBClassifier(max_depth=optimal_max_depth, learning_rate=0.1, n_estimators=1000000, min_child_weight=min_child_weight, subsample=optimal_subsample, colsample_bytree=0.8, objective=OBJECTIVE) _, best_score = evaluate_estimator(estimator, X_train, Y_train) best_score_list.append(best_score) best_score_index = GET_BEST_SCORE_INDEX(best_score_list) optimal_min_child_weight = min_child_weight_search_space[best_score_index] print("The optimal min_child_weight is {:f}.".format( optimal_min_child_weight)) print("Tuning colsample_bytree ...") colsample_bytree_search_space = np.linspace(0.6, 1, num=5) best_score_list = [] for colsample_bytree in colsample_bytree_search_space: estimator = XGBClassifier(max_depth=optimal_max_depth, learning_rate=0.1, n_estimators=1000000, min_child_weight=optimal_min_child_weight, subsample=optimal_subsample, colsample_bytree=colsample_bytree, objective=OBJECTIVE) _, best_score = evaluate_estimator(estimator, X_train, Y_train) best_score_list.append(best_score) best_score_index = GET_BEST_SCORE_INDEX(best_score_list) optimal_colsample_bytree = colsample_bytree_search_space[best_score_index] print("The optimal colsample_bytree is {:f}.".format( optimal_colsample_bytree)) optimal_parameters = [ optimal_max_depth, optimal_min_child_weight, optimal_subsample, optimal_colsample_bytree ] print("The optimal parameters are as follows:") print(optimal_parameters) return optimal_parameters
print(target_vector.shape) categorical_df = df[['Weekday', 'DepartmentDescription']] categorical_df = categorical_df.fillna('-1') df['Upc'] = df.fillna(df['Upc'].mean()) df['FinelineNumber'] = df.fillna(df['FinelineNumber'].mean()) df = df.drop(['Weekday', 'DepartmentDescription'], axis=1) df = df.fillna(0) #Apply LabelEncoder encoder = LabelEncoder() temp_df = categorical_df.apply(encoder.fit_transform) #combine data df1 = df[['VisitNumber']].values df2 = df[['Upc']].values df3 = df[['FinelineNumber']].values #prints = [print(x.shape) for x in [df1,df2,df3]] print(df) print(temp_df.shape) final_train_data = np.concatenate((df1,df2,df3,temp_df), axis=1) #train the model model = XGBClassifier(silent=False) print("Training the model....") model.fit(final_train_data,target_vector) print("Dumping model....") f = open('walmart.pkl', 'wb+') pickle.dump(model, f)
x_train, x_valid, y_train, y_valid = train_test_split(X, y, train_size=0.95, test_size=0.05,shuffle=False) # In[75]: x_test.shape, x_train.shape,test.shape, train.shape # In[76]: gbr = XGBClassifier(missing=np.nan, learning_rate = 0.15, gamma=1, colsample_bytree=0.8) # In[77]: gbr.fit(X, y) # In[78]: pred_cv = gbr.predict(x_valid)
def create_and_save_model(X, y): clf1 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=0.2, seed=27, reg_alpha=0.4, reg_lambda=1, early_stopping_rounds=50, show_progress=True) clf2 = AdaBoostClassifier(n_estimators=150) # initialize the base classifier base_cls = DecisionTreeClassifier() # no. of base classifier num_trees = 200 # bagging classifier clf3 = BaggingClassifier(base_estimator=base_cls, n_estimators=num_trees, random_state=8, n_jobs=-1) clf4 = RandomForestClassifier(bootstrap=True, class_weight={ 0: 2.5, 1: 1 }, criterion='entropy', max_depth=60, max_features="auto", max_leaf_nodes=50, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=6, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1, oob_score=True, random_state=10, verbose=1, warm_start=False) params = { 'n_estimators': 200, 'max_depth': 20, 'subsample': 0.6, 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3, 'loss': 'exponential', 'max_features': 'auto', 'verbose': 1 } #'ccp_alpha': 0.04 clf5 = GradientBoostingClassifier(**params) estimators = [('xgb', clf1), ('abc', clf2), ('bc', clf3), ('rf', clf4), ('gbc', clf5)] stack_estimator = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=40, scale_pos_weight=1, seed=27, reg_alpha=0, reg_lambda=1, early_stopping_rounds=50, show_progress=True) model = StackingClassifier(estimators=estimators, final_estimator=stack_estimator, n_jobs=-1, cv=5, verbose=1) model.fit(X, y) file_name = 'model_final.pkl' joblib.dump(model, file_name) return file_name