def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50): alg = XGBClassifier(**params) df = data.sample(frac=0.3) pX = df.drop('LABEL', axis=1) py = df['LABEL'] if useTrainCV: print("start use cv") xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_param['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print(cvresult.shape[0]) alg.set_params(n_estimators=cvresult.shape[0]) params['n_estimators'] = cvresult.shape[0] print("best tree size is {}".format(cvresult.shape[0])) # Fit the algorithm on the data alg.fit(X, y, eval_metric='auc') y_pred = alg.predict(pX) accuracy = metrics.accuracy_score(py, y_pred) print("精确率Accuracy: %.2f%%" % (accuracy * 100.0)) print('auc:', metrics.roc_auc_score(py, y_pred)) train_report = metrics.classification_report(py, y_pred) print(train_report) feat_imp = pd.Series( alg.get_booster().get_fscore()).sort_values(ascending=False) print(feat_imp) return alg
def Create_Model(X_train, X_test, y_train, y_test, learning_rate, n_estimators, max_depth, min_child_weight, gamma, subsample, colsample_bytree, reg_alpha, eval_metric): ROCforest = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, objective='binary:logistic', nthread=4, seed=12) cv_folds = 5 eval_metric = eval_metric xgb_param = ROCforest.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=ROCforest.get_params()['n_estimators'], nfold=cv_folds, metrics=eval_metric) ROCforest.set_params(n_estimators=cvresult.shape[0]) ROCforest.fit(X_train, y_train) return ROCforest
def xgb_model(x1, y1): X_train, X_test, y_train, y_test = train_test_split( x1, y1, test_size=0.3, random_state=SEED ) # Down-sample controls in training set, [1:1] case:control if subsample is True: X_train, y_train = subsample_df(X_train, y_train) # Implement SMOTE to balance training set, [1:1] case:control if smote is True: X_train, y_train = smote_sample(X_train, y_train) columns = X_train.columns # Weight Rescale ratio = float( np.sum(y_train["psych_hosp"].values == 0) / np.sum(y_train["psych_hosp"].values == 1) ) # Instantiate the XGBClassifier and specify parameters xgb1 = XGBClassifier( learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", nthread=4, scale_pos_weight=ratio, seed=SEED, ) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=xgb1.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=50, ) xgb1.set_params(n_estimators=cvresult.shape[0]) # Fit the algorithm on the data xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc") imp = importances(xgb1, X_test, y_test) # permutation imp = imp.reset_index() imp_ = imp[imp["Importance"] >= 0.0001] feats = [] for _ in imp_["Feature"]: feats.append(_) return imp, feats
def cv(self, cache=False): if not cache: rows = get_db_data(300000) features = rows[:, :-1] ys = rows[:, -1] try: dump_svmlight_file(features, ys, 'catarse.txt.all') except Exception as inst: print(inst) dtrain = xgb.DMatrix(features, label=ys) X = features y = ys else: # load file from text file, also binary buffer generated by xgboost dtrain = xgb.DMatrix('catarse_recommender/common/catarse.txt.all') data = load_svmlight_file( 'catarse_recommender/common/catarse.txt.all') X = data[0] y = data[1] xgb1 = XGBClassifier(learning_rate=0.01, n_estimators=800, max_depth=4, nthread=8, objective='binary:logistic', seed=27) xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics=['logloss', 'error'], early_stopping_rounds=20, stratified=True, shuffle=True) print(cvresult) filehandler = open(b"catarse_recommender/common/cv_result.obj", "wb") pickle.dump(cvresult, filehandler)
n_estimators=100, # objective="gblinear", max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective="multi:softprob", # n_classes=12, nthread=4, # scale_pos_weight=1, seed=27) modelfit(xgb1, y, predictors) #预测 dtrain = xgb.DMatrix(predictors, y) params = xgb1.get_xgb_params() params['num_class'] = 12 model = xgb.train(dtrain=dtrain, params=params) dtest = xgb.DMatrix(Xtest) pred = pd.DataFrame(model.predict(dtest), index=gatest.index, columns=targetencoder.classes_) #Step 2: Tune max_depth and min_child_weight param_test1 = { 'max_depth': [7, 9, 10], #10 12 'min_child_weight': [5, 7, 9] #9 15 } gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
train.drop(x, axis=1, inplace=True) test.drop(x, axis=1, inplace=True) y_train = train['TARGET'].values X_train = train.drop(['ID','TARGET'], axis=1).values y_test = test['ID'] X_test = test.drop(['ID'], axis=1).values xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=600, max_depth=5, min_child_weight=1, gamma=0, subsample=0.6815, colsample_bytree=0.701, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27) xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics=['auc'], early_stopping_rounds=50, show_progress=False) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb1.fit(X_train, y_train, eval_metric='auc') output = xgb1.predict_proba(X_test)[:,1] submission = pd.DataFrame({"ID":y_test, "TARGET":output}) submission.to_csv("submission.csv", index=False)
def modelfit(train, labels, test, features, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): model = XGBClassifier(learning_rate=0.2, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=27) test_percent = 0.2 X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=test_percent, random_state=23) xgb_param = model.get_xgb_params() xgtrain = xgb.DMatrix(X_train[features], y_train) xgcv = xgb.DMatrix(X_test[features]) xgtest = xgb.DMatrix(test[features]) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print("n_estimators=") print(cvresult.shape[0]) model.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data model.fit(X_train, y_train) ##training predictions proba = model.predict_proba(X_test) preds = proba[:, 1] score = roc_auc_score(y_test, preds) print("Area under ROC {0}".format(score)) #Print model report: # print "\nModel Report" # print "Accuracy : %.4g" % accuracy_score(y_train, preds) # print "AUC Score (Train): %f" % roc_auc_score(y_train, preds) feat_imp = pd.Series( model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') # plt.show() ##test predictions test_proba = model.predict_proba(test) test_preds = test_proba[:, 1] return test_preds
colsample_bytree=0.75, min_child_weight=2, eta=0.025, gamma=0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) isCV = True cv_folds = 3 early_stopping_rounds = 10 predictors = [x for x in train_df.columns if x not in [target, IDcol]] if isCV: xgb_param = xgb.get_xgb_params() xgtrain = xgb.DMatrix(train_df[predictors].values, label=train_df[target].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=True) xgb.set_params(n_estimators=cvresult.shape[0]) xgb.fit(train_df[predictors], train_df['acc_now_delinq'], eval_metric='auc') dtrain_predictions = xgb.predict(train_df[predictors])
def do_cell(task): df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3] #print('do_cell', df_train.shape, df_test.shape, x_start, y_start) #train n_places_th_local = n_places_th n_places_local = n_places if n_places != 0: tmp = df_train.shape[0] value_counts = df_train.place_id.value_counts()[0:n_places] df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns] n_places_th_local = value_counts.values[n_places - 1] percentage = df_train.shape[0]/tmp elif n_places_th != 0: value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] else: n_places_th_local = 2 value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] while percentage > n_places_percentage: n_places_th_local += 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] n_places_th_local -= 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] #print(x_start, y_start, n_places_local, n_places_th_local, percentage) #test row_ids = df_test.index if 'place_id' in df_test.columns: df_test = df_test.drop(['place_id'], axis=1) le = LabelEncoder() y = le.fit_transform(df_train.place_id.values) X = df_train.drop(['place_id'], axis=1).values X_predict = df_test.values score = 0 n_estimators = 0 if xgb == 1: if xgb_calculate_n_estimators == True: clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False) score = round(1 - clf.booster().best_score, 6) n_estimators = clf.booster().best_ntree_limit else: abc += 1 xgb_options = clf.get_xgb_params() xgb_options['num_class'] = n_places + 1 train_dmatrix = DMatrix(X, label=y) #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score) n_estimators = cv_results.shape[0] score = round(1 - cv_results.values[-1][0], 6) std = round(cv_results.values[-1][1], 6) else: n_estimators = n_estimators_fixed clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) else: clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1) if rf_calculate_score == True: if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() else: #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) scores_cv = [] for train, test in folds: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() print(' ', x_start, y_start, score) scores_cv.append(score) score = np.array(scores_cv).mean() #if few_cells == 1 or grid_search == 1: # return [score, None, None] clf.fit(X, y) y_predict = clf.predict_proba(X_predict) ##1 labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx]) print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage) return [score, row_ids, labels_predict]
def fit_xgboost(param_grid, param_table, train, col_type, find_n_estimator=False, cv_iterations=5, cv_folds=5, nthread=3, seed=1, verbose=0): target = col_type['target'] features = col_type['features'] ID = col_type['ID'] start_time = strftime("%Y-%m-%d %H-%M", gmtime()) pred_return = {} for params in param_table.itertuples(index=True, name='NamedTuple'): params = params._asdict() index = params['Index'] params.pop('Index') # remove "Index" from params params['objective'] = 'binary:logistic' params['nthread'] = nthread params['random_state'] = seed params['seed'] = seed params['silent'] = True xgb_model = XGBClassifier() xgb_model.set_params(**params) if find_n_estimator: xgb_train = xgb.DMatrix(train[features], label=train[target]) cv_result = xgb.cv( xgb_model.get_xgb_params(), xgb_train, num_boost_round=int(params['n_estimators']), nfold=cv_folds, metrics='auc', early_stopping_rounds=50, seed=seed) best_n_estimator = cv_result.shape[0] param_table.at[index, 'n_estimators'] = best_n_estimator xgb_model.set_params(n_estimators=best_n_estimator) scores = [] pred_all = [] for cv_index in range(cv_iterations): pred = train.loc[:, [ID]] # get only the ID column # k-fold cross validation skf = StratifiedKFold(n_splits=cv_folds, random_state=cv_index, shuffle=True) for train_index, dev_index in skf.split(train[features].values, train[target].values): X_train = train[features].iloc[train_index].values y_train = train[target].iloc[train_index].values X_dev = train[features].iloc[dev_index].values y_dev = train[target].iloc[dev_index].values # Fit the algorithm on train folds xgb_model.fit(X_train, y_train, eval_metric='auc') # Predict on dev fold pred_dev = xgb_model.predict_proba(X_dev)[:, 1] pred.at[dev_index, 'Pred'] = pred_dev # Compute the score score = metrics.roc_auc_score(y_dev, pred_dev) scores.append(score) if len(pred_all) == 0: pred_all = pred else: pred_all = pd.concat([pred_all, pred], axis=0) pred_mean = pred_all.groupby(ID)['Pred'].mean() # avg predict_proba for each ID score = metrics.roc_auc_score(train.sort_values(ID)[target].values, pred_mean) # use avg pred to compute auc score pred_return['Pred_' + str(index)] = pred_mean # store the pred result for use in stacking param_table.at[index, 'Score'] = score param_table.at[index, 'Score_Std'] = np.std(scores) if verbose == 1: print('{} : {}'.format(index, param_table.iloc[index, :])) param_table["Score_Weighted"] = param_table["Score"] - 0.1 * param_table["Score_Std"] # update_param_grid best_param_index = param_table["Score_Weighted"].idxmax() print("Param_grid size: {}".format(param_table.shape[0])) print("Current Score: {}, Score_Std: {}".format(param_table.loc[best_param_index, "Score"], param_table.loc[best_param_index, "Score_Std"])) print("--------------------------") for param in param_grid: best_param = param_table.loc[best_param_index, param] if isinstance(param_grid[param], list): if len(param_grid[param]) > 1 or (len(param_grid[param]) == 1 and param_grid[param][0] != best_param): print("{}: tuned to {}".format(param, best_param)) else: print("{}: tuned to {}".format(param, best_param)) param_grid[param] = [best_param] return param_grid, pred_return
# set grid search parameters # reg_alpha = [1e-5, 1e-2, 0.1, 1] reg_lambda = [1e-5, 1e-2, 0.1, 1] # num_fits = len(reg_alpha)*5 num_fits = len(reg_lambda) * 5 param_grid = dict( # reg_alpha=reg_alpha, reg_lambda=reg_lambda) kfold = StratifiedKFold(n_splits=5, shuffle=True) grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", cv=kfold, verbose=num_fits) start = time.time() grid_result = grid_search.fit(X_train, y_train) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) end = time.time() print("parameters: {}".format(model.get_xgb_params())) print("\nTotal time : {:.2f} {}".format((end - start) / 60, "minutes"))
def xgb_model2(x1, y1, ft): ## Remove features that negatively impact the model - Used after xgb_mode2 is already run once ##Copy results from XGB2_FEATS into 'unwanted' # unwanted = {'fever_unknown', 'other_ext_injury', 'med_angiotensin_ii_i', 'wbc_disease', 'proc_124', 'acute_bronch', 'hemorrhoid', 'chf', 'poison_psycho', 'eye_inflam', 'lower_limb_fract', 'biliary_tract', 'other_bone_disease', 'med_antifungal', 'spondylosis', 'secndry_malig', 'other_joint', 'neoplasm_unspec', 'chest_pain_nos', 'acq_foot_deform', 'mood', 'nonmalig_breast', 'schizo', 'suicide', 'osteo_arth', 'other_connective', 'medical_eval'} # ft = [e for e in ft if e not in unwanted] print("XGB Features:\n", ft, "\n") x1 = x1.loc[:, ft] X_train, X_test, y_train, y_test = train_test_split( x1, y1, test_size=0.3, random_state=SEED ) # Down-sample controls in training set, [1:1] case:control if subsample is True: X_train, y_train = subsample_df(X_train, y_train) # Implement SMOTE to balance training set, [1:1] case:control if smote is True: X_train, y_train = smote_sample(X_train, y_train) columns = X_train.columns # Weight Rescale ratio = float( np.sum(y_train["psych_hosp"].values == 0) / np.sum(y_train["psych_hosp"].values == 1) ) # Instantiate the XGBClassifier and specify parameters xgb1 = XGBClassifier( learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", nthread=4, scale_pos_weight=ratio, seed=SEED, ) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=xgb1.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=50, ) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb_cv_score = cross_val_score( xgb1, X_train, np.ravel(y_train), cv=10, scoring="roc_auc" ) # Fit the algorithm on the data xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc") D = feature_dependence_matrix(X_train) viz1 = plot_dependence_heatmap(D, figsize=(11, 10)) viz1.save("output/Psych_XGB_feat_depend_" + outfile) xgb_predict = xgb1.predict(X_test) print("=== All AUC Scores [CV - Train] ===") print(xgb_cv_score, "\n") print("=== Mean AUC Score [CV - Train] ===") print(xgb_cv_score.mean(), "\n") print("=== Confusion Matrix [Test] ===") print(confusion_matrix(y_test, xgb_predict), "\n") print("=== Classification Report [Test] ===") print(classification_report(y_test, xgb_predict), "\n") print("=== AUC Score [Test] ===") print(roc_auc_score(y_test, xgb_predict), "\n") imp = importances(xgb1, X_test, y_test) # permutation viz2 = plot_importances(imp) viz2.save("output/Psych_XGB_feat_imp_" + outfile) imp = imp.reset_index() imp_ = imp[imp["Importance"] < 0.00000] feats = [] for _ in imp_["Feature"]: feats.append(_) xgb_roc_auc = roc_auc_score(y_test, xgb_predict) fpr, tpr, thresholds = roc_curve(y_test, xgb1.predict_proba(X_test)[:, 1]) plt.figure() plt.plot(fpr, tpr, label="XGB Classifier (area = %0.3f)" % xgb_roc_auc) plt.plot([0, 1], [0, 1], "r--") plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver operating characteristic: Clinical Data Only [XGB]") plt.legend(loc="lower right") plt.savefig("output/ROC_Psych_XGB_" + outfile) plt.savefig("output/ROC_Psych_XGB_" + outfile) plt.show() return imp, feats
}, { 'gamma': np.linspace(0, 0.5, 5) }, { 'n_estimators': list(range(500, 3000, 500)), }, { 'subsample': np.linspace(0, 1, 5), 'colsample_bytree': np.linspace(0, 1, 5) }, { 'reg_alpha': [0, 1e-10, 1e-5, 1e-2] }, { 'learning_rate': [0.01] # set learning rate low }, { 'n_estimators': list(range(100, 5000, 500)) } ] for i, param_space in enumerate(param_spaces): print('pct spaces tuned: ' + str(1. * i / len(param_space))) model = cv_grid(param_space, model, X, y) tuned_params_dict = model.get_xgb_params() alsDataManager.save_dict_as_json( output_dict=tuned_params_dict, output_path='xgboost_tuned_params.txt')
save_model(mod_name=model_name, alg=model, cv_res=cv_results) else: # Load the model model, cv_results = load_model(mod_name=model_name) # Print parameters display_param(mod_name=model_name, cv_res=cv_results, alg=model, grid_cv=False) # Make predictions train_pred, train_prob = predict_results(alg=model, d_train_x=train_x, d_train_y=train_y) # Retrieve parameters n_estimators = model.get_xgb_params()["n_estimators"] model_list[model_name] = [model, train_pred, train_prob] print("Test 1 Over") # ======================== Step 2.2 : Test 2 ======================== print("=>=>=> Launching test 2") model_name = "xgboost_2" test = False if test: # Define the model xgb_params = { "learning_rate": 0.1, "n_estimators": n_estimators,
def set_parameters(set_name, golden_set, input_file): golden = str_to_bool(golden_set) #------------------------------------------------------------------------- #read in the directory that is being run data_dir = set_name #read in the parameters file and load it full_path = os.path.join(working_dir, "{0}".format(data_dir), 'params.yaml') stream = open(full_path, 'r') parameters = yaml.load(stream, Loader=yaml.FullLoader) #read in Hypatia data as pandas dataframe (2D structure), drop HIP numbers df = pd.read_csv(input_file) set_number = set_name #------------------------------------------------------------------------- if golden: df2 = df.copy() df2.loc[df2[(df2['Exo'] == 1) & (df2['MaxPMass'] > parameters['gas_giant_mass'])]. sample(10, random_state=np.random.RandomState()).index, 'Exo'] = 0 yy = df2.loc[df2['Exo'] == 0].index zz = df.loc[df['Exo'] == 0].index changed = [ind for ind in yy if not ind in zz] changedhips = [df['HIP'][ind] for ind in changed] df = df2.copy() yy2 = df2.loc[df2['Exo'] == 0].index zz2 = df.loc[df['Exo'] == 0].index changed2 = [ind for ind in yy2 if not ind in zz2] #------------------------------------------------------------------------- df.index = df['HIP'] df['Exo'] = df['Exo'].astype('category') #category = limited possibilities df['Multi'] = df['Multi'].astype('category') df['MaxPMass'] = df['MaxPMass'].astype(np.number) df['Sampled'] = np.zeros((df.shape[0])) df['Predicted'] = np.zeros((df.shape[0])) df = df.drop(['HIP'], 1) # Print a bunch of stuff in terminal print('Parameters used in simulation:') print('------------------------------') print('') for key in parameters.keys(): print('{0} = {1}'.format(key, parameters[key])) cv_folds = parameters['cv_folds'] early_stopping_rounds = parameters['early_stopping_rounds'] N_iterations = parameters['N_iterations'] N_samples = parameters['N_samples'] gas_giant_mass = parameters['gas_giant_mass'] features = parameters['features'] relevant_columns = features + ['Exo', 'MaxPMass', 'Sampled', 'Predicted'] #Redefine dataframe with the "relevant columns" and remove nans if dropnans==True in yaml if (parameters['dropnans']): df = df[relevant_columns].dropna() print('Number of samples used in simulation: {0}'.format(df.shape[0])) print('') #Define the confusion matrix and other arrays cfm = np.zeros((2, 2)) auc_score_train = [] precision_score_train = [] feat_imp_train = pd.DataFrame(columns=features) probabilities_total = pd.DataFrame(index=df.index) print('iteration \t estimators') print('---------------------------') #---------------------------XGBOOST LOOP---------------------------------------------- # Loop for all of the iterations (defined in yaml) for iteration in range(0, N_iterations): #dataframe of 200 random hosts with giant planets df_iter_with_exo = df[(df['Exo'] == 1) & (df['MaxPMass'] > gas_giant_mass)].sample( N_samples, random_state=np.random.RandomState()) #dataframe of 200 random non hosts df_iter_none_exo = df[df['Exo'] == 0].sample( N_samples, random_state=np.random.RandomState()) # make a new dataframe of the 400 star subset df_train = pd.concat([df_iter_with_exo, df_iter_none_exo], axis=0) # make a dataframe of those stars NOT in the training set (to predict on) df_predict = df[~df.index.isin(df_train.index)] # The train dataframe with everything but the Exo column X = df_train.drop(['Exo'], 1) # The Exo column (and hips) Y = df_train.Exo # Note: Using gbtree booster alg = XGBClassifier( learning_rate= 0.1, #def=0.3, prevents overfitting and makes feature weight conservative n_estimators=1000, #number of boosted trees to fit max_depth=6, #def=6, max depth of tree/complexity min_child_weight= 1, #def=1, min weight needed to continue leaf partitioning gamma= 0, #def=0, minimum loss reduction required to make partition on a leaf subsample=0.8, #def=1, subsample ratio of the training set colsample_bytree= 0.8, #def=1, subsample ratio of columns when making each tree objective= 'binary:logistic', #def=linear, logistic regression for binary classification, output probability nthread= 1, #originall = 8, but issue on laptop...def=max, number of parallel threads used to run xgboost scale_pos_weight=1, #def=1, balance positive and neg weights seed=27) #def=0, random number seed #get input parameters of algorithm xgb_param = alg.get_xgb_params() #construct training set matrix xgtrain = xgb.DMatrix(X[features].values, label=Y) #cross validation (CV) of xgboost to avoid overfitting cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) print(iteration, '\t \t', cvresult.shape[0]) alg.fit(X[features], Y, eval_metric='auc') dtrain_predictions = alg.predict(X[features]) dtrain_predprob = alg.predict_proba(X[features])[:, 1] feat_imp = alg.get_booster().get_fscore() # See how the algorithm performs on the Exo data auc_score = metrics.roc_auc_score(Y, dtrain_predprob) precision_score = metrics.precision_score(Y, dtrain_predictions) metric_score = metrics.confusion_matrix(Y, dtrain_predictions) # Weighting function to ignore the null values normalized_features = pd.DataFrame( (1 - df_train[features].isnull().sum() / df_train[features].count()) * pd.Series(alg.get_booster().get_fscore()), columns=[iteration]).T #calculate the confusion matrix feat_imp_train = pd.concat([ feat_imp_train, pd.DataFrame(feat_imp, columns=features, index=[iteration]) ]) feat_imp_train_normal = pd.concat( [feat_imp_train, normalized_features]) auc_score_train.append(auc_score) precision_score_train.append(precision_score) cfm += metric_score df.loc[df_predict.index, 'Sampled'] += np.ones(len(df_predict.index)) df.loc[df_predict.index, 'Predicted'] += alg.predict(df_predict[features]) df.loc[df_predict.index, 'Prob'] = alg.predict(df_predict[features]) values = df['Prob'] probabilities_total = pd.concat( [probabilities_total, pd.Series(values, name=str(iteration))], axis=1) if (not iteration % 10): probabilities_total.to_pickle( '{0}/probabilities_total.pkl'.format(data_dir)) #------------------------------------------------------------------------- # Calculate the confusion matrix cfm /= N_iterations cfm[0] /= cfm[0].sum() cfm[1] /= cfm[1].sum() # Print confusion matrix print(np.round(cfm, 3)) df['Prob'] = df['Predicted'] / df['Sampled'] ###########-------------------Output List of Planets------------------------######### #Find the stars with >90% probability of hosting a planet, with the Sampled, Predicted, and Prob columns planets = df[(df.Prob > .90) & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']] print('Number of most probable planet hosts: {0}'.format(planets.shape[0])) #Sort the stars with predicted planets and save that file planetprobs = planets.sort_values(by='Prob', ascending=False) name = data_dir + '/figures/planet_probabilities' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' #name = data_dir+'/figures/planet_probabilities.csv' outfile = open(name, 'w') planetprobs.to_csv(outfile) outfile.close() #Create a second list with all stars in Hypatia and the probabilities planets2 = df[(df.Prob > .0) & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']] if golden: #if 10 stars were randomly taken out changeddf = pd.DataFrame([]) #make empty dataframe for star in changedhips: #loop over the 10 known planets hosts (defined at top) changeddf = changeddf.append(planets2.loc[planets2.index == star]) if planets2.loc[ planets2.index == star].empty: #catch for when a known planet host was cut (bc of abunds) temp = pd.Series([nan, nan, nan], index=['Sampled', 'Predicted', 'Prob']) temp.name = star changeddf = changeddf.append( temp) #append blank file (with star name as index) #Save golden set as a separate file with the date and time as a tag filename = '{0}/figures/goldenSetProbabilities' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' changeddf.to_csv(filename.format(set_number), na_rep=" ") #Save the file with all of the probabilities planetprobs2 = planets2.sort_values(by='Prob', ascending=False) name2 = data_dir + '/figures/planet_probabilitiesAll' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' #name2 = data_dir+'/figures/planet_probabilitiesAll.csv' outfile2 = open(name2, 'w') planetprobs2.to_csv(outfile2) outfile2.close() ###########------------------------Save Files------------------------########## print('Saving data files') #Save files feat_imp_train.to_pickle('{0}/features_train.pkl'.format(data_dir)) feat_imp_train_normal.to_pickle( '{0}/features_train_normal.pkl'.format(data_dir)) probabilities_total.to_pickle( '{0}/probabilities_total.pkl'.format(data_dir)) df.to_pickle('{0}/df_info_all.pkl'.format(data_dir)) np.save('{0}/auc_score_train.npy'.format(data_dir), np.array(auc_score_train)) np.save('{0}/precision_score_train.npy'.format(data_dir), np.array(precision_score_train)) np.save('{0}/cfm.npy'.format(data_dir), cfm) print('Simulation completed successfully.') if golden: print("Changed indices and HIP numbers:") print(changed) print(changedhips)
# auc: Area under the curve model = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=10, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='multi:softmax', nthread=4, scale_pos_weight=1, seed=27, num_class=3) xgb_param = model.get_xgb_params() xgtrain = xgb.DMatrix(X, y) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=1000, #model.get_params()['n_estimators'], nfold=5, metrics='merror', early_stopping_rounds=50, stratified=True) print('\ntraining error') print(cvresult['train-merror-mean']) print('\nvalidation error')
max_depth=10, max_delta_step=2, random_state=42) param_xgb = {'colsample_bytree': [0.7, 0.8, 0.9, 1], 'subsample': [0.9, 1]} xgbc = model_fit(xgbc, xtrain, ytrain, param_xgb, False) xgbc.fit(xtrain, ytrain) bestpred_xgb = print_feature_importance(xgbc) ypred_xgb = Test_Set_Report(xgbc, xtest, ytest) # xgb.cv is used to get the actual number of n_estimators required based on the learning rate, # it uses early_stopping_rounds to get the optimal value xdtrain = xgb.DMatrix(xtrain, label=ytrain) cvresult_xgb = xgb.cv(xgbc.get_xgb_params(), xdtrain, nfold=5, num_boost_round=5000, metrics='auc', early_stopping_rounds=50) ''' Storing the predicted results along with the actual prediction for each phone number in a csv file''' FinalResult_Python = pd.concat([FinalResult_Python, ypred_xgb], axis=1) FinalResult_Python.rename(columns={0: 'Predicted Churn'}, inplace=True) FinalResult_Python['Predicted Churn'] = FinalResult_Python[ 'Predicted Churn'].map({ 0: 'False', 1: 'True' })
'wifi_infos' ] ] #自动调参第一步,确定最优评估器个数。 xgb0 = XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, subsample=subsample, colsample_bytree=colsample_bytree, objective='multi:softmax', scale_pos_weight=1, seed=0) xgb_param = xgb0.get_xgb_params() xgb_param['num_class'] = num_class xgtrain = xgb.DMatrix(df_train[feature], label=df_train['label']) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb0.get_params()['n_estimators'], nfold=5, metrics='merror', early_stopping_rounds=20, verbose_eval=True) n_estimators = cvresult.shape[0] print("n_estimators : %s" % n_estimators) #自动调参第二步,确定最优max_depth,min_child_weight。 modelfit1(df_train, feature) #自动调参第三步,确定最优subsample,colsample_bytree。
X_test = np.zeros((X_test_o.shape[0], X_test_o.shape[1] + 6), np.int64) y_train = y_train_o y_test = y_test_o for i in range(X_train_o.shape[0]): error_total[X_train_o[i, 0], y_train_o[i]] += 1 l = X_train_o.shape[1] for i in range(X_train.shape[0]): X_train[i] = np.append(X_train_o[i], error_total[X_train_o[i, 0]]) X_train[i, l + y_train_o[i]] -= 1 for i in range(X_test.shape[0]): X_test[i] = np.append(X_test_o[i], error_total[X_test_o[i, 0]]) dtrain = xgb.DMatrix(X_train, y_train) dvalid = xgb.DMatrix(X_test, y_test) # dtrain = xgb.DMatrix(X_train[tr],y_train[tr]) # dvalid = xgb.DMatrix(X_train[va],y_train[va]) watchlist = [(dtrain, 'train'), (dvalid, 'valid_data')] bst = xgb.train(dtrain=dtrain, num_boost_round=2000, evals=watchlist, verbose_eval=50, params=xgbc.get_xgb_params(), early_stopping_rounds=100) y_train_pred_prod[va] += bst.predict(xgb.DMatrix(X_test), ntree_limit=bst.best_ntree_limit) print(bst.best_ntree_limit) # y_test_pred_prod += bst.predict(xgb.DMatrix(X_test), ntree_limit=bst.best_ntree_limit) #print('the roc_auc_score for train:',metrics.accuracy_score(y_test,np.argmax(y_test_pred_prod,axis=1))) print('the roc_auc_score for train:', metrics.accuracy_score(y, np.argmax(y_train_pred_prod, axis=1))) print(time.clock() - start)
ss = StandardScaler() x_train_ss = ss.fit_transform(x_train) x_test_ss = ss.transform(x_test) print('train.shape', x_train.shape, 'test.shape', x_test.shape) xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, seed=1) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(x_train_ss, label=y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=50, verbose_eval=10) print('n_estimators', cvresult.shape[0]) print('test-auc:', cvresult.iloc[cvresult.shape[0] - 1, 0]) xgb1.set_params(n_estimators=cvresult.shape[0]) print('model', xgb1) #n_estimators 137
clf = XGBClassifier( n_estimators=5000, #总迭代数 max_depth=4, #树的深度 min_child_weight=1, #。。。 subsample=0.65, colsample_bytree=0.7, learning_rate=0.01, objective='multi:softmax', #需要被最小化的损失函数,选的是多分类预测类别 num_class=5, #指定类别数目 gamma=0, #惩罚参数 reg_alpha=0.05, reg_lambda=0.05, nthread=4, seed=27) xgtrain = xgb.DMatrix(X, label=y) xgb_param = clf.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['mlogloss'], early_stopping_rounds=50, stratified=True, seed=1301) print('Best number of trees = {}'.format(cvresult.shape[0])) clf.set_params(n_estimators=cvresult.shape[0], use_label_encoder=False) #把clf的参数设置成最好的树对应的参数 clf.fit(X, y, eval_metric='merror') dtest_x = xgb.DMatrix(X1) pre = clf.predict(X1)