def load_architecture(): ada_params_filename = logger.config_dict['BEST_ADA_L'] logger.log( "Loading params for ADA from {} ...".format(ada_params_filename)) with open(logger.get_model_file(ada_params_filename, "large")) as fp: ada_best_params = json.load(fp) ada_model = AdaBoostClassifier(DecisionTreeClassifier()) ada_model.set_params(**ada_best_params) xgb_params_filename = logger.config_dict['BEST_XGB_L'] logger.log( "Loading params for XGB from {} ...".format(xgb_params_filename)) with open(logger.get_model_file(xgb_params_filename, "large")) as fp: xgb_best_params = json.load(fp) xgb_model = XGBClassifier() xgb_model.set_params(**xgb_best_params) ensemble_weights = [0.5, 0.5] comb_model = VotingClassifier(estimators=[('ADA', ada_model), ('XGB', xgb_model)], voting='soft', weights=ensemble_weights, n_jobs=-1) logger.log("Finish loading best architecture {}".format(comb_model)) return comb_model
def xgmethod(X,Y): # split data into train and test sets seed = 7 test_size = 0.3 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) scaler = preprocessing.StandardScaler().fit(X_train) scaler.transform(X_train) # XGtrain matrix xgtrain = xgb.DMatrix(X_train, label=y_train) model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=100,objective='binary:logistic') xgb_param = model.get_xgb_params() print ('Start cross validation') cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=500, nfold=10, metrics=['auc'], early_stopping_rounds=50, stratified=True, seed=1301) print('Best number of trees = {}'.format(cvresult.shape[0])) model.set_params(n_estimators=cvresult.shape[0]) print('Fit on the trainingsdata') model.fit(X_train, y_train, eval_metric='auc') pred = model.predict(X_test, ntree_limit=cvresult.shape[0]) # make predictions for test data predictions = [round(value) for value in pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) return accuracy
def xgboost(X_train, X_test, y_train, y_test, **kwargs): model = XGBClassifier(random_state=9) model.set_params(**kwargs) model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) return accuracy
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) if useTrainCV: print("Start Feeding Data") xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds) display(cvresult) alg.set_params(n_estimators=cvresult.shape[0]) print('Start Training') alg.fit(X_train, y_train, eval_metric='auc') print("Start Predicting") predictions = alg.predict(X_test) pred_proba = alg.predict_proba(X_test)[:, 1] # Model performance print("\nModel statistic") print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) print("AUC score (test set): %f" % metrics.roc_auc_score(y_test, pred_proba)) print("F1 Score (test set): %f" % metrics.f1_score(y_test, predictions)) feat_imp = alg.feature_importances_ feat = X_train.columns.tolist() res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False) res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() print(res_df) print(res_df["Features"].tolist()) return cvresult, alg
def return_classifier(classifier, classifier_params): """ Returns classifier object based on name """ # Max Features parameter for RandomForest and DecisionTree cp = classifier_params.copy() if classifier in ['LogisticRegression', 'KNeighborsClassifier','RandomForest']: cp['n_jobs'] = -1 if classifier == 'LinearSVC': cv_generator = cp['cv_generator'] else: cv_generator = None if classifier == 'XGBoost': from xgboost import XGBClassifier clf = XGBClassifier() elif classifier == 'LogisticRegression': clf = linear_model.LogisticRegression() elif classifier == 'KNeighborsClassifier': del cp['random_state'] clf = neighbors.KNeighborsClassifier() elif classifier == 'RandomForest': clf = ensemble.RandomForestClassifier() elif classifier == 'DecisionTree': clf = tree.DecisionTreeClassifier() elif classifier == 'AdaBoost': clf = ensemble.AdaBoostClassifier() elif classifier == 'LinearSVC': del cp['cv_generator'] clf = svm.LinearSVC() clf.set_params(**cp) return clf, cv_generator
def get_xgb_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. train_X, train_y = experiment_.get_train_data() clf = XGBClassifier() try: del best_param_['model_type'] except: pass clf.set_params(**best_param_) clf.fit(train_X, train_y) index2feature = clf.booster().get_fscore() fis = pd.DataFrame({'name':index2feature.keys(), 'score':index2feature.values()}) fis = fis.sort('score', ascending=False) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 2. plot #gs = GridSpec(2,2) #ax1 = plt.subplot(gs[:,0]) #ax2 = plt.subplot(gs[0,1]) #ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance sns.barplot(x = 'score', y = 'name', data = fis, #ax=ax1, color="blue") #plt.title("Feature_Importance", fontsize=10) plt.ylabel("Feature", fontsize=10) plt.xlabel("Feature_Importance : f-Score", fontsize=10) """ # 3.2 PDF confidence_score = clf.oob_decision_function_[:,1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = min(best_param_.get('n_estimators',1), 100) counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) """ png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname) plt.tight_layout() plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1) plt.close() return True
def xgbclf(params, X_train, y_train, X_test, y_test): eval_set = [(X_train, y_train), (X_test, y_test)] model = XGBClassifier(**params).\ fit(X_train, y_train, eval_set=eval_set, \ eval_metric='auc', early_stopping_rounds = 100, verbose=100) model.set_params(**{'n_estimators': model.best_ntree_limit}) model.fit(X_train, y_train) y_pred = model.predict(X_test, ntree_limit=model.best_ntree_limit) abclf_cm = confusion_matrix(y_test, y_pred) print(abclf_cm) print(abclf_cm[0][0] / (abclf_cm[0][0] + abclf_cm[1][0])) print(classification_report(y_test, y_pred)) print('\n') print("Model Final Generalization Accuracy: %.6f" % accuracy_score(y_test, y_pred)) y_pred_proba = model.predict_proba(X_test, ntree_limit=model.best_ntree_limit)[:, 1] get_roc(y_test, y_pred_proba) return model
def xgboost(X_train, X_test, y_train, y_test, **kwargs): xgb1 = XGBClassifier(seed=9) if kwargs: xgb1.set_params(**kwargs) xgb1.fit(X_train, y_train) y_pred = xgb1.predict(X_test) accuracy = accuracy_score(y_pred, y_test) return accuracy
def xgboost(X_train, X_test, y_train, y_test, **kwargs): model = XGBClassifier(seed=9) model.set_params(**kwargs) #ac,bst=myXGBoost(X_train, X_test, y_train, y_test,model,param_grid1,KFold=3) #h,j=param2(X_train, X_test, y_train, y_test,model,param_grid2) #return h,j #print model model.fit(X_train, y_train) d = model.predict(X_test) a = accuracy_score(y_test, d) return a
def train_xgb(X, y, params, save_path=None, save_path_booster=None): # the threshold is not handled by XGB interface params, binary_threshold = _parse_param_and_delete(params, 'binary_threshold', .5) # n_jobs is handled by XGB SKL interface params = _parse_param_and_keep(params, name='n_jobs', default=min(max_cpu_count(), 24)) X = np.asarray(X) y = np.asarray(y).flatten() if not tuple(np.sort(np.unique(y))) == (0, 1): raise NotImplementedError( 'XGB Wrapper currently only support biinary classification.') # Fit the model model = XGBClassifier(use_label_encoder=False, ) model = clone(model) model.set_params(**params) logging.info('Training...') model.fit( X, y, # early_stopping_rounds=10, verbose=True, ) # Save and re-load (feature-agnostic model) temp_file = f'temp-{time.time()}-{random.random()}.bin' model.get_booster().save_model(temp_file) booster = Booster(model_file=temp_file) os.remove(temp_file) if binary_threshold == 'auto': p_ = booster.predict(DMatrix(X)) p_ = np.sort(p_) binary_threshold = p_[int((y == 0).sum())] logging.info(f'Using a binary_threshold = {binary_threshold}') # Wrap model = XGBClassifierSKLWrapper(booster, features=X.shape[1], threshold=binary_threshold) # Save if save_path is not None: save_pickle(model, save_path) if save_path_booster is not None: save_pickle(model.get_booster(), save_path_booster) return model
class Hyperopt_xbc: def __init__(self, X, y, seed): self.name = 'XGBoost' self.name_short = 'XBC' self.X = X self.y = y self.seed = seed self.clf = None self.best_acc = 0 self.space = { 'objective': 'binary:logistic', 'max_depth': hp.choice('max_depth', range(5, 30, 1)), 'learning_rate': hp.quniform('learning_rate', 0.01, 0.5, 0.01), 'n_estimators': hp.choice('n_estimators', range(10, 500, 10)), 'booster': hp.choice('booster', ['gbtree', 'gblinear', 'dart']), 'gamma': hp.quniform('gamma', 0, 0.50, 0.01), 'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1), 'subsample': hp.quniform('subsample', 0.1, 1, 0.01), 'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1.0, 0.01) } self.max_evals = 50 def train_test(self, params): warnings.filterwarnings(action='ignore', category=DeprecationWarning) self.clf = XGBClassifier(**params) self.clf.fit(self.X, self.y) return cross_val_score(self.clf, self.X, self.y, scoring='roc_auc', cv=10).mean() def f(self, params): acc = self.train_test(params) if acc > self.best_acc: self.best_acc = acc return {'loss': -acc, 'status': STATUS_OK} def best(self): trials = Trials() best = fmin(self.f, self.space, algo=tpe.suggest, max_evals=self.max_evals, rstate=np.random.RandomState(self.seed), trials=trials) self.clf.set_params(**best) return self.clf, self.name, self.name_short, space_eval( self.space, best), self.best_acc
def training(self): """ Training is done at each max_depth loop. XGBoost's cv is used to find the optimum number of tree (estimators) at each depth, up to 1000 trees. Once traning result doesn't improve for 50 epochs, training will stop. The tree number used in the last epoch will be used to fit the train and test set again. Metrics will then be measured again this XGB model. """ max_depth = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] best_depth = 0 best_estimator = 0 max_score = 0 for md in max_depth: model = XGBClassifier(learning_rate=0.3, n_estimators=1000, max_depth=md, min_child_weight=1, gamma=1, subsample=1, colsample_bytree=0.1, reg_lambda=0, reg_alpha=1, random_state=42) xgb_param = model.get_xgb_params() xgtrain = xgboost.DMatrix(self.Xtrain.values, label=self.ytrain.values) cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=1000, early_stopping_rounds=50, nfold=8, metrics='auc', stratified=True, shuffle=True, seed=42, verbose_eval=False) print("There are {} trees in the XGB model. CV-mean: {:.4f}, CV-std: {:.4f}.".format( cvresult.shape[0], cvresult.iloc[cvresult.shape[0] - 1, 0], cvresult.iloc[cvresult.shape[0] - 1, 1])) n = cvresult.shape[0] model.set_params(n_estimators=n) model.fit(self.Xtrain, self.ytrain, eval_metric=self._metric, eval_set=[(self.Xtrain, self.ytrain), (self.Xtest, self.ytest)], verbose=False) y_pred = model.predict(self.Xtest) score = accuracy_score(self.ytest, y_pred) mse = mean_squared_error(self.ytest, y_pred) if score > max_score: max_score = score min_mse = mse best_depth = md best_estimator = n self.best_xgb = model print("Accuracy score: " + str(round(score, 4)) + " at depth: " + str(md) + " and estimator " + str(n)) print("Mean square error: " + str(round(mse, 4)) + " at depth: " + str(md) + " and estimator " + str(n)) print("Best score: " + str(round(max_score, 4)) + " Best MSE: " + str(round(min_mse, 4)) + " at depth: " + str( best_depth) + " and estimator of " + str(best_estimator))
def generateXGBoostPrediction(train, test): print('\n##################\nXGBoost\n##################') features = [ 'orderfrequency', 'dayfrequency', 'days_without_product_order', 'department_id', 'aisle_id', 'eval_days_since_prior_order', 'numproductorders', 'totaluserorders', 'user_id', 'product_id' ] param = {} #param['booster'] = 'gbtree' param['objective'] = 'binary:logistic' # param["eval_metric"] = "error" # param['eta'] = 0.3 # param['gamma'] = 0 param['max_depth'] = 4 param['n_estimators'] = 80 param['learning_rate'] = 0.1 # param['min_child_weight'] = 1 # param['max_delta_step'] = 0 #param['subsample'] = 1 # param['colsample_bytree'] = 1 # param['silent'] = 1 # param['seed'] = 0 #param['base_score'] = 0.4 X_train = train[features] test = test[features] y_train = train['reordered'] estimator = XGBClassifier() estimator.set_params(**param) metLearn = CalibratedClassifierCV(estimator, method='sigmoid', cv=5) metLearn.fit(X_train, y_train) y_pred = metLearn.predict(test) # estimator.fit(X_train, y_train) # y_pred = estimator.predict(test) print('Predict counter : %s' % (Counter(y_pred))) df = pd.DataFrame(columns=('user_id', 'product_id', 'predy')) df['user_id'] = test['user_id'] df['product_id'] = test['product_id'] df['predy'] = y_pred return df
def final_xgb(X_train, y_train, X_test, y_test, scale_pos_weight, best_params, analysis): xgb = XGBClassifier(**best_params) xgb.set_params(njobs=4, random_state=0, objective='binary:logistic', scale_pos_weight=scale_pos_weight) eval_set = [(X_train, y_train), (X_test, y_test)] eval_metric = ["error", "auc"] xgb.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=0) results = xgb.evals_result() fig1, axes1 = plt.subplots(figsize=(10, 8), nrows=1, ncols=2) axes1[0].plot(results['validation_0']['error'], label='Train Error') axes1[0].plot(results['validation_1']['error'], label='Validation Error') axes1[0].set_title("Final XGBoost Error") axes1[0].set_xlabel("Iteration") axes1[0].set_ylabel("Error") axes1[0].legend() axes1[1].plot(results['validation_0']['auc'], label='Train AUC-ROC') axes1[1].plot(results['validation_1']['auc'], label='Validation AUC-ROC') axes1[1].set_title("Final XGBoost AUC-ROC") axes1[1].set_xlabel("Iteration") axes1[1].set_ylabel("AUC") axes1[1].legend() fig1.tight_layout() fig1.savefig(fig_dir + '/{}_final_xgb_model.png'.format(analysis), format='png', dpi=300, transparent=False) return xgb
def opt_BDT(input, output, params, show, names): model = XGBClassifier(**params) xgb_param = model.get_xgb_params() cvscores = [] AUC = [] X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42) matrix_train = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv( xgb_param, matrix_train, num_boost_round=model.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=30, verbose_eval=True, ) model.set_params(n_estimators=cvresult.shape[0]) model.fit(X_train, y_train, eval_metric="auc") y_prob = model.predict_proba(X_test) y_pred = model.predict(X_test) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_test, y_prob[:, 1]) accuracy = accuracy_score(y_test, prediction) print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc)) if show: name = "channel_" + str(channel) + "_BDT" name = "%s_%s" % (name, selection) modelname = "models/%s.h5" % name print("Save to %s" % modelname) plotter.plot_separation(model, X_test, y_test, name, False) plotter.plot_ROC(model, X_test, y_test, name, False) model.get_booster().feature_names = names mp.rc("figure", figsize=(5, 5)) plot_importance(model.get_booster()) plt.subplots_adjust(left=0.3) plt.show()
def fit_predict(self, X_train, y_train, X_valid, y_valid, X_test, **kwargs): clf = XGBClassifier() if self.params is not None: clf.set_params(**self.params) # print(clf.get_params()) eval_set = [(X_train, y_train), (X_valid, y_valid)] self.clf = clf.fit(X_train, y_train, eval_set=eval_set, eval_metric=None, verbose=100, early_stopping_rounds=100) # evals_result = self.clf.evals_result() valid_predict = clf.predict_proba(X_valid) test_predict = clf.predict_proba(X_test) return valid_predict, test_predict
class ClassificationLearner: def __init__(self, **kwargs): self.estimator = XGBClassifier(**kwargs) self.fit_info = None # noinspection PyPep8Naming # pylint: disable-msg=too-many-arguments # pylint: disable-msg=too-many-locals # pylint: disable-msg=invalid-name def fit(self, X, y): # If there is no evaluation data, split some. x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) if X.shape[0] < 10000: best_param = search_parameters(self.estimator, x_train, y_train) self.estimator.set_params(**best_param) self.estimator.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10, verbose=False) y_train_pred = self.predict_proba(x_train)[:, 1] train_auc = sklearn.metrics.roc_auc_score(y_train, y_train_pred) y_test_pred = self.predict_proba(x_test)[:, 1] test_auc = sklearn.metrics.roc_auc_score(y_test, y_test_pred) self.fit_info = 'Train/Test AUC: {:.2f}/{:.2f}'.format( train_auc, test_auc) return self def predict_proba(self, x): return self.estimator.predict_proba(x) def predict(self, x): return self.estimator.predict(x)
def train_evaluate(training_dataset_path, validation_dataset_path, max_depth, n_estimators, output_dir): df_train = pd.read_csv(training_dataset_path) df_validation = pd.read_csv(validation_dataset_path) df = pd.concat([df_train, df_validation]) categorical_features = ['workclass', 'occupation'] target = 'income_bracket' # One-hot encode categorical variables df = pd.get_dummies(df, columns=categorical_features) # Change label to 0 if <=50K, 1 if >50K df[target] = df[target].apply(lambda x: 0 if x == ' <=50K' else 1) # Split features and labels into 2 different vars X_train = df.loc[:, df.columns != target] y_train = np.array(df[target]) # Normalize features scaler = StandardScaler() X_train = scaler.fit_transform(X_train) grid = {'max_depth': int(max_depth), 'n_estimators': int(n_estimators)} model = XGBClassifier() model.set_params(**grid) model.fit(X_train, y_train) model_filename = 'xgb_model.pkl' pickle.dump(model, open(model_filename, "wb")) EXPORT_PATH = os.path.join( output_dir, datetime.datetime.now().strftime("%Y%m%d%H%M%S")) gcs_model_path = '{}/{}'.format(EXPORT_PATH, model_filename) subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path]) print('Saved model in: {}'.format(gcs_model_path))
def return_model_assessment(self, args): curr_model_hyper_params = [ 'colsample_bylevel', 'colsample_bytree', 'gamma', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample' ] params = { curr_model_hyper_params[i]: args[i] for i, j in enumerate(curr_model_hyper_params) } model = XGBClassifier(random_state=self.seed, seed=self.seed) model.set_params(**params) fitted_model = model.fit(self.X_train, self.y_train, sample_weight=None) self.models.append(fitted_model) train_predictions = model.predict(self.X_train) test_predictions = model.predict(self.X_test) train_score = f1_score(train_predictions, self.y_train) test_score = f1_score(test_predictions, self.y_test) self.train_scores.append(train_score) self.test_scores.append(test_score) return 1 - test_score
def get_default_xgb_model(df): final_X, final_y = hs.get_final_data(df, hs.get_data_transformer()) parameters = { 'nthread': 1, 'objective': 'binary:logistic', 'learning_rate': 0.01, 'max_depth': 8, 'min_child_weight': 3, 'silent': 1, 'subsample': 0.8, 'colsample_bytree': 0.5, 'n_estimators': 1000, 'missing': -999, 'seed': 1337 } xgb_model = XGBClassifier(verbosity=0) xgb_model.set_params(**parameters) xgb_model.fit(final_X, final_y) return xgb_model
def modelXGBClassifier(self, trial: optuna.trial.Trial): opt_params = dict( max_depth=trial.suggest_int("max_depth", 2, 2**4), learning_rate=trial.suggest_discrete_uniform( 'learning_rate', 0.001, 1, 0.001), n_estimators=trial.suggest_int("n_estimators", 2, 2**10, log=True), gamma=trial.suggest_loguniform('gamma', 1e-8, 1), min_child_weight=trial.suggest_loguniform('min_child_weight', 1e-8, 2**10), subsample=trial.suggest_uniform('subsample', 0.1, 1), colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.1, 1), colsample_bylevel=trial.suggest_uniform('colsample_bylevel', 0.1, 1), reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-8, 10), reg_lambda=trial.suggest_loguniform('reg_lambda', 1e-8, 10), ) clf = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic", booster='gbtree', n_jobs=1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, missing=None) clf.set_params(**{**opt_params, **self.params}) return clf
def log_xgboost(params, train_X, train_Y, test_X, test_Y): with mlflow.start_run() as ml_run: for k, v in params.items(): mlflow.log_param(k, v) mlflow.set_tag("state", "dev") xgc = XGBClassifier(objective="binary:logistic") xgc.set_params(**params) model = xgc.fit(train_X, train_Y.values.ravel(), eval_set=[(train_X, train_Y.values.ravel()), (test_X, test_Y.values.ravel())], eval_metric=['error', 'logloss'], verbose=0) predictions = model.predict(test_X) acc = accuracy_score(test_Y.values.ravel(), predictions) loss = log_loss(test_Y.values.ravel(), predictions) ## Plots error_plot = plot_learning(model, "error") error_plot.savefig("temp/error_plot.png") mlflow.log_artifact("temp/error_plot.png") loss_plot = plot_learning(model, "logloss") loss_plot.savefig("temp/logloss.png") mlflow.log_artifact("temp/logloss.png") conf_mat = confusion_matrix(test_Y, predictions) conf_mat_plot = sns.heatmap(conf_mat, annot=True, fmt='g') conf_mat_plot.figure.savefig("temp/confmat.png") mlflow.log_artifact("temp/confmat.png") mlflow.log_metrics({'log_loss': loss, 'accuracy': acc}) mlflow.xgboost.log_model(model, "model") print(f"Model trained with parameters: {params}") return model, predictions, acc, loss
logger.info('test col: %s' % (add_col)) for train_idx, test_idx in list(cv)[:1]: train_omit_idx = numpy.intersect1d(train_idx, omit_idx) logger.info('ommit size: %s %s' % (train_idx.shape[0], len(train_omit_idx))) ans = [] insample_ans = [] for i in ['']: # logger.info('model: %s' % i) cols = data.columns.values # [col for col in feature_column if 'L%s' % i in col] logger.info('model xg: %s' % i) model = XGBClassifier(seed=0) #model = RandomForestClassifier(n_jobs=-1, random_state=0) gc.collect() model.set_params(**params) model.fit(data.ix[train_idx, cols], target[train_idx]) ans = model.predict_proba(data.ix[test_idx, cols])[:, 1] insample_ans = model.predict_proba(data.ix[train_idx, cols])[:, 1] logger.info('train_end') """ if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = ids.ix[test_idx].values else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target[test_idx]]
subsample=0.9, colsample_bytree=0.7, objective='multi:softprob', scale_pos_weight=1, seed=0, ) xgb_enc = OneHotEncoder(handle_unknown='ignore') xgb_enc.fit(X) # since I am working mostly on categorical features estimate_nround = False if estimate_nround: logger.info('estimating the n_estimators...') best_n_rounds = estimate_xgb_nround(xgb_model, X, y) logger.info('complete estimating the n_estimators') xgb_model.set_params(n_estimators=best_n_rounds) xgb_model.fit(xgb_enc.transform(X), y) plot_importance_matrix(xgb_model, csv_path) sys.exit() # start tunning param_grid = { ### step 1 ### # 'max_depth': [3, 5, 7, 9], # 'min_child_weight': [1, 3, 5] ### best parameter for round 1: max_depth = 5, min_child_weight = 1 ### ### step 2 ### # 'max_depth': [4, 5, 6], # 'min_child_weight': [1, 2] ### best parameter for round 2: max_depth = 5, min_child_weight = 1 ### ### step 3 ###
objective= 'multi:softprob', max_depth = 7, gamma= .2) # use the xgb interface xgb_param = clf.get_xgb_params() xgb_param['num_class'] = 5 xgb_param['eval_metric'] = 'mlogloss' Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan) cvresult = xgb.cv(xgb_param, Xg_train, num_boost_round = clf.get_params()['n_estimators'], nfold = 5, show_progress = True, early_stopping_rounds = 100) clf.set_params(n_estimators=cvresult.shape[0]) clf.fit(X_train, y_train) best_outcome_params = clf.get_params() best_outcome_score = cvresult.min() try: # predict the outcome probabilities y_pred = grid.predict_proba(X_test) except: # predict the outcome probabilities y_pred = clf.predict_proba(X_test) # Create a data frame column_names = possible_outcomes[:] idx = pd.Int64Index(np.arange(1,11457, dtype='int64'))
nfold=5, metrics='auc', early_stopping_rounds=50, seed=42 ) cvresult.head() cvresult.shape xgb_best_param = {'n_estimators': cvresult.shape[0]} xgb_best_param # best n_estimators value to be used in the stack model # update xgb with the optimal n_estimators xgb.set_params(**xgb_best_param) # #### 2. Tune max_depth and min_child_weight parameter_grid = { 'max_depth': np.arange(2, 4), 'min_child_weight': np.arange(1, 4) } grid_xgb = GridSearchCV(xgb, parameter_grid, cv=cv_splitter, n_jobs=-1) grid_xgb.fit(X_1, y) grid_xgb.best_params_ xgb_best_param.update(grid_xgb.best_params_) xgb_best_param # best parameter values to be used in the stack model
no_test=False) var_kin, var_geo = data.variables_list() skf = StratifiedKFold(n_splits=n, shuffle=True) params = { 'learning_rate': 0.05, 'n_estimators': 100, 'max_depth': 4, 'subsample': 0.5, 'n_jobs': 4, 'min_child_weight': 15 } train_params = {'early_stopping_rounds': 10, 'verbose': 0} xgb = XGBClassifier(**params) xgb.set_params(**train_params) xgb_kin = clf.Classifier(model=xgb, cv=skf, variables=var_kin, model_name='XGBoost', var_name='kinetic', fig_name='xgb', train_params=train_params) xgb_kin.fit(train) xgb_kin.check_ks_and_cvm(train, check_agreement=check_agreement, check_correlation=check_correlation) xgb_kin.predict(data=test) params = { 'learning_rate': 0.05,
dsample = LabelEncoding(dsample) dtest = LabelEncoding(test) X_train = dsample.drop('AB_NICU', axis=1) y_train = dsample['AB_NICU'] X_test = dtest.drop('AB_NICU', axis=1) y_test = dtest['AB_NICU'] print(X_train.shape) print(X_test.shape) print(y_train.shape) print(y_test.shape) #XGBoost initial fit xgb = XGBClassifier() xgb.set_params(random_state=0) xgb.fit(X_train, y_train) print("The training error is: %.5f" % (1 - xgb.score(X_train, y_train))) print("The test error is: %.5f" % (1 - xgb.score(X_test, y_test))) # Commented out IPython magic to ensure Python compatibility. # set the parameter grid xgb_param_grid ={'learning_rate': [0.01,0.05,0.1], 'max_depth': [3,4,5,6], 'min_child_weight': [4,5,6], 'n_estimators': [100,200,300,400]} #grid search grid_search_xgb = GridSearchCV(xgb, xgb_param_grid, scoring='accuracy', cv= 5, n_jobs=-1, return_train_score = True) # %time grid_search_xgb.fit(X_train, y_train)
max_score = -100 best_thresh = None pg = list(ParameterGrid(all_params)) for i in range(data.shape[1]): thresh, score = mcc_optimize(data[:, i], target) logger.info('model:%s, thresh: %s, total score: %s, max_score: %s' % (i, thresh, score, max_score)) for i, params in enumerate(pg): logger.info('%s/%s param: %s' % (i + 1, len(pg), params)) pred_proba_all = [] y_true = [] for train_idx, test_idx in cv: model = XGBClassifier(seed=0) #model = LogisticRegression(n_jobs=-1, class_weight='balanced') model.set_params(**params) model.fit(data[train_idx], target[train_idx], eval_metric=evalmcc_xgb_min, verbose=False) #pred_proba = data[test_idx, -1] pred_proba = model.predict_proba(data[test_idx])[:, 1] pred_proba_all = numpy.r_[pred_proba_all, pred_proba] y_true = numpy.r_[y_true, target[test_idx]] score = roc_auc_score(target[test_idx], pred_proba) #logger.info(' score: %s' % score) #thresh, score = mcc_scoring(model, data[test_idx], target[test_idx]) list_score.append(score) #logger.info(' thresh: %s' % thresh)
def rvs(self, random_state): return random_state.choice(self.support) def search(param_dict, cv_obj, X, y, n_iter=1_000, skeleton=None, scoring='neg_log_loss', **kwargs): if skeleton is None: skeleton = XGBClassifier(n_jobs=1, random_state=SEED) if 'early_stopping_rounds' in kwargs: skeleton.set_params(n_estimators=1_000) dist = {k: Uniform(v) for k, v in param_dict.items()} optim = RandomizedSearchCV( estimator=skeleton, param_distributions=dist, n_iter=n_iter, scoring=scoring, cv=cv_obj, return_train_score=True, verbose=1, n_jobs=4, random_state=SEED, )
Fitting the final XGBoost with parameters found on grid_cv. Use all training data. Test on test data. ####################################################################### """ params = best_params # params = {'colsample_bytree': 0.6, # 'learning_rate': 0.01, # 'max_depth': 3, # 'n_estimators': 250, # 'subsample': 1.0} xgb = XGBClassifier(**params) xgb.set_params(silent=True, verbosity=0, njobs=4, random_state=0, objective='binary:logistic', scale_pos_weight=scale_pos_weight) eval_set = [(X_train, y_train), (X_test, y_test)] eval_metric = ["error", "auc"] xgb.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=False) results = xgb.evals_result() fig1, ax1 = plt.subplots() ax1.plot(results['validation_0']['error'], label='Train Error')
def get_model(PARAMS): '''Get model according to parameters''' model = XGBClassifier() model.set_params(**PARAMS) return model