def test_deprecate_position_arg(): from sklearn.datasets import load_digits X, y = load_digits(return_X_y=True, n_class=2) w = y with pytest.warns(FutureWarning): xgb.XGBRegressor(3, learning_rate=0.1) model = xgb.XGBRegressor(n_estimators=1) with pytest.warns(FutureWarning): model.fit(X, y, w) with pytest.warns(FutureWarning): xgb.XGBClassifier(1, use_label_encoder=False) model = xgb.XGBClassifier(n_estimators=1, use_label_encoder=False) with pytest.warns(FutureWarning): model.fit(X, y, w) with pytest.warns(FutureWarning): xgb.XGBRanker('rank:ndcg', learning_rate=0.1) model = xgb.XGBRanker(n_estimators=1) group = np.repeat(1, X.shape[0]) with pytest.warns(FutureWarning): model.fit(X, y, group) with pytest.warns(FutureWarning): xgb.XGBRFRegressor(1, learning_rate=0.1) model = xgb.XGBRFRegressor(n_estimators=1) with pytest.warns(FutureWarning): model.fit(X, y, w) with pytest.warns(FutureWarning): xgb.XGBRFClassifier(1, use_label_encoder=True) model = xgb.XGBRFClassifier(n_estimators=1) with pytest.warns(FutureWarning): model.fit(X, y, w)
def run_training(pred_df, fold): train_df = pred_df[pred_df.kfold != fold].reset_index(drop=True) valid_df = pred_df[pred_df.kfold == fold].reset_index(drop=True) xtrain = train_df[["lr_pred", "lr_cnt_pred", "rf_svd_pred", "gnb_pred"]].values xvalid = valid_df[["lr_pred", "lr_cnt_pred", "rf_svd_pred", "gnb_pred"]].values clf = xgb.XGBRFClassifier(use_label_encoder=False, base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10, min_child_weight=1, missing=None, n_estimators=100, nthread=-1, objective='binary:logistic', eval_metric='logloss') clf.fit(xtrain, train_df.is_duplicate.values) preds = clf.predict_proba(xvalid)[:, 1] auc = metrics.roc_auc_score(valid_df.is_duplicate.values, preds) print(f"{fold}, {auc}") valid_df.loc[:, "xgb_pred"] = preds return valid_df
def XGB(train, target, test, rf=True): if rf: prtstr = "XGBRF Score" classifier = xgb.XGBClassifier() else: prtstr = "XGB Score" classifier = xgb.XGBRFClassifier() classifier.fit(train, target) print(prtstr, classifier.score(train, target)) prediction = classifier.predict_proba(test)[:, 1] return prediction
def choose_ml(classifier_name, train_x, train_y, test_x, test_y): if classifier_name == "lr": print("logistic regression") model = LogisticRegression(solver='liblinear', max_iter=1000) if classifier_name == "svm": print("support vector machine") params_grid = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] model = GridSearchCV(SVC(), params_grid, iid=True, cv=5) if classifier_name == "dt": print("decision tree") model = tree.DecisionTreeClassifier() if classifier_name == "rf": print("random forest") model = RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42) if classifier_name == "ann": print("artificianl neural network") model = MLPClassifier(activation='logistic', hidden_layer_sizes=(train_x.shape[1], train_x.shape[1] + 1, 2), max_iter=500) if classifier_name == "nb": print("naive bayes") model = GaussianNB() if classifier_name == "knn": print("k nearest neighbours") model = KNeighborsClassifier() if classifier_name == "xgb": print("xgboost") model = xgb.XGBRFClassifier() model.fit(train_x, train_y) pred_y = model.predict(test_x) accuracy = metrics.accuracy_score(test_y, pred_y) f1score = metrics.f1_score(test_y, pred_y) print("we are here") return accuracy, f1score
def xgboost_classifier(x_train_tf, train_df, x_test_tfidf): """ classify data by xgboost classifier :param x_train_tf: training data represented as counted vector :param train_df: the training data :param x_test_tfidf: test data represented as counted vector :return: predicted labels """ model = xgb.XGBRFClassifier() model.fit(x_train_tf, train_df.label) predictions = model.predict(x_test_tfidf) predictions_proba = model.predict_proba(x_test_tfidf)[:, 1] return predictions, predictions_proba
def select_model_train_with_vlaid(train_data, train_label, test_data, test_label): random_state = 0 plt.figure(figsize=(12, 8)) plt.subplots_adjust(wspace=0.7, hspace=0.5) y = train_label.reshape(1, -1)[0] y_test = test_label.reshape(1, -1)[0] for i in range(len(train_data)): X = train_data[i] X_test = test_data[i] classifiers = [] classifiers.append(LogisticRegression(random_state=random_state)) classifiers.append(KNeighborsClassifier()) classifiers.append(DecisionTreeClassifier(random_state=random_state)) classifiers.append(SVC(probability=True, random_state=random_state)) classifiers.append(RandomForestClassifier(random_state=random_state)) # classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state)) classifiers.append( GradientBoostingClassifier(random_state=random_state)) classifiers.append(ExtraTreesClassifier(random_state=random_state)) classifiers.append(lgb.LGBMClassifier(random_state=random_state)) classifiers.append(xgb.XGBRFClassifier(random_state=random_state)) acc = [] for classifier in classifiers: clf = fit_model(classifier, X, y) y_pred = clf.predict(X_test) acc.append(accuracy_score(y_test, y_pred)) indexs = [ 'Logistic', 'KNeighbors', 'DecisionTree', 'SVC', 'RandomForest', 'GradientBoosting', 'ExtraTreeClassifier', 'lightgbm', 'xgbRF' ] titles = [ "PSE-PP", "PSE-AAC", "PSE-PSSM", "AVB-PP", "AVB-AAC", "AVB-PSSM", "DWT-PP", "DWT-AAC", "DWT-PSSM" ] data = pd.DataFrame(acc, columns=['Acc'], index=indexs) print(titles[i]) print(data) print() p = plt.subplot(3, 3, (i + 1)) p.set_xlim([0, 1]) p.set_title(titles[i]) # p.set_ylabel('Model') g = sns.barplot(x=data['Acc'], y=data.index, data=data) # plt.savefig("./imgs/model_select.jpg") plt.show()
def trainBDT(X, y, X_val, y_val, param, min_background): #Train trees evallist = [(X, y), (X_val, y_val)] model = xgb.XGBRFClassifier(**param) model.fit(X, y.ravel(), eval_set=evallist, verbose=True) #Get significance data ypred = model.predict(X_val) predictions = [round(value) for value in ypred] accuracy = accuracy_score(y_val, predictions) print("The training accuaracy is: {}".format(accuracy)) conf_matrix = confusion_matrix(y_val, predictions) print("The confusion matrix: {}".format(conf_matrix)) print("The precision is: {}".format(precision_score(y_val, predictions))) plot_BDTScore(X_val.copy(), y_val.copy(), model, min_background) return model, predictions
def post_train_maxfeat_rf( config, train_dataloader, val_dataloader, ): ## Get and preprocess training data to maxfeat if is_rank0: print("### Get and preprocess training data ###") maxfeat_list, pred_list = to_maxfeat_feature(train_dataloader, is_rank0) # Gather training data to rank 0 if hvd != None: maxfeat_list = maxfeat_list.tolist() pred_list = pred_list.tolist() all_maxfeat_list = MPI.COMM_WORLD.gather(maxfeat_list, root=0) all_pred_list = MPI.COMM_WORLD.gather(pred_list, root=0) if is_rank0: all_maxfeat_list = np.concatenate(np.array(all_maxfeat_list), axis=0) all_pred_list = np.concatenate(np.array(all_pred_list), axis=0) maxfeat_list = all_maxfeat_list pred_list = all_pred_list ## Train the post_train model if is_rank0: print("### Training the post-train model ###") post_train_model = xgb.XGBRFClassifier() post_train_model.fit(maxfeat_list, pred_list) accuracy = post_train_model.score(maxfeat_list, pred_list) print("Train Accuracy: {}".format(accuracy)) with open(config["POST_TRAIN_MODEL_PATH"], "wb") as f: pickle.dump(post_train_model, f) print("Post-train model saved at {}.".format( config["POST_TRAIN_MODEL_PATH"]))
def _init_model(self, **kwargs): self._model_name = MODEL_XGB_RF self._model = xgb.XGBRFClassifier(**kwargs)
# XGBoost (tree method "hist") regression(xgboost.XGBRegressor(**XGBOOST_HIST_PARAMS), test_fraction=0.2), classification(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS), test_fraction=0.2), classification_binary(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS), test_fraction=0.2), # XGBoost (LINEAR) regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_LINEAR)), classification(xgboost.XGBClassifier(**XGBOOST_PARAMS_LINEAR)), classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS_LINEAR)), # XGBoost (RF) regression(xgboost.XGBRFRegressor(**XGBOOST_PARAMS_RF)), classification(xgboost.XGBRFClassifier(**XGBOOST_PARAMS_RF)), classification_binary(xgboost.XGBRFClassifier(**XGBOOST_PARAMS_RF)), # XGBoost (Boosted Random Forests) regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_BOOSTED_RF)), classification(xgboost.XGBClassifier(**XGBOOST_PARAMS_BOOSTED_RF)), classification_binary( xgboost.XGBClassifier(**XGBOOST_PARAMS_BOOSTED_RF)), # XGBoost (Large Trees) regression_random(xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)), classification_random(xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)), classification_binary_random( xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)), # XGBoost (Huge Trees)
#,(vali_x,vali_y) model.fit(train_x1, train_y1, eval_set=[(train_x1, train_y1), (test_x1, test_y1)], eval_metric='auc') test['prob'] = test['prob'] + model.predict_proba(test_x)[:, 1] test['prob'] = test['prob'] / 10 test.rename(columns={'seller_id': 'merchant_id'}, inplace=True) test[['user_id', 'merchant_id', 'prob']].to_csv('result1.csv', index=False) #构建stacking模型 分数上升了0.001 train_x = train[features] train_y = train['label'] test_x = test[features] clf1 = xgb.XGBRFClassifier(learning_rate=0.01, n_estimators=1500, random_state=2019) clf2 = lgb.LGBMClassifier(learning_rate=0.01, n_estimators=1500, random_state=2019) dtc = DecisionTreeClassifier() sclf = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=dtc) for clf, label in zip([clf1, clf2, sclf], ['xgb', 'lgb', 'StackingClassifier']): scores = model_selection.cross_val_score(clf, train_x, train_y, cv=10, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
########################### xgboost ############################################ ##### boosted tree # xgb = xgboost.XGBClassifier(max_depth=8, scale_pos_weight=9, n_estimators=1000) ##n_estimators=1000, learning_rate=0.05 # xgb.fit(X=train.values.astype(np.float32), y=np.squeeze(train_labels.astype(np.float32)), early_stopping_rounds=20, # eval_set=[(val.values.astype(np.float32), val_labels.astype(np.float32))], verbose=True) # # # make predictions # predxgb = xgb.predict(test.values.astype(np.float32)) # xgb_conf_mat = metrics.confusion_matrix(test_labels.astype(np.float32), predxgb) ## tree methods tend to have higher false negative rates than ANN # print(xgb_conf_mat/np.expand_dims(np.sum(xgb_conf_mat, axis=1), axis=1)) # xgb.feature_importances_ #### random forest xgbrf = xgboost.XGBRFClassifier(max_depth=8, scale_pos_weight=9, n_estimators=100) # xgbrf = xgboost.XGBRFClassifier(scale_pos_weight=9) xgbrf.fit(X=train.values.astype(np.float32), y=np.squeeze(train_labels.astype(np.float32)), early_stopping_rounds=20, eval_set=[(val.values.astype(np.float32), val_labels.astype(np.float32))], verbose=True) predxgbrf = xgbrf.predict(test.values.astype(np.float32)) xgbrf_conf_mat = metrics.confusion_matrix( test_labels.astype(np.float32), predxgbrf ) ## tree methods tend to have higher false negative rates than ANN print(xgbrf_conf_mat / np.expand_dims(np.sum(xgbrf_conf_mat, axis=1), axis=1)) # xgbrf.feature_importances_
df = df.append(gen_df(vcf_reader_nn, flabel_nn), ignore_index=True) df = df.append(gen_df(vcf_reader_ns, flabel_ns), ignore_index=True) df = df.append(gen_df(vcf_reader_gn, flabel_gn), ignore_index=True) df = df.append(gen_df(vcf_reader_gs, flabel_gs), ignore_index=True) df = df.append(gen_df(vcf_reader_lp, flabel_lp), ignore_index=True) #print(df) print('Running classifier...') # Fitting X = df[['PRECISE', 'CIPOS', 'CIEND', 'RE', 'SVLEN', 'MAPQ', 'DEPTHPVAL']] y = df['LABEL'] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2) clf = xgb.XGBRFClassifier(n_estimators=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) print("Precision:", metrics.precision_score(y_test, y_pred, pos_label="True")) print("Recall:", metrics.recall_score(y_test, y_pred, pos_label="True")) print("F1 score:", metrics.f1_score(y_test, y_pred, pos_label="True")) callset = [i for i, x in enumerate(y_pred) if x == 'True'] print('Callset:', len(callset)) trueset = [i for i, x in enumerate(y_test) if x == 'True'] print('Trueset:', len(trueset)) intersect = [value for value in callset if value in trueset] print('Intersect:', len(intersect))
def fit(self, X, y): self.model = LassoLarsIC(criterion='aic').fit(X, y) return self def transform(self, X): return np.asarray(X)[:, abs(self.model.coef_) > 0] scale_pos_weight = Counter(y_train)[0] / Counter(y_train)[1] clf = xgb.XGBRFClassifier(objective='binary:logistic', scale_pos_weight=scale_pos_weight, learning_rate=0.01, n_estimators=5000, max_depth=10, min_child_weight=1, gamma=0, subsample=0.3, colsample_bytree=0.3, reg_alpha=0.014, nthread=4, seed=27) PL = Pipeline( steps=[('PreProcessor', StandardScaler()), ('PCA', PCA()), ('EmbeddedSelector', LASSOJorn()), ('clf', CalibratedClassifierCV(base_estimator=clf, method='sigmoid'))]) #tss = TimeSeriesSplit(n_splits=3) #optimizer = GridSearchCV(PL, parameters, cv=tss, n_jobs=-1, verbose=10, scoring='roc_auc')
groups = pd.DataFrame(list_groups, columns = (['Severity'])) groups = groups.set_index(clin.index) clin = clin.join(groups) #scaling data from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(prot) X= scaler.transform(prot) X= pd.DataFrame(X) X.columns= prot.columns X = X.set_index(prot.index) groups['Severity_2'] = groups['Severity'].replace({"Control":0, "Low":0, "Moderate":0, "Severe": 1, "Critical":1}) y = groups['Severity_2'] #setting xgboost gbm_param_grid = {'learning_rate': [0.15, 0.20, 0.25, 0.30], 'num_boosting_rounds' :[10, 15, 20, 25, 30], 'subsample':[0.2, 0.3, 0.5, 0.8, 0.9], 'colsample_bytree':[0.2, 0.25, 0.30, 0.35], 'max_depth':[2, 3, 5]} gbm = xgb.XGBRFClassifier() grid_roc = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, scoring= "accuracy", cv = 3) grid_roc.fit(X, y) print("Best parameters found are", grid_roc.best_params_) print("best roc score found", grid_roc.best_score_)
def get_model(): """Return model of specified type.""" model_type = config["adu"]["model"] if model_type == "SVC": model = SVC() elif model_type == "LogisticRegression": model = LogisticRegression() elif model_type == "RandomForest": model = RandomForestClassifier() elif model_type == "AdaBoost": model = AdaBoostClassifier() elif model_type == "XGBoost": model = xgb.XGBClassifier() elif model_type == "XGBRF": model = xgb.XGBRFClassifier() elif model_type == "AutoML": model = autosklearn.classification.AutoSklearnClassifier( resampling_strategy="cv", resampling_strategy_arguments={"folds": 10}, n_jobs=80, ensemble_memory_limit=10240, ml_memory_limit=30720, ) elif model_type == "Stacking": cv_split = StratifiedShuffleSplit(n_splits=config["adu"]["n_splits"], test_size=0.33) rf_param_grid = Grid["RandomForest"] log_param_grid = Grid["LogisticRegression"] svc_param_grid = Grid["SVC"] ada_param_grid = Grid["AdaBoost"] xgb_param_grid = Grid["XGBoost"] xgbrf_param_grid = Grid["XGBRF"] train_method = config["adu"]["train_method"] estimator_dict = dict() if train_method == "GridSearch": estimator_dict["rf"] = GridSearchCV( RandomForestClassifier(), param_grid=rf_param_grid, cv=cv_split, refit=True, ) estimator_dict["log"] = GridSearchCV(LogisticRegression(), param_grid=log_param_grid, cv=cv_split, refit=True) estimator_dict["svc"] = GridSearchCV(SVC(), param_grid=svc_param_grid, cv=cv_split, refit=True) estimator_dict["ada"] = GridSearchCV(AdaBoostClassifier(), param_grid=ada_param_grid, cv=cv_split, refit=True) estimator_dict["xgbrf"] = GridSearchCV( xgb.XGBRFClassifier(), param_grid=xgbrf_param_grid, cv=cv_split, refit=True, ) estimator_dict["xgb"] = GridSearchCV(xgb.XGBClassifier(), param_grid=xgb_param_grid, cv=cv_split, refit=True) elif train_method == "RandomSearch": estimator_dict["rf"] = RandomizedSearchCV( RandomForestClassifier(), param_distributions=rf_param_grid, cv=cv_split, refit=True, ) estimator_dict["log"] = RandomizedSearchCV( LogisticRegression(), param_distributions=log_param_grid, cv=cv_split, refit=True, ) estimator_dict["svc"] = RandomizedSearchCV( SVC(), param_distributions=svc_param_grid, cv=cv_split, refit=True) estimator_dict["ada"] = RandomizedSearchCV( AdaBoostClassifier(), param_distributions=ada_param_grid, cv=cv_split, refit=True, ) estimator_dict["xgbrf"] = RandomizedSearchCV( xgb.XGBRFClassifier(), param_distributions=xgbrf_param_grid, cv=cv_split, refit=True, ) estimator_dict["xgb"] = RandomizedSearchCV( xgb.XGBClassifier(), param_distributions=xgb_param_grid, cv=cv_split, refit=True, ) stacks = config["adu"]["stacking"]["estimator_stack"] final_est = estimator_dict[config["adu"]["stacking"] ["final_estimator"]] passth = config["adu"]["stacking"]["passthrough"] single_layer = [] for i, m in enumerate(stacks): if isinstance(m, list): sublayer = [(mo + str(i + j), estimator_dict[mo]) for j, mo in enumerate(m)] layer = StackingClassifier( estimators=sublayer, final_estimator=final_est, n_jobs=-1, passthrough=passth, verbose=0, ) final_est = layer else: single_layer.append((m + str(i), estimator_dict[m])) if len(single_layer) > 0: model = StackingClassifier( estimators=single_layer, final_estimator=final_est, n_jobs=-1, passthrough=passth, verbose=0, ) else: model = layer else: print("Invalid model option") exit(1) return model
from sklearn import ensemble import xgboost as xgb from sklearn import linear_model #ML MODELS MODELS = { "randomforest": ensemble.RandomForestClassifier( n_estimators=200, n_jobs=-1, verbose=2), "extratrees": ensemble.ExtraTreesClassifier( n_estimators=200, n_jobs=-1, verbose=2), "xgboost": xgb.XGBRFClassifier(verbosity=2, max_depth=4, n_estimators=200, n_jobs=-1), "logreg": linear_model.LogisticRegression( n_jobs= -1 ) } #deep learning models: DL_MODELS = { }
train_x = df.iloc[train_ind, :] train_y = labels[train_ind, :] val_x = df.iloc[val_ind, :] val_y = labels[val_ind, :] test_x = df.iloc[test_ind, :] test_y = labels[test_ind, :] xgb = xgboost.XGBRFClassifier( learning_rate=0.05, max_depth=8, scale_pos_weight=10, n_estimators=100, n_jobs=8, nthread=-1, subsample=.6, verbosity=1, colsample_bylevel=.9, colsample_bynode=.9, colsample_bytree=.9, gamma=1, base_score=.5, min_child_weight=1, max_delta_step=10) ##n_estimators=1000, learning_rate=0.05 xgb.fit(X=train_x.values.astype(np.float32), y=np.squeeze(train_y.astype(np.float32)), early_stopping_rounds=50, eval_set=[(val_x.values.astype(np.float32), val_y.astype(np.float32))], verbose=True) # make predictions predxgb = xgb.predict(test_x.values.astype(np.float32))
train.drop(TARGET_COL, axis=1).columns.get_loc('apache_4a_hospital_death_prob')) print( train.drop(TARGET_COL, axis=1).columns.get_loc('apache_4a_icu_death_prob')) print( train.drop(TARGET_COL, axis=1).columns.get_loc('apache_hospital_minus_apache_icu')) print( train.drop(TARGET_COL, axis=1).columns.get_loc('apache_icu_div_apache_hospital')) print(train.drop(TARGET_COL, axis=1).columns.get_loc('age')) print(train.drop(TARGET_COL, axis=1).columns.get_loc('ventilated_apache')) models = [[ xgboost.XGBRFClassifier(n_estimators=300, max_depth=50, tree_method="gpu_hist", verbose=10), xgboost.XGBRFClassifier(n_estimators=2000, max_depth=12, tree_method="gpu_hist", n_jobs=1), SelectFromModel( CatBoostClassifier(iterations=2200, depth=10, objective="Logloss", nan_mode="Max", verbose=1000, task_type="GPU")), SelectFromModel( xgboost.XGBClassifier(n_estimators=3000, eta=0.02,
columns_in_train = df_train.columns ct = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)), remainder='passthrough') X = ct.fit_transform(df_train) label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) print(X.shape, len(X), len(y)) # Model selection print("\n") print("#" * 30) print('Model selection:') clf = xgb.XGBRFClassifier() clf.fit(X, y) # Model application print("\n") print("#" * 30) print('Model Application with missing values:') df_test = pd.read_csv("./data/bank_marketing_test.csv") df_test = df_test.astype(object).replace(to_replace={"unknown": np.nan}) scores = pd.Series(index=df_test.index, dtype=np.float64) vld_index = df_test.index[df_test[cat_columns].notnull().all(axis=1)] df_test_val = df_test.dropna(subset=cat_columns, inplace=False) assert len(vld_index) == len(df_test_val) df_test_val = df_test_val[columns_in_train]
groups=uuid_groups) dump(clf.get_params(), "params_separated_lr.joblib") param_dict = load("params_separated_lr.joblib") print(param_dict) clf.fit(X_train_clean, y_train) y_pred = clf.predict(X_test_clean) print( "Balanced accuracy LR: ", balanced_accuracy_score(y_test.T, y_pred, average="macro", zero_default=0)) rf_clf = xgb.XGBRFClassifier(max_depth=12, n_estimators=200, tree_method="gpu_hist", objective="binary:logistic") clf = FlexOneVsRestClassifier(rf_clf, n_estimators=y_train.shape[1]) bounds = {"max_depth": (8, 15), "colsample_bynode": (0.5, 0.9)} clf.tune_hyperparams(X=X_train, y=y_train, bounds=bounds, metric=single_balanced_accuracy_score, init_points=6, n_iter=9, int_params=["max_depth"], groups=uuid_groups) dump(clf.get_params(), "params_separated_rf.joblib") param_dict = load("params_separated_rf.joblib")
('GNB', model4)], voting='soft') eclf3 = VotingClassifier(estimators=[('dt', model1), ('lr', model3), ('GNB', model4)], voting='soft') eclf4 = VotingClassifier(estimators=[('knn', model2), ('lr', model3), ('GNB', model4)], voting='soft') #XGBoost xgb_model1 = xgb.XGBClassifier(objective="binary:logistic", seed=42, learning_rate=0.01) xgb_model2 = xgb.XGBRFClassifier(n_estimators=100, subsample=0.9, colsample_bynode=0.2) gb_model = GradientBoostingClassifier(random_state=34) catboost_model = CatBoostClassifier() #Stacking Different Models and using logistic regression as a meta classifier clf1 = KNeighborsClassifier() clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() clf4 = LogisticRegression(random_state=1, max_iter=300) lr = lr_model sclf1 = StackingClassifier(classifiers=[clf1, clf2, clf3], use_probas=True,
def test_evaluation_metric(): from sklearn.datasets import load_diabetes, load_digits from sklearn.metrics import mean_absolute_error X, y = load_diabetes(return_X_y=True) n_estimators = 16 with tm.captured_output() as (out, err): reg = xgb.XGBRegressor( tree_method="hist", eval_metric=mean_absolute_error, n_estimators=n_estimators, ) reg.fit(X, y, eval_set=[(X, y)]) lines = out.getvalue().strip().split('\n') assert len(lines) == n_estimators for line in lines: assert line.find("mean_absolute_error") != -1 def metric(predt: np.ndarray, Xy: xgb.DMatrix): y = Xy.get_label() return "m", np.abs(predt - y).sum() with pytest.warns(UserWarning): reg = xgb.XGBRegressor( tree_method="hist", n_estimators=1, ) reg.fit(X, y, eval_set=[(X, y)], eval_metric=metric) def merror(y_true: np.ndarray, predt: np.ndarray): n_samples = y_true.shape[0] assert n_samples == predt.size errors = np.zeros(y_true.shape[0]) errors[y != predt] = 1.0 return np.sum(errors) / n_samples X, y = load_digits(n_class=10, return_X_y=True) clf = xgb.XGBClassifier(use_label_encoder=False, tree_method="hist", eval_metric=merror, n_estimators=16, objective="multi:softmax") clf.fit(X, y, eval_set=[(X, y)]) custom = clf.evals_result() clf = xgb.XGBClassifier(use_label_encoder=False, tree_method="hist", eval_metric="merror", n_estimators=16, objective="multi:softmax") clf.fit(X, y, eval_set=[(X, y)]) internal = clf.evals_result() np.testing.assert_allclose(custom["validation_0"]["merror"], internal["validation_0"]["merror"], atol=1e-6) clf = xgb.XGBRFClassifier( use_label_encoder=False, tree_method="hist", n_estimators=16, objective=tm.softprob_obj(10), eval_metric=merror, ) with pytest.raises(AssertionError): # shape check inside the `merror` function clf.fit(X, y, eval_set=[(X, y)])
from sklearn import ensemble import xgboost as xgb from sklearn import linear_model MODELS = { "randomforest_classifier": ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2), "randomforest_regressor": ensemble.RandomForestRegressor(n_estimators=200, n_jobs=-1, verbose=2), "xgb_classifier": xgb.XGBRFClassifier( learning_rate=1, subsample=0.9, ), "xgb_regressor": xgb.XGBRFRegressor(learning_rate=1, subsample=0.9), "logistic_regressor": linear_model.LogisticRegression( penalty='elasticnet', fit_intercept=True, class_weight='balanced', random_state=42, solver='saga', verbose=2, n_jobs=-1, ) #TODO: add more models here }
# xgb.XGBRFClassifier(learning_rate=1,) # For this case, we want objetive='binary:hinge' # Relevant parameters # n_estimators = number of trees in random forest to fit(RF only) # max_depth = maximum tree depth for base learners # learning_rate = (float) "eta" in xgb, boosted learning rate # objective = learning task and objective, or custom objective function # booster = 'gbtree', 'gblinear', or 'dart' # tree_method = ??? (leave as default for now) # n_threads = number of threads to use # gamma = (float) minimum loss reduction requared to make a furhter partition # min_child_seight = minimum sum of instance weight needed in a child # missing = value to treat as missing # num_parallel_tree = used for random forest xgb_rf_model = xgb.XGBRFClassifier(objective="binary:hinge", missing=-1) xgb_param_grid = { "n_estimators": [50, 100, 200], "max_depth": [4, 6, 8, 10, 12], "learning_rate": [0.2, 0.3, 0.5, 0.75, 1, 1.25, 1.5, 2], "num_parallel_tree": [50, 100, 200], "gamma": [0, 0.1, 0.25], "min_child_weight": [0.5, 1, 2] } xgb_grid_rf_clf = model_selection.GridSearchCV(xgb_rf_model, xgb_param_grid, n_jobs=4) #xgb_grid_rf_clf = xgb_rf_model xgb_grid_rf_clf.fit(xgb_x_train, xgb_y_train)
def test_xgb_base_module(root_client: sy.VirtualMachineClient) -> None: sy.load("xgboost") sy.load("numpy") # third party import numpy as np import xgboost as xgb xgb_remote = root_client.xgboost # import xgboost as xgb X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) y = np.array([0, 0, 1, 1]) param = {"eta": 0.3, "max_depth": 3, "num_class": 3} steps = 20 D_train = xgb.DMatrix(X, label=y) model = xgb.train(param, D_train, steps) preds = model.predict(D_train) D_train = xgb_remote.DMatrix(X, label=y) model = xgb_remote.train(param, D_train, steps) preds_remote = model.predict(D_train).get() classifier = xgb_remote.XGBClassifier( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False ) classifier.fit(X, y) y_pred_classifier_remote = classifier.predict(X).get() classifier = xgb.XGBClassifier( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False ) classifier.fit(X, y) y_pred_classifier = classifier.predict(X) classifier = xgb_remote.XGBRFClassifier( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False ) classifier.fit(X, y) y_pred_classifier_rf_remote = classifier.predict(X).get() classifier = xgb.XGBRFClassifier( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3, use_label_encoder=False ) classifier.fit(X, y) y_pred_classifier_rf = classifier.predict(X) regressor = xgb.XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3) regressor.fit(X, y) y_pred_regressor = regressor.predict(X) regressor = xgb_remote.XGBRegressor( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3 ) regressor.fit(X, y) y_pred_regressor_remote = regressor.predict(X).get() regressor = xgb.XGBRFRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3) regressor.fit(X, y) y_pred_regressor_rf = regressor.predict(X) regressor = xgb_remote.XGBRFRegressor( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3 ) regressor.fit(X, y) y_pred_regressor_rf_remote = regressor.predict(X).get() assert np.array_equal(y_pred_classifier_rf, y_pred_classifier_rf_remote) assert np.array_equal(y_pred_regressor_rf, y_pred_regressor_rf_remote) assert np.array_equal(y_pred_regressor, y_pred_regressor_remote) assert np.array_equal(y_pred_classifier, y_pred_classifier_remote) assert np.array_equal(preds_remote, preds)
def run_training(fold_): total_roc = [] total_conf = [] t0 = time.time() #df = pd.read_csv("../input/embedded_train_tiny_folds.csv") df = pd.read_hdf(path_or_buf="../input/tiny_data/full_data_folds.h5", key='dataset') #print("tg\n",df.target.value_counts()) #print(" ") t1 = time.time() total_time = t1 - t0 print("time to read file", total_time) print(f"fold: {fold_}") t0 = time.time() train_df = df[df.kfold != fold_].reset_index(drop=True) test_df = df[df.kfold == fold_].reset_index(drop=True) # print("train shape\n", train_df.shape) # print("test shape\n", test_df.shape) #features xtrain = train_df.drop(["kfold", "target"], axis=1) xtest = test_df.drop(["kfold", "target"], axis=1) # Standard scaler #sc = StandardScaler() #sc.fit(xtrain) #xtrain = sc.transform(xtrain) #xtest = sc.transform(xtest) # target # First make the target binary train_df.target = train_df.target.apply(lambda x: 'open' if x == 'open' else 'closed') test_df.target = test_df.target.apply(lambda x: 'open' if x == 'open' else 'closed') # Encode labels le = preprocessing.LabelEncoder() le.fit(train_df.target) #print(le.classes_) ytrain = le.transform(train_df.target) ytest = le.transform(test_df.target) print("now do SMOTE") # defin pipeline #over = RandomOverSampler( # sampling_strategy=0.032, # random_state=0) over = SMOTE(sampling_strategy=0.8, n_jobs=-1) under = RandomUnderSampler(sampling_strategy=0.9) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) #transform the datset X_res, y_res = pipeline.fit_resample(xtrain, ytrain) #X_res, y_res =xtrain, ytrain print("Before sampling %s" % Counter(ytrain)) print('Resampled dataset shape %s' % Counter(y_res)) #model model = xgb.XGBRFClassifier(use_label_encoder=False, scale_pos_weight=0.9, n_estimators=70, max_depth=6, n_jobs=-1, subsample=0.4, num_parallel_tree=20, eval_metric='logloss', tree_method='auto', objective='reg:logistic', gamma=.1, min_child_weight=6, booster='dart', eta=0.8) #fit the model on training data model.fit(X_res, y_res) # make predictions preds = model.predict(xtest) preds_proba = model.predict_proba(xtest)[:, 1] # print('preds shape',preds_proba.shape) t1 = time.time() total_time = t1 - t0 print('time to fit model:', total_time) accuracy_score = np.sum(preds == ytest) / len(ytest) #log_loss= metrics.log_loss(train_df.OpenStatus,preds) #print(f"Fold:{fold_}") #print(f"Accuracy={accuracy_score}") conf_m = confusion_matrix(ytest, preds) #print('Confusion matrix\n',conf_m) roc_score = roc_auc_score(ytest, preds_proba) print('ROC AUC score\n', roc_score) t = [fold_, roc_score] total_conf.append(conf_m) total_roc.append(t) test_df.loc[:, "xgb_pred_n"] = preds_proba print('Confusion matrix\n', confusion_matrix(ytest, preds)) return test_df[["id", "target", "kfold", "xgb_pred_n"]], np.mean(total_roc, axis=0)[1]