class XGBTransformer(ModelTransformer): def __init__(self, config_model_parameters, columns): super().__init__(config_model_parameters, columns) self.model = XGBRegressor(**self.model_parameters) self.name = "XGBoost" def features_importance(self, n=20): b = self.model.booster() fs = b.get_fscore() all_features = [fs.get(f, 0.) for f in b.feature_names] all_features = np.array(all_features, dtype=np.float32) importance = all_features / all_features.sum() logger.info("Feature importances") for k, (i, f) in enumerate(reversed(sorted(zip(importance, self.columns)))): logger.info("%s -> %f", f, i) if k == n: break
def test_tree_ensemble_regressor_xgboost(self): this = os.path.dirname(__file__) data_train = pandas.read_csv(os.path.join( this, "xgboost.model.xgb.n4.d3.train.txt"), header=None) X = data_train.iloc[:, 1:].values y = data_train.iloc[:, 0].values params = dict(n_estimator=4, max_depth=3) model = XGBRegressor(**params).fit(X, y) # See https://github.com/apple/coremltools/issues/51. model.booster = model.get_booster model_coreml = convert_xgb_to_coreml(model) model_onnx = convert_cml(model_coreml) assert model_onnx is not None
def test_tree_ensemble_regressor_xgboost(self): this = os.path.dirname(__file__) data_train = pandas.read_csv(os.path.join( this, "xgboost.model.xgb.n4.d3.train.txt"), header=None) X = data_train.iloc[:, 1:].values y = data_train.iloc[:, 0].values params = dict(n_estimator=4, max_depth=3) model = XGBRegressor(**params).fit(X, y) # See https://github.com/apple/coremltools/issues/51. model.booster = model.get_booster model_coreml = convert_xgb_to_coreml(model) model_onnx = convert_cml(model_coreml) assert model_onnx is not None if sys.version_info[0] >= 3: # python 2.7 returns TypeError: can't pickle instancemethod objects dump_data_and_model(X.astype(numpy.float32), model, model_onnx, basename="CmlXGBoostRegressor-OneOff-Reshape", allow_failure=True)
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict(Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
class PrudentialRegressorFO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
subsample=subsample, colsample_bytree=colsample_bytree, scale_pos_weight=1, gamma=gamma, reg_alpha=reg_alpha, learning_rate=learning_rate) pid_list = data_reader.pid_check(training_data) model.fit(training_data, training_label) training_output, predict_output = model.predict(training_data), model.predict( predict_data) data_writer.data_write(folder_name, training_output, predict_output, silent=True) score = model.booster().get_fscore() mapper = {'f{0}'.format(i): v for i, v in enumerate(names)} mapped = {mapper[k]: v for k, v in score.items()} fig, ax = plt.subplots(1, 1, figsize=(7, 25)) xgb.plot_importance(mapped, ax=ax) #plt.show() plt.savefig("graph.png") all_output = 0 rem = [] for i in range(12): training_data, training_label, predict_data, predict_label = data_reader.test_data_read( i) training_data = np.delete(training_data, 124, 1) predict_data = np.delete(predict_data, 124, 1) training_data = np.delete(training_data, 118, 1)
# y_train=train_df["Y"] y_train = np.log1p(train_df["Y"]) print("---1----") train_df = train_df.drop(["Y"], axis=1) print("---11----") # quantity = [attr for attr in train_df.columns if train_df.dtypes[attr] != 'object'] # 数值变量集合 train_df = train_df[quantity] # X_train = Imputer().fit_transform(train_df) X_train = train_df X_train = X_train.fillna(0) print(np.isnan(X_train).any()) print("---111----") xgb1 = XGBRegressor() xgb1.fit(X_train, y_train) print("---2----") feat_imp = pd.Series(xgb1.booster().get_fscore()).sort_values(ascending=False) def sort_weight_values(X, y, predictors): selector = SelectKBest(f_regression, k=5) selector.fit(X, y) scores = -np.log10(selector.pvalues_) dt = pd.DataFrame() dt["predictors"] = predictors dt["scores"] = scores dt = dt.sort_values(by='scores', axis=0, ascending=False) dt.to_csv( '/Users/jianjun.yue/PycharmGItHub/data/智能制造质量预测/特征重要性_SelectKBest.csv', header=False, index=False) print("------------------------------------------------")
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin): def __init__( self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[ -1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor(objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict( Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor( n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict( X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
class PrudentialRegressorFO(BaseEstimator, RegressorMixin): def __init__( self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[ -1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor(objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor self.off = FullDigitizedOptimizedOffsetRegressor( n_buckets=self.n_buckets, # basinhopping=True, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict( X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict( X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
clf_xgb = XGBRegressor(max_depth=3, n_estimators=1000) clf_gbm = GBMRegressor(exec_path=path_to_exec, num_iterations=1000, learning_rate=0.01, num_leaves=255, min_data_in_leaf=1, early_stopping_round=20, verbose=False) x_train, x_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=test_size, random_state=seed) # Training the two models clf_gbm.fit(x_train, y_train, test_data=[(x_test, y_test)]) clf_xgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=20, verbose=False) print("xgboost: feature importance") dic_fi = clf_xgb.booster().get_fscore() xgb_fi = [(feature_names[int(k[1:])], dic_fi[k]) for k in dic_fi] xgb_fi = sorted(xgb_fi, key=lambda x: x[1], reverse=True) print(xgb_fi) print("lightgbm: feature importance") gbm_fi = clf_gbm.feature_importance(feature_names) print(gbm_fi)
# 회기 모델 x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, random_state=0) scaler = RobustScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) # XGBRFRegressor?????? model = XGBRegressor() model.fit(x_train,y_train) model.booster().get_score() thres_holds = np.sort(model.feature_importances_) print(thres_holds) model1 = MultiOutputRegressor(XGBRegressor()) model1.fit(x_train,y_train) score = model.score(x_test,y_test) print(f"r2 : {score}") '''feature engineering'''
test = test.drop(["单核细胞%"], axis=1) # X = X.drop(["淋巴细胞%"], axis=1) # X = X.drop(["乙肝e抗原"], axis=1) # X = X.drop(["乙肝表面抗体"], axis=1) Y = data["血糖"] Y = np.log1p(Y) clf = XGBRegressor() print("---111----") kfold = KFold(n_splits=5, random_state=7) test_score = np.sqrt( -cross_val_score(clf, X, Y, cv=kfold, scoring='neg_mean_squared_error')) print("------test_score--------") print(test_score) print(np.mean(test_score)) print("---2----") clf.fit(X, Y) FeatureImportances = pd.Series( clf.booster().get_fscore()).sort_values(ascending=False) print(FeatureImportances) print("---3----") pred = np.expm1(clf.predict(test)) pred_df = pd.DataFrame() pred_df["pred"] = pred pred_df.to_csv( '/Users/jianjun.yue/PycharmGItHub/data/人工智能辅助糖尿病遗传风险预测/sub_0107_XG_去掉负效果特征.csv', header=False, index=False, float_format='%.3f')