def test_deprecate_position_arg(): from sklearn.datasets import load_digits X, y = load_digits(return_X_y=True, n_class=2) w = y with pytest.warns(FutureWarning): xgb.XGBRegressor(3, learning_rate=0.1) model = xgb.XGBRegressor(n_estimators=1) with pytest.warns(FutureWarning): model.fit(X, y, w) with pytest.warns(FutureWarning): xgb.XGBClassifier(1, use_label_encoder=False) model = xgb.XGBClassifier(n_estimators=1, use_label_encoder=False) with pytest.warns(FutureWarning): model.fit(X, y, w) with pytest.warns(FutureWarning): xgb.XGBRanker('rank:ndcg', learning_rate=0.1) model = xgb.XGBRanker(n_estimators=1) group = np.repeat(1, X.shape[0]) with pytest.warns(FutureWarning): model.fit(X, y, group) with pytest.warns(FutureWarning): xgb.XGBRFRegressor(1, learning_rate=0.1) model = xgb.XGBRFRegressor(n_estimators=1) with pytest.warns(FutureWarning): model.fit(X, y, w) with pytest.warns(FutureWarning): xgb.XGBRFClassifier(1, use_label_encoder=True) model = xgb.XGBRFClassifier(n_estimators=1) with pytest.warns(FutureWarning): model.fit(X, y, w)
def __init__(self, param=None, blend_type='multiply', scale=True, test_model=None, control_model=None, ranking=True): if param is None: param = { 'n_jobs': -1, 'n_estimators': 10, 'eval_metric': ['ndcg', 'map'], 'objective': 'rank:ndcg', 'verbose': True } elif 'verbose' not in param: param['verbose'] = True self.test_model = test_model self.control_model = control_model if self.test_model is None: self.test_model = xgboost.XGBRanker(**param) self.param = param if self.control_model is None: self.control_model = xgboost.XGBRanker(**param) self.blend_type = blend_type self.scale = scale self.ranking = ranking
def __init__(self, mode='local', learning_rate=0.3, min_child_weight=1, n_estimators=300, max_depth=3, subsample=1, colsample_bytree=1, reg_lambda=1.0, reg_alpha=0): name = 'Stacking' cluster = 'no_cluster' super(Stacking, self).__init__(mode, cluster, name) self.current_directory = Path(__file__).absolute().parent self.data_directory = self.current_directory.joinpath('..', '..', 'submissions') # self.gt_csv = self.data_directory.joinpath('ground_truth.csv') self.mode = mode self.xg = xgb.XGBRanker( learning_rate=learning_rate, min_child_weight=min_child_weight, max_depth=math.ceil( max_depth), n_estimators=math.ceil( n_estimators), subsample=subsample, colsample_bytree=colsample_bytree, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs=-1, objective='rank:ndcg') self.fixed_params_dict = { 'mode': mode, 'cluster': cluster, 'ask_to_load': False, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, } global _best_so_far global _group_t global _kind _best_so_far = 0 _group_t = []
def loadRankingModel(dataset_name, folder_path): model = xgb.XGBRanker(objective='rank:map', learning_rate=0.1, gamma=1.0, max_depth=6, n_estimators=4) model.load_model('{}/{}_xgboost_rank_model.bin'.format( folder_path, dataset_name)) return model
def score_ranking(self): """ Preparation of dataframes and use of XGBoost :return: """ # CREATING DATAFRAMES FOR XGBOOST print(">>> Preparing the two DataFrames...") # Train dataframe self.df_train = self.df_builder.build_base_dataframe( users=self.df_user_id_col, items=self.df_item_id_col) self.df_builder.build_whole_dataframe(self.df_train) # Test dataframe self.df_test = self.df_builder.retrieve_test_dataframe() # BUILD TRAIN AND TEST GROUPS train_group = [] test_group = [] train_user_ids = list(self.df_train.loc[:, 'user_id'].values) test_user_ids = list(self.df_test.loc[:, 'user_id'].values) train_group.extend([self.cutoff] * len(set(train_user_ids))) test_group.extend([self.cutoff] * len(set(test_user_ids))) # DROP USELESS COLUMNS OF DF_TRAIN AND DF_TEST train_dropped = self.df_train.drop(labels={'user_id', 'item_id'}, axis=1) test_dropped = self.df_test.drop(labels={'user_id', 'item_id'}, axis=1) print(">>> DataFrames well formed and ready to be used!") # LGBM TO TRAIN FASTER ON GPU # lgbm_group = self.xgb_dataframe.groupby('user_id').size().values # lightGBM_ranker = lgb.LGBMRanker(device='gpu') # lightGBM_ranker.fit(train_dropped, users, lgbm_group) # XGB RANKER AT WORK print(">>> Fitting of the XGB model...") xbg_ranker = xgb.XGBRanker() xbg_ranker.fit(train_dropped, train_user_ids, train_group) print(">>> Fitting completed!") # model = xgb.train(params, # dtrain, # num_round, # verbose_eval=2, # early_stopping_rounds=20) # print(xgb_regressor.predict(X_test)) print(">>> Predicting the scores...") predictions = xbg_ranker.predict(test_dropped) print(">>> DONE") print(predictions[0:100])
def train(data): raw = pd.read_pickle("../Data/training_set_VU_DM.pkl") groups = raw.srch_id.value_counts(sort=False).sort_index() # Remove if to many NaN values raw = raw.dropna(thresh=len(raw) * .8, axis=1) # Else fill with zeros raw = raw.fillna(0) # Increase booking weight y_train = (raw.click_bool + 4 * raw.booking_bool).values # Irrelevant features to_remove = [ 'srch_id', 'date_time', 'position', 'click_bool', 'booking_bool' ] X_train = raw.drop(to_remove, axis=1).values model = xgb.XGBRanker(tree_method='gpu_hist', booster='gbtree', objective='rank:ndcg', random_state=42, learning_rate=0.05, colsample_bytree=0.9, eta=0.05, max_depth=6, n_estimators=110, subsample=0.75) model.fit(X_train, y_train, group=groups, verbose=True) n_queries = data.srch_id.nunique() print("Predicting...") for i, (query, query_group) in enumerate(data.groupby('srch_id')): query_batch = query_group.drop('srch_id', axis=1).values out = model.predict(query_batch) order = out.argsort()[::-1] # Order by learned ranking result = query_group.reset_index()[['srch_id', 'prop_id']].reindex(order) # Save results result.to_csv("../Predictions/XGB_ndcg.csv", mode='a', header=(i == 0), index=False) if i % 100 == 0: print(f"Predicted {i}/{n_queries}")
def save_model_to_local_file(booster, model_params, meta, filename): from sklearn2pmml import PMMLPipeline, sklearn2pmml try: from xgboost.compat import XGBoostLabelEncoder except: # noqa: E722 # xgboost==0.82.0 does not have XGBoostLabelEncoder # in xgboost.compat.py from xgboost.sklearn import XGBLabelEncoder as XGBoostLabelEncoder objective = model_params.get("objective") bst_meta = dict() if objective.startswith("binary:") or objective.startswith("multi:"): if objective.startswith("binary:"): num_class = 2 else: num_class = model_params.get("num_class") assert num_class is not None and num_class > 0, \ "num_class should not be None" # To fake a trained XGBClassifier, there must be "_le", "classes_", # inside XGBClassifier. See here: # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356 model = xgb.XGBClassifier() label_encoder = XGBoostLabelEncoder() label_encoder.fit(list(range(num_class))) model._le = label_encoder model.classes_ = model._le.classes_ bst_meta["_le"] = {"classes_": model.classes_.tolist()} bst_meta["classes_"] = model.classes_.tolist() elif objective.startswith("reg:"): model = xgb.XGBRegressor() elif objective.startswith("rank:"): model = xgb.XGBRanker() else: raise ValueError( "Not supported objective {} for saving PMML".format(objective)) model_type = type(model).__name__ bst_meta["type"] = model_type # Meta data is needed for saving sklearn pipeline. See here: # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356 booster.set_attr(scikit_learn=json.dumps(bst_meta)) booster.save_model(filename) save_model_metadata("model_meta.json", meta) booster.set_attr(scikit_learn=None) model.load_model(filename) pipeline = PMMLPipeline([(model_type, model)]) sklearn2pmml(pipeline, "{}.pmml".format(filename))
def run_scikit_model_check(name, path): if name.find('reg') != -1: reg = xgboost.XGBRegressor() reg.load_model(path) config = json.loads(reg.get_booster().save_config()) if name.find('0.90') != -1: assert config['learner']['learner_train_param'][ 'objective'] == 'reg:linear' else: assert config['learner']['learner_train_param'][ 'objective'] == 'reg:squarederror' assert (len(reg.get_booster().get_dump()) == gm.kRounds * gm.kForests) run_model_param_check(config) elif name.find('cls') != -1: cls = xgboost.XGBClassifier() cls.load_model(path) if name.find('0.90') == -1: assert len(cls.classes_) == gm.kClasses assert len(cls._le.classes_) == gm.kClasses assert cls.n_classes_ == gm.kClasses assert (len(cls.get_booster().get_dump()) == gm.kRounds * gm.kForests * gm.kClasses), path config = json.loads(cls.get_booster().save_config()) assert config['learner']['learner_train_param'][ 'objective'] == 'multi:softprob', path run_model_param_check(config) elif name.find('ltr') != -1: ltr = xgboost.XGBRanker() ltr.load_model(path) assert (len(ltr.get_booster().get_dump()) == gm.kRounds * gm.kForests) config = json.loads(ltr.get_booster().save_config()) assert config['learner']['learner_train_param'][ 'objective'] == 'rank:ndcg' run_model_param_check(config) elif name.find('logitraw') != -1: logit = xgboost.XGBClassifier() logit.load_model(path) assert (len(logit.get_booster().get_dump()) == gm.kRounds * gm.kForests) config = json.loads(logit.get_booster().save_config()) assert config['learner']['learner_train_param'][ 'objective'] == 'binary:logitraw' elif name.find('logit') != -1: logit = xgboost.XGBClassifier() logit.load_model(path) assert (len(logit.get_booster().get_dump()) == gm.kRounds * gm.kForests) config = json.loads(logit.get_booster().save_config()) assert config['learner']['learner_train_param'][ 'objective'] == 'binary:logistic' else: assert False
def run_pr_auc_ltr(self, tree_method): from sklearn.datasets import make_classification X, y = make_classification(128, 4, n_classes=2, random_state=1994) ltr = xgb.XGBRanker(tree_method=tree_method, n_estimators=16) groups = np.array([32, 32, 64]) ltr.fit(X, y, group=groups, eval_set=[(X, y)], eval_group=[groups], eval_metric="aucpr") results = ltr.evals_result()["validation_0"]["aucpr"] assert results[-1] >= 0.99
def _run_xgb_ranker_converter(self, num_classes, extra_config={}): warnings.filterwarnings("ignore") for max_depth in [1, 3, 8, 10, 12]: model = xgb.XGBRanker(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y, group=[X.shape[0]]) torch_model = hummingbird.ml.convert(model, "torch", X, extra_config=extra_config) self.assertIsNotNone(torch_model) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-06, atol=1e-06)
def test_float64_xgb_ranker_converter(self): warnings.filterwarnings("ignore") num_classes = 3 for max_depth in [1, 3, 8, 10, 12]: model = xgb.XGBRanker(n_estimators=10, max_depth=max_depth) np.random.seed(0) X = np.random.rand(100, 200) y = np.random.randint(num_classes, size=100) model.fit(X, y, group=[X.shape[0]]) torch_model = hummingbird.ml.convert(model, "torch", X[0:1]) self.assertIsNotNone(torch_model) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-06, atol=1e-06)
def __init__(self, mode='local', learning_rate=0.3, min_child_weight=1, n_estimators=300, max_depth=3, subsample=1, colsample_bytree=1, reg_lambda=1.0, reg_alpha=0): name = 'gbdt_hybrid' cluster = 'no_cluster' super(Gbdt_Hybrid, self).__init__(mode, cluster, name) self.current_directory = Path(__file__).absolute().parent self.data_directory = self.current_directory.joinpath( '..', '..', 'submissions/hybrid') #self.gt_csv = self.data_directory.joinpath('ground_truth.csv') self.mode = mode self.full = data.full_df() self.local_target_indices = data.target_indices(mode='local', cluster='no_cluster') self.full_target_indices = data.target_indices(mode='full', cluster='no_cluster') directory = self.data_directory.joinpath('local') full_dir = self.data_directory.joinpath('full') self.xg = xgb.XGBRanker(learning_rate=learning_rate, min_child_weight=min_child_weight, max_depth=math.ceil(max_depth), n_estimators=math.ceil(n_estimators), subsample=subsample, colsample_bytree=colsample_bytree, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs=-1, objective='rank:ndcg') self.cv_path = self.data_directory.joinpath('cross_validation')
def rank_XGBoost(X, y, X_test, group_size, df_test): """Makes XGBoost ranking""" xgb_rank = xgb.XGBRanker(objective='rank:ndcg') xgb_rank.fit(X, y, group_size) preds = xgb_rank.predict(X_test) # xgb.plot_importance(xgb_rank) # plt.show() # make submission xgb_dataset = pd.DataFrame(preds, columns=['xgb_preds']) xgb_merged = pd.concat([df_test[['srch_id', 'prop_id']], xgb_dataset], axis=1, sort=False) xgb_ranking = xgb_merged.join( xgb_merged.groupby('srch_id')['xgb_preds'].rank( ascending=False).astype(int).rename('xgb_rank')) return xgb_ranking
def model_trainer(train_set, val_set, MODEL='LGB', SUFFIX='val', SAVE=False): """Train model """ logger.info(f'Training {MODEL} in {SUFFIX} mode...') # get hyperparameters params = hypara_dispatcher(MODEL) # fit if MODEL == 'LGB': # train data dtrain_set = lgb.Dataset( train_set['X'].values , train_set['y'].values , feature_name=features ) if SUFFIX == 'val': # val data dval_set = lgb.Dataset( val_set['X'].values , val_set['y'].values , feature_name=features ) # train model = lgb.train( params , dtrain_set , valid_sets=[dtrain_set, dval_set] , early_stopping_rounds=100 , verbose_eval=100 ) else: # train model = lgb.train( params , dtrain_set ) elif MODEL == 'XGB': # model model = xgb.XGBRegressor(**params) if SUFFIX == 'val': # train model.fit( train_set['X'], train_set['y'], eval_set=[(val_set['X'], val_set['y'])], verbose=500, early_stopping_rounds=100, ) else: # train model.fit( train_set['X'], train_set['y'], verbose=500, ) elif MODEL == 'XGBRank': # model model = xgb.XGBRanker(**params) if SUFFIX == 'val': model.fit( train_set['X'], train_set['y'], eval_set=[(val_set['X'], val_set['y'])], group=train_set['g'], eval_group=[val_set['g']], verbose=100, early_stopping_rounds=100, ) else: model.fit( train_set['X'], train_set['y'], group=train_set['g'], verbose=100, ) # save model if SAVE: if MODEL[-1] == 'B': # save via joblib joblib.dump(model, f'{OUTPUT_DIR}/{target}_{MODEL}_model_{SUFFIX}.pkl') logger.info(f'{MODEL} {SUFFIX}_model for {target} saved!') return model
def GridSearch(feature, label, group, metrics, scoring=0.5, cv=5, cv_num=3, metrics_min=True, speedy=True, speedy_param=(20000, 0.3), gpu=False): """XGBRanker model params search use GridSearch method. Args: feature: pandas dataframe, model's feature. label: pandas series, model's label. loss: XGBRanker param 'objective'. metrics: model metrics function. scoring: metrics error opt base line value. cv: cross validation fold. cv_num: minimum cross validation fold. metrics_min: metrics value whether the smaller the better. speedy: whether use speedy method. speedy_param: if use speedy method, test_size will be set, test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2). gpu: whether use gpu. Returns: a best XGBRanker model params dict. Raises: params error. """ def product(x): if len(x)==1: return itertools.product(x[0]) elif len(x)==2: return itertools.product(x[0], x[1]) else: return itertools.product(x[0], x[1], x[2]) start = time.time() if gpu: raise "XGBRanker is not supported currently." if speedy: test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2) tree_method = 'gpu_hist' if gpu else 'auto' n_job = 1 if gpu else int(np.ceil(cpu_count()*0.9)) weight_dict = Counter(label) if len(weight_dict)==2: weight = int(np.ceil(weight_dict[min(weight_dict)]/weight_dict[max(weight_dict)])) else: weight_dict = {j:i for i,j in weight_dict.items()} weight = int(np.ceil(weight_dict[max(weight_dict)]/weight_dict[min(weight_dict)])) params = {'learning_rate': 0.1, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 1, 'reg_alpha': 0, 'reg_lambda': 1, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8, 'max_delta_step': 0, 'scale_pos_weight': weight, 'n_jobs':n_job, 'random_state': 27, 'objective': 'rank:pairwise', 'tree_method':tree_method} cv_params = {'param1':{'n_estimators': list(range(100, 850, 50))}, 'param2':{'max_depth': [3, 4, 5, 6, 7], 'min_child_weight': [1, 2, 3, 4, 5]}, 'param3':{'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}, 'param4':{'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'colsample_bylevel': [0.6, 0.7, 0.8, 0.9]}, 'param5':{'reg_alpha': [0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.05, 0.1, 1, 2, 3]}, 'param6':{'max_delta_step': list(np.linspace(0, 10, 11, dtype='int')), 'scale_pos_weight': list(np.linspace(1, weight, weight, dtype='int'))}, 'param7':{'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.2]}} for _, cv_param in cv_params.items(): cv_param_name = [i for i in cv_param] cv_param_value = [cv_param[i] for i in cv_param_name] cv_param_iter = product(cv_param_value) for value in cv_param_iter: params.update({name:name_value for name, name_value in zip(cv_param_name, value)}) model = xgb.XGBRanker(**params) score = [] if speedy: for i in range(cv_num): X_train, X_test, y_train, y_test, g_train, g_test = train_test_split(feature, label, group, test_size=test_size, stratify=label, random_state=np.random.choice(range(100), 1)[0]) model.fit(X_train, y_train, g_train) cv_pred = model.predict(X_test) score.append(metrics(y_test.values, cv_pred)) else: skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=np.random.choice(range(100), 1)[0]) for n, (train_index, test_index) in enumerate(skf.split(feature, label)): if n == cv_num: break model.fit(feature.loc[train_index], label[train_index], group[train_index]) cv_pred = model.predict(feature.loc[test_index]) score.append(metrics(label[test_index].values, cv_pred)) cv_score = round(np.mean(score), 4) if metrics_min: if cv_score<scoring: scoring = cv_score best_params = params.copy() else: if cv_score>scoring: scoring = cv_score best_params = params.copy() params = best_params.copy() sys.stdout.write("XGBRanker grid search run time {} min, best score: {}, best param:{}\r".format( divmod((time.time()-start),60)[0], scoring, best_params)) sys.stdout.flush() print("XGBRanker param finetuning with grid search run time: %d min %.2f s" % divmod((time.time() - start), 60)) return best_params
def Model_Search_XGboost_cv(X, y, model='binary', folds=5, sklearn_metric=None, xgboost_metric=None, step_wise_start_at=0, final_learning_rate=0.01, use_optuna=False, direction='minimize', n_trials=25, load_study_from=None, save_study_as=None, n_jobs=4): # model if isinstance(model, str): if model == 'binary': model = xgboost.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42, seed=42, feature_fraction_seed=42, use_label_encoder=False, nthread=n_jobs) elif model == 'multiclass': n_classes = len(np.unique(y)) model = xgboost.XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=n_classes, random_state=42, seed=42, feature_fraction_seed=42, use_label_encoder=False, nthread=n_jobs) elif model == 'regression': model = xgboost.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', random_state=42, seed=42, feature_fraction_seed=42, use_label_encoder=False, nthread=n_jobs) elif model == 'ranking': model = xgboost.XGBRanker(objective='rank:map', eval_metric='map', random_state=42, seed=42, feature_fraction_seed=42, use_label_encoder=False, nthread=n_jobs) else: sys.exit('Error: Unkown model type.') # sklearn_metric if sklearn_metric is None: # https://scikit-learn.org/stable/modules/model_evaluation.html if isinstance(model, xgboost.XGBClassifier): sklearn_metric = 'neg_log_loss' elif isinstance(model, xgboost.XGBRegressor): sklearn_metric = 'neg_root_mean_squared_error' elif isinstance(model, xgboost.XGBRanker): sklearn_metric = 'average_precision_score' else: sys.exit('Error: Sklearn score metric needs to be provided.') # xgboost_metric if xgboost_metric is None: # https://xgboost.readthedocs.io/en/latest/parameter.html if isinstance(model, xgboost.XGBClassifier): n_classes = len(np.unique(y)) if n_classes > 2: xgboost_metric = 'mlogloss' else: xgboost_metric = 'logloss' elif isinstance(model, xgboost.XGBRegressor): xgboost_metric = 'rmse' elif isinstance(model, xgboost.XGBRanker): xgboost_metric = 'map' else: sys.exit('Error: Xgboost score metric needs to be provided.') # folds if isinstance(folds, int): if isinstance(model, xgboost.XGBClassifier): folds = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42) else: folds = KFold(n_splits=folds, shuffle=True, random_state=42) # ------------------------------------------------------------------------------------------------ # Set fixed params fixed_params = { 'verbosity': 0, 'silent': 1, # 'num_iterations': 10000, 'n_estimators': 10000, } model.set_params(**fixed_params) # dataset for .cv d_train = xgboost.DMatrix(X, label=y) # ------------------------------------------------------------------------------------------------ if use_optuna: print("Searching for a Xgboost model with optuna \n") params = model.get_params() if load_study_from is not None: study = joblib.load(load_study_from) else: study = optuna.create_study( direction=direction, pruner=optuna.pruners.SuccessiveHalvingPruner()) def objetive(trial): params.update({ "learning_rate": trial.suggest_uniform("learning_rate", 0.005, 0.1), "max_depth": trial.suggest_int("max_depth", 4, 12), "min_child_weight": trial.suggest_int("min_child_weight", 1, 50), "gamma": trial.suggest_loguniform("gamma", 1e-4, 1e4), "subsample": trial.suggest_loguniform("subsample", 0.4, 1), "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.4, 1), "alpha": trial.suggest_loguniform("alpha", 1e-8, 1e4), "lambda": trial.suggest_loguniform("lambda", 1e-8, 1e4), }) cv_results = xgboost.cv(params, d_train, num_boost_round=10000, early_stopping_rounds=50, folds=folds, metrics=xgboost_metric, show_stdv=False, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[2] score = cv_results[rmetric_name][ -1] # np.min(cv_results[rmetric_name]) print("Num_boost_round: " + str(len(cv_results[rmetric_name]))) if save_study_as is not None: joblib.dump(study, save_study_as) return score study.optimize(objetive, n_trials=n_trials, n_jobs=1) print( "------------------------------------------------------------------------" ) print("Best parameters found: " + str(study.best_params)) print("Best score achived: " + str(study.best_value)) print( "------------------------------------------------------------------------" ) model.set_params(**study.best_params) # num_boost_round optimization cv_results = xgboost.cv(model.get_params(), d_train, num_boost_round=10000, early_stopping_rounds=50, folds=folds, metrics=xgboost_metric, show_stdv=False, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[2] best_boost_round = len(cv_results[rmetric_name]) best_score_achived = cv_results[rmetric_name][-1] print("Best num_boost_round: " + str(best_boost_round)) print("Best score achived: " + str(best_score_achived)) print( "------------------------------------------------------------------------" ) model.set_params(n_estimators=best_boost_round) else: print("Searching for a Xgboost model with the step wise method \n") if step_wise_start_at <= 0: # num_boost_round optimization cv_results = xgboost.cv(model.get_params(), d_train, num_boost_round=10000, early_stopping_rounds=50, folds=folds, metrics=xgboost_metric, show_stdv=False, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[2] best_boost_round = len(cv_results[rmetric_name]) best_score_achived = cv_results[rmetric_name][-1] print( "------------------------------------------------------------------------" ) print("Best num_boost_round: " + str(best_boost_round)) print("Best score achived: " + str(best_score_achived)) print( "------------------------------------------------------------------------" ) model.set_params(n_estimators=best_boost_round) # Param search if step_wise_start_at <= 1: param_test = {'max_depth': range(2, 11, 1)} search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=1, cv=folds, verbose=True) search.fit(X, y) print("Best params encountered: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 2: param_test = { 'min_child_weight': [0, 1, 2, 3, 5, 7, 10, 12, 15, 25, 50, 75, 100] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=1, cv=folds, verbose=True) search.fit(X, y) print("Best params encountered: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 3: param_test = { 'gamma': [ 0, 0.0001, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7, 10, 40, 70, 100 ] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=1, cv=folds, verbose=True) search.fit(X, y) print("Best parameters found: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 4: param_test = { 'subsample': [ 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1 ] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=1, cv=folds, verbose=True) search.fit(X, y) print("Best parameters found: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 5: param_test = { 'colsample_bytree': [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=1, cv=folds, verbose=True) search.fit(X, y) print("Best parameters found: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 6: param_test = { 'alpha': [ 0, 0.0001, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7, 10, 40, 70, 100, 200 ] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=1, cv=folds, verbose=True) search.fit(X, y) print("Best parameters found: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ if step_wise_start_at <= 7: param_test = { 'lambda': [ 0, 0.0001, 0.001, 0.01, 0.04, 0.07, 0.1, 0.4, 0.7, 1, 4, 7, 10, 40, 70, 100, 200 ] } search = GridSearchCV(estimator=model, param_grid=param_test, scoring=sklearn_metric, n_jobs=1, cv=folds, verbose=True) search.fit(X, y) print("Best parameters found: " + str(search.best_params_)) print("Best score achived: " + str(search.best_score_)) print( "------------------------------------------------------------------------" ) pipe_model = search.best_estimator_ # Get model from pipeline model = pipe_model.named_steps['model'] # Set final learning rate model.set_params(learning_rate=final_learning_rate) # num_boost_round optimization model.set_params(num_iterations=10000) cv_results = xgboost.cv(model.get_params(), d_train, num_boost_round=10000, early_stopping_rounds=50, folds=folds, metrics=xgboost_metric, show_stdv=False, verbose_eval=None, as_pandas=False) rmetric_name = list(cv_results.keys())[2] best_boost_round = len(cv_results[rmetric_name]) best_score_achived = cv_results[rmetric_name][-1] print( "------------------------------------------------------------------------" ) print("Best num_boost_round: " + str(best_boost_round)) print("Best score achived: " + str(best_score_achived)) print( "------------------------------------------------------------------------" ) model.set_params(n_estimators=best_boost_round) return model
train = base_expanded_df(alpha=0.2, beta=0.05, isValidation=True, save=True) test = base_expanded_df(alpha=0.2, beta=0.05, isValidation=False, save=True) #train = pd.read_csv("dataset/expanded/base_expanded_train.csv") #test = pd.read_csv("dataset/expanded/base_expanded_test.csv") train = adding_features(train, isValidation=True) test = adding_features(test, isValidation=False) train.to_csv('dataset/expanded/train_complete.csv', index=False) test.to_csv('dataset/expanded/test_complete.csv', index=False) train = pd.read_csv("train_complete.csv") test = pd.read_csv("test_complete.csv") group = train.groupby('queried_record_id').size().values ranker = xgb.XGBRanker() ranker.fit(train.drop(['queried_record_id', 'target', 'linked_id_idx'], axis=1), train['target'], group=group) predictions = ranker.predict( test.drop(['queried_record_id', 'linked_id_idx'], axis=1)) test['predictions'] = predictions df_predictions = test[[ 'queried_record_id', 'predicted_record_id', 'predicted_record_id_record', 'predictions' ]] rec_pred = [] for (r, p, l, record_id) in zip(df_predictions.predicted_record_id,
# 并且需要给定每个query_id下样本的数量 groups = train_data.groupby('id').size().to_frame( 'size')['size'].to_numpy() # 计算每个query_id 下的样品数量 test_data = df.iloc[X_test_inds] # We need to keep the id for later predictions X_test = test_data.loc[:, ~test_data.columns.isin(['rank'])] y_test = test_data.loc[:, test_data.columns.isin(['rank'])] # 然后我们就可以建模了,可以用XGBRanker训练排序模型,在这个场景下,我们无法自定义objective,也无法自定义mertic了. model = xgb.XGBRanker( tree_method='gpu_hist', booster='gbtree', objective='rank:pairwise', # rank:ndcg rank:map random_state=42, learning_rate=0.1, colsample_bytree=0.9, eta=0.05, max_depth=6, n_estimators=110, subsample=0.75) model.fit(X_train, y_train, group=groups, verbose=True) # 训练完后我们就可以进行预估,因为预估方法并不会输入groups,所以我们需要做一些特殊处理: def predict(model, df): return model.predict(df.loc[:, ~df.columns.isin(['id'])]) predictions = (data.groupby('id').apply(lambda x: predict(model, x))) """
def __init__(self, mode, cluster='no_cluster', kind='kind1', ask_to_load=True, class_weights=False, learning_rate=0.1658, min_child_weight=0.5644, n_estimators=100, max_depth=11, subsample=1, colsample_bytree=1, reg_lambda=65.0, reg_alpha=50.0, max_delta_step=2, scale_pos_weight=20, gamma=0.01, weights_position=False, log_weights=False): #name = 'xgboost_ranker_mode={}_cluster={}_kind={}_class_weights={}_learning_rate={}_min_child_weight={}_n_estimators={}_max_depth={}_subsample={}_colsample_bytree={}_reg_lambda={}_reg_alpha={}_max_delta_step={}_scale_pos_weight={}_gamma={}_weights_position={}_log_weights={}'.format( # mode, cluster, kind, class_weights, learning_rate, min_child_weight, n_estimators, max_depth, subsample, colsample_bytree, reg_lambda, reg_alpha, max_delta_step, scale_pos_weight,gamma, weights_position, log_weights #) name = 'final_stacking' super(XGBoostWrapper, self).__init__(name=name, mode=mode, cluster=cluster) self.class_weights = class_weights self.weights_position = weights_position self.log_weights = log_weights self.kind = kind self.ask_to_load = ask_to_load self.xg = xgb.XGBRanker(learning_rate=learning_rate, min_child_weight=min_child_weight, max_depth=math.ceil(max_depth), n_estimators=math.ceil(n_estimators), gamma=gamma, max_delta_step=math.ceil(max_delta_step), scale_pos_weight=math.ceil(scale_pos_weight), subsample=subsample, colsample_bytree=colsample_bytree, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs=-1, objective='rank:pairwise') self.fixed_params_dict = { 'mode': mode, 'cluster': cluster, 'kind': kind, 'ask_to_load': False, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'max_delta_step': 0, 'scale_pos_weight': 1, 'gamma': 0 } # create hyperparameters dictionary self.hyperparameters_dict = { 'learning_rate': (0.01, 0.3), 'max_depth': (3, 7), 'n_estimators': (700, 1200), 'reg_lambda': (0, 0.5), 'reg_alpha': (0, 0.5) }
def RandomSearch(feature, label, group, metrics, iter_num=1000, scoring=0.5, cv=5, cv_num=3, metrics_min=True, speedy=True, speedy_param=(20000, 0.3), gpu=False, save_model_dir=None ): """XGBRanker model params search use RandomSearch method. Args: feature: pandas dataframe, model's feature. label: pandas series, model's label. group: label group. metrics: model metrics function. scoring: metrics error opt base line value. cv: cross validation fold. cv_num: minimum cross validation fold. metrics_min: metrics value whether the smaller the better. speedy: whether use speedy method. speedy_param: if use speedy method, test_size will be set, test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2). gpu: whether use gpu. save_model_dir: save model folder. Returns: a best XGBRanker model params dict. Raises: params error. """ import xgboost as xgb assert xgb.__version__>=__xgboost_version__, f'xgboost version should be >={__xgboost_version__}.' start = time.time() if gpu: raise "XGBRanker is not supported currently." best_params={} if speedy: test_size = 1-round(min(speedy_param[0], feature.shape[0]*speedy_param[1])/feature.shape[0], 2) tree_method = ['gpu_hist'] if gpu else ['auto', 'exact', 'approx', 'hist'] n_job = 1 if gpu else int(np.ceil(cpu_count()*0.9)) weight_dict = Counter(label) if len(weight_dict)==2: weight = int(np.ceil(weight_dict[min(weight_dict)]/weight_dict[max(weight_dict)])) else: weight_dict = {j:i for i,j in weight_dict.items()} weight = int(np.ceil(weight_dict[max(weight_dict)]/weight_dict[min(weight_dict)])) hp = HyperParametersRandom() hp.Float('learning_rate', 0.01, 0.1) hp.Int('n_estimators', 100, 850) hp.Choice('max_depth', [3, 4, 5, 6, 7]) hp.Choice('min_child_weight', [1, 2, 3, 4, 5, 6, 7]) hp.Choice('max_delta_step', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) hp.Choice('reg_alpha', np.concatenate([np.linspace(0, 1, 101), np.linspace(2, 100, 99)]).round(2)) hp.Choice('reg_lambda', np.concatenate([np.linspace(0, 1, 101), np.linspace(2, 100, 99)]).round(2)) hp.Choice('subsample', [0.5, 0.6, 0.7, 0.8, 0.9, 1. ]) hp.Choice('colsample_bytree', [0.5, 0.6, 0.7, 0.8, 0.9, 1. ]) hp.Choice('colsample_bylevel', [0.5, 0.6, 0.7, 0.8, 0.9, 1. ]) hp.Choice('colsample_bynode', [0.5, 0.6, 0.7, 0.8, 0.9, 1. ]) hp.Choice('gamma', np.concatenate([np.linspace(0, 1, 101), np.linspace(2, 100, 99)]).round(2)) hp.Choice('scale_pos_weight', [1, weight]) hp.Choice('n_jobs', [n_job]) hp.Choice('random_state', [27]) hp.Choice('objective', ['rank:pairwise']) hp.Choice('booster', ['gbtree']) hp.Choice('tree_method', tree_method) hp.Choice('importance_type', ["gain", "weight", "cover", "total_gain", "total_cover"]) for i in range(1, iter_num+1): hp.update() model = xgb.XGBRanker(**hp.params) score = [] if speedy: for _ in range(cv_num): X_train, X_test, y_train, y_test, g_train, g_test = train_test_split(feature, label, group, test_size=test_size, stratify=label, random_state=np.random.choice(range(100), 1)[0]) model.fit(X_train, y_train, g_train) cv_pred = model.predict(X_test) score.append(metrics(y_test.values, cv_pred)) else: skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=np.random.choice(range(100), 1)[0]) for n, (train_index, test_index) in enumerate(skf.split(feature, label)): if n == cv_num: break model.fit(feature.loc[train_index], label[train_index], group[train_index]) cv_pred = model.predict(feature.loc[test_index]) score.append(metrics(label[test_index].values, cv_pred)) cv_score = round(np.mean(score), 4) if metrics_min: if cv_score<scoring: scoring = cv_score best_params = params.copy() if save_model_dir is not None: pickle.dump(model, open(os.path.join(save_model_dir, "xgb_model.pkl"), "wb")) with open(os.path.join(save_model_dir, "xgb_params.json"),'w') as f: json.dump(best_params, f) else: if cv_score>scoring: scoring = cv_score best_params = params.copy() if save_model_dir is not None: pickle.dump(model, open(os.path.join(save_model_dir, "xgb_model.pkl"), "wb")) with open(os.path.join(save_model_dir, "xgb_params.json"),'w') as f: json.dump(best_params, f) sys.stdout.write("XGBRanker random search percent: {}%, run time {} min, best score: {}, best param:{}\r".format( round(i/iter_num*100,2), divmod((time.time()-start),60)[0], scoring, best_params)) sys.stdout.flush() print("XGBRanker param finetuning with random search run time: %d min %.2f s" % divmod((time.time() - start), 60)) return best_params
dftrain = pd.read_csv('dataprep.csv', index_col=0) print(dftrain.min()) print(dftrain.max()) print(dftrain.head()) print(dftrain[pd.isnull(dftrain).any(axis=1)]) def groupsize(df): srch_value = df.srch_id.value_counts() df_srch_count = pd.DataFrame([srch_value]).T.sort_index() return df_srch_count.srch_id params = {'objective': 'rank:ndcg'} xgb_rank = xgb.XGBRanker(**params) x_train = dftrain.drop(['click_bool', 'booking_bool'], axis=1) y_train = np.array( dftrain['click_bool']) + 4 * np.array(dftrain['booking_bool']) x_test = x_train[4500022:4958347].copy() y_test = y_train[4500022:4958347].copy() x_val = x_train[4000003:4500022].copy() y_val = y_train[4000003:4500022].copy() x_train = x_train[0:4000003] y_train = y_train[0:4000003] resultTest = x_test[['srch_id', 'prop_id']].copy() resultTest['click_bool'] = np.array( dftrain.loc[4500022:4958347, 'click_bool']) + 4 * np.array( dftrain.loc[4500022:4958347, 'booking_bool'])
import xgboost as xgb import pandas as pd df = pd.read_csv('test/data/iris/iris.csv') X = df.drop(columns=['Species']) y = df['Species'] # y = y.replace(2, 1) X_train = X[:100] y_train = y[:100] X_test = X[100:] y_test = y[100:] group_train = [20, 80] model = xgb.XGBRanker() model.fit(X_train, y_train, group_train) # print(model.predict(X_test)) print(model.feature_importances_)