def log_focal_loss_obj(preds: np.ndarray, dtrain: xgb.DMatrix): labels = dtrain.get_label() # print(preds.shape) kRows, kClasses = preds.shape if dtrain.get_weight().size == 0: # Use 1 as weight if we don't have custom weight. weights = np.ones((kRows, 1), dtype=float) else: weights = dtrain.get_weight() grad = np.zeros((kRows, kClasses), dtype=float) hess = np.zeros((kRows, kClasses), dtype=float) for r in range(kRows): #print(preds[r]) target = int(labels[r]) assert target >= 0 or target <= kClasses p = softmax(preds[r, :]) grad_r, hess_r = focal_logloss_derivative_gamma2(p, target) grad[r] = grad_r * weights[r] hess[r] = hess_r * weights[r] # Right now (XGBoost 1.0.0), reshaping is necessary grad = grad.reshape((kRows * kClasses, 1)) hess = hess.reshape((kRows * kClasses, 1)) return grad, hess
def predict_xgboost_answers(xgb_model): # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность) load_tfidf_1 = TFIDF.load(os.path.join(PATH_TO_TF_IDF, 'tf_idf_1')) x_test, y_test = sklearn.datasets.load_svmlight_file( os.path.join(PATH_TO_LEARNING_TO_RANK, 'x_test.txt')) group_test = [] with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"), "r", encoding="utf-8") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) test_dmatrix = DMatrix(x_test) test_dmatrix.set_group(group_test) pred = xgb_model.predict(test_dmatrix) prediction_answer = [] for i, p in enumerate(pred): prediction_answer.append( (load_tfidf_1.num_to_num_dict[i % CNT_ARTICLES], p)) predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK, 'prediction_file.txt') if os.path.exists(predict_file): os.remove(predict_file) f = open(predict_file, 'w+', encoding="utf-8") predictions = [str(pred) for pred in prediction_answer] f.write('\n'.join(predictions)) f.close()
def apply(self, X, ntree_limit=0): """Return the predicted leaf every tree for each sample. Parameters ---------- X : array_like, shape=[n_samples, n_features] Input features matrix. ntree_limit : int Limit number of trees in the prediction; defaults to 0 (use all trees). Returns ------- X_leaves : array_like, shape=[n_samples, n_trees] For each datapoint x in X and for each tree, return the index of the leaf x ends up in. Leaves are numbered within ``[0; 2**(self.max_depth+1))``, possibly with gaps in the numbering. """ sizes, group_indices, X_features, _, _ = _preprare_data_in_groups(X) test_dmatrix = DMatrix(X_features, missing=self.missing) test_dmatrix.set_group(sizes) X_leaves = self.get_booster().predict(test_dmatrix, pred_leaf=True, ntree_limit=ntree_limit) revert_group_indices = np.arange( len(group_indices))[group_indices.argsort()] X_leaves = X_leaves[revert_group_indices, :] return X_leaves
def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> [str, float]: ''' Root mean squared log error metric.''' y = dtrain.get_label() price = dtrain.get_weight() predt[predt < -1] = -1 + 1e-6 elements = np.power(np.log1p(y) - np.log1p(predt), 2) return 'PyRMSLE', float(np.sqrt(np.sum(elements) / (len(y))))
def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: '''Compute the hessian for squared log error.''' y = dtrain.get_label() price = dtrain.get_weight() return (((np.log1p(price) * (y - predt) + predt + 1) / predt + 1) + (np.log1p(price) - 1) * np.log1p(predt) - np.log1p(y)) / (np.log1p(price) * (y - predt) + predt + 1)**2
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: '''Compute the gradient squared log error.''' y = dtrain.get_label() price = dtrain.get_weight() return (np.log1p(predt) - np.log1p(y)) / ( (predt + 1) + (np.log1p(price) * (y - predt)) ) # with log is best, maybe apply log in all bot part of equation
def cv(X_t, y_t, X_test, y_test): ''' :type dataframes: will be using DMatrix, optimized datastructure offered from xgboost testing new APIs in xgboost with train/cv, to be used for hyperparameter tuning ''' dtrain = DMatrix(X_t, label=y_t) dtest = DMatrix(X_test, label=y_test) params={"objective":"binary:logistic", 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.8, 'eta': 0.1, 'alpha': 0.2, 'lambda': 0.2, 'eval_metric':'auc'} num_boost_round = 313 #model.best_iteration + 1 #fit model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dtest,'Test')]) return model '''
def predict_xgboost_answers(xgb_model): # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность) features = pd.read_csv(f"{PATH_TO_LEARNING_TO_RANK}/x_test.csv", sep=',') x_test = features.drop(['doc_id', 'is_rel', '7'], axis=1) group_test = [] with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"), "r", encoding="utf-8") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) test_dmatrix = DMatrix(x_test) test_dmatrix.set_group(group_test) pred = xgb_model.predict(test_dmatrix) corpus = SimpleCorp.load("codexes_corp_articles", os.path.join(PATH_TO_FILES, "corp")) prediction_answer = [] for p, doc_id in zip( pred, list(corpus.corpus.keys()) * (len(pred) // CNT_ARTICLES)): prediction_answer.append((doc_id, p)) predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK, 'prediction_file.txt') if os.path.exists(predict_file): os.remove(predict_file) f = open(predict_file, 'w+', encoding="utf-8") predictions = [str(pred) for pred in prediction_answer] f.write('\n'.join(predictions)) f.close()
def predict(self, X, output_margin=False, ntree_limit=0): sizes, _, X_features, _, _ = _preprare_data_in_groups(X) test_dmatrix = DMatrix(X_features, missing=self.missing) test_dmatrix.set_group(sizes) rank_values = self.get_booster().predict(test_dmatrix, output_margin=output_margin, ntree_limit=ntree_limit) return rank_values
def predict(self, X, group=None, output_margin=False, ntree_limit=0): if group == None: group = [X.shape[0]] test_dmatrix = DMatrix(X, missing=self.missing) test_dmatrix.set_group(group) rank_values = self.booster().predict(test_dmatrix, output_margin=output_margin, ntree_limit=ntree_limit) return rank_values
def best_num_round(params, dall: xgb.DMatrix, cv_splits, verbose=True): params = _complete_params(params) train_idx, test_idx = cv_splits[-1] dtrain = dall.slice(train_idx) dtest = dall.slice(test_idx) bst = xgb.train(params, dtrain, early_stopping_rounds=50, num_boost_round=500, evals=[(dtrain, 'dtrain'), (dtest, 'dtest')], feval=_xgb_feval, verbose_eval=verbose) return bst.best_ntree_limit
def get_pairs_rank_score(loaded_model, text_list): test_group_list, test_data_list, test_target_list = test_data_generation( text_list) # print(test_group_list, '\n*******test_group_list************') # print(test_data_list, '\n*********test_data_list**********') # print(test_target_list, '\n*********test_target_list**********') xgbTest = DMatrix(np.asmatrix(test_data_list), label=test_target_list) xgbTest.set_group(test_group_list) results = loaded_model.predict(xgbTest) return results
def to_dmatrix(data, labels=None): if type(data) in [pd.DataFrame, pd.Series]: if labels != None and type(labels) in [pd.DataFrame, pd.Series]: return DMatrix(data.values, labels.values) else: return DMatrix(data.values) else: if labels != None: return DMatrix(data, labels) else: return DMatrix(data)
def test_dmatrix_creator(self): # This function acts as a pseudo-itertools.chain() def row_tup_iter(data): pdf = pd.DataFrame(data) yield pdf # Standard testing DMatrix creation expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100) expected_labels = np.array([1, 0] * 100) expected_dmatrix = DMatrix(data=expected_features, label=expected_labels) data = { "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100, "label": [1, 0] * 100, } output_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=False, has_validation=False, has_base_margin=False, ) # You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using # the same classifier and making sure the outputs are equal model = XGBClassifier() model.fit(expected_features, expected_labels) expected_preds = model.get_booster().predict(expected_dmatrix) output_preds = model.get_booster().predict(output_dmatrix) self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3)) # DMatrix creation with weights expected_weight = np.array([0.2, 0.8] * 100) expected_dmatrix = DMatrix(data=expected_features, label=expected_labels, weight=expected_weight) data["weight"] = [0.2, 0.8] * 100 output_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=True, has_validation=False, has_base_margin=False, ) model.fit(expected_features, expected_labels, sample_weight=expected_weight) expected_preds = model.get_booster().predict(expected_dmatrix) output_preds = model.get_booster().predict(output_dmatrix) self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
def main(): p = get_cli_args(args) x_train, y_train, qid_train = load_svmlight_file( p.train.xgboost_train_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_train = x_train.todense() x_train = np.concatenate([ x_train, x_train[:, -2] / x_train[:, 2], x_train[:, -1] / x_train[:, 4] ], 1) x_test = x_test.todense() x_test = np.concatenate( [x_test, x_test[:, -2] / x_test[:, 2], x_test[:, -1] / x_test[:, 4]], 1) train_dmatrix = DMatrix(x_train, y_train) test_dmatrix = DMatrix(x_test, y_test) train_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_train)]) test_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_test)]) params = { 'objective': 'rank:pairwise', 'eval_metric': ['error', 'map@1'], 'tree_method': 'exact', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6 } xgb_model = xgb.train(params, train_dmatrix, num_boost_round=100, evals=[(test_dmatrix, 'validation')]) xgb_train_str = items_to_str(_.omit(params, 'objective', 'eval_metric').items(), sort_by=itemgetter(0)) xgb_model.save_model(xgb_train_str + '_model.xgb')
def mknfold(X_train, y_train, nfold, param, evals=(), features=None): ''' Makes n folds in input data. Parameters ---------- X_train : pandas.DataFrame X data to be trained y_train : pandas.DataFrame y data to be trained nfold : int Number of folds in CV. param : dict Booster params evals : list Evaluation metrics to be watches in CV. features : list features selected to be trained Returns ------- ret : list list of CVPack objects containing the dmatrix training, testing, and list of parameters and metrics to use for every fold wt_list : list list of weights for each fold. This is the size of each fold ''' if not features: features = X_train.columns out_idset, wt_list = bin_fold(X_train, nfold) in_idset = [ np.concatenate([out_idset[i] for i in range(nfold) if k != i]) for k in range(nfold) ] evals = list(evals) ret = [] for k in range(nfold): # perform the slicing using the indexes determined by the above methods x_train_snip = X_train.loc[in_idset[k]][features] y_train_snip = X_train.loc[in_idset[k]]['encoded_target'] x_test_snip = X_train.loc[out_idset[k]][features] y_test_snip = X_train.loc[out_idset[k]]['encoded_target'] dtrain = DMatrix(x_train_snip, label=y_train_snip) dtest = DMatrix(x_test_snip, label=y_test_snip) tparam = param plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret, wt_list
def _convert_partition_data_to_dmatrix( partition_data_iter, has_weight, has_validation, has_base_margin, dmatrix_kwargs=None, ): # pylint: disable=too-many-locals, unbalanced-tuple-unpacking dmatrix_kwargs = dmatrix_kwargs or {} # if we are not using external storage, we use the standard method of parsing data. train_val_data = _prepare_train_val_data(partition_data_iter, has_weight, has_validation, has_base_margin) if has_validation: ( train_x, train_y, train_w, train_b_m, val_x, val_y, val_w, val_b_m, ) = train_val_data training_dmatrix = DMatrix( data=train_x, label=train_y, weight=train_w, base_margin=train_b_m, **dmatrix_kwargs, ) val_dmatrix = DMatrix( data=val_x, label=val_y, weight=val_w, base_margin=val_b_m, **dmatrix_kwargs, ) return training_dmatrix, val_dmatrix train_x, train_y, train_w, train_b_m = train_val_data training_dmatrix = DMatrix( data=train_x, label=train_y, weight=train_w, base_margin=train_b_m, **dmatrix_kwargs, ) return training_dmatrix
def test_external_storage(self): # Instantiating base data (features, labels) features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100) labels = np.array([1, 0] * 100) normal_dmatrix = DMatrix(features, labels) test_dmatrix = DMatrix(features) data = { "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100, "label": [1, 0] * 100, } # Creating the dmatrix based on storage temporary_path = tempfile.mkdtemp() storage_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=False, has_validation=False, has_base_margin=False, ) # Testing without weights normal_booster = worker_train({}, normal_dmatrix) storage_booster = worker_train({}, storage_dmatrix) normal_preds = normal_booster.predict(test_dmatrix) storage_preds = storage_booster.predict(test_dmatrix) self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3)) shutil.rmtree(temporary_path) # Testing weights weights = np.array([0.2, 0.8] * 100) normal_dmatrix = DMatrix(data=features, label=labels, weight=weights) data["weight"] = [0.2, 0.8] * 100 temporary_path = tempfile.mkdtemp() storage_dmatrix = _convert_partition_data_to_dmatrix( [pd.DataFrame(data)], has_weight=True, has_validation=False, has_base_margin=False, ) normal_booster = worker_train({}, normal_dmatrix) storage_booster = worker_train({}, storage_dmatrix) normal_preds = normal_booster.predict(test_dmatrix) storage_preds = storage_booster.predict(test_dmatrix) self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3)) shutil.rmtree(temporary_path)
def train(self, train_x, train_y): """Train a xgboost_lr model Arguments: train_x {[type]} -- [description] train_y {[type]} -- [description] """ self.xgb_clf.fit(train_x, train_y, eval_metric=self.xgb_eval_metric, eval_set=[(train_x, train_y)]) xgb_eval_result = self.xgb_clf.evals_result() print("train eval result: ", xgb_eval_result) train_x_mat = DMatrix(train_x) # get boost tree leaf info train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat, pred_leaf=True) print(train_xgb_pred_mat) train_lr_feature_mat = self.one_hot_encoder.fit_transform( train_xgb_pred_mat) print('train_mat:', train_lr_feature_mat.shape) print('train_mat array:', train_lr_feature_mat.toarray()) # lr self.lr_clf.fit(train_lr_feature_mat, train_y) self.init = True
def test_xgboost_booster_classifier_reg(self): x, y = make_classification(n_classes=2, n_features=5, n_samples=100, random_state=42, n_informative=3) y = y.astype(np.float32) + 0.567 x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42) data = DMatrix(x_train, label=y_train) model = train( { 'objective': 'reg:squarederror', 'n_estimators': 3, 'min_child_samples': 1 }, data) model_onnx = convert_xgboost( model, 'tree-based classifier', [('input', FloatTensorType([None, x.shape[1]]))]) dump_data_and_model( x_test.astype(np.float32), model, model_onnx, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBBoosterReg")
def __classify(self, path): files = [self.parse_pe(path)] df = pd.DataFrame(files) df = df.drop(['sha256', 'size'], axis=1) sections = df['sections'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series) imports = df['import'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series) imports = imports.reset_index().set_index(['index', 'dll']) imports = imports['symbols'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('import').reset_index().set_index('index') join = sections.join(imports).fillna(0) join['SectionName'] = join['SectionName'].astype('str') join['dll'] = join['dll'].astype('str') join['import'] = join['import'].astype('str') string_columns = ['SectionName', 'dll', 'import'] matrix = self.ohe.transform(join[string_columns]) index = join.index rows = [] for i in index.unique(): select = index.slice_indexer(start=i, end=i) rows.append(csr_matrix(matrix[select].sum(axis=0))) join_encoded = pd.DataFrame(data={'matrix':rows}) df = df.drop(['sections', 'import'], axis=1) df = df.join(join_encoded) X = df.apply(lambda x: hstack((x.drop('matrix').astype('int64').values, x['matrix'])).T, axis=1) X = hstack(X.values).T X = X.todok().toarray() return self.booster.predict(DMatrix(X))[0]
def get_xgb_dmatrix(tup): from xgboost import DMatrix data, label, weight, missing, feature_names, feature_types = tup return DMatrix(data, label=label, missing=missing, weight=weight, feature_names=feature_names, feature_types=feature_types, nthread=-1)
def predict(self, smiles, get_features=get_fp, use_tqdm=False): canonical_smiles = [] invalid_smiles = [] if use_tqdm: pbar = tqdm(range(len(smiles))) else: pbar = range(len(smiles)) for i in pbar: sm = smiles[i] if use_tqdm: pbar.set_description("Calculating predictions...") try: sm = Chem.MolToSmiles(Chem.MolFromSmiles(sm, sanitize=False)) if len(sm) == 0: invalid_smiles.append(sm) else: canonical_smiles.append(sm) except: invalid_smiles.append(sm) if len(canonical_smiles) == 0: return canonical_smiles, [], invalid_smiles prediction = [] x, _, _ = get_features(canonical_smiles, sanitize=False) x = DMatrix(x) for i in range(len(self.models)): y_pred = self.models[i].predict(x) if self.transformer is not None: y_pred = self.transformer.inverse_transform(y_pred) prediction.append(y_pred) prediction = np.array(prediction) prediction = np.mean(prediction, axis=0) return canonical_smiles, prediction, invalid_smiles
def fair_metric(predt: np.ndarray, dtrain: xgb.DMatrix): ''' FairXGB Error Metric''' # predt is the prediction array # Find the right protected group vector if len(predt) == len(protected_train): protected_feature = np.array(protected_train.copy()) elif len(predt) == len(protected_full): protected_feature = np.array(protected_full.copy()) elif len(predt) == len(protected_valid): protected_feature = np.array(protected_valid.copy()) else: protected_feature = 0 y = dtrain.get_label() answer = -y * np.log( sigmoid(predt)) - (1 - y) * np.log(1 - sigmoid(predt)) answer += mu * ( protected_feature * np.log(sigmoid(predt)) + (1 - protected_feature) * np.log(1 - sigmoid(predt))) return 'Fair_Metric', float(np.sum(answer) / len(answer))
def trainModel(self, train_x, train_y): #train a xgboost model self.xgb_clf = xgb.XGBClassifier(nthread=self.xgb_nthread) self.xgb_clf.fit(train_x, train_y, eval_metric=self.xgb_eval_metric, eval_set=[(train_x, train_y)]) xgb_eval_result = self.xgb_clf.evals_result() print 'XGB_train eval_result:', xgb_eval_result train_x_mat = DMatrix(train_x) train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat, pred_leaf=True) self.one_hot_encoder = OneHotEncoder() train_lr_feature_mat = self.one_hot_encoder.fit_transform( train_xgb_pred_mat) print 'train_mat:', train_lr_feature_mat.shape #train a LR model self.lr_clf = LR() self.lr_clf.fit(train_lr_feature_mat, train_y) self.init_flag = True pickle.dump(self.xgb_clf, file(self.xgb_model_name, 'wb'), True) pickle.dump(self.lr_clf, file(self.lr_model_name, 'wb'), True) pickle.dump(self.one_hot_encoder, file(self.one_hot_encoder_model_name, 'wb'), True) print 'Train xgboost and lr model done'
def rmsle(predt: np.ndarray, dtrain: DMatrix) -> Tuple[str, float]: """ Root mean squared log error metric. """ y = dtrain.get_label() predt[predt < -1] = -1 + 1e-6 elements = np.power(np.log1p(y) - np.log1p(predt), 2) return "my_rmsle", float(np.sqrt(np.sum(elements) / len(y))) + DEBUG_ERROR
def _prediction_feature_weights(xgb, X, feature_names, xgb_feature_names): """ For each target, return score and numpy array with feature weights on this prediction, following an idea from http://blog.datadive.net/interpreting-random-forests/ """ # XGBClassifier does not have pred_leaf argument, so use booster booster = xgb.booster() # type: Booster leaf_ids, = booster.predict(DMatrix(X, missing=xgb.missing), pred_leaf=True) xgb_feature_names = {f: i for i, f in enumerate(xgb_feature_names)} tree_dumps = booster.get_dump(with_stats=True) assert len(tree_dumps) == len(leaf_ids) target_feature_weights = partial(_target_feature_weights, feature_names=feature_names, xgb_feature_names=xgb_feature_names) n_targets = _xgb_n_targets(xgb) if n_targets > 1: # For multiclass, XGBoost stores dumps and leaf_ids in a 1d array, # so we need to split them. scores_weights = [ target_feature_weights( leaf_ids[target_idx::n_targets], tree_dumps[target_idx::n_targets], ) for target_idx in range(n_targets) ] else: scores_weights = [target_feature_weights(leaf_ids, tree_dumps)] return scores_weights
def to_xgboost(self, **kwargs): from xgboost import DMatrix dmatrix = DMatrix(self.to_numpy(**kwargs)) ## TODO: Uncomment when XGB observation models are implemented # dmatrix.observation_model = self.observation_model(backend="xgboost", loss="mse") return dmatrix
def predict(self, x, **kwargs): """ Perform prediction for a batch of inputs. :param x: Test set. :type x: `np.ndarray` :return: Array of predictions of shape `(nb_inputs, nb_classes)`. :rtype: `np.ndarray` """ from xgboost import Booster, XGBClassifier from art.utils import to_categorical # Apply preprocessing x_preprocessed, _ = self._apply_preprocessing(x, y=None, fit=False) if isinstance(self._model, Booster): from xgboost import DMatrix train_data = DMatrix(x_preprocessed, label=None) predictions = self._model.predict(train_data) y_prediction = np.asarray([line for line in predictions]) if len(y_prediction.shape) == 1: y_prediction = to_categorical(labels=y_prediction, nb_classes=self.nb_classes()) elif isinstance(self._model, XGBClassifier): y_prediction = self._model.predict_proba(x_preprocessed) # Apply postprocessing y_prediction = self._apply_postprocessing(preds=y_prediction, fit=False) return y_prediction
def update(self, Xtrain, ytrain, Xval, yval, scoring, n_iterations): dtrain = DMatrix(data=Xtrain, label=ytrain) early_stop_callback = early_stop() if not (self.env['earlier_stop']): for i in range(n_iterations - self.model.n_estimators): # note: # this is a get, but the internal booster in XGBClassifier is also updated # add unit test for controle if future updates self.model.get_booster().update( dtrain, iteration=self.model.n_estimators) self.model.n_estimators += 1 score = scoring(self, Xval, yval) if score > self.env['best_score']: self.env['best_score'] = score self.env['best_iteration'] = self.model.n_estimators try: early_stop_callback(env=self.env, score=score, iteration=self.model.n_estimators) except EarlyStopException: print('Update Stopped Earlier! @ {} instead of {}'.format( self.model.n_estimators, n_iterations)) self.env['earlier_stop'] = True break
} ################################################################## ## DMatrix ## generate training dataset # 一共 2 组 * 每组 3 条, 6 条样本, 特征维数是 2 n_group = 2 n_choice = 3 dtrain = np.random.uniform(0, 100, [n_group * n_choice, 2]); print(dtrain.shape) # (6, 2) # numpy.random.choice(a, size=None, replace=True, p=None) dtarget = np.array([np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten(); print(dtarget) # [1 0 2 1 0 2] # n_group 用于表示从前到后每组各自有多少样本, 前提是样本中各组是连续的, [3, 3] 表示一共 6 条样本中前 3 条是第一组, 后 3 条是第二组 dgroup = np.array([n_choice for i in range(n_group)]).flatten(); print(dgroup) # [3 3] # concate Train data, very import here ! xgbTrain = DMatrix(dtrain, label=dtarget) xgbTrain.set_group(dgroup) # generate eval data dtrain_eval = np.random.uniform(0, 100, [n_group * n_choice, 2]); print(dtrain_eval.shape) # (6, 2) xgbTrain_eval = DMatrix(dtrain_eval, label=dtarget) xgbTrain_eval .set_group(dgroup) evallist = [(xgbTrain, 'train'), (xgbTrain_eval, 'eval')] # train model # xgb_rank_params1 加上 evals 这个参数会报错, 还没找到原因 # rankModel = train(xgb_rank_params1, xgbTrain, num_boost_round=10) rankModel = train(xgb_rank_params2, xgbTrain, num_boost_round=20, evals=evallist) # test dataset dtest = np.random.uniform(0, 100, [n_group*n_choice, 2]); print(dtest.shape) # (6, 2)
group_train = [] with open("mq2008.train.group", "r") as f: data = f.readlines() for line in data: group_train.append(int(line.split("\n")[0])) group_valid = [] with open("mq2008.vali.group", "r") as f: data = f.readlines() for line in data: group_valid.append(int(line.split("\n")[0])) group_test = [] with open("mq2008.test.group", "r") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test) train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) params = {'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6} xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')]) pred = xgb_model.predict(test_dmatrix)