def setup_class(cls): """ Download and setup the test fixtures """ cls.dpath = 'demo/rank/' (x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid, y_valid, qid_valid) = tm.get_mq2008(cls.dpath) # instantiate the matrices cls.dtrain = xgboost.DMatrix(x_train, y_train) cls.dvalid = xgboost.DMatrix(x_valid, y_valid) cls.dtest = xgboost.DMatrix(x_test, y_test) # set the group counts from the query IDs cls.dtrain.set_group([len(list(items)) for _key, items in itertools.groupby(qid_train)]) cls.dtest.set_group([len(list(items)) for _key, items in itertools.groupby(qid_test)]) cls.dvalid.set_group([len(list(items)) for _key, items in itertools.groupby(qid_valid)]) # save the query IDs for testing cls.qid_train = qid_train cls.qid_test = qid_test cls.qid_valid = qid_valid # model training parameters cls.params = {'objective': 'rank:pairwise', 'booster': 'gbtree', 'eval_metric': ['ndcg'] }
def test_dask_ranking(client: "Client") -> None: dpath = "demo/rank/" mq2008 = tm.get_mq2008(dpath) data = [] for d in mq2008: if isinstance(d, scipy.sparse.csr_matrix): d[d == 0] = np.inf d = d.toarray() d[d == 0] = np.nan d[np.isinf(d)] = 0 data.append(da.from_array(d)) else: data.append(da.from_array(d)) ( x_train, y_train, qid_train, x_test, y_test, qid_test, x_valid, y_valid, qid_valid, ) = data qid_train = qid_train.astype(np.uint32) qid_valid = qid_valid.astype(np.uint32) qid_test = qid_test.astype(np.uint32) rank = xgb.dask.DaskXGBRanker(n_estimators=2500) rank.fit( x_train, y_train, qid=qid_train, eval_set=[(x_test, y_test), (x_train, y_train)], eval_qid=[qid_test, qid_train], eval_metric=["ndcg"], verbose=True, early_stopping_rounds=10, ) assert rank.n_features_in_ == 46 assert rank.best_score > 0.98