def __init__(self, train_qset, valid_qset, test_qset, ranker_params, fit_params, click_model, total_number_of_clicked_queries=10000, learn_from_random=False): self.name = self.name + '-' + 'n_estimators-%d-learning_rate%.2f' % \ (ranker_params['n_estimators'], ranker_params['learning_rate']) self.fit_params = fit_params self.click_model = click_model self.offline_train_qset = train_qset self.offline_valid_qset = valid_qset self.offline_test_qset = test_qset self.offline_qset = { 'train': train_qset, 'test': test_qset, 'valid': valid_qset } self.ranker_params = ranker_params self.fit_params = fit_params if learn_from_random: self.offline_ranker = None else: self.offline_ranker = gbm.LGBMRanker(**self.ranker_params) self.offline_fit() self.click_ranker = gbm.LGBMRanker(**self.ranker_params) self.click_fit(total_number_of_clicked_queries)
def testLocalRanker(self): X, y = self.X, self.y y = (y * 10).astype(mt.int32) ranker = LGBMRanker(n_estimators=2) ranker.fit(X, y, group=[X.shape[0]], verbose=True) prediction = ranker.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) self.assertIsInstance(prediction, mt.Tensor) result = prediction.fetch() self.assertEqual(prediction.dtype, result.dtype) # test weight weight = mt.random.rand(X.shape[0]) ranker = LGBMRanker(verbosity=1, n_estimators=2) ranker.fit(X, y, group=[X.shape[0]], sample_weight=weight) prediction = ranker.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) result = prediction.fetch() self.assertEqual(prediction.dtype, result.dtype) # test local model X_np = X.execute(session=self.session).fetch(session=self.session) y_np = y.execute(session=self.session).fetch(session=self.session) raw_ranker = lightgbm.LGBMRanker(verbosity=1, n_estimators=2) raw_ranker.fit(X_np, y_np, group=[X.shape[0]]) prediction = LGBMRanker(raw_ranker).predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X))
def test_lambdarank(self): X_train, y_train = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) X_test, y_test = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test')) q_train = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) q_test = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query')) gbm = lgb.LGBMRanker() gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False, callbacks=[ lgb.reset_parameter(learning_rate=lambda x: 0.95**x * 0.1) ])
def train(self, tabular_path: str, join_result_path: str, model_path: str, model_weights_path=None, histogram_path=None) -> None: """ Train a classification model for spatial join cost estimator, then save the trained model to file """ # Extract train and test data, but only use train data X_train, y_train = datasets.load_data(tabular_path, RankingModel.TARGET, RankingModel.DROP_COLUMNS) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1) query_train = [X_train.shape[0]] query_val = [X_val.shape[0]] gbm = lgb.LGBMRanker() model = gbm.fit(X_train, y_train, group=query_train, eval_set=[(X_val, y_val)], eval_group=[query_val], eval_at=[1, 2], early_stopping_rounds=50) # Fit and save the model # model = self.rnk_model.fit(X_train, y_train) pickle.dump(model, open(model_path, 'wb'))
def _run_lgbm_ranker_converter(self, num_classes, extra_config={}, label_gain=None): warnings.filterwarnings("ignore") for max_depth in [1, 3, 8, 10, 12, None]: model = lgb.LGBMRanker(n_estimators=10, max_depth=max_depth, label_gain=label_gain) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=100) model.fit(X, y, group=[X.shape[0]], eval_set=[(X, y)], eval_group=[X.shape[0]]) torch_model = hummingbird.ml.convert(model, "torch", extra_config=extra_config) self.assertIsNotNone(torch_model) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-06, atol=1e-06)
def update_ranker(self, ranker_params, fit_params): """"This method uses the training data from generate_training_data_from_clicks to improve the ranker.""" if self.observed_training_data: train_indices = [inds for otd in self.observed_training_data for inds in otd[0]] train_features = np.concatenate([self.train_qset.feature_vectors[inds] for inds in train_indices]) train_labels = np.concatenate([otd[1] for otd in self.observed_training_data]) train_query_group = np.concatenate([otd[2] for otd in self.observed_training_data]) else: raise ValueError('OnlineLTR.generate_training_data_from_clicks()' 'should be called before OnlineLTR.update_ranker().') ranker = gbm.LGBMRanker(**ranker_params) if 'early_stopping_rounds' in fit_params: num_queries = len(self.observed_training_data[-1][0]) valid_query_ids = self.sample_query_ids(num_queries, data='valid') valid_labels = np.concatenate([self.valid_qset[qid].relevance_scores for qid in valid_query_ids]) valid_features = self.valid_qset[valid_query_ids].feature_vectors valid_query_group = [self.valid_qset[qid].document_count() for qid in valid_query_ids] ranker.fit(X=train_features, y=train_labels, group=train_query_group, eval_set=[(valid_features, valid_labels)], eval_group=[valid_query_group], **fit_params) else: ranker.fit(X=train_features, y=train_labels, group=train_query_group, **fit_params) return ranker
def test_local_ranker(setup): y = (y_raw * 10).astype(mt.int32) ranker = LGBMRanker(n_estimators=2) ranker.fit(X_raw, y, group=[X_raw.shape[0]], verbose=True) prediction = ranker.predict(X_raw) assert prediction.ndim == 1 assert prediction.shape[0] == len(X_raw) assert isinstance(prediction, mt.Tensor) result = prediction.fetch() assert prediction.dtype == result.dtype # test weight weight = mt.random.rand(X_raw.shape[0]) ranker = LGBMRanker(verbosity=1, n_estimators=2) ranker.fit(X_raw, y, group=[X_raw.shape[0]], sample_weight=weight) prediction = ranker.predict(X_raw) assert prediction.ndim == 1 assert prediction.shape[0] == len(X_raw) result = prediction.fetch() assert prediction.dtype == result.dtype # test local model X_np = X_raw.execute().fetch() y_np = y.execute().fetch() raw_ranker = lightgbm.LGBMRanker(verbosity=1, n_estimators=2) raw_ranker.fit(X_np, y_np, group=[X_raw.shape[0]]) prediction = LGBMRanker(raw_ranker).predict(X_raw) assert prediction.ndim == 1 assert prediction.shape[0] == len(X_raw)
def testLocalRanker(self): X, y = self.X, self.y y = (y * 10).astype(mt.int32) regressor = LGBMRanker(n_estimators=2) regressor.fit(X, y, group=[X.shape[0]], verbose=True) prediction = regressor.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) self.assertIsInstance(prediction, mt.Tensor) result = prediction.fetch() self.assertEqual(prediction.dtype, result.dtype) # test weight weight = mt.random.rand(X.shape[0]) classifier = LGBMRanker(verbosity=1, n_estimators=2) classifier.fit(X, y, group=[X.shape[0]], sample_weight=weight) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) result = prediction.fetch() self.assertEqual(prediction.dtype, result.dtype) # test local model classifier = lightgbm.LGBMRanker(verbosity=1, n_estimators=2) classifier.fit(X, y, group=[X.shape[0]]) prediction = predict(classifier, X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X))
def lgb_main(train_final_df, val_final_df=None): print('ranker begin....') train_final_df.sort_values(by=['user_id'], inplace=True) g_train = train_final_df.groupby(['user_id'], as_index=False).count()["label"].values if mode == 'offline': val_final_df = val_final_df.sort_values(by=['user_id']) g_val = val_final_df.groupby(['user_id'], as_index=False).count()["label"].values lgb_ranker = lgb.LGBMRanker( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=300, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs=-1) # 300epoch, best, 0.882898, dense_feat + hist_cnt_sim_feat user_interest_dense_feat if mode == 'offline': lgb_ranker.fit(train_final_df[lgb_cols], train_final_df['label'], group=g_train, eval_set=[(val_final_df[lgb_cols], val_final_df['label'])], eval_group=[g_val], eval_at=[50], eval_metric=['auc', ], early_stopping_rounds=50, ) else: lgb_ranker.fit(train_final_df[lgb_cols], train_final_df['label'], group=g_train) print('train done...') return lgb_ranker
def test_lightgbm_ranking(): try: import lightgbm except: print("Skipping test_lightgbm_ranking!") return import shap import numpy as np # train lightgbm ranker model x_train, y_train, x_test, y_test, q_train, q_test = shap.datasets.rank() model = lightgbm.LGBMRanker() model.fit( x_train, y_train, group=q_train, eval_set=[(x_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False, callbacks=[ lightgbm.reset_parameter(learning_rate=lambda x: 0.95**x * 0.1) ]) _validate_shap_values(model, x_test)
def train(tmp_dir, output_model, num_leaves=16, max_depth=-1, learning_rate=0.1, n_estimators=100, min_child_samples=5, feature_name='auto', task='SA'): train_file = os.path.join(tmp_dir, f"train_{task}.csv") train_size = os.path.join(tmp_dir, f"train_{task}_size.csv") X_train, y_train = load_svmlight_file(train_file) print('Training in prog...') model = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=num_leaves, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, min_child_samples=min_child_samples) model.fit(X_train, y_train, group=np.loadtxt(train_size), feature_name=feature_name) model.booster_.save_model(output_model) print(f'Model saved at {output_model}')
def test_lambdarank(self): X_train, y_train = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train')) X_test, y_test = load_svmlight_file( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test')) q_train = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query')) q_test = np.loadtxt( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query')) gbm = lgb.LGBMRanker() gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False, callbacks=[ lgb.reset_parameter( learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x)) ]) self.assertLessEqual(gbm.best_iteration_, 25) self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6333) self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6048)
def fit_evaluate(self, df_train: pd.DataFrame, df_val: pd.DataFrame, verbose=False, features=None, **kwargs) -> EvaluationResult: X_train = df_train.ext.X y_train = df_train.ext.y groups_train = df_train.ext.group_sizes X_val = df_val.ext.X y_val = df_val.ext.y groups_val = df_val.ext.group_sizes assert len(X_train) == len(y_train) == sum( groups_train ), f"Sizes are not equal - Xt: {len(X_train)}, yt {len(y_train)}, Gt:{ sum(groups_train)}" assert len(X_val) == len(y_val) == sum( groups_val ), f"Sizes are not equal - Xv: {len(X_val)}, yv {len(y_val)}, Gv:{sum(groups_val)}" gbm = lgb.LGBMRanker(boosting_type="gbdt", class_weight="balanced", n_estimators=200) gbm.fit(X_train, y_train, group=groups_train, eval_set=[(X_val, y_val)], eval_group=[groups_val], early_stopping_rounds=10, feature_name=features, verbose=verbose) self.model = gbm return self.evaluate(df_val)
def __init__(self, mode='train', model_path=os.path.join(Config.root_path, 'model/ranking/lightgbm')): self.ts = TextSimilarity() self.matchingNN = MatchNN() self.ranker = lgb.LGBMRanker(**params) self.train_data = pd.read_csv(os.path.join(Config.root_path, 'data/ranking/train.tsv'), sep='\t', header=0, quoting=csv.QUOTE_NONE) self.dev_data = pd.read_csv(os.path.join(Config.root_path, 'data/ranking/dev.tsv'), sep='\t', header=0, quoting=csv.QUOTE_NONE) if mode == 'train': logging.info('Training mode') self.train_data = self.generate_feature(self.train_data, 'train') logging.info("train_data columns: {}".format( self.train_data.columns)) logging.info("train_data shape: {}".format(self.train_data.shape)) logging.info("train_data: {}".format(self.train_data[:5])) self.dev_data = self.generate_feature(self.dev_data, 'dev') logging.info("dev_data shape: {}".format(self.dev_data.shape)) self.ranker = self.trainer() self.save(self.ranker, model_path) else: self.ranker = joblib.load(model_path)
def fit(self, df): """Train the lightGBM model.""" df_impressions = fb.build_features(df) # Target column, item that was clicked f.print_time("target column") df_impressions.loc[:, "is_clicked"] = ( df_impressions["referenced_item"] == df_impressions["impressed_item"]).astype(int) features = [ "position", "prices", "interaction_count", "is_last_interacted", ] # Bring to format suitable for lightGBM f.print_time("lightGBM format") X = df_impressions[features] y = df_impressions.is_clicked q = (df_impressions.groupby( ["user_id", "session_id", "timestamp", "step"]).size().reset_index(name="query_length").query_length) # Training the actual model f.print_time("training lightGBM model") self.gbm = lgb.LGBMRanker() self.gbm.fit(X, y, group=q, verbose=True)
def main(args): config = Config.from_parseargs(args) prelude(config) logging.info("Start...") logging.info(config) cache = SvmLightCache(config.cache_name) logging.info("Loading data...") X, y, qid = cache.load_svmlight_file(args.train, query_id=True) X_val, y_val, qid_val = cache.load_svmlight_file(args.valid, query_id=True) scaler = None if config.normalize: scaler = get_scaler(config.normalize) normalize(scaler, X, is_train=True) normalize(scaler, X_val, is_train=False) model = lgb.LGBMRanker( objective=config.objective, boosting_type=config.boosting_type, n_estimators=config.trees, num_leaves=config.leaves, learning_rate=config.learning_rate, colsample_bytree=config.colsample_bytree, max_position=config.max_position, subsample_for_bin=config.subsample_for_bin, min_data_in_leaf=config.min_data_in_leaf, min_sum_hessian_in_leaf=config.min_sum_hessian_in_leaf, sigmoid=config.sigmoid, subsample=config.subsample, subsample_freq=config.subsample_freq, lambda_l1=0., lambda_l2=0., lambdamart_norm=False, max_depth=-1, n_jobs=44, silent=config.silent) logging.info(model) record_evals = {} record_cb = lgb.record_evaluation(record_evals) model.fit(X, y, group=group_counts(qid), eval_names=['train', 'valid'], eval_set=[(X, y), (X_val, y_val)], eval_group=[group_counts(qid), group_counts(qid_val)], eval_metric=config.eval_metric, eval_at=config.eval_at, early_stopping_rounds=config.early_stopping_rounds, callbacks=[record_cb]) model._scaler = scaler model._record_evals = record_evals logging.info("Best iteration {}...".format(model.best_iteration_)) logging.info("Best score {}...".format(model.best_score_)) logging.info("Num features {}...".format(model.n_features_)) modelpath = Path(config.model_dir) / "{}.pkl".format(config.name) logging.info("Save model to {}...".format(modelpath)) joblib.dump(model, modelpath)
def lgb_rank(): """ Trains the LGB Ranker and saves it to the data directory. """ #print(0.8*TRAIN_LINES) import lightgbm as lgb import numpy as np params = { 'num_iterations': 2000, 'learning_rate': 0.0025, 'bagging_fraction': 0.25, 'feature_fraction': 0.8, 'bagging_freq': 1, "boosting": 'gbdt', 'feature_fraction': 1, 'early_stopping_round': 100, 'is_unbalance': 'true', # replaced with scale_pos_weight argument 'max_depth': 3, # -1 means no limit } X, Y, M = get_lgb_data(avoid_overfit=True) lgbc = lgb.LGBMRanker(**params) X_train, X_val = X[M <= 330530, :], X[M > 330530, :] Y_train, Y_val = np.ravel(Y[M <= 330530, :]), np.ravel(Y[M > 330530, :]) M_train, M_val = M[M <= 330530], M[M > 330530] def get_successive_sizes(M): M = np.array(M) u, unique_ids = np.unique(M, return_index=True) unique_ids = np.sort(unique_ids) unique_ids = list(unique_ids) unique_ids.append(M.shape[0]) for i in range(len(unique_ids) - 1): unique_ids[i] = unique_ids[i + 1] - unique_ids[i] unique_ids.pop() return unique_ids M_train, M_val = [ np.asarray(get_successive_sizes(x)) for x in [M_train, M_val] ] bst = lgbc.fit(X_train, Y_train, group=M_train, eval_group=[M_val], eval_set=[(X_val, Y_val)], verbose=2) pred_train = bst.predict(X_train, group=M_train) pred_val = bst.predict(X_val, group=M_val) from sklearn.externals import joblib # save model joblib.dump(lgbc, path.join(DATA_DIR, 'model', 'lgb.pkl'))
def test_lambdarank(self): X_train, y_train = load_svmlight_file( '../../examples/lambdarank/rank.train') X_test, y_test = load_svmlight_file( '../../examples/lambdarank/rank.test') q_train = np.loadtxt('../../examples/lambdarank/rank.train.query') lgb_model = lgb.LGBMRanker().fit(X_train, y_train, group=q_train, eval_at=[1])
def test_lambdarank(self): fd = FileLoader('../../examples/lambdarank', 'rank') X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True) X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True) group_train = fd.load_field('.train.query') lgb_train = lgb.Dataset(X_train, y_train, group=group_train) gbm = lgb.LGBMRanker(**fd.params) gbm.fit(X_train, y_train, group=group_train) sk_pred = gbm.predict(X_test) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
def __init__(self, mode, cluster, dataset_name, params_dict): self.dataset_name = dataset_name super(lightGBM, self).__init__(name=f'lightGBM_{dataset_name}', mode=mode, cluster=cluster) self._BASE_PATH = f'dataset/preprocessed/lightGBM/{self.cluster}/{self.mode}/{self.dataset_name}' self._load_data() self.params_dict = params_dict self.eval_res = {} self.model = lgb.LGBMRanker(**self.params_dict)
def __init__(self, train_qset, valid_qset, test_qset, ranker_params, fit_params): self.name = self.name + '-' + 'n_estimators-%d-learning_rate%.2f' % ( ranker_params['n_estimators'], ranker_params['learning_rate']) self.train_qset = train_qset self.valid_qset = valid_qset self.test_qset = test_qset self.ranker_params = ranker_params self.fit_params = fit_params self.ranker = gbm.LGBMRanker(**self.ranker_params) self.fit()
def test_xendcg(): fd = FileLoader(EXAMPLES_DIR / 'xendcg', 'rank') X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True) X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True) group_train = fd.load_field('.train.query') lgb_train = lgb.Dataset(X_train, y_train, group=group_train) gbm = lgb.LGBMRanker(**fd.params) gbm.fit(X_train, y_train, group=group_train) sk_pred = gbm.predict(X_test) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.file_load_check(lgb_train, '.train')
def train(tmp_dir, output_model): train_file = os.path.join(tmp_dir, "train_mt.csv") train_size = os.path.join(tmp_dir, "train_mt_size.csv") X_train, y_train = load_svmlight_file(train_file) model = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=16, max_depth=-1, learning_rate=0.1, n_estimators=100, min_child_samples=5) model.fit(X_train, y_train, group=np.loadtxt(train_size)) model.booster_.save_model(output_model)
def test_lambdarank(self): X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train') X_test, y_test = load_svmlight_file('../../examples/lambdarank/rank.test') q_train = np.loadtxt('../../examples/lambdarank/rank.train.query') q_test = np.loadtxt('../../examples/lambdarank/rank.test.query') lgb_model = lgb.LGBMRanker().fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1], verbose=False, callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
def test_lambdarank(): fd = FileLoader(EXAMPLES_DIR / 'lambdarank', 'rank') X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True) X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True) group_train = fd.load_field('.train.query') lgb_train = lgb.Dataset(X_train, y_train, group=group_train) params = dict(fd.params) params['force_col_wise'] = True gbm = lgb.LGBMRanker(**params) gbm.fit(X_train, y_train, group=group_train) sk_pred = gbm.predict(X_test) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.file_load_check(lgb_train, '.train')
def test_xendcg(self): dir_path = os.path.dirname(os.path.realpath(__file__)) X_train, y_train = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.train')) X_test, y_test = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.test')) q_train = np.loadtxt(os.path.join(dir_path, '../../examples/xendcg/rank.train.query')) q_test = np.loadtxt(os.path.join(dir_path, '../../examples/xendcg/rank.test.query')) gbm = lgb.LGBMRanker(n_estimators=50, objective='rank_xendcg', random_state=5, n_jobs=1) gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False, eval_metric='ndcg', callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))]) self.assertLessEqual(gbm.best_iteration_, 24) self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6579) self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6421)
def trainer(self): logging.info('Training lightgbm model.') self.gbm = lgb.LGBMRanker(metric='auc') columns = [i for i in self.data.columns if i not in ['question1', 'question2', 'label']] X_train, X_test, y_train, y_test = train_test_split(self.data[columns], self.data['label'], test_soze=0.3, random_state=42) query_train = [X_train.shape[0]] query_val = [X_test.shape[0]] self.gbm.fit(X_train, y_train, group=query_train, eval_set=[(X_test, y_test)], eval_group=[query_val], eval_at=[5, 10, 20], early_stopping_rounds=50)
def test_ranker(output, client, listen_port, group): X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group ) # rebalance small dask.array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() dw = dw.persist() dg = dg.persist() _ = wait([dX, dy, dw, dg]) client.rebalance() # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1 } dask_ranker = lgb.DaskLGBMRanker( client=client, time_out=5, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) local_ranker.fit(X, y, sample_weight=w, group=g) rnkvec_local = local_ranker.predict(X) # distributed ranker should be able to rank decently well and should # have high rank correlation with scores from serial ranker. dcor = spearmanr(rnkvec_dask, y).correlation assert dcor > 0.6 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def main(): x_train, y_train, x_test, y_test = load_svmlight_files( ['data/rank.train', 'data/rank.test']) train_query = pd.read_csv('data/rank.train.query', header=None).values.flatten() model = lgbm.LGBMRanker(num_leaves=50, n_estimators=200, random_state=42) print(model) model.fit(x_train, y_train, group=train_query, eval_metric='ndgc', eval_at=[1, 3, 5]) preds = model.predict(x_test) print(spearmanr(y_test, preds)) print('DONE')
def cross_validate(param=dict(n_estimators=1000, metric="map", colsample_bytree=0.2, max_depth=7, importance_type="gain"), n_folds=5, target="satisfied"): train_users = big_table["user_id"].unique() folds = KFold(n_folds, shuffle=True, random_state=42) models = [] test_pred = np.zeros(test_big_table.shape[0]) scores = [] for idx, (train_idx, valid_idx) in enumerate(folds.split(train_users)): t_user = train_users[train_idx] v_user = train_users[valid_idx] train_data = big_table[big_table["user_id"].isin(t_user)] valid_data = big_table[big_table["user_id"].isin(v_user)] train_group = train_data.groupby( "user_id", as_index=False).count()["satisfied"].values valid_group = valid_data.groupby( "user_id", as_index=False).count()["satisfied"].values test_group = test_big_table.groupby( "user_id", as_index=False).count()["jd_no"].values result = feature_select(target, train_data, valid_data, test_big_table) t_x, t_y = result[0] v_x, v_y = result[1] test_x, _ = result[2] model = lgb.LGBMRanker(**param) print("Fold", idx, "-" * 30) model.fit( t_x, t_y, group=train_group, eval_set=[(t_x, t_y), (v_x, v_y)], eval_group=[train_group, valid_group], early_stopping_rounds=100, verbose=10, callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.01)]) models.append(model) test_pred += model.predict(test_x) / n_folds scores.append(model.best_score_["valid_1"]["ndcg@1"]) print("mean score", np.mean(scores)) return models, test_pred