Beispiel #1
0
 def __init__(self,
              train_qset,
              valid_qset,
              test_qset,
              ranker_params,
              fit_params,
              click_model,
              total_number_of_clicked_queries=10000,
              learn_from_random=False):
     self.name = self.name + '-' + 'n_estimators-%d-learning_rate%.2f' % \
                 (ranker_params['n_estimators'], ranker_params['learning_rate'])
     self.fit_params = fit_params
     self.click_model = click_model
     self.offline_train_qset = train_qset
     self.offline_valid_qset = valid_qset
     self.offline_test_qset = test_qset
     self.offline_qset = {
         'train': train_qset,
         'test': test_qset,
         'valid': valid_qset
     }
     self.ranker_params = ranker_params
     self.fit_params = fit_params
     if learn_from_random:
         self.offline_ranker = None
     else:
         self.offline_ranker = gbm.LGBMRanker(**self.ranker_params)
         self.offline_fit()
     self.click_ranker = gbm.LGBMRanker(**self.ranker_params)
     self.click_fit(total_number_of_clicked_queries)
Beispiel #2
0
    def testLocalRanker(self):
        X, y = self.X, self.y
        y = (y * 10).astype(mt.int32)
        ranker = LGBMRanker(n_estimators=2)
        ranker.fit(X, y, group=[X.shape[0]], verbose=True)
        prediction = ranker.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        self.assertIsInstance(prediction, mt.Tensor)
        result = prediction.fetch()
        self.assertEqual(prediction.dtype, result.dtype)

        # test weight
        weight = mt.random.rand(X.shape[0])
        ranker = LGBMRanker(verbosity=1, n_estimators=2)
        ranker.fit(X, y, group=[X.shape[0]], sample_weight=weight)
        prediction = ranker.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))
        result = prediction.fetch()
        self.assertEqual(prediction.dtype, result.dtype)

        # test local model
        X_np = X.execute(session=self.session).fetch(session=self.session)
        y_np = y.execute(session=self.session).fetch(session=self.session)
        raw_ranker = lightgbm.LGBMRanker(verbosity=1, n_estimators=2)
        raw_ranker.fit(X_np, y_np, group=[X.shape[0]])
        prediction = LGBMRanker(raw_ranker).predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))
Beispiel #3
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train'))
     X_test, y_test = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.test'))
     q_train = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train.query'))
     q_test = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.test.query'))
     gbm = lgb.LGBMRanker()
     gbm.fit(X_train,
             y_train,
             group=q_train,
             eval_set=[(X_test, y_test)],
             eval_group=[q_test],
             eval_at=[1, 3],
             early_stopping_rounds=5,
             verbose=False,
             callbacks=[
                 lgb.reset_parameter(learning_rate=lambda x: 0.95**x * 0.1)
             ])
Beispiel #4
0
    def train(self,
              tabular_path: str,
              join_result_path: str,
              model_path: str,
              model_weights_path=None,
              histogram_path=None) -> None:
        """
        Train a classification model for spatial join cost estimator, then save the trained model to file
        """

        # Extract train and test data, but only use train data
        X_train, y_train = datasets.load_data(tabular_path,
                                              RankingModel.TARGET,
                                              RankingModel.DROP_COLUMNS)
        X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.2,
                                                          random_state=1)

        query_train = [X_train.shape[0]]
        query_val = [X_val.shape[0]]

        gbm = lgb.LGBMRanker()
        model = gbm.fit(X_train,
                        y_train,
                        group=query_train,
                        eval_set=[(X_val, y_val)],
                        eval_group=[query_val],
                        eval_at=[1, 2],
                        early_stopping_rounds=50)

        # Fit and save the model
        # model = self.rnk_model.fit(X_train, y_train)
        pickle.dump(model, open(model_path, 'wb'))
Beispiel #5
0
    def _run_lgbm_ranker_converter(self,
                                   num_classes,
                                   extra_config={},
                                   label_gain=None):
        warnings.filterwarnings("ignore")
        for max_depth in [1, 3, 8, 10, 12, None]:
            model = lgb.LGBMRanker(n_estimators=10,
                                   max_depth=max_depth,
                                   label_gain=label_gain)
            np.random.seed(0)
            X = np.random.rand(100, 200)
            X = np.array(X, dtype=np.float32)
            y = np.random.randint(num_classes, size=100)

            model.fit(X,
                      y,
                      group=[X.shape[0]],
                      eval_set=[(X, y)],
                      eval_group=[X.shape[0]])

            torch_model = hummingbird.ml.convert(model,
                                                 "torch",
                                                 extra_config=extra_config)
            self.assertIsNotNone(torch_model)
            np.testing.assert_allclose(model.predict(X),
                                       torch_model.predict(X),
                                       rtol=1e-06,
                                       atol=1e-06)
Beispiel #6
0
  def update_ranker(self, ranker_params, fit_params):
    """"This method uses the training data from
    generate_training_data_from_clicks to improve the ranker."""
    if self.observed_training_data:
      train_indices = [inds for otd in self.observed_training_data
                       for inds in otd[0]]
      train_features = np.concatenate([self.train_qset.feature_vectors[inds]
                                       for inds in train_indices])
      train_labels = np.concatenate([otd[1]
                                     for otd in self.observed_training_data])
      train_query_group = np.concatenate([otd[2]
                                       for otd in self.observed_training_data])
    else:
      raise ValueError('OnlineLTR.generate_training_data_from_clicks()'
        'should be called before OnlineLTR.update_ranker().')

    ranker = gbm.LGBMRanker(**ranker_params)
    if 'early_stopping_rounds' in fit_params:
      num_queries = len(self.observed_training_data[-1][0])
      valid_query_ids = self.sample_query_ids(num_queries, data='valid')
      valid_labels = np.concatenate([self.valid_qset[qid].relevance_scores
                                     for qid in valid_query_ids])
      valid_features = self.valid_qset[valid_query_ids].feature_vectors
      valid_query_group = [self.valid_qset[qid].document_count() for qid in valid_query_ids]
      ranker.fit(X=train_features, y=train_labels, group=train_query_group,
                 eval_set=[(valid_features, valid_labels)], eval_group=[valid_query_group],
                 **fit_params)
    else:
      ranker.fit(X=train_features, y=train_labels, group=train_query_group,
                 **fit_params)
    return ranker
Beispiel #7
0
def test_local_ranker(setup):
    y = (y_raw * 10).astype(mt.int32)
    ranker = LGBMRanker(n_estimators=2)
    ranker.fit(X_raw, y, group=[X_raw.shape[0]], verbose=True)
    prediction = ranker.predict(X_raw)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X_raw)

    assert isinstance(prediction, mt.Tensor)
    result = prediction.fetch()
    assert prediction.dtype == result.dtype

    # test weight
    weight = mt.random.rand(X_raw.shape[0])
    ranker = LGBMRanker(verbosity=1, n_estimators=2)
    ranker.fit(X_raw, y, group=[X_raw.shape[0]], sample_weight=weight)
    prediction = ranker.predict(X_raw)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X_raw)
    result = prediction.fetch()
    assert prediction.dtype == result.dtype

    # test local model
    X_np = X_raw.execute().fetch()
    y_np = y.execute().fetch()
    raw_ranker = lightgbm.LGBMRanker(verbosity=1, n_estimators=2)
    raw_ranker.fit(X_np, y_np, group=[X_raw.shape[0]])
    prediction = LGBMRanker(raw_ranker).predict(X_raw)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X_raw)
Beispiel #8
0
    def testLocalRanker(self):
        X, y = self.X, self.y
        y = (y * 10).astype(mt.int32)
        regressor = LGBMRanker(n_estimators=2)
        regressor.fit(X, y, group=[X.shape[0]], verbose=True)
        prediction = regressor.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        self.assertIsInstance(prediction, mt.Tensor)
        result = prediction.fetch()
        self.assertEqual(prediction.dtype, result.dtype)

        # test weight
        weight = mt.random.rand(X.shape[0])
        classifier = LGBMRanker(verbosity=1, n_estimators=2)
        classifier.fit(X, y, group=[X.shape[0]], sample_weight=weight)
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))
        result = prediction.fetch()
        self.assertEqual(prediction.dtype, result.dtype)

        # test local model
        classifier = lightgbm.LGBMRanker(verbosity=1, n_estimators=2)
        classifier.fit(X, y, group=[X.shape[0]])
        prediction = predict(classifier, X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))
def lgb_main(train_final_df, val_final_df=None):
    print('ranker begin....')
    train_final_df.sort_values(by=['user_id'], inplace=True)
    g_train = train_final_df.groupby(['user_id'], as_index=False).count()["label"].values

    if mode == 'offline':
        val_final_df = val_final_df.sort_values(by=['user_id'])
        g_val = val_final_df.groupby(['user_id'], as_index=False).count()["label"].values

    lgb_ranker = lgb.LGBMRanker(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=300,
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.01, min_child_weight=50, random_state=2018,
        n_jobs=-1)  # 300epoch, best, 0.882898, dense_feat  + hist_cnt_sim_feat user_interest_dense_feat

    if mode == 'offline':
        lgb_ranker.fit(train_final_df[lgb_cols], train_final_df['label'], group=g_train,
                       eval_set=[(val_final_df[lgb_cols], val_final_df['label'])], eval_group=[g_val],
                       eval_at=[50], eval_metric=['auc', ],
                       early_stopping_rounds=50, )
    else:
        lgb_ranker.fit(train_final_df[lgb_cols], train_final_df['label'], group=g_train)

    print('train done...')
    return lgb_ranker
Beispiel #10
0
def test_lightgbm_ranking():
    try:
        import lightgbm
    except:
        print("Skipping test_lightgbm_ranking!")
        return
    import shap
    import numpy as np

    # train lightgbm ranker model
    x_train, y_train, x_test, y_test, q_train, q_test = shap.datasets.rank()
    model = lightgbm.LGBMRanker()
    model.fit(
        x_train,
        y_train,
        group=q_train,
        eval_set=[(x_test, y_test)],
        eval_group=[q_test],
        eval_at=[1, 3],
        early_stopping_rounds=5,
        verbose=False,
        callbacks=[
            lightgbm.reset_parameter(learning_rate=lambda x: 0.95**x * 0.1)
        ])
    _validate_shap_values(model, x_test)
Beispiel #11
0
def train(tmp_dir,
          output_model,
          num_leaves=16,
          max_depth=-1,
          learning_rate=0.1,
          n_estimators=100,
          min_child_samples=5,
          feature_name='auto',
          task='SA'):
    train_file = os.path.join(tmp_dir, f"train_{task}.csv")
    train_size = os.path.join(tmp_dir, f"train_{task}_size.csv")
    X_train, y_train = load_svmlight_file(train_file)
    print('Training in prog...')
    model = lgb.LGBMRanker(boosting_type='gbdt',
                           num_leaves=num_leaves,
                           max_depth=max_depth,
                           learning_rate=learning_rate,
                           n_estimators=n_estimators,
                           min_child_samples=min_child_samples)
    model.fit(X_train,
              y_train,
              group=np.loadtxt(train_size),
              feature_name=feature_name)
    model.booster_.save_model(output_model)
    print(f'Model saved at {output_model}')
Beispiel #12
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train'))
     X_test, y_test = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.test'))
     q_train = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train.query'))
     q_test = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.test.query'))
     gbm = lgb.LGBMRanker()
     gbm.fit(X_train,
             y_train,
             group=q_train,
             eval_set=[(X_test, y_test)],
             eval_group=[q_test],
             eval_at=[1, 3],
             early_stopping_rounds=10,
             verbose=False,
             callbacks=[
                 lgb.reset_parameter(
                     learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
             ])
     self.assertLessEqual(gbm.best_iteration_, 25)
     self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6333)
     self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6048)
    def fit_evaluate(self,
                     df_train: pd.DataFrame,
                     df_val: pd.DataFrame,
                     verbose=False,
                     features=None,
                     **kwargs) -> EvaluationResult:
        X_train = df_train.ext.X
        y_train = df_train.ext.y
        groups_train = df_train.ext.group_sizes
        X_val = df_val.ext.X
        y_val = df_val.ext.y
        groups_val = df_val.ext.group_sizes

        assert len(X_train) == len(y_train) == sum(
            groups_train
        ), f"Sizes are not equal - Xt: {len(X_train)}, yt {len(y_train)}, Gt:{ sum(groups_train)}"
        assert len(X_val) == len(y_val) == sum(
            groups_val
        ), f"Sizes are not equal - Xv: {len(X_val)}, yv {len(y_val)}, Gv:{sum(groups_val)}"

        gbm = lgb.LGBMRanker(boosting_type="gbdt",
                             class_weight="balanced",
                             n_estimators=200)
        gbm.fit(X_train,
                y_train,
                group=groups_train,
                eval_set=[(X_val, y_val)],
                eval_group=[groups_val],
                early_stopping_rounds=10,
                feature_name=features,
                verbose=verbose)

        self.model = gbm

        return self.evaluate(df_val)
Beispiel #14
0
    def __init__(self,
                 mode='train',
                 model_path=os.path.join(Config.root_path,
                                         'model/ranking/lightgbm')):
        self.ts = TextSimilarity()
        self.matchingNN = MatchNN()
        self.ranker = lgb.LGBMRanker(**params)
        self.train_data = pd.read_csv(os.path.join(Config.root_path,
                                                   'data/ranking/train.tsv'),
                                      sep='\t',
                                      header=0,
                                      quoting=csv.QUOTE_NONE)
        self.dev_data = pd.read_csv(os.path.join(Config.root_path,
                                                 'data/ranking/dev.tsv'),
                                    sep='\t',
                                    header=0,
                                    quoting=csv.QUOTE_NONE)

        if mode == 'train':
            logging.info('Training mode')
            self.train_data = self.generate_feature(self.train_data, 'train')
            logging.info("train_data columns: {}".format(
                self.train_data.columns))
            logging.info("train_data shape: {}".format(self.train_data.shape))
            logging.info("train_data: {}".format(self.train_data[:5]))
            self.dev_data = self.generate_feature(self.dev_data, 'dev')
            logging.info("dev_data shape: {}".format(self.dev_data.shape))
            self.ranker = self.trainer()
            self.save(self.ranker, model_path)

        else:
            self.ranker = joblib.load(model_path)
Beispiel #15
0
    def fit(self, df):
        """Train the lightGBM model."""

        df_impressions = fb.build_features(df)

        # Target column, item that was clicked
        f.print_time("target column")
        df_impressions.loc[:, "is_clicked"] = (
            df_impressions["referenced_item"] ==
            df_impressions["impressed_item"]).astype(int)

        features = [
            "position",
            "prices",
            "interaction_count",
            "is_last_interacted",
        ]

        # Bring to format suitable for lightGBM
        f.print_time("lightGBM format")
        X = df_impressions[features]
        y = df_impressions.is_clicked

        q = (df_impressions.groupby(
            ["user_id", "session_id", "timestamp",
             "step"]).size().reset_index(name="query_length").query_length)

        # Training the actual model
        f.print_time("training lightGBM model")
        self.gbm = lgb.LGBMRanker()
        self.gbm.fit(X, y, group=q, verbose=True)
Beispiel #16
0
def main(args):
    config = Config.from_parseargs(args)
    prelude(config)
    logging.info("Start...")
    logging.info(config)
    cache = SvmLightCache(config.cache_name)

    logging.info("Loading data...")
    X, y, qid = cache.load_svmlight_file(args.train, query_id=True)
    X_val, y_val, qid_val = cache.load_svmlight_file(args.valid, query_id=True)

    scaler = None
    if config.normalize:
        scaler = get_scaler(config.normalize)
        normalize(scaler, X, is_train=True)
        normalize(scaler, X_val, is_train=False)

    model = lgb.LGBMRanker(
        objective=config.objective,
        boosting_type=config.boosting_type,
        n_estimators=config.trees,
        num_leaves=config.leaves,
        learning_rate=config.learning_rate,
        colsample_bytree=config.colsample_bytree,
        max_position=config.max_position,
        subsample_for_bin=config.subsample_for_bin,
        min_data_in_leaf=config.min_data_in_leaf,
        min_sum_hessian_in_leaf=config.min_sum_hessian_in_leaf,
        sigmoid=config.sigmoid,
        subsample=config.subsample,
        subsample_freq=config.subsample_freq,
        lambda_l1=0.,
        lambda_l2=0.,
        lambdamart_norm=False,
        max_depth=-1,
        n_jobs=44,
        silent=config.silent)
    logging.info(model)
    record_evals = {}
    record_cb = lgb.record_evaluation(record_evals)
    model.fit(X,
              y,
              group=group_counts(qid),
              eval_names=['train', 'valid'],
              eval_set=[(X, y), (X_val, y_val)],
              eval_group=[group_counts(qid),
                          group_counts(qid_val)],
              eval_metric=config.eval_metric,
              eval_at=config.eval_at,
              early_stopping_rounds=config.early_stopping_rounds,
              callbacks=[record_cb])
    model._scaler = scaler
    model._record_evals = record_evals
    logging.info("Best iteration {}...".format(model.best_iteration_))
    logging.info("Best score {}...".format(model.best_score_))
    logging.info("Num features {}...".format(model.n_features_))
    modelpath = Path(config.model_dir) / "{}.pkl".format(config.name)
    logging.info("Save model to {}...".format(modelpath))
    joblib.dump(model, modelpath)
Beispiel #17
0
def lgb_rank():
    """
        Trains the LGB Ranker and saves it to the data directory.

    """
    #print(0.8*TRAIN_LINES)

    import lightgbm as lgb
    import numpy as np
    params = {
        'num_iterations': 2000,
        'learning_rate': 0.0025,
        'bagging_fraction': 0.25,
        'feature_fraction': 0.8,
        'bagging_freq': 1,
        "boosting": 'gbdt',
        'feature_fraction': 1,
        'early_stopping_round': 100,
        'is_unbalance': 'true',  # replaced with scale_pos_weight argument
        'max_depth': 3,  # -1 means no limit
    }
    X, Y, M = get_lgb_data(avoid_overfit=True)

    lgbc = lgb.LGBMRanker(**params)

    X_train, X_val = X[M <= 330530, :], X[M > 330530, :]
    Y_train, Y_val = np.ravel(Y[M <= 330530, :]), np.ravel(Y[M > 330530, :])

    M_train, M_val = M[M <= 330530], M[M > 330530]

    def get_successive_sizes(M):
        M = np.array(M)
        u, unique_ids = np.unique(M, return_index=True)
        unique_ids = np.sort(unique_ids)
        unique_ids = list(unique_ids)
        unique_ids.append(M.shape[0])
        for i in range(len(unique_ids) - 1):
            unique_ids[i] = unique_ids[i + 1] - unique_ids[i]
        unique_ids.pop()
        return unique_ids

    M_train, M_val = [
        np.asarray(get_successive_sizes(x)) for x in [M_train, M_val]
    ]

    bst = lgbc.fit(X_train,
                   Y_train,
                   group=M_train,
                   eval_group=[M_val],
                   eval_set=[(X_val, Y_val)],
                   verbose=2)

    pred_train = bst.predict(X_train, group=M_train)
    pred_val = bst.predict(X_val, group=M_val)

    from sklearn.externals import joblib
    # save model
    joblib.dump(lgbc, path.join(DATA_DIR, 'model', 'lgb.pkl'))
Beispiel #18
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file(
         '../../examples/lambdarank/rank.train')
     X_test, y_test = load_svmlight_file(
         '../../examples/lambdarank/rank.test')
     q_train = np.loadtxt('../../examples/lambdarank/rank.train.query')
     lgb_model = lgb.LGBMRanker().fit(X_train,
                                      y_train,
                                      group=q_train,
                                      eval_at=[1])
 def test_lambdarank(self):
     fd = FileLoader('../../examples/lambdarank', 'rank')
     X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
     X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
     group_train = fd.load_field('.train.query')
     lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
     gbm = lgb.LGBMRanker(**fd.params)
     gbm.fit(X_train, y_train, group=group_train)
     sk_pred = gbm.predict(X_test)
     fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
Beispiel #20
0
 def __init__(self, mode, cluster, dataset_name, params_dict):
     self.dataset_name = dataset_name
     super(lightGBM, self).__init__(name=f'lightGBM_{dataset_name}',
                                    mode=mode,
                                    cluster=cluster)
     self._BASE_PATH = f'dataset/preprocessed/lightGBM/{self.cluster}/{self.mode}/{self.dataset_name}'
     self._load_data()
     self.params_dict = params_dict
     self.eval_res = {}
     self.model = lgb.LGBMRanker(**self.params_dict)
Beispiel #21
0
 def __init__(self, train_qset, valid_qset, test_qset, ranker_params,
              fit_params):
     self.name = self.name + '-' + 'n_estimators-%d-learning_rate%.2f' % (
         ranker_params['n_estimators'], ranker_params['learning_rate'])
     self.train_qset = train_qset
     self.valid_qset = valid_qset
     self.test_qset = test_qset
     self.ranker_params = ranker_params
     self.fit_params = fit_params
     self.ranker = gbm.LGBMRanker(**self.ranker_params)
     self.fit()
Beispiel #22
0
def test_xendcg():
    fd = FileLoader(EXAMPLES_DIR / 'xendcg', 'rank')
    X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
    X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
    group_train = fd.load_field('.train.query')
    lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
    gbm = lgb.LGBMRanker(**fd.params)
    gbm.fit(X_train, y_train, group=group_train)
    sk_pred = gbm.predict(X_test)
    fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
    fd.file_load_check(lgb_train, '.train')
Beispiel #23
0
def train(tmp_dir, output_model):
    train_file = os.path.join(tmp_dir, "train_mt.csv")
    train_size = os.path.join(tmp_dir, "train_mt_size.csv")
    X_train, y_train = load_svmlight_file(train_file)
    model = lgb.LGBMRanker(boosting_type='gbdt',
                           num_leaves=16,
                           max_depth=-1,
                           learning_rate=0.1,
                           n_estimators=100,
                           min_child_samples=5)
    model.fit(X_train, y_train, group=np.loadtxt(train_size))
    model.booster_.save_model(output_model)
Beispiel #24
0
 def test_lambdarank(self):
     X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train')
     X_test, y_test = load_svmlight_file('../../examples/lambdarank/rank.test')
     q_train = np.loadtxt('../../examples/lambdarank/rank.train.query')
     q_test = np.loadtxt('../../examples/lambdarank/rank.test.query')
     lgb_model = lgb.LGBMRanker().fit(X_train, y_train,
                                      group=q_train,
                                      eval_set=[(X_test, y_test)],
                                      eval_group=[q_test],
                                      eval_at=[1],
                                      verbose=False,
                                      callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
Beispiel #25
0
def test_lambdarank():
    fd = FileLoader(EXAMPLES_DIR / 'lambdarank', 'rank')
    X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
    X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
    group_train = fd.load_field('.train.query')
    lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
    params = dict(fd.params)
    params['force_col_wise'] = True
    gbm = lgb.LGBMRanker(**params)
    gbm.fit(X_train, y_train, group=group_train)
    sk_pred = gbm.predict(X_test)
    fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
    fd.file_load_check(lgb_train, '.train')
Beispiel #26
0
 def test_xendcg(self):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     X_train, y_train = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.train'))
     X_test, y_test = load_svmlight_file(os.path.join(dir_path, '../../examples/xendcg/rank.test'))
     q_train = np.loadtxt(os.path.join(dir_path, '../../examples/xendcg/rank.train.query'))
     q_test = np.loadtxt(os.path.join(dir_path, '../../examples/xendcg/rank.test.query'))
     gbm = lgb.LGBMRanker(n_estimators=50, objective='rank_xendcg', random_state=5, n_jobs=1)
     gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
             eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False,
             eval_metric='ndcg',
             callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))])
     self.assertLessEqual(gbm.best_iteration_, 24)
     self.assertGreater(gbm.best_score_['valid_0']['ndcg@1'], 0.6579)
     self.assertGreater(gbm.best_score_['valid_0']['ndcg@3'], 0.6421)
Beispiel #27
0
 def trainer(self):
     logging.info('Training lightgbm model.')
     self.gbm = lgb.LGBMRanker(metric='auc')
     columns = [i for i in self.data.columns if i not in ['question1', 'question2', 'label']]
     X_train, X_test, y_train, y_test = train_test_split(self.data[columns], self.data['label'], test_soze=0.3, random_state=42)
     query_train = [X_train.shape[0]]
     query_val = [X_test.shape[0]]
     self.gbm.fit(X_train,
                  y_train,
                  group=query_train,
                  eval_set=[(X_test, y_test)],
                  eval_group=[query_val],
                  eval_at=[5, 10, 20],
                  early_stopping_rounds=50)
Beispiel #28
0
def test_ranker(output, client, listen_port, group):

    X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
        output=output,
        group=group
    )

    # rebalance small dask.array dataset for better performance.
    if output == 'array':
        dX = dX.persist()
        dy = dy.persist()
        dw = dw.persist()
        dg = dg.persist()
        _ = wait([dX, dy, dw, dg])
        client.rebalance()

    # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of
    # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
    params = {
        "random_state": 42,
        "n_estimators": 50,
        "num_leaves": 20,
        "min_child_samples": 1
    }
    dask_ranker = lgb.DaskLGBMRanker(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
    rnkvec_dask = dask_ranker.predict(dX)
    rnkvec_dask = rnkvec_dask.compute()
    rnkvec_dask_local = dask_ranker.to_local().predict(X)

    local_ranker = lgb.LGBMRanker(**params)
    local_ranker.fit(X, y, sample_weight=w, group=g)
    rnkvec_local = local_ranker.predict(X)

    # distributed ranker should be able to rank decently well and should
    # have high rank correlation with scores from serial ranker.
    dcor = spearmanr(rnkvec_dask, y).correlation
    assert dcor > 0.6
    assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8
    assert_eq(rnkvec_dask, rnkvec_dask_local)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def main():
    x_train, y_train, x_test, y_test = load_svmlight_files(
        ['data/rank.train', 'data/rank.test'])
    train_query = pd.read_csv('data/rank.train.query',
                              header=None).values.flatten()

    model = lgbm.LGBMRanker(num_leaves=50, n_estimators=200, random_state=42)
    print(model)
    model.fit(x_train,
              y_train,
              group=train_query,
              eval_metric='ndgc',
              eval_at=[1, 3, 5])
    preds = model.predict(x_test)

    print(spearmanr(y_test, preds))
    print('DONE')
Beispiel #30
0
def cross_validate(param=dict(n_estimators=1000,
                              metric="map",
                              colsample_bytree=0.2,
                              max_depth=7,
                              importance_type="gain"),
                   n_folds=5,
                   target="satisfied"):
    train_users = big_table["user_id"].unique()
    folds = KFold(n_folds, shuffle=True, random_state=42)
    models = []
    test_pred = np.zeros(test_big_table.shape[0])
    scores = []
    for idx, (train_idx, valid_idx) in enumerate(folds.split(train_users)):
        t_user = train_users[train_idx]
        v_user = train_users[valid_idx]
        train_data = big_table[big_table["user_id"].isin(t_user)]
        valid_data = big_table[big_table["user_id"].isin(v_user)]
        train_group = train_data.groupby(
            "user_id", as_index=False).count()["satisfied"].values
        valid_group = valid_data.groupby(
            "user_id", as_index=False).count()["satisfied"].values
        test_group = test_big_table.groupby(
            "user_id", as_index=False).count()["jd_no"].values

        result = feature_select(target, train_data, valid_data, test_big_table)
        t_x, t_y = result[0]
        v_x, v_y = result[1]
        test_x, _ = result[2]
        model = lgb.LGBMRanker(**param)
        print("Fold", idx, "-" * 30)
        model.fit(
            t_x,
            t_y,
            group=train_group,
            eval_set=[(t_x, t_y), (v_x, v_y)],
            eval_group=[train_group, valid_group],
            early_stopping_rounds=100,
            verbose=10,
            callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.01)])
        models.append(model)
        test_pred += model.predict(test_x) / n_folds
        scores.append(model.best_score_["valid_1"]["ndcg@1"])
    print("mean score", np.mean(scores))
    return models, test_pred