Example #1
0
def main():
    p = get_cli_args(args)
    x_train, y_train, qid_train = load_svmlight_file(
        p.train.xgboost_train_path, query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path,
                                                  query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_train = x_train.todense()
    x_train = np.concatenate([
        x_train, x_train[:, -2] / x_train[:, 2], x_train[:, -1] / x_train[:, 4]
    ], 1)
    x_test = x_test.todense()
    x_test = np.concatenate(
        [x_test, x_test[:, -2] / x_test[:, 2], x_test[:, -1] / x_test[:, 4]],
        1)
    train_dmatrix = DMatrix(x_train, y_train)
    test_dmatrix = DMatrix(x_test, y_test)
    train_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_train)])
    test_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_test)])
    params = {
        'objective': 'rank:pairwise',
        'eval_metric': ['error', 'map@1'],
        'tree_method': 'exact',
        'eta': 0.1,
        'gamma': 1.0,
        'min_child_weight': 0.1,
        'max_depth': 6
    }
    xgb_model = xgb.train(params,
                          train_dmatrix,
                          num_boost_round=100,
                          evals=[(test_dmatrix, 'validation')])
    xgb_train_str = items_to_str(_.omit(params, 'objective',
                                        'eval_metric').items(),
                                 sort_by=itemgetter(0))
    xgb_model.save_model(xgb_train_str + '_model.xgb')
def predict_xgboost_answers(xgb_model):
    # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность)
    load_tfidf_1 = TFIDF.load(os.path.join(PATH_TO_TF_IDF, 'tf_idf_1'))
    x_test, y_test = sklearn.datasets.load_svmlight_file(
        os.path.join(PATH_TO_LEARNING_TO_RANK, 'x_test.txt'))
    group_test = []
    with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"),
              "r",
              encoding="utf-8") as f:
        data = f.readlines()
        for line in data:
            group_test.append(int(line.split("\n")[0]))
    test_dmatrix = DMatrix(x_test)
    test_dmatrix.set_group(group_test)
    pred = xgb_model.predict(test_dmatrix)
    prediction_answer = []
    for i, p in enumerate(pred):
        prediction_answer.append(
            (load_tfidf_1.num_to_num_dict[i % CNT_ARTICLES], p))
    predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK,
                                'prediction_file.txt')
    if os.path.exists(predict_file):
        os.remove(predict_file)
    f = open(predict_file, 'w+', encoding="utf-8")
    predictions = [str(pred) for pred in prediction_answer]
    f.write('\n'.join(predictions))
    f.close()
Example #3
0
    def apply(self, X, ntree_limit=0):
        """Return the predicted leaf every tree for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        ntree_limit : int
            Limit number of trees in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
            For each datapoint x in X and for each tree, return the index of the
            leaf x ends up in. Leaves are numbered within
            ``[0; 2**(self.max_depth+1))``, possibly with gaps in the numbering.
        """
        sizes, group_indices, X_features, _, _ = _preprare_data_in_groups(X)
        test_dmatrix = DMatrix(X_features, missing=self.missing)
        test_dmatrix.set_group(sizes)
        X_leaves = self.get_booster().predict(test_dmatrix,
                                              pred_leaf=True,
                                              ntree_limit=ntree_limit)
        revert_group_indices = np.arange(
            len(group_indices))[group_indices.argsort()]
        X_leaves = X_leaves[revert_group_indices, :]
        return X_leaves
Example #4
0
def predict_xgboost_answers(xgb_model):
    # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность)
    features = pd.read_csv(f"{PATH_TO_LEARNING_TO_RANK}/x_test.csv", sep=',')
    x_test = features.drop(['doc_id', 'is_rel', '7'], axis=1)
    group_test = []
    with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"),
              "r",
              encoding="utf-8") as f:
        data = f.readlines()
        for line in data:
            group_test.append(int(line.split("\n")[0]))

    test_dmatrix = DMatrix(x_test)
    test_dmatrix.set_group(group_test)

    pred = xgb_model.predict(test_dmatrix)
    corpus = SimpleCorp.load("codexes_corp_articles",
                             os.path.join(PATH_TO_FILES, "corp"))
    prediction_answer = []
    for p, doc_id in zip(
            pred,
            list(corpus.corpus.keys()) * (len(pred) // CNT_ARTICLES)):
        prediction_answer.append((doc_id, p))
    predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK,
                                'prediction_file.txt')
    if os.path.exists(predict_file):
        os.remove(predict_file)
    f = open(predict_file, 'w+', encoding="utf-8")
    predictions = [str(pred) for pred in prediction_answer]
    f.write('\n'.join(predictions))
    f.close()
 def predict(self, X, output_margin=False, ntree_limit=0):
     sizes, _, X_features, _, _ = _preprare_data_in_groups(X)
     test_dmatrix = DMatrix(X_features, missing=self.missing)
     test_dmatrix.set_group(sizes)
     rank_values = self.get_booster().predict(test_dmatrix,
                                              output_margin=output_margin,
                                              ntree_limit=ntree_limit)
     return rank_values
Example #6
0
 def predict(self, X, group=None, output_margin=False, ntree_limit=0):
     if group == None:
         group = [X.shape[0]]
     test_dmatrix = DMatrix(X, missing=self.missing)
     test_dmatrix.set_group(group)
     rank_values = self.booster().predict(test_dmatrix,
                                          output_margin=output_margin,
                                          ntree_limit=ntree_limit)
     return rank_values
Example #7
0
def get_pairs_rank_score(loaded_model, text_list):
    test_group_list, test_data_list, test_target_list = test_data_generation(
        text_list)
    # print(test_group_list, '\n*******test_group_list************')
    # print(test_data_list, '\n*********test_data_list**********')
    # print(test_target_list, '\n*********test_target_list**********')
    xgbTest = DMatrix(np.asmatrix(test_data_list), label=test_target_list)
    xgbTest.set_group(test_group_list)
    results = loaded_model.predict(xgbTest)
    return results
Example #8
0
def train(model_file):
    #  This script demonstrate how to do ranking with xgboost.train
    x_train, y_train = load_svmlight_file("mq2008.train")
    x_valid, y_valid = load_svmlight_file("mq2008.vali")
    x_test, y_test = load_svmlight_file("mq2008.test")

    group_train = []
    with open("mq2008.train.group", "r", encoding="utf8") as f:
        data = f.readlines()
        for line in data:
            group_train.append(int(line.split("\n")[0]))

    group_valid = []
    with open("mq2008.vali.group", "r", encoding="utf8") as f:
        data = f.readlines()
        for line in data:
            group_valid.append(int(line.split("\n")[0]))

    group_test = []
    with open("mq2008.test.group", "r", encoding="utf8") as f:
        data = f.readlines()
        for line in data:
            group_test.append(int(line.split("\n")[0]))

    train_dmatrix = DMatrix(x_train, y_train)
    valid_dmatrix = DMatrix(x_valid, y_valid)
    test_dmatrix = DMatrix(x_test)

    train_dmatrix.set_group(group_train)
    valid_dmatrix.set_group(group_valid)

    params = {
        'objective': 'rank:pairwise',
        'eta': 0.01,
        'gamma': 1.0,
        'min_child_weight': 0.1,
        'max_depth': 8
    }
    xgb_model = xgb.train(params,
                          train_dmatrix,
                          num_boost_round=4,
                          evals=[(valid_dmatrix, 'validation')])
    pred = xgb_model.predict(test_dmatrix)
    xgb_model.dump_model(model_file + ".txt")
    xgb_model.save_model(model_file)
    # save figures
    plt.clf()
    xgb.plot_importance(xgb_model)
    plt.savefig('feature_importance.png', dpi=800, format='png')
Example #9
0
 def predict(self, X, output_margin=False, ntree_limit=0):
     '''
     X (array_like) – The dmatrix storing the input.
     output_margin (bool) – Whether to output the raw untransformed margin value.
     ntree_limit (int) – Limit number of trees in the prediction; defaults to 0 (use all trees).
     '''
     sizes, group_indices, X_features, _, _ = _preprare_data_in_groups(X)
     test_dmatrix = DMatrix(X_features, missing=self.missing)
     test_dmatrix.set_group(sizes)
     rank_values = self.get_booster().predict(test_dmatrix,
                                              output_margin=output_margin,
                                              ntree_limit=ntree_limit)
     revert_group_indices = np.arange(
         len(group_indices))[group_indices.argsort()]
     rank_values = rank_values[revert_group_indices]
     return rank_values
Example #10
0
    def predict(self, X, group=None, output_margin=False, ntree_limit=0):
        unsort = (group is not None)
        if group == None:
            group = [X.shape[0]]
        else:
            idx = np.argsort(group)
            X = X[idx, :]
            group = group[idx]
            unique, counts = np.unique(group, return_counts=True)
            group = counts[np.argsort(unique)]

        test_dmatrix = DMatrix(X, missing=self.missing)
        test_dmatrix.set_group(group)
        rank_values = self.get_booster().predict(test_dmatrix,
                                                 output_margin=output_margin,
                                                 ntree_limit=ntree_limit)
        if unsort:
            rank_values = rank_values[np.argsort(idx)]
        return rank_values
Example #11
0
def train_ranking():
    train_group_list, train_data_list, train_target_list = data_generation({})
    test_group_list, test_data_list, test_target_list = train_group_list, train_data_list, train_target_list
    eval_group_list, eval_data_list, eval_target_list = train_group_list, train_data_list, train_target_list

    xgbTrain = DMatrix(np.asmatrix(train_data_list), label=train_target_list)
    xgbTrain.set_group(train_group_list)

    xgbEval = DMatrix(np.asmatrix(eval_data_list), label=eval_target_list)
    xgbEval.set_group(eval_group_list)
    evallist = [(xgbTrain, 'train'), (xgbEval, 'eval')]

    rankModel = train(xgb_rank_params2,
                      xgbTrain,
                      num_boost_round=50,
                      evals=evallist)
    rankModel.save_model('xgb.model')
    loaded_model = xgb.Booster(model_file='xgb.model')
    xgbTest = DMatrix(np.asmatrix(test_data_list), label=test_target_list)
    xgbTest.set_group(test_group_list)
    results = loaded_model.predict(xgbTest)

    with open('results.txt', mode='w', encoding='utf-8') as f:
        for item in results:
            f.write(str(item) + '\n')
def train():
    train_dmatrix = DMatrix(x_train, y_train)
    valid_dmatrix = DMatrix(x_valid, y_valid)
    test_dmatrix = DMatrix(x_test)

    train_dmatrix.set_group(group_train)
    valid_dmatrix.set_group(group_valid)

    params = {
        'objective': 'rank:pairwise',
        'eta': 0.1,
        'gamma': 1.0,
        'min_child_weight': 0.1,
        'max_depth': 6
    }
    xgb_model = xgb.train(params,
                          train_dmatrix,
                          num_boost_round=4,
                          evals=[(valid_dmatrix, 'validation')])
    with open(curr_dir + './data/data_model/pairwise_origin_version.model',
              'wb') as f:
        pickle.dump(xgb_model, f, pickle.HIGHEST_PROTOCOL)
    return 1
def predict_norm(request, features=None, iifh=None):
    new_request = Request(request, "", "").create_dict()

    xgb_model = joblib.load("final_xgb_model.sav")

    create_feature_files_for_all_requests([new_request], "files/", features)
    features = pd.read_pickle(f"{PATH_TO_FILES}/0.pickle")

    x_test = features.drop(['is_rel', '7'], axis=1)
    group_test = [CNT_ARTICLES]

    test_dmatrix = DMatrix(x_test)
    test_dmatrix.set_group(group_test)
    pred = xgb_model.predict(test_dmatrix)

    corpus = SimpleCorp.load("codexes_corp_articles", os.path.join(PATH_TO_FILES, "corp"))
    art_names = SimpleCorp.load('codexes_corp_art_names', f'{PATH_TO_FILES}/corp')

    prediction_answer = []
    for p, doc_id in zip(pred, list(corpus.corpus.keys())):
        prediction_answer.append((doc_id, p))

    prediction_answer.sort(key=lambda x: x[1], reverse=True)

    valid_answers = []
    for i, res in enumerate(prediction_answer[:5]):
        doc_id = res[0]
        cod = name_codexes[int(doc_id[0])]
        answer = f"<p>Cтатья {doc_id[1]}. {art_names.get_doc(doc_id)} // {cod[0]}{cod[1:].lower()}.</p>"
        print(answer)
        if iifh is not None:
            snippet = iifh.hightlight_words(request, doc_id)
            answer += f"<p style='padding-left:50px; border-left: 3px gray;'>{snippet}</p>"
            print(snippet)
        valid_answers.append(Markup(answer))
    return valid_answers
Example #14
0
def main():
    #  Import training data
    x_train, y_train, qid_train = load_svmlight_file("hn.train", query_id=True)
    x_valid, y_valid, qid_valid = load_svmlight_file("hn.vali", query_id=True)
    x_test, y_test, qid_test = load_svmlight_file("hn.test", query_id=True)

    group_train = group_qid(qid_train)
    group_valid = group_qid(qid_valid)
    group_test = group_qid(qid_test)

    train_dmatrix = DMatrix(x_train, y_train)
    valid_dmatrix = DMatrix(x_valid, y_valid)
    test_dmatrix = DMatrix(x_test)

    train_dmatrix.set_group(group_train)
    valid_dmatrix.set_group(group_valid)
    test_dmatrix.set_group(group_test)

    # Train Xgboost with basic parameters
    params = {'objective': 'rank:pairwise', 'eta': 0.1,
              # 'gamma': 1.0,
              # 'min_child_weight': 0.1,
              'max_depth': 3}
    params['eval_metric'] = ['ndcg@1', 'ndcg@3', 'ndcg@5', 'ndcg@10']
    xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4,
                          evals=[(valid_dmatrix, 'validation')])
    pred = xgb_model.predict(test_dmatrix)

    data_predict = regroup_results(group_test, pred, y_test)

    # Testing random sample
    # Simple debug function that print algolia results and predictions
    def print_random_sample(line):
        prevsum = sum(group_test[:line])
        print('Algolia clicks are: {}'.format(y_test[prevsum:prevsum + group_test[line]]))
        print('Predictions are: {}'.format(pred[prevsum:prevsum + group_test[line]]))
        print('Xgboost clicks are: {}'.format(data_predict[line]))
    print_random_sample(1)

    print('> Mean reciprocal rank is : {}'.format(
        mean_reciprocal_rank(data_predict)))
    print('> Mean average position is : {}'.format(
        mean_average_precision(data_predict)))

    # nDCG
    for i in [1, 3, 5, 10]:
        ndcg_ = []
        for query in data_predict:
            ndcg_.append(ndcg_at_k(query, i))
        print('> nDCG@{} is : {}'.format(i, pd.Series(ndcg_).mean()))
Example #15
0
def train(model_file):
    trans_data(data_path)
    #  This script demonstrate how to do ranking with xgboost.train
    x_train, y_train = load_svmlight_file(data_path + TASK + ".train")
    x_valid, y_valid = load_svmlight_file(data_path + TASK + ".valid")
    x_test, y_test = load_svmlight_file(data_path + TASK + ".test")
    print("train data shape: [%d, %d]" % (x_train.shape[0], x_train.shape[1]))

    group_train = load_group_data(data_path + TASK + ".train.group")
    group_valid = load_group_data(data_path + TASK + ".valid.group")
    group_test = load_group_data(data_path + TASK + ".test.group")

    train_dmatrix = DMatrix(x_train, y_train)
    valid_dmatrix = DMatrix(x_valid, y_valid)
    test_dmatrix = DMatrix(x_test, y_test)

    train_dmatrix.set_group(group_train)
    valid_dmatrix.set_group(group_valid)
    test_dmatrix.set_group(group_test)

    params = {'booster':'gbtree', 'objective': 'rank:pairwise', 'eta': 0.01, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 2, \
              'eval_metric':'ndcg@1'}     # ndcg@1, logloss
    xgb_model = xgb.train(params,
                          train_dmatrix,
                          num_boost_round=4,
                          evals=[(train_dmatrix, 'train'),
                                 (valid_dmatrix, 'valid'),
                                 (test_dmatrix, 'test')])
    pred = xgb_model.predict(test_dmatrix)
    print("save model to %s" % (model_file))
    xgb_model.dump_model(model_file + ".txt")
    xgb_model.save_model(model_file)
    joblib.dump(xgb_model, 'rank_model/xgb_clf.m')
    # save figures
    plt.clf()
    xgb.plot_importance(xgb_model)
    plt.savefig('rank_model/feature_importance.png', dpi=800, format='png')
##################################################################
## DMatrix
## generate training dataset
# 一共 2 组 * 每组 3 条, 6 条样本, 特征维数是 2
n_group = 2
n_choice = 3
dtrain = np.random.uniform(0, 100, [n_group * n_choice, 2]); print(dtrain.shape)  # (6, 2)
# numpy.random.choice(a, size=None, replace=True, p=None)
dtarget = np.array([np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten(); print(dtarget)  # [1 0 2 1 0 2]
# n_group 用于表示从前到后每组各自有多少样本, 前提是样本中各组是连续的, [3, 3] 表示一共 6 条样本中前 3 条是第一组, 后 3 条是第二组
dgroup = np.array([n_choice for i in range(n_group)]).flatten(); print(dgroup)  # [3 3]

# concate Train data, very import here !
xgbTrain = DMatrix(dtrain, label=dtarget)
xgbTrain.set_group(dgroup)

# generate eval data
dtrain_eval = np.random.uniform(0, 100, [n_group * n_choice, 2]); print(dtrain_eval.shape)  # (6, 2)
xgbTrain_eval = DMatrix(dtrain_eval, label=dtarget)
xgbTrain_eval .set_group(dgroup)
evallist  = [(xgbTrain, 'train'), (xgbTrain_eval, 'eval')]

# train model
# xgb_rank_params1 加上 evals 这个参数会报错, 还没找到原因
# rankModel = train(xgb_rank_params1, xgbTrain, num_boost_round=10)
rankModel = train(xgb_rank_params2, xgbTrain, num_boost_round=20, evals=evallist)

# test dataset
dtest = np.random.uniform(0, 100, [n_group*n_choice, 2]); print(dtest.shape)  # (6, 2)
dtestgroup = np.array([n_choice for i in range(n_group)]).flatten(); print(dtestgroup)  # [3 3]
Example #17
0
class xgbLtr:
    def __init__(self):
        self.train_file = DATA_PATH + TASK + ".train"
        self.valid_file = DATA_PATH + TASK + ".valid"
        self.test_file = DATA_PATH + TASK + ".test"
        self.model_path = "rank_model/"
        self.model_name = TASK + "_xgb1.model"

    def load_data(self):
        x_train, y_train = load_svmlight_file(self.train_file)
        x_valid, y_valid = load_svmlight_file(self.valid_file)
        x_test, y_test = load_svmlight_file(self.test_file)
        print("train data shape: [%d, %d]" %
              (x_train.shape[0], x_train.shape[1]))

        group_train = load_group_data(DATA_PATH + TASK + ".train.group")
        group_valid = load_group_data(DATA_PATH + TASK + ".valid.group")
        group_test = load_group_data(DATA_PATH + TASK + ".test.group")

        self.train_dmatrix = DMatrix(x_train, y_train)
        self.valid_dmatrix = DMatrix(x_valid, y_valid)
        self.test_dmatrix = DMatrix(x_test, y_test)

        self.train_dmatrix.set_group(group_train)
        self.valid_dmatrix.set_group(group_valid)
        self.test_dmatrix.set_group(group_test)

    def train(self):
        params = {
            'booster': 'gbtree',
            'objective': 'rank:pairwise',
            'eta': 0.01,
            'gamma': 1.0,
            'min_child_weight': 0.1,
            'max_depth': 2,
            'eval_metric': 'ndcg@1'
        }  # ndcg@1, logloss
        xgb_model = xgb.train(params,
                              self.train_dmatrix,
                              num_boost_round=4,
                              evals=[(self.train_dmatrix, 'train'),
                                     (self.valid_dmatrix, 'valid'),
                                     (self.test_dmatrix, 'test')])
        pred = xgb_model.predict(self.test_dmatrix)
        print("save model to %s" % (self.model_path))
        xgb_model.dump_model(self.model_path + self.model_name + ".txt")
        xgb_model.save_model(self.model_path + self.model_name)
        joblib.dump(xgb_model, self.model_path + '/xgb_clf.m')
        # save figures
        plt.clf()
        xgb.plot_importance(xgb_model)
        plt.savefig(self.model_path + '/feature_importance.png',
                    dpi=800,
                    format='png')

    def plotXgboostTree(self):
        xgb_model = xgb.Booster(model_file=self.model_path + self.model_name)
        xgbclf = joblib.load(self.model_path + '/xgb_clf.m')
        #plt.clf();    xgb.plot_tree(xgbclf, num_trees=0, fmap='./xgb.fmap');    plt.savefig('xgb_tree.png', dpi=800, format='png'); exit(0)
        for i in range(4):
            #plt.clf()
            xgb.plot_tree(xgb_model,
                          num_trees=i,
                          fmap='./get_jdcv_data/feature.fmap')
            fig = plt.gcf()
            fig.set_size_inches(150, 100)
            fig.savefig('xgb_tree_' + str(i) + '.png')
            #plt.savefig('xgb_tree_' + str(i) + '.png', dpi=800, format='png')
            a = 1
        pass
Example #18
0
group_train = []
with open("mq2008.train.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_train.append(int(line.split("\n")[0]))

group_valid = []
with open("mq2008.vali.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_valid.append(int(line.split("\n")[0]))

group_test = []
with open("mq2008.test.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_test.append(int(line.split("\n")[0]))

train_dmatrix = DMatrix(x_train, y_train)
valid_dmatrix = DMatrix(x_valid, y_valid)
test_dmatrix = DMatrix(x_test)

train_dmatrix.set_group(group_train)
valid_dmatrix.set_group(group_valid)

params = {'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0,
               'min_child_weight': 0.1, 'max_depth': 6}
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4,
                           evals=[(valid_dmatrix, 'validation')])
pred = xgb_model.predict(test_dmatrix)
Example #19
0
print('X_train.shape', X_train.shape)
print('X_val.shape', X_val.shape)
print('X_test.shape', X_test.shape)
print('y_train.shape', y_train.shape)
print('y_val.shape', y_val.shape)
print(set(X_train.columns) - set(X_test.columns))
print(set(X_val.columns) - set(X_test.columns))
print(set(X_train.columns) - set(X_val.columns))

#%% Prepare matrices
print('Creating Dmatrices')
train_dmatrix = DMatrix(X_train.values, y_train)
valid_dmatrix = DMatrix(X_val.values, y_val)
test_dmatrix = DMatrix(X_test.values)

train_dmatrix.set_group(train_group_sizes)
valid_dmatrix.set_group(val_group_sizes)
test_dmatrix.set_group(test_group_sizes)

del X_train
del X_val
del X_test

#%% XGBOOST MODEL: POSITION-BASED
#XGB model using efficient data structure Dmatrix
params = {
    'max_depth': 3,
    'min_child_weight': 10,
    'learning_rate': 0.3,
    'subsample': 0.5,
    'colsample_bytree': 0.6,
Example #20
0
print("data load done!!!")

xgb_rank_params2 = {
    'bst:max_depth': 5,  # 构建树的深度,越大越容易过拟合
    'bst:eta': 0.1,  # 如同学习率
    'silent': 0,  # 设置成1则没有运行信息输出,最好是设置为0.
    'objective': 'rank:pairwise',
    'nthread': 8,  # cpu 线程数
    'eval_metric': 'ndcg@10-',
    'metric': 'ndcg@10-'
}

train_group_list, train_data_list, train_target_list = data_generation(
    train_dict)
xgbTrain = DMatrix(train_data_list, label=train_target_list)
xgbTrain.set_group(train_group_list)

eval_group_list, eval_data_list, eval_target_list = data_generation(eval_dict)
xgbEval = DMatrix(eval_data_list, label=eval_target_list)
xgbEval.set_group(eval_group_list)

# get evallist
evallist = [(xgbTrain, 'train'), (xgbEval, 'eval')]

# train
rankModel = train(xgb_rank_params2,
                  xgbTrain,
                  num_boost_round=5,
                  evals=evallist)

# get predict
Example #21
0
class xgbLtr:
    def __init__(self):
        self.train_file = DATA_PATH + TASK + ".train"
        self.valid_file = DATA_PATH + TASK + ".valid"
        self.test_file = DATA_PATH + TASK + ".test"
        self.model_path = conf.xgb_rank_model
        self.model_name = TASK + "_xgb.model"

    def load_data(self):
        print("train data file: %s" % (DATA_PATH))
        trans_data(DATA_PATH)
        x_train, y_train = load_svmlight_file(self.train_file)
        x_valid, y_valid = load_svmlight_file(self.valid_file)
        x_test, y_test = load_svmlight_file(self.test_file)
        #random.shuffle(y_train); random.shuffle(y_valid); random.shuffle(y_test)
        print("train data shape: [%d, %d]" %
              (x_train.shape[0], x_train.shape[1]))

        group_train = load_group_data(DATA_PATH + TASK + ".train.group")
        group_valid = load_group_data(DATA_PATH + TASK + ".valid.group")
        group_test = load_group_data(DATA_PATH + TASK + ".test.group")

        self.train_dmatrix = DMatrix(x_train, y_train)
        self.valid_dmatrix = DMatrix(x_valid, y_valid)
        self.test_dmatrix = DMatrix(x_test, y_test)

        self.train_dmatrix.set_group(group_train)
        self.valid_dmatrix.set_group(group_valid)
        self.test_dmatrix.set_group(group_test)

    def train(self):
        extra_pam = {}
        extra_pam = {'verbosity':0, 'validate_parameters': True, 'subsample':0.1, 'lambda': 0.6, 'alpha': 0.8,  \
                     'early_stopping_rounds':1}
        params = {
            'booster': 'gbtree',
            'objective': 'rank:pairwise',
            'eta': 1e-3,
            'gamma': 10.0,
            'min_child_weight': 0.1,
            'max_depth': 6,
            'eval_metric': ['logloss']
        }  # ndcg@1, logloss,auc
        params.update(extra_pam)
        xgb_model = xgb.train(
            params,
            self.train_dmatrix,
            num_boost_round=100,  #evals=[(self.valid_dmatrix, 'valid')])
            evals=[(self.train_dmatrix, 'train'),
                   (self.valid_dmatrix, 'valid'), (self.test_dmatrix, 'test')])
        pred = xgb_model.predict(self.valid_dmatrix)
        print("save model to %s" % (self.model_path))
        xgb_model.dump_model(self.model_path + self.model_name + ".txt")
        xgb_model.save_model(self.model_path + self.model_name)
        joblib.dump(xgb_model, self.model_path + '/xgb_clf.m')
        # save figures
        plt.clf()
        xgb.plot_importance(xgb_model)
        plt.savefig(self.model_path + '/feature_importance.png',
                    dpi=800,
                    format='png')

    def plotXgboostTree(self):
        xgb_model = xgb.Booster(model_file=self.model_path + self.model_name)
        xgbclf = joblib.load(self.model_path + '/xgb_clf.m')
        #plt.clf();    xgb.plot_tree(xgbclf, num_trees=0, fmap='./xgb.fmap');    plt.savefig('xgb_tree.png', dpi=800, format='png'); exit(0)
        for i in range(4):
            #plt.clf()
            xgb.plot_tree(xgb_model,
                          num_trees=i,
                          fmap='./get_jdcv_data/feature.fmap')
            fig = plt.gcf()
            fig.set_size_inches(150, 100)
            fig.savefig('xgb_tree_' + str(i) + '.png')
            #plt.savefig('xgb_tree_' + str(i) + '.png', dpi=800, format='png')
            a = 1
        pass

    def predict(self, vec):
        print("xgb model file: %s" % (conf.xgb_rank_model))
        self.xgb_model = xgb.Booster(model_file=conf.xgb_rank_model +
                                     self.model_name)
        feature_vector = [0] * 30
        for ele in vec.split()[2:]:
            k, v = ele.split(":")
            try:
                val = int(v)
            except:
                val = float(v)
            feature_vector[int(k) - 1] = val
            a = 1
        feature = np.array(feature_vector)
        feature_csr = sparse.csr_matrix(feature)
        input = DMatrix(feature_csr)
        score = self.xgb_model.predict(input)[0]
        return score

    def test(self, fea_num=24, topk=1, path=conf.rank_data_file + "valid.txt"):
        xgb_dict = parse_xgb_dict(conf.xgb_rank_model + self.model_name +
                                  ".txt")

        def cal_score():
            pass

        xgb_model = xgb.Booster(model_file=conf.xgb_rank_model +
                                self.model_name)
        group_data = {}
        print("test file: %s\ttree number: %d" % (path, len(xgb_dict)))
        text = [
            line.strip().split()
            for line in open(path, encoding="utf8").readlines()
        ]
        for line in text:
            if line[1] not in group_data: group_data[line[1]] = []
            group_data[line[1]].append(line)
        group_data = {k: v for k, v in group_data.items() if len(v) > 1}
        ndcgs = []  #np.zeros(len(group_data))
        #for i, (_, datas) in enumerate(tqdm(group_data.items(), total=len(group_data))):
        for i, (_, datas) in enumerate(group_data.items()):
            score_label = []
            for ele in datas:
                feature_vector = [0] * fea_num
                label = int(ele[0])
                for e in ele[2:]:
                    k, v = e.split(":")
                    try:
                        val = int(v)
                    except:
                        val = float(v)
                    feature_vector[int(k) - 1] = val
                feature = np.array(feature_vector)
                feature_csr = sparse.csr_matrix(feature)
                input = DMatrix(feature_csr)
                score = xgb_model.predict(input)[0]  # xgboost 自带的预测函数
                #score = predict_proba(xgb_dict, feature)        # 解析 .txt 模型文件得到的预测函数
                score_label.append((score, label))
            sorted_score_label = sorted(score_label,
                                        key=lambda d: d[0],
                                        reverse=True)
            label_list = [label for score, label in sorted_score_label]
            dcg, idcg, ndcg = cal_ndcg(label_list, topk)
            if len(set(label_list)) <= 1: continue
            ndcgs.append(ndcg)  #[i] = ndcg
            print([(round(k, 3), v) for k, v in sorted_score_label],
                  round(ndcg, 3))
        ndcgs_mean = np.mean(np.array(ndcgs))  #np.mean(ndcgs)
        print("topk: %d\tndcgs mean: %.3f" % (topk, ndcgs_mean))
        pass
Example #22
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            eval_metric=None,
            early_stopping_rounds=None,
            verbose=True,
            xgb_model=None):
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix with the first feature containing a group indicator
        y : array_like
            Labels
        sample_weight : array_like
            instance weights
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for
            early-stopping
        eval_metric : str, callable, optional
            If a str, should be a built-in evaluation metric to use. See
            doc/parameter.md. If callable, a custom evaluation metric. The call
            signature is func(y_predicted, y_true) where y_true will be a
            DMatrix object such that you may need to call the get_label
            method. It must return a str, value pair where the str is a name
            for the evaluation and value is the value of the evaluation
            function. This objective is always minimized.
        early_stopping_rounds : int
            Activates early stopping. Validation error needs to decrease at
            least every <early_stopping_rounds> round(s) to continue training.
            Requires at least one item in evals.  If there's more than one,
            will use the last. Returns the model from the last iteration
            (not the best one). If early stopping occurs, the model will
            have three additional fields: bst.best_score, bst.best_iteration
            and bst.best_ntree_limit.
            (Use bst.best_ntree_limit to get the correct value if num_parallel_tree
            and/or num_class appears in the parameters)
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
            metric measured on the validation set to stderr.
        xgb_model : str
            file name of stored xgb model or 'Booster' instance Xgb model to be
            loaded before training (allows training continuation).
        """

        X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True)
        group_sizes, _, X_features, y, sample_weight = _preprare_data_in_groups(
            X, y, sample_weight)
        params = self.get_xgb_params()
        evals_result = {}
        feval = eval_metric if callable(eval_metric) else None
        if eval_metric is not None:
            if callable(eval_metric):
                eval_metric = None
            else:
                params.update({'eval_metric': eval_metric})

        if sample_weight is not None:
            train_dmatrix = DMatrix(X_features,
                                    label=y,
                                    weight=sample_weight,
                                    missing=self.missing)
        else:
            train_dmatrix = DMatrix(X_features, label=y, missing=self.missing)
        train_dmatrix.set_group(group_sizes)

        self._Booster = train(params,
                              train_dmatrix,
                              self.n_estimators,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result,
                              obj=None,
                              feval=feval,
                              verbose_eval=verbose,
                              xgb_model=xgb_model)

        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[
                    val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result = evals_result

        if early_stopping_rounds is not None:
            self.best_score = self._Booster.best_score
            self.best_iteration = self._Booster.best_iteration
            self.best_ntree_limit = self._Booster.best_ntree_limit

        return self
Example #23
0
def cv_eval_lambdaMART_in_XGBoost(para_dict=None):
    # common parameters across different models
    debug, dataset, dir_data, model = para_dict['debug'], para_dict[
        'dataset'], para_dict['dir_data'], para_dict['model']
    min_docs, min_rele, cutoffs = para_dict['min_docs'], para_dict[
        'min_rele'], para_dict['cutoffs']
    do_validation, validation_k, do_log = para_dict[
        'do_validation'], para_dict['validation_k'], para_dict['do_log']
    eta, gamma, min_child_weight, max_depth, tree_method = para_dict[
        'eta'], para_dict['gamma'], para_dict['min_child_weight'], para_dict[
            'max_depth'], para_dict['tree_method']

    if debug:
        fold_num = 2
    else:
        fold_num = 5

    model_output = update_output_setting(para_dict=para_dict)
    if do_log:  # open log file
        sys.stdout = open(model_output + 'log.txt', "w")

    time_begin = datetime.datetime.now()  # timing
    l2r_cv_avg_scores = np.zeros(len(cutoffs))  # fold average
    for fold_k in range(1, fold_num + 1):
        print(
            '\nFold-',
            fold_k)  # fold-wise data preparation plus certain light filtering

        dir_fold_k = dir_data + 'Fold' + str(fold_k) + '/'
        ori_file_train, ori_file_vali, ori_file_test = dir_fold_k + 'train.txt', dir_fold_k + 'vali.txt', dir_fold_k + 'test.txt'

        file_train_data, file_train_group = load_data_xgboost(
            ori_file_train,
            min_docs=min_docs,
            min_rele=min_rele,
            dataset=dataset)
        file_vali_data, file_vali_group = load_data_xgboost(ori_file_vali,
                                                            min_docs=min_docs,
                                                            min_rele=min_rele,
                                                            dataset=dataset)
        file_test_data, file_test_group = load_data_xgboost(ori_file_test,
                                                            min_docs=min_docs,
                                                            min_rele=min_rele,
                                                            dataset=dataset)

        x_train, y_train = load_svmlight_file(file_train_data)
        group_train = load_group_data(file_train_group)
        train_dmatrix = DMatrix(x_train, y_train)
        train_dmatrix.set_group(group_train)

        if do_validation:
            x_valid, y_valid = load_svmlight_file(file_vali_data)
            group_valid = load_group_data(file_vali_group)
            valid_dmatrix = DMatrix(x_valid, y_valid)
            valid_dmatrix.set_group(group_valid)

        x_test, y_test = load_svmlight_file(file_test_data)
        group_test = load_group_data(file_test_group)
        test_dmatrix = DMatrix(x_test)
        """ possible settings of params """
        # params = {'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6}

        # ndcg
        # params = {'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6}
        #params = {'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6, 'eval_metric': 'ndcg@10'}

        params = {
            'objective': 'rank:ndcg',
            'eta': eta,
            'gamma': gamma,
            'min_child_weight': min_child_weight,
            'max_depth': max_depth,
            'eval_metric': 'ndcg@10-',
            'tree_method': tree_method
        }  # if idealDCG=0, then 0

        # map
        # params = {'objective': 'rank:map', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6}

        if do_validation:
            fold_xgb_model = xgb.train(params,
                                       train_dmatrix,
                                       num_boost_round=500,
                                       evals=[(valid_dmatrix, 'validation')])
        else:
            fold_xgb_model = xgb.train(params,
                                       train_dmatrix,
                                       num_boost_round=500)

        fold_checkpoint = '-'.join(['Fold', str(fold_k)])  # buffer model
        save_dir = model_output + fold_checkpoint + '/'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        with open(save_dir + '_'.join(['fold', str(fold_k), 'model']) + '.dat',
                  'wb') as model_file:
            pickle.dump(fold_xgb_model, model_file)

        pred = fold_xgb_model.predict(test_dmatrix)  # fold-wise performance
        fold_avg_ndcg_at_ks = cal_nDCG_at_ks(all_std_labels=y_test,
                                             all_preds=pred,
                                             group=group_test,
                                             ks=cutoffs)
        performance_list = [model + ' Fold-' + str(fold_k)]
        for i, co in enumerate(cutoffs):
            performance_list.append('nDCG@{}:{:.4f}'.format(
                co, fold_avg_ndcg_at_ks[i]))
        performance_str = '\t'.join(performance_list)
        print('\n\t', performance_str)

        l2r_cv_avg_scores = np.add(
            l2r_cv_avg_scores,
            fold_avg_ndcg_at_ks)  # sum for later cv-performance

    time_end = datetime.datetime.now()  # overall timing
    elapsed_time_str = str(time_end - time_begin)
    print('Elapsed time:\t', elapsed_time_str + "\n")

    print()  # begin to print either cv or average performance
    l2r_cv_avg_scores = np.divide(l2r_cv_avg_scores, fold_num)
    if do_validation:
        eval_prefix = str(fold_num) + '-fold cross validation scores:'
    else:
        eval_prefix = str(fold_num) + '-fold average scores:'

    print(model, eval_prefix,
          to_output_str(list_scores=l2r_cv_avg_scores, list_cutoffs=cutoffs))

    return l2r_cv_avg_scores
Example #24
0
    data = f.readlines()
    for line in data:
        group_valid.append(int(line.split("\n")[0]))
group_test = []
with open("data_lambdaMART\\qac.test.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_test.append(int(line.split("\n")[0]))

# create the train/validation/test DMatrix objects required by xgboost;s lambdaMART implementation
train_dmatrix = DMatrix(x_train, y_train)
valid_dmatrix = DMatrix(x_valid, y_valid)
test_dmatrix = DMatrix(x_test)

# set the groups for the training and validation sets
train_dmatrix.set_group(group_train)
valid_dmatrix.set_group(group_valid)

# LambdaMART parameters
params = {'objective': 'rank:pairwise', 'n_estimators': 300, 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1,
          'max_depth': 6}

# create lambdaMart with the aforementioned parameters
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')])

# run the prediction process on the test set
predictions = xgb_model.predict(test_dmatrix)

# plot the importance of the features in the training and validation sets
xgb.plot_importance(xgb_model)
 def _dmat_init(group, **params):
     ret = DMatrix(**params)
     ret.set_group(group)
     return ret
Example #26
0
    def fit(self,
            X,
            y,
            group=None,
            eval_metric=None,
            sample_weight=None,
            early_stopping_rounds=None,
            verbose=True):
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
        group : list, optional
            Group number list. All X and y will be taken as single group when group is not provided. All ranking is valid only in their own group.
        sample_weight : array_like
            instance weights
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for
            early-stopping
        eval_metric : str, callable, optional
            If a str, should be a built-in evaluation metric to use. See
            doc/parameter.md. If callable, a custom evaluation metric. The call
            signature is func(y_predicted, y_true) where y_true will be a
            DMatrix object such that you may need to call the get_label
            method. It must return a str, value pair where the str is a name
            for the evaluation and value is the value of the evaluation
            function. This objective is always minimized.
        early_stopping_rounds : int
            Activates early stopping. Validation error needs to decrease at
            least every <early_stopping_rounds> round(s) to continue training.
            Requires at least one item in evals.  If there's more than one,
            will use the last. Returns the model from the last iteration
            (not the best one). If early stopping occurs, the model will
            have three additional fields: bst.best_score, bst.best_iteration
            and bst.best_ntree_limit.
            (Use bst.best_ntree_limit to get the correct value if num_parallel_tree
            and/or num_class appears in the parameters)
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
            metric measured on the validation set to stderr.
        xgb_model : str
            file name of stored xgb model or 'Booster' instance Xgb model to be
            loaded before training (allows training continuation).
        """
        if group == None:
            group = [X.shape[0]]

        params = self.get_xgb_params()

        if callable(self.objective):
            obj = _objective_decorator(self.objective)
            # Use default value. Is it really not used ?
            xgb_options["objective"] = "rank:pairwise"
        else:
            obj = None

        evals_result = {}
        feval = eval_metric if callable(eval_metric) else None
        if eval_metric is not None:
            if callable(eval_metric):
                eval_metric = None
            else:
                params.update({'eval_metric': eval_metric})

        if sample_weight is not None:
            train_dmatrix = DMatrix(X,
                                    label=y,
                                    weight=sample_weight,
                                    missing=self.missing)
        else:
            train_dmatrix = DMatrix(X, label=y, missing=self.missing)
        train_dmatrix.set_group(group)

        self.objective = params["objective"]

        self._Booster = train(params,
                              train_dmatrix,
                              self.n_estimators,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result,
                              obj=obj,
                              feval=feval,
                              verbose_eval=verbose,
                              xgb_model=None)

        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[
                    val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result = evals_result

        if early_stopping_rounds is not None:
            self.best_score = self._Booster.best_score
            self.best_iteration = self._Booster.best_iteration
            self.best_ntree_limit = self._Booster.best_ntree_limit

        return self
Example #27
0
    # 'nthread': 4,
    'eval_metric': 'ndcg'
}
# generate training dataset
# 一共2组*每组3条,6条样本,特征维数是2
n_group = 2
n_choice = 3
dtrain = np.random.uniform(0, 100, [n_group * n_choice, 2])
# numpy.random.choice(a, size=None, replace=True, p=None)
dtarget = np.array(
    [np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten()
# n_group用于表示从前到后每组各自有多少样本,前提是样本中各组是连续的,[3,3]表示一共6条样本中前3条是第一组,后3条是第二组
dgroup = np.array([n_choice for i in range(n_group)]).flatten()
# concate Train data, very import here !
xgbTrain = DMatrix(dtrain, label=dtarget)
xgbTrain.set_group(dgroup)
# generate eval data
dtrain_eval = np.random.uniform(0, 100, [n_group * n_choice, 2])
xgbTrain_eval = DMatrix(dtrain_eval, label=dtarget)
xgbTrain_eval.set_group(dgroup)
evallist = [(xgbTrain, 'train'), (xgbTrain_eval, 'eval')]
# train model
# xgb_rank_params1加上 evals 这个参数会报错,还没找到原因
# rankModel = train(xgb_rank_params1,xgbTrain,num_boost_round=10)
rankModel = train(xgb_rank_params2,
                  xgbTrain,
                  num_boost_round=20,
                  evals=evallist)
# test dataset
dtest = np.random.uniform(0, 100, [n_group * n_choice, 2])
dtestgroup = np.array([n_choice for i in range(n_group)]).flatten()
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            eval_metric=None,
            early_stopping_rounds=None,
            verbose=True,
            xgb_model=None,
            callbacks=None,
            learning_rates=None):
        """
        Fit the gradient boosting model
        Parameters
        ----------
        X : array_like
            Feature matrix with the first feature containing a group indicator
        y : array_like
            Labels
        sample_weight : array_like
            instance weights
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for
            early-stopping
        eval_metric : str, callable, optional
            If a str, should be a built-in evaluation metric to use. See
            doc/parameter.md. If callable, a custom evaluation metric. The call
            signature is func(y_predicted, y_true) where y_true will be a
            DMatrix object such that you may need to call the get_label
            method. It must return a str, value pair where the str is a name
            for the evaluation and value is the value of the evaluation
            function. This objective is always minimized.
        early_stopping_rounds : int
            Activates early stopping. Validation error needs to decrease at
            least every <early_stopping_rounds> round(s) to continue training.
            Requires at least one item in evals.  If there's more than one,
            will use the last. Returns the model from the last iteration
            (not the best one). If early stopping occurs, the model will
            have three additional fields: bst.best_score, bst.best_iteration
            and bst.best_ntree_limit.
            (Use bst.best_ntree_limit to get the correct value if num_parallel_tree
            and/or num_class appears in the parameters)
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
            metric measured on the validation set to stderr.
        learning_rates: list or function (deprecated - use callback API instead)
            List of learning rate for each boosting round
            or a customized function that calculates eta in terms of
            current number of round and the total number of boosting round (e.g. yields
            learning rate decay)
        xgb_model : file name of stored xgb model or 'Booster' instance
            Xgb model to be loaded before training (allows training continuation).
        callbacks : list of callback functions
            List of callback functions that are applied at end of each iteration.
            It is possible to use predefined callbacks by using xgb.callback module.
            Example: [xgb.callback.reset_learning_rate(custom_rates)]
        """

        #X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True)

        sizes, _, X_features, y, _, _ = self._preprare_data_in_groups(X, y)

        params = self.get_xgb_params()

        if callable(self.objective):
            obj = _objective_decorator(self.objective)
            # Dummy, Not used when custom objective is given
            params["objective"] = "binary:logistic"
        else:
            obj = None

        evals_result = {}
        feval = eval_metric if callable(eval_metric) else None
        if eval_metric is not None:
            if callable(eval_metric):
                eval_metric = None
            else:
                params.update({'eval_metric': eval_metric})

        if sample_weight is not None:
            train_dmatrix = DMatrix(X_features,
                                    label=y,
                                    weight=sample_weight,
                                    missing=self.missing)
        else:
            train_dmatrix = DMatrix(X_features, label=y, missing=self.missing)

        train_dmatrix.set_group(sizes)

        def _dmat_init(group, **params):
            ret = DMatrix(**params)
            ret.set_group(group)
            return ret

        eval_group = []
        neval_set = []
        if eval_set:
            for i in range(len(eval_set)):
                seval_group, _, X_features, y, _, _ = self._preprare_data_in_groups(
                    eval_set[i][0], eval_set[i][1])
                eval_group.append(seval_group)
                neval_set.append([X_features, y])
        if neval_set != []:
            sample_weight_eval_set = [None] * len(eval_set)
            evals = [
                _dmat_init(eval_group[i],
                           data=neval_set[i][0],
                           label=neval_set[i][1],
                           missing=self.missing,
                           weight=sample_weight_eval_set[i],
                           nthread=self.n_jobs) for i in range(len(neval_set))
            ]
            nevals = len(evals)
            eval_names = ["eval_{}".format(i) for i in range(nevals)]
            evals = list(zip(evals, eval_names))
        else:
            evals = ()

        self._Booster = train(params,
                              train_dmatrix,
                              self.n_estimators,
                              evals=evals,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result,
                              obj=obj,
                              feval=feval,
                              verbose_eval=verbose,
                              xgb_model=xgb_model,
                              learning_rates=learning_rates,
                              callbacks=callbacks)

        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[
                    val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result = evals_result

        if early_stopping_rounds is not None:
            self.best_score = self._Booster.best_score
            self.best_iteration = self._Booster.best_iteration
            self.best_ntree_limit = self._Booster.best_ntree_limit

        return self
Example #29
0
def main_xgb(fold_offset):
    all_races_rank_regression = []
    all_races_query_regression = []
    all_races_target_regression = []
    get_race_gets(all_races_train, all_races_rank_regression,
                  all_races_query_regression, all_races_target_regression)
    all_races_rank_regression = np.array(all_races_rank_regression)
    all_races_query_regression = np.array(all_races_query_regression)
    all_races_target_regression = np.array(all_races_target_regression)

    if use_history:
        categorical_feature = [0, 1, 2, 3, 4] + list(range(6, 21))
    else:
        categorical_feature = [0, 1, 2, 3, 4]
    categorical_dim = [
        int(np.max(all_races_rank_regression[:, c]))
        for c in categorical_feature
    ]

    del all_races_rank_regression, all_races_query_regression, all_races_target_regression

    def get_matrix(mat):
        shape = list(mat.shape)
        shape[1] += int(np.sum(categorical_dim)) - len(categorical_feature)
        matrix = np.zeros(tuple(shape))
        cur_dim = 0
        cur_ind = 0
        while cur_dim < shape[1]:
            if cur_ind in categorical_feature:
                dim = categorical_dim[categorical_feature.index(cur_ind)]
                for z in range(shape[0]):
                    matrix[z, cur_dim + int(mat[z, cur_ind])] = 1
                cur_dim += dim
            else:
                matrix[:, cur_dim] = mat[:, cur_ind]
                cur_dim += 1
            cur_ind += 1
        return matrix

    if len(test_src) > 0:
        all_races_rank_test_x = get_matrix(all_races_rank_test)
    if len(in_data) != 0 and len(in_meta) != 0:
        predict_races_target_x = get_matrix(predict_races_target)

    for fold_id, (train_index, test_index) in enumerate(
            KFold(n_splits=10).split(all_races_train)):
        all_races_train_train = all_races_train[train_index]
        all_races_train_valid = all_races_train[test_index]
        all_races_rank_train_train = []
        all_races_query_train_train = []
        all_races_target_train_train = []
        all_races_rank_train_valid = []
        all_races_query_train_valid = []
        all_races_target_train_valid = []
        get_race_gets(all_races_train_train, all_races_rank_train_train,
                      all_races_query_train_train,
                      all_races_target_train_train)
        get_race_gets(all_races_train_valid, all_races_rank_train_valid,
                      all_races_query_train_valid,
                      all_races_target_train_valid)
        all_races_rank_train_train = get_matrix(
            np.array(all_races_rank_train_train))
        all_races_query_train_train = np.array(all_races_query_train_train)
        all_races_target_train_train = np.array(all_races_target_train_train)
        all_races_rank_train_valid = get_matrix(
            np.array(all_races_rank_train_valid))
        all_races_query_train_valid = np.array(all_races_query_train_valid)
        all_races_target_train_valid = np.array(all_races_target_train_valid)

        xgb_params = {
            'objective': 'rank:pairwise',
            'eta': 0.1,
            'gamma': 0.0001,
            'min_child_weight': 0.1,
            'max_depth': 6
        }
        xgtrain = DMatrix(all_races_rank_train_train,
                          all_races_target_train_train)
        xgtrain.set_group(all_races_query_train_train)
        xgvalid = DMatrix(all_races_rank_train_valid,
                          all_races_target_train_valid)
        xgvalid.set_group(all_races_query_train_valid)

        del all_races_train_train, all_races_train_valid, all_races_rank_train_train, all_races_target_train_train, all_races_query_train_train, all_races_rank_train_valid, all_races_target_train_valid, all_races_query_train_valid

        xgb_clf = xgb.train(xgb_params,
                            xgtrain,
                            num_boost_round=10,
                            evals=[(xgvalid, 'validation')])
        del xgtrain, xgvalid

        if len(test_src) > 0:
            dst = norm_racedata(
                xgb_clf.predict(DMatrix(all_races_rank_test_x)),
                all_races_query_test)
            for dst_ind in range(len(dst)):
                test_validation_regression[dst_ind][fold_offset +
                                                    fold_id] = dst[dst_ind]
            cur_pos = 0
        if len(in_data) != 0 and len(in_meta) != 0:
            dst = norm_racedata(
                xgb_clf.predict(DMatrix(predict_races_target_x)),
                [len(predict_races_target_x)])
            for dst_ind in range(len(dst)):
                predict_validation_regression[dst_ind][fold_offset +
                                                       fold_id] = dst[dst_ind]
 def _dmat_init(data, labels, **params):
     sizes, _, X_features, y, _ = _preprare_data_in_groups(data, labels)
     ret = DMatrix(X_features, y, **params)
     ret.set_group(sizes)
     return ret