Example #1
0
def log_focal_loss_obj(preds: np.ndarray, dtrain: xgb.DMatrix):
    labels = dtrain.get_label()
    # print(preds.shape)
    kRows, kClasses = preds.shape

    if dtrain.get_weight().size == 0:
        # Use 1 as weight if we don't have custom weight.
        weights = np.ones((kRows, 1), dtype=float)
    else:
        weights = dtrain.get_weight()

    grad = np.zeros((kRows, kClasses), dtype=float)
    hess = np.zeros((kRows, kClasses), dtype=float)

    for r in range(kRows):
        #print(preds[r])
        target = int(labels[r])
        assert target >= 0 or target <= kClasses
        p = softmax(preds[r, :])
        grad_r, hess_r = focal_logloss_derivative_gamma2(p, target)
        grad[r] = grad_r * weights[r]
        hess[r] = hess_r * weights[r]

    # Right now (XGBoost 1.0.0), reshaping is necessary
    grad = grad.reshape((kRows * kClasses, 1))
    hess = hess.reshape((kRows * kClasses, 1))

    return grad, hess
def predict_xgboost_answers(xgb_model):
    # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность)
    load_tfidf_1 = TFIDF.load(os.path.join(PATH_TO_TF_IDF, 'tf_idf_1'))
    x_test, y_test = sklearn.datasets.load_svmlight_file(
        os.path.join(PATH_TO_LEARNING_TO_RANK, 'x_test.txt'))
    group_test = []
    with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"),
              "r",
              encoding="utf-8") as f:
        data = f.readlines()
        for line in data:
            group_test.append(int(line.split("\n")[0]))
    test_dmatrix = DMatrix(x_test)
    test_dmatrix.set_group(group_test)
    pred = xgb_model.predict(test_dmatrix)
    prediction_answer = []
    for i, p in enumerate(pred):
        prediction_answer.append(
            (load_tfidf_1.num_to_num_dict[i % CNT_ARTICLES], p))
    predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK,
                                'prediction_file.txt')
    if os.path.exists(predict_file):
        os.remove(predict_file)
    f = open(predict_file, 'w+', encoding="utf-8")
    predictions = [str(pred) for pred in prediction_answer]
    f.write('\n'.join(predictions))
    f.close()
Example #3
0
    def apply(self, X, ntree_limit=0):
        """Return the predicted leaf every tree for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        ntree_limit : int
            Limit number of trees in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
            For each datapoint x in X and for each tree, return the index of the
            leaf x ends up in. Leaves are numbered within
            ``[0; 2**(self.max_depth+1))``, possibly with gaps in the numbering.
        """
        sizes, group_indices, X_features, _, _ = _preprare_data_in_groups(X)
        test_dmatrix = DMatrix(X_features, missing=self.missing)
        test_dmatrix.set_group(sizes)
        X_leaves = self.get_booster().predict(test_dmatrix,
                                              pred_leaf=True,
                                              ntree_limit=ntree_limit)
        revert_group_indices = np.arange(
            len(group_indices))[group_indices.argsort()]
        X_leaves = X_leaves[revert_group_indices, :]
        return X_leaves
Example #4
0
def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> [str, float]:
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    price = dtrain.get_weight()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return 'PyRMSLE', float(np.sqrt(np.sum(elements) / (len(y))))
Example #5
0
def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    price = dtrain.get_weight()
    return (((np.log1p(price) * (y - predt) + predt + 1) / predt + 1) +
            (np.log1p(price) - 1) * np.log1p(predt) -
            np.log1p(y)) / (np.log1p(price) * (y - predt) + predt + 1)**2
Example #6
0
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient squared log error.'''
    y = dtrain.get_label()
    price = dtrain.get_weight()
    return (np.log1p(predt) - np.log1p(y)) / (
        (predt + 1) + (np.log1p(price) * (y - predt))
    )  # with log is best, maybe apply log in all bot part of equation
Example #7
0
def cv(X_t, y_t, X_test, y_test):
    '''
    :type dataframes: will be using DMatrix, optimized datastructure offered from xgboost
    testing new APIs in xgboost with train/cv, to be used for hyperparameter tuning
    '''
    dtrain = DMatrix(X_t, label=y_t)
    dtest = DMatrix(X_test, label=y_test)
    params={"objective":"binary:logistic",
            'max_depth': 9,
            'min_child_weight': 2,
            'subsample': 0.8,
            'eta': 0.1,
            'alpha': 0.2,
            'lambda': 0.2,
            'eval_metric':'auc'}
    num_boost_round = 313 #model.best_iteration + 1
    #fit
    model = xgb.train(params,
              dtrain,
              num_boost_round=num_boost_round,
              evals=[(dtest,'Test')])

    return model

    '''
Example #8
0
def predict_xgboost_answers(xgb_model):
    # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность)
    features = pd.read_csv(f"{PATH_TO_LEARNING_TO_RANK}/x_test.csv", sep=',')
    x_test = features.drop(['doc_id', 'is_rel', '7'], axis=1)
    group_test = []
    with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"),
              "r",
              encoding="utf-8") as f:
        data = f.readlines()
        for line in data:
            group_test.append(int(line.split("\n")[0]))

    test_dmatrix = DMatrix(x_test)
    test_dmatrix.set_group(group_test)

    pred = xgb_model.predict(test_dmatrix)
    corpus = SimpleCorp.load("codexes_corp_articles",
                             os.path.join(PATH_TO_FILES, "corp"))
    prediction_answer = []
    for p, doc_id in zip(
            pred,
            list(corpus.corpus.keys()) * (len(pred) // CNT_ARTICLES)):
        prediction_answer.append((doc_id, p))
    predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK,
                                'prediction_file.txt')
    if os.path.exists(predict_file):
        os.remove(predict_file)
    f = open(predict_file, 'w+', encoding="utf-8")
    predictions = [str(pred) for pred in prediction_answer]
    f.write('\n'.join(predictions))
    f.close()
 def predict(self, X, output_margin=False, ntree_limit=0):
     sizes, _, X_features, _, _ = _preprare_data_in_groups(X)
     test_dmatrix = DMatrix(X_features, missing=self.missing)
     test_dmatrix.set_group(sizes)
     rank_values = self.get_booster().predict(test_dmatrix,
                                              output_margin=output_margin,
                                              ntree_limit=ntree_limit)
     return rank_values
Example #10
0
 def predict(self, X, group=None, output_margin=False, ntree_limit=0):
     if group == None:
         group = [X.shape[0]]
     test_dmatrix = DMatrix(X, missing=self.missing)
     test_dmatrix.set_group(group)
     rank_values = self.booster().predict(test_dmatrix,
                                          output_margin=output_margin,
                                          ntree_limit=ntree_limit)
     return rank_values
Example #11
0
def best_num_round(params, dall: xgb.DMatrix, cv_splits, verbose=True):
    params = _complete_params(params)
    train_idx, test_idx = cv_splits[-1]
    dtrain = dall.slice(train_idx)
    dtest = dall.slice(test_idx)
    bst = xgb.train(params, dtrain, early_stopping_rounds=50,
                    num_boost_round=500, evals=[(dtrain, 'dtrain'),
                                                (dtest, 'dtest')],
                    feval=_xgb_feval, verbose_eval=verbose)
    return bst.best_ntree_limit
Example #12
0
def get_pairs_rank_score(loaded_model, text_list):
    test_group_list, test_data_list, test_target_list = test_data_generation(
        text_list)
    # print(test_group_list, '\n*******test_group_list************')
    # print(test_data_list, '\n*********test_data_list**********')
    # print(test_target_list, '\n*********test_target_list**********')
    xgbTest = DMatrix(np.asmatrix(test_data_list), label=test_target_list)
    xgbTest.set_group(test_group_list)
    results = loaded_model.predict(xgbTest)
    return results
Example #13
0
def to_dmatrix(data, labels=None):
    if type(data) in [pd.DataFrame, pd.Series]:
        if labels != None and type(labels) in [pd.DataFrame, pd.Series]:
            return DMatrix(data.values, labels.values)
        else:
            return DMatrix(data.values)
    else:
        if labels != None:
            return DMatrix(data, labels)
        else:
            return DMatrix(data)
Example #14
0
    def test_dmatrix_creator(self):

        # This function acts as a pseudo-itertools.chain()
        def row_tup_iter(data):
            pdf = pd.DataFrame(data)
            yield pdf

        # Standard testing DMatrix creation
        expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
        expected_labels = np.array([1, 0] * 100)
        expected_dmatrix = DMatrix(data=expected_features,
                                   label=expected_labels)

        data = {
            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
            "label": [1, 0] * 100,
        }
        output_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=False,
            has_validation=False,
            has_base_margin=False,
        )
        # You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using
        # the same classifier and making sure the outputs are equal
        model = XGBClassifier()
        model.fit(expected_features, expected_labels)
        expected_preds = model.get_booster().predict(expected_dmatrix)
        output_preds = model.get_booster().predict(output_dmatrix)
        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))

        # DMatrix creation with weights
        expected_weight = np.array([0.2, 0.8] * 100)
        expected_dmatrix = DMatrix(data=expected_features,
                                   label=expected_labels,
                                   weight=expected_weight)

        data["weight"] = [0.2, 0.8] * 100
        output_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=True,
            has_validation=False,
            has_base_margin=False,
        )

        model.fit(expected_features,
                  expected_labels,
                  sample_weight=expected_weight)
        expected_preds = model.get_booster().predict(expected_dmatrix)
        output_preds = model.get_booster().predict(output_dmatrix)
        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
Example #15
0
def main():
    p = get_cli_args(args)
    x_train, y_train, qid_train = load_svmlight_file(
        p.train.xgboost_train_path, query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path,
                                                  query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_train = x_train.todense()
    x_train = np.concatenate([
        x_train, x_train[:, -2] / x_train[:, 2], x_train[:, -1] / x_train[:, 4]
    ], 1)
    x_test = x_test.todense()
    x_test = np.concatenate(
        [x_test, x_test[:, -2] / x_test[:, 2], x_test[:, -1] / x_test[:, 4]],
        1)
    train_dmatrix = DMatrix(x_train, y_train)
    test_dmatrix = DMatrix(x_test, y_test)
    train_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_train)])
    test_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_test)])
    params = {
        'objective': 'rank:pairwise',
        'eval_metric': ['error', 'map@1'],
        'tree_method': 'exact',
        'eta': 0.1,
        'gamma': 1.0,
        'min_child_weight': 0.1,
        'max_depth': 6
    }
    xgb_model = xgb.train(params,
                          train_dmatrix,
                          num_boost_round=100,
                          evals=[(test_dmatrix, 'validation')])
    xgb_train_str = items_to_str(_.omit(params, 'objective',
                                        'eval_metric').items(),
                                 sort_by=itemgetter(0))
    xgb_model.save_model(xgb_train_str + '_model.xgb')
Example #16
0
def mknfold(X_train, y_train, nfold, param, evals=(), features=None):
    '''
    Makes n folds in input data.

    Parameters
    ----------
    X_train : pandas.DataFrame
        X data to be trained
    y_train : pandas.DataFrame
        y data to be trained
    nfold : int
        Number of folds in CV.
    param : dict
        Booster params
    evals : list
        Evaluation metrics to be watches in CV.
    features : list
        features selected to be trained

    Returns
    -------
    ret : list
        list of CVPack objects containing the dmatrix training, testing, and
        list of parameters and metrics to use for every fold
    wt_list : list
        list of weights for each fold. This is the size of each fold
    '''
    if not features:
        features = X_train.columns
    out_idset, wt_list = bin_fold(X_train, nfold)
    in_idset = [
        np.concatenate([out_idset[i] for i in range(nfold) if k != i])
        for k in range(nfold)
    ]
    evals = list(evals)
    ret = []
    for k in range(nfold):
        # perform the slicing using the indexes determined by the above methods
        x_train_snip = X_train.loc[in_idset[k]][features]
        y_train_snip = X_train.loc[in_idset[k]]['encoded_target']
        x_test_snip = X_train.loc[out_idset[k]][features]
        y_test_snip = X_train.loc[out_idset[k]]['encoded_target']
        dtrain = DMatrix(x_train_snip, label=y_train_snip)
        dtest = DMatrix(x_test_snip, label=y_test_snip)
        tparam = param
        plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
        ret.append(CVPack(dtrain, dtest, plst))
    return ret, wt_list
Example #17
0
def _convert_partition_data_to_dmatrix(
    partition_data_iter,
    has_weight,
    has_validation,
    has_base_margin,
    dmatrix_kwargs=None,
):
    # pylint: disable=too-many-locals, unbalanced-tuple-unpacking
    dmatrix_kwargs = dmatrix_kwargs or {}
    # if we are not using external storage, we use the standard method of parsing data.
    train_val_data = _prepare_train_val_data(partition_data_iter, has_weight,
                                             has_validation, has_base_margin)
    if has_validation:
        (
            train_x,
            train_y,
            train_w,
            train_b_m,
            val_x,
            val_y,
            val_w,
            val_b_m,
        ) = train_val_data
        training_dmatrix = DMatrix(
            data=train_x,
            label=train_y,
            weight=train_w,
            base_margin=train_b_m,
            **dmatrix_kwargs,
        )
        val_dmatrix = DMatrix(
            data=val_x,
            label=val_y,
            weight=val_w,
            base_margin=val_b_m,
            **dmatrix_kwargs,
        )
        return training_dmatrix, val_dmatrix

    train_x, train_y, train_w, train_b_m = train_val_data
    training_dmatrix = DMatrix(
        data=train_x,
        label=train_y,
        weight=train_w,
        base_margin=train_b_m,
        **dmatrix_kwargs,
    )
    return training_dmatrix
Example #18
0
    def test_external_storage(self):
        # Instantiating base data (features, labels)
        features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
        labels = np.array([1, 0] * 100)
        normal_dmatrix = DMatrix(features, labels)
        test_dmatrix = DMatrix(features)

        data = {
            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
            "label": [1, 0] * 100,
        }

        # Creating the dmatrix based on storage
        temporary_path = tempfile.mkdtemp()
        storage_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=False,
            has_validation=False,
            has_base_margin=False,
        )

        # Testing without weights
        normal_booster = worker_train({}, normal_dmatrix)
        storage_booster = worker_train({}, storage_dmatrix)
        normal_preds = normal_booster.predict(test_dmatrix)
        storage_preds = storage_booster.predict(test_dmatrix)
        self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
        shutil.rmtree(temporary_path)

        # Testing weights
        weights = np.array([0.2, 0.8] * 100)
        normal_dmatrix = DMatrix(data=features, label=labels, weight=weights)
        data["weight"] = [0.2, 0.8] * 100

        temporary_path = tempfile.mkdtemp()
        storage_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=True,
            has_validation=False,
            has_base_margin=False,
        )

        normal_booster = worker_train({}, normal_dmatrix)
        storage_booster = worker_train({}, storage_dmatrix)
        normal_preds = normal_booster.predict(test_dmatrix)
        storage_preds = storage_booster.predict(test_dmatrix)
        self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
        shutil.rmtree(temporary_path)
Example #19
0
    def train(self, train_x, train_y):
        """Train a xgboost_lr model
        
        Arguments:
            train_x {[type]} -- [description]
            train_y {[type]} -- [description]
        """
        self.xgb_clf.fit(train_x,
                         train_y,
                         eval_metric=self.xgb_eval_metric,
                         eval_set=[(train_x, train_y)])
        xgb_eval_result = self.xgb_clf.evals_result()
        print("train eval result: ", xgb_eval_result)

        train_x_mat = DMatrix(train_x)
        # get boost tree leaf info
        train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat,
                                                                pred_leaf=True)
        print(train_xgb_pred_mat)

        train_lr_feature_mat = self.one_hot_encoder.fit_transform(
            train_xgb_pred_mat)
        print('train_mat:', train_lr_feature_mat.shape)
        print('train_mat array:', train_lr_feature_mat.toarray())

        # lr
        self.lr_clf.fit(train_lr_feature_mat, train_y)
        self.init = True
    def test_xgboost_booster_classifier_reg(self):
        x, y = make_classification(n_classes=2,
                                   n_features=5,
                                   n_samples=100,
                                   random_state=42,
                                   n_informative=3)
        y = y.astype(np.float32) + 0.567
        x_train, x_test, y_train, _ = train_test_split(x,
                                                       y,
                                                       test_size=0.5,
                                                       random_state=42)

        data = DMatrix(x_train, label=y_train)
        model = train(
            {
                'objective': 'reg:squarederror',
                'n_estimators': 3,
                'min_child_samples': 1
            }, data)
        model_onnx = convert_xgboost(
            model, 'tree-based classifier',
            [('input', FloatTensorType([None, x.shape[1]]))])
        dump_data_and_model(
            x_test.astype(np.float32),
            model,
            model_onnx,
            allow_failure=
            "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
            basename="XGBBoosterReg")
Example #21
0
    def __classify(self, path):
        files = [self.parse_pe(path)]
        df = pd.DataFrame(files)
        df = df.drop(['sha256', 'size'], axis=1)
        sections = df['sections'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)

        imports = df['import'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)
        imports = imports.reset_index().set_index(['index', 'dll'])
        imports = imports['symbols'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('import').reset_index().set_index('index')

        join = sections.join(imports).fillna(0)

        join['SectionName'] = join['SectionName'].astype('str')
        join['dll'] = join['dll'].astype('str')
        join['import'] = join['import'].astype('str')

        string_columns = ['SectionName', 'dll', 'import']
        matrix = self.ohe.transform(join[string_columns])

        index = join.index
        rows = []
        for i in index.unique():
            select = index.slice_indexer(start=i, end=i)
            rows.append(csr_matrix(matrix[select].sum(axis=0)))

        join_encoded = pd.DataFrame(data={'matrix':rows})

        df = df.drop(['sections', 'import'], axis=1)
        df = df.join(join_encoded)

        X = df.apply(lambda x: hstack((x.drop('matrix').astype('int64').values, x['matrix'])).T, axis=1)
        X = hstack(X.values).T
        X = X.todok().toarray()
        return self.booster.predict(DMatrix(X))[0]
Example #22
0
    def get_xgb_dmatrix(tup):
        from xgboost import DMatrix

        data, label, weight, missing, feature_names, feature_types = tup
        return DMatrix(data, label=label, missing=missing, weight=weight,
                       feature_names=feature_names, feature_types=feature_types,
                       nthread=-1)
Example #23
0
 def predict(self, smiles, get_features=get_fp, use_tqdm=False):
     canonical_smiles = []
     invalid_smiles = []
     if use_tqdm:
         pbar = tqdm(range(len(smiles)))
     else:
         pbar = range(len(smiles))
     for i in pbar:
         sm = smiles[i]
         if use_tqdm:
             pbar.set_description("Calculating predictions...")
         try:
             sm = Chem.MolToSmiles(Chem.MolFromSmiles(sm, sanitize=False))
             if len(sm) == 0:
                 invalid_smiles.append(sm)
             else:
                 canonical_smiles.append(sm)
         except:
             invalid_smiles.append(sm)
     if len(canonical_smiles) == 0:
         return canonical_smiles, [], invalid_smiles
     prediction = []
     x, _, _ = get_features(canonical_smiles, sanitize=False)
     x = DMatrix(x)
     for i in range(len(self.models)):
         y_pred = self.models[i].predict(x)
         if self.transformer is not None:
             y_pred = self.transformer.inverse_transform(y_pred)
         prediction.append(y_pred)
     prediction = np.array(prediction)
     prediction = np.mean(prediction, axis=0)
     return canonical_smiles, prediction, invalid_smiles
        def fair_metric(predt: np.ndarray, dtrain: xgb.DMatrix):
            ''' FairXGB Error Metric'''
            # predt is the prediction array

            # Find the right protected group vector
            if len(predt) == len(protected_train):
                protected_feature = np.array(protected_train.copy())

            elif len(predt) == len(protected_full):
                protected_feature = np.array(protected_full.copy())

            elif len(predt) == len(protected_valid):
                protected_feature = np.array(protected_valid.copy())

            else:
                protected_feature = 0

            y = dtrain.get_label()

            answer = -y * np.log(
                sigmoid(predt)) - (1 - y) * np.log(1 - sigmoid(predt))

            answer += mu * (
                protected_feature * np.log(sigmoid(predt)) +
                (1 - protected_feature) * np.log(1 - sigmoid(predt)))

            return 'Fair_Metric', float(np.sum(answer) / len(answer))
Example #25
0
    def trainModel(self, train_x, train_y):
        #train a xgboost model
        self.xgb_clf = xgb.XGBClassifier(nthread=self.xgb_nthread)
        self.xgb_clf.fit(train_x,
                         train_y,
                         eval_metric=self.xgb_eval_metric,
                         eval_set=[(train_x, train_y)])

        xgb_eval_result = self.xgb_clf.evals_result()
        print 'XGB_train eval_result:', xgb_eval_result

        train_x_mat = DMatrix(train_x)

        train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat,
                                                                pred_leaf=True)

        self.one_hot_encoder = OneHotEncoder()
        train_lr_feature_mat = self.one_hot_encoder.fit_transform(
            train_xgb_pred_mat)
        print 'train_mat:', train_lr_feature_mat.shape
        #train a LR model
        self.lr_clf = LR()
        self.lr_clf.fit(train_lr_feature_mat, train_y)

        self.init_flag = True

        pickle.dump(self.xgb_clf, file(self.xgb_model_name, 'wb'), True)
        pickle.dump(self.lr_clf, file(self.lr_model_name, 'wb'), True)
        pickle.dump(self.one_hot_encoder,
                    file(self.one_hot_encoder_model_name, 'wb'), True)

        print 'Train xgboost and lr model done'
Example #26
0
def rmsle(predt: np.ndarray, dtrain: DMatrix) -> Tuple[str, float]:
    """ Root mean squared log error metric.
    """
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return "my_rmsle", float(np.sqrt(np.sum(elements) / len(y))) + DEBUG_ERROR
Example #27
0
def _prediction_feature_weights(xgb, X, feature_names, xgb_feature_names):
    """ For each target, return score and numpy array with feature weights
    on this prediction, following an idea from
    http://blog.datadive.net/interpreting-random-forests/
    """
    # XGBClassifier does not have pred_leaf argument, so use booster
    booster = xgb.booster()  # type: Booster
    leaf_ids, = booster.predict(DMatrix(X, missing=xgb.missing),
                                pred_leaf=True)
    xgb_feature_names = {f: i for i, f in enumerate(xgb_feature_names)}
    tree_dumps = booster.get_dump(with_stats=True)
    assert len(tree_dumps) == len(leaf_ids)

    target_feature_weights = partial(_target_feature_weights,
                                     feature_names=feature_names,
                                     xgb_feature_names=xgb_feature_names)
    n_targets = _xgb_n_targets(xgb)
    if n_targets > 1:
        # For multiclass, XGBoost stores dumps and leaf_ids in a 1d array,
        # so we need to split them.
        scores_weights = [
            target_feature_weights(
                leaf_ids[target_idx::n_targets],
                tree_dumps[target_idx::n_targets],
            ) for target_idx in range(n_targets)
        ]
    else:
        scores_weights = [target_feature_weights(leaf_ids, tree_dumps)]
    return scores_weights
Example #28
0
    def to_xgboost(self, **kwargs):
        from xgboost import DMatrix

        dmatrix = DMatrix(self.to_numpy(**kwargs))
        ## TODO: Uncomment when XGB observation models are implemented
        # dmatrix.observation_model = self.observation_model(backend="xgboost", loss="mse")
        return dmatrix
    def predict(self, x, **kwargs):
        """
        Perform prediction for a batch of inputs.

        :param x: Test set.
        :type x: `np.ndarray`
        :return: Array of predictions of shape `(nb_inputs, nb_classes)`.
        :rtype: `np.ndarray`
        """
        from xgboost import Booster, XGBClassifier
        from art.utils import to_categorical

        # Apply preprocessing
        x_preprocessed, _ = self._apply_preprocessing(x, y=None, fit=False)

        if isinstance(self._model, Booster):
            from xgboost import DMatrix

            train_data = DMatrix(x_preprocessed, label=None)
            predictions = self._model.predict(train_data)
            y_prediction = np.asarray([line for line in predictions])
            if len(y_prediction.shape) == 1:
                y_prediction = to_categorical(labels=y_prediction,
                                              nb_classes=self.nb_classes())
        elif isinstance(self._model, XGBClassifier):
            y_prediction = self._model.predict_proba(x_preprocessed)

        # Apply postprocessing
        y_prediction = self._apply_postprocessing(preds=y_prediction,
                                                  fit=False)

        return y_prediction
    def update(self, Xtrain, ytrain, Xval, yval, scoring, n_iterations):
        dtrain = DMatrix(data=Xtrain, label=ytrain)

        early_stop_callback = early_stop()

        if not (self.env['earlier_stop']):
            for i in range(n_iterations - self.model.n_estimators):
                # note:
                # this is a get, but the internal booster in XGBClassifier is also updated
                # add unit test for controle if future updates
                self.model.get_booster().update(
                    dtrain, iteration=self.model.n_estimators)
                self.model.n_estimators += 1

                score = scoring(self, Xval, yval)

                if score > self.env['best_score']:
                    self.env['best_score'] = score
                    self.env['best_iteration'] = self.model.n_estimators
                try:
                    early_stop_callback(env=self.env,
                                        score=score,
                                        iteration=self.model.n_estimators)
                except EarlyStopException:
                    print('Update Stopped Earlier! @ {} instead of {}'.format(
                        self.model.n_estimators, n_iterations))
                    self.env['earlier_stop'] = True
                    break
}

##################################################################
## DMatrix
## generate training dataset
# 一共 2 组 * 每组 3 条, 6 条样本, 特征维数是 2
n_group = 2
n_choice = 3
dtrain = np.random.uniform(0, 100, [n_group * n_choice, 2]); print(dtrain.shape)  # (6, 2)
# numpy.random.choice(a, size=None, replace=True, p=None)
dtarget = np.array([np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten(); print(dtarget)  # [1 0 2 1 0 2]
# n_group 用于表示从前到后每组各自有多少样本, 前提是样本中各组是连续的, [3, 3] 表示一共 6 条样本中前 3 条是第一组, 后 3 条是第二组
dgroup = np.array([n_choice for i in range(n_group)]).flatten(); print(dgroup)  # [3 3]

# concate Train data, very import here !
xgbTrain = DMatrix(dtrain, label=dtarget)
xgbTrain.set_group(dgroup)

# generate eval data
dtrain_eval = np.random.uniform(0, 100, [n_group * n_choice, 2]); print(dtrain_eval.shape)  # (6, 2)
xgbTrain_eval = DMatrix(dtrain_eval, label=dtarget)
xgbTrain_eval .set_group(dgroup)
evallist  = [(xgbTrain, 'train'), (xgbTrain_eval, 'eval')]

# train model
# xgb_rank_params1 加上 evals 这个参数会报错, 还没找到原因
# rankModel = train(xgb_rank_params1, xgbTrain, num_boost_round=10)
rankModel = train(xgb_rank_params2, xgbTrain, num_boost_round=20, evals=evallist)

# test dataset
dtest = np.random.uniform(0, 100, [n_group*n_choice, 2]); print(dtest.shape)  # (6, 2)
Example #32
0
group_train = []
with open("mq2008.train.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_train.append(int(line.split("\n")[0]))

group_valid = []
with open("mq2008.vali.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_valid.append(int(line.split("\n")[0]))

group_test = []
with open("mq2008.test.group", "r") as f:
    data = f.readlines()
    for line in data:
        group_test.append(int(line.split("\n")[0]))

train_dmatrix = DMatrix(x_train, y_train)
valid_dmatrix = DMatrix(x_valid, y_valid)
test_dmatrix = DMatrix(x_test)

train_dmatrix.set_group(group_train)
valid_dmatrix.set_group(group_valid)

params = {'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0,
               'min_child_weight': 0.1, 'max_depth': 6}
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4,
                           evals=[(valid_dmatrix, 'validation')])
pred = xgb_model.predict(test_dmatrix)