Example #1
0
def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> [str, float]:
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    price = dtrain.get_weight()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return 'PyRMSLE', float(np.sqrt(np.sum(elements) / (len(y))))
Example #2
0
def log_focal_loss_obj(preds: np.ndarray, dtrain: xgb.DMatrix):
    labels = dtrain.get_label()
    # print(preds.shape)
    kRows, kClasses = preds.shape

    if dtrain.get_weight().size == 0:
        # Use 1 as weight if we don't have custom weight.
        weights = np.ones((kRows, 1), dtype=float)
    else:
        weights = dtrain.get_weight()

    grad = np.zeros((kRows, kClasses), dtype=float)
    hess = np.zeros((kRows, kClasses), dtype=float)

    for r in range(kRows):
        #print(preds[r])
        target = int(labels[r])
        assert target >= 0 or target <= kClasses
        p = softmax(preds[r, :])
        grad_r, hess_r = focal_logloss_derivative_gamma2(p, target)
        grad[r] = grad_r * weights[r]
        hess[r] = hess_r * weights[r]

    # Right now (XGBoost 1.0.0), reshaping is necessary
    grad = grad.reshape((kRows * kClasses, 1))
    hess = hess.reshape((kRows * kClasses, 1))

    return grad, hess
Example #3
0
def rmsle(predt: np.ndarray, dtrain: DMatrix) -> Tuple[str, float]:
    """ Root mean squared log error metric.
    """
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return "my_rmsle", float(np.sqrt(np.sum(elements) / len(y))) + DEBUG_ERROR
        def fair_metric(predt: np.ndarray, dtrain: xgb.DMatrix):
            ''' FairXGB Error Metric'''
            # predt is the prediction array

            # Find the right protected group vector
            if len(predt) == len(protected_train):
                protected_feature = np.array(protected_train.copy())

            elif len(predt) == len(protected_full):
                protected_feature = np.array(protected_full.copy())

            elif len(predt) == len(protected_valid):
                protected_feature = np.array(protected_valid.copy())

            else:
                protected_feature = 0

            y = dtrain.get_label()

            answer = -y * np.log(
                sigmoid(predt)) - (1 - y) * np.log(1 - sigmoid(predt))

            answer += mu * (
                protected_feature * np.log(sigmoid(predt)) +
                (1 - protected_feature) * np.log(1 - sigmoid(predt)))

            return 'Fair_Metric', float(np.sum(answer) / len(answer))
Example #5
0
def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    price = dtrain.get_weight()
    return (((np.log1p(price) * (y - predt) + predt + 1) / predt + 1) +
            (np.log1p(price) - 1) * np.log1p(predt) -
            np.log1p(y)) / (np.log1p(price) * (y - predt) + predt + 1)**2
Example #6
0
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient squared log error.'''
    y = dtrain.get_label()
    price = dtrain.get_weight()
    return (np.log1p(predt) - np.log1p(y)) / (
        (predt + 1) + (np.log1p(price) * (y - predt))
    )  # with log is best, maybe apply log in all bot part of equation
 def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
     ''' Root mean squared log error metric.
     :math:`\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}`
     '''
     y = dtrain.get_label()
     predt[predt < -1] = -1 + 1e-6
     elements = np.power(np.log1p(y) - np.log1p(predt), 2)
     return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y)))
    def loss(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
        ''' Root mean squared log error metric.

        :math:`\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}`
        '''
        y = dtrain.get_label()
        elements = np.exp(predt) - y
        return 'PoissonReg', float(np.mean(elements**2))
Example #9
0
def eval_error_metric(predt, dtrain: xgb.DMatrix):
    label = dtrain.get_label()
    r = np.zeros(predt.shape)
    gt = predt > 0.5
    r[gt] = 1 - label[gt]
    le = predt <= 0.5
    r[le] = label[le]
    return 'CustomErr', np.sum(r)
def neg_rsqaure_score(predt: np.ndarray, dmat: xgb.DMatrix) -> Tuple[str, float]:
    """
    negated r-sqaure score metric.
    :param predt: predicted values
    :param dmat: dmatrix
    :return: r-sqaure
    """
    gold = dmat.get_label()
    return 'NegrSquare', -r2_score(y_true=gold, y_pred=predt)
Example #11
0
def pearson_corr(predt: np.ndarray, dtrain: xgb.DMatrix):
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    y_bar = np.mean(y)
    yhat_bar = predt.mean()
    B = ((y - y_bar) * (predt - yhat_bar)).sum()
    C = np.sqrt(((y - y_bar) ** 2)).sum()  # constant variable
    D = np.sqrt(((y - yhat_bar) ** 2)).sum()
    metric = B / np.sqrt(C * D)
    return 'PearsonCorr', metric
Example #12
0
def eval_error_metric(predt, dtrain: xgb.DMatrix):
    """Evaluation metric for xgb.train"""
    label = dtrain.get_label()
    r = np.zeros(predt.shape)
    gt = predt > 0.5
    if predt.size == 0:
        return "CustomErr", 0
    r[gt] = 1 - label[gt]
    le = predt <= 0.5
    r[le] = label[le]
    return 'CustomErr', np.sum(r)
Example #13
0
    def custom_metrics(y_pred: list, data: xgb.DMatrix) -> list:
        y = data.get_label()
        auprc = calc_auprc(y, y_pred)
        tp = calc_tp(y, y_pred)
        tn = calc_tn(y, y_pred)
        fp = calc_fp(y, y_pred)
        fn = calc_fn(y, y_pred)
        precision = calc_precision(tp, tn, fp, fn)
        recall = calc_recall(tp, tn, fp, fn)
        fpr = calc_fpr(tp, tn, fp, fn)

        return [('precision', precision), ('recall', recall), ('fpr', fpr),
                ('auprc', auprc)]
        def custom_loss(prediction: np.ndarray, dtrain: xgb.DMatrix, ERA_len,
                        CESM_len):
            """
            Specify a loss function which takes an constraint objective into consideration and returns gradient and
            hessian for the training. This objective (Squared Mean Error) is then optimized on the unlabeled CESM data.
            @param prediction: Prediction values for stacked ERAI and CESMp dataframes
            @type prediction: np.ndarray
            @param dtrain: Training set (features and labels)
            @type dtrain: xgb.DMatrix
            @param ERA_len: Number of ERAI samples in dtrain
            @type ERA_len: int
            @param CESM_len: umber of CESMp samples in dtrain (after ERAI samples)
            @type CESM_len: int
            @return: Gradient and hessian of prediction for ERAI and CESM samples
            @rtype: np.ndarray, np.ndarray
            """

            # Get ERAI labels
            y_ERA = dtrain.get_label()[:ERA_len]

            # Split predictions into ERAI and CESM
            prediction = self.sigmoid(prediction)
            pred_ERA = prediction[:ERA_len]
            pred_CESM = prediction[ERA_len:]

            # Calculate log loss gradient and hessian for ERAI samples
            grad_logloss = pred_ERA - y_ERA
            hess_logloss = pred_ERA * (1.0 - pred_ERA)

            # Calculate SME gradient and hessian for CESM samples
            grad_SME = 2 * self.lambda_ / CESM_len * pred_CESM * (
                1 - pred_CESM) * (np.mean(pred_CESM) - np.mean(y_ERA))
            hess_SME = 2 * self.lambda_ / CESM_len * pred_CESM * (
                1 - pred_CESM) * (
                    (1 - pred_CESM) *
                    (np.mean(pred_CESM) - np.mean(y_ERA)) - pred_CESM *
                    (np.mean(pred_CESM) - np.mean(y_ERA)) + pred_CESM *
                    (1 - pred_CESM) / CESM_len)

            # Build full gradient for all samples
            grad = np.zeros(len(prediction))
            grad[:ERA_len] = grad_logloss
            grad[ERA_len:] = grad_SME

            # Build full hessian for all samples
            hess = np.zeros(len(prediction))
            hess[:ERA_len] = hess_logloss
            hess[ERA_len:] = hess_SME

            return grad, hess
Example #15
0
def xgb_mape(preds: np.ndarray, dtrain: DMatrix) -> Tuple[str, float]:
    """
    Mean average precision error metric for evaluation in xgboost.

    Args:
        preds: Array of predictions
        dtrain: DMatrix of data

    Returns:
        Tuple of error name (str) and error (float)
    """
    labels = dtrain.get_label()
    mask = labels != 0
    return "mape", (np.fabs(labels - preds) / labels)[mask].mean()
Example #16
0
def merror(predt: np.ndarray, dtrain: xgb.DMatrix):
    y = dtrain.get_label()
    # Like custom objective, the predt is untransformed leaf weight
    assert predt.shape == (kRows, kClasses)
    out = np.zeros(kRows)
    for r in range(predt.shape[0]):
        i = np.argmax(predt[r])
        out[r] = i

    assert y.shape == out.shape

    errors = np.zeros(kRows)
    errors[y != out] = 1.0
    return 'PyMError', np.sum(errors) / kRows
    def objective(trial: optuna.trial, dtrain: xgb.DMatrix,
                  dvalid: xgb.DMatrix) -> float:
        """
        """

        # define the parameters to be tested
        param = {
            "verbosity":
            0,
            "objective":
            "binary:logistic",
            "booster":
            trial.suggest_categorical("booster",
                                      ["gbtree", "gblinear", "dart"]),
            "lambda":
            trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha":
            trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        }

        # choose the different hyperparameter values
        if param["booster"] == "gbtree" or param["booster"] == "dart":
            param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
            param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            param["grow_policy"] = trial.suggest_categorical(
                "grow_policy", ["depthwise", "lossguide"])

        if param["booster"] == "dart":
            param["sample_type"] = trial.suggest_categorical(
                "sample_type", ["uniform", "weighted"])
            param["normalize_type"] = trial.suggest_categorical(
                "normalize_type", ["tree", "forest"])
            param["rate_drop"] = trial.suggest_float("rate_drop",
                                                     1e-8,
                                                     1.0,
                                                     log=True)
            param["skip_drop"] = trial.suggest_float("skip_drop",
                                                     1e-8,
                                                     1.0,
                                                     log=True)

        # fit a model to the training dataset using the defined hyperparameter values
        xgboost_mod = xgb.train(param, dtrain)
        valid_preds = xgboost_mod.predict(dvalid)
        valid_pred_labels = np.rint(valid_preds)
        accuracy = accuracy_score(dvalid.get_label(), valid_pred_labels)

        return accuracy
Example #18
0
def xgb_mape_exp(preds: np.ndarray, dtrain: DMatrix) -> Tuple[str, float]:
    """
    Mean average precision error metric for evaluation in xgboost.
    NOTE: This will exponentiate the predictions first, in the case where our actual is logged

    Args:
        preds: Array of predictions
        dtrain: DMatrix of data

    Returns:
        Tuple of error name (str) and error (float)
    """
    labels = dtrain.get_label()
    mask = labels != 0
    return "mape_exp", (np.fabs(labels - np.exp(preds)) / labels)[mask].mean()
Example #19
0
def xgb_log_cosh(preds: np.ndarray, dtrain: DMatrix) -> Tuple[float, float]:
    """
    Log-Cosh objective for xgboost.

    Args:
        preds: Array of predictions
        dtrain: DMatrix of data

    Returns:
        Gradient and hessian for log-cosh
    """
    x = preds - dtrain.get_label()
    grad = np.tanh(x)  # pylint: disable=assignment-from-no-return
    hess = 1 / np.cosh(x)**2
    return grad, hess
Example #20
0
def xgb_pr_auc(preds: np.ndarray, lgb_train: DMatrix) -> Tuple[str, float]:
    """
    Precision Recall AUC (Area under Curve) of our prediction in lightgbxgboostm

    Args:
        preds: Array of predictions
        lgb_train: DMatrix of data

    Returns:
        Precision Recall AUC (Area under Curve)
    """
    labels = lgb_train.get_label()
    precision, recall, _ = precision_recall_curve(labels, preds)
    result = auc(recall, precision)
    return "pr_auc", result
Example #21
0
 def fforma_loss(self, predt: np.ndarray,
                 dtrain: xgb.DMatrix) -> (str, float):
     '''
     Compute...
     '''
     y = dtrain.get_label().astype(int)
     n_train = len(y)
     #print(predt.shape)
     #print(predt)
     preds_transformed = predt  #np.array([softmax(row) for row in predt])
     weighted_avg_loss_func = (preds_transformed *
                               self.contribution_to_owa[y, :]).sum(
                                   axis=1).reshape((n_train, 1))
     fforma_loss = weighted_avg_loss_func.sum()
     #print(grad)
     return 'FFORMA-loss', fforma_loss
Example #22
0
def xgb_mpse(preds: np.ndarray, dtrain: DMatrix) -> Tuple[float, float]:
    """
    Mean-Squared Percentage Error objective for xgboost

    Args:
        preds: Array of predictions
        dtrain: DMatrix of data

    Returns:
        Gradient and hessian for mean squared percentage error
    """
    yhat = dtrain.get_label()
    grad = 2.0 / yhat * (preds * 1.0 / yhat - 1)
    hess = 2.0 / (yhat**2)
    grad = np.where(np.isinf(grad), 0., grad)
    hess = np.where(np.isinf(hess), 0., hess)
    return grad, hess
Example #23
0
def corr_xgb(
    time_id_fold,
    y_pred: np.array,
    dtrain: xgb.DMatrix,
) -> Tuple[str, float]:
    """
    Pearson correlation coefficient metric
    """
    y_true = dtrain.get_label()

    pd_info = pd.DataFrame({
        'time_id': time_id_fold,
        'y_pred': y_pred,
        'y_true': y_true
    })
    mean_corr = calculate_corr(pd_info)

    return 'pearson_corr', mean_corr
Example #24
0
def xgb_fair(preds: np.ndarray, dtrain: DMatrix) -> Tuple[float, float]:
    """
    Fair loss objective for xgboost.

    y = c * abs(x) - c * np.log(abs(abs(x) + c))

    Args:
        preds: Array of predictions
        dtrain: DMatrix of data

    Returns:
        Gradient and hessian for fair loss
    """
    x = preds - dtrain.get_label()
    c = 1
    den = abs(x) + c
    grad = c * x / den
    hess = c * c / den**2
    return grad, hess
Example #25
0
def merror(predt: np.ndarray, dtrain: xgb.DMatrix):
    y = dtrain.get_label()
    # Like custom objective, the predt is untransformed leaf weight when custom objective
    # is provided.

    # With the use of `custom_metric` parameter in train function, custom metric receives
    # raw input only when custom objective is also being used.  Otherwise custom metric
    # will receive transformed prediction.
    assert predt.shape == (kRows, kClasses)
    out = np.zeros(kRows)
    for r in range(predt.shape[0]):
        i = np.argmax(predt[r])
        out[r] = i

    assert y.shape == out.shape

    errors = np.zeros(kRows)
    errors[y != out] = 1.0
    return 'PyMError', np.sum(errors) / kRows
Example #26
0
    def calibrate(
        self,
        data: xgb.DMatrix,
        response: Optional[Union[np.ndarray, pd.Series]] = None,
        alpha: Union[int, float] = 0.95,
    ) -> None:
        """Method to calibrate conformal intervals that will allow
        prediction intervals that vary by row.

        Method calls _calibrate_leaf_node_counts to record the number
        of times each leaf node is visited across the whole of the
        passed data.

        Method calls _calibrate_interval to set the default interval that
        will be scaled using the inverse of the noncomformity function
        when making predictions. This allows intervals to vary by instance.

        Parameters
        ----------
        data : xgb.DMatrix
            Dataset to use to set baselines.

        alpha : int or float, default = 0.95
            Confidence level for the interval.

        response : np.ndarray, pd.Series or None, default = None
            The response values for the records in data. If passed as
            None then the _calibrate_interval function will attempt to extract
            the response from the data argument with get_label.

        """

        check_type(data, [xgb.DMatrix], "data")

        if response is None:

            # only to stop mypy complaining about get_label method
            data = cast(xgb.DMatrix, data)

            response = data.get_label()

        super().calibrate(data=data, response=response, alpha=alpha)
Example #27
0
def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
    '''Loss function.  Computing the gradient and approximated hessian (diagonal).
    Reimplements the `multi:softprob` inside XGBoost.

    '''
    labels = data.get_label()
    if data.get_weight().size == 0:
        # Use 1 as weight if we don't have custom weight.
        weights = np.ones((kRows, 1), dtype=float)
    else:
        weights = data.get_weight()

    # The prediction is of shape (rows, classes), each element in a row
    # represents a raw prediction (leaf weight, hasn't gone through softmax
    # yet).  In XGBoost 1.0.0, the prediction is transformed by a softmax
    # function, fixed in later versions.
    assert predt.shape == (kRows, kClasses)

    grad = np.zeros((kRows, kClasses), dtype=float)
    hess = np.zeros((kRows, kClasses), dtype=float)

    eps = 1e-6

    # compute the gradient and hessian, slow iterations in Python, only
    # suitable for demo.  Also the one in native XGBoost core is more robust to
    # numeric overflow as we don't do anything to mitigate the `exp` in
    # `softmax` here.
    for r in range(predt.shape[0]):
        target = labels[r]
        p = softmax(predt[r, :])
        for c in range(predt.shape[1]):
            assert target >= 0 or target <= kClasses
            g = p[c] - 1.0 if c == target else p[c]
            g = g * weights[r]
            h = max((2.0 * p[c] * (1.0 - p[c]) * weights[r]).item(), eps)
            grad[r, c] = g
            hess[r, c] = h

    # Right now (XGBoost 1.0.0), reshaping is necessary
    grad = grad.reshape((kRows * kClasses, 1))
    hess = hess.reshape((kRows * kClasses, 1))
    return grad, hess
Example #28
0
 def error_softmax_obj(self, predt: np.ndarray,
                       dtrain: xgb.DMatrix) -> (np.ndarray, np.ndarray):
     '''
     Compute...
     '''
     y = dtrain.get_label().astype(int)
     #print(y)
     n_train = len(y)
     #print(predt.shape)
     #print(predt)
     preds_transformed = predt  #np.array([softmax(row) for row in predt])
     weighted_avg_loss_func = (preds_transformed *
                               self.contribution_to_owa[y, :]).sum(
                                   axis=1).reshape((n_train, 1))
     grad = preds_transformed * (self.contribution_to_owa[y, :] -
                                 weighted_avg_loss_func)
     hess = self.contribution_to_owa[y, :] * preds_transformed * (
         1.0 - preds_transformed) - grad * preds_transformed
     #print(grad)
     return grad.reshape(-1, 1), hess.reshape(-1, 1)
Example #29
0
def xgb_huber_approx(preds: np.ndarray,
                     dtrain: DMatrix) -> Tuple[float, float]:
    """
    Huber loss (approximation) objective for xgboost.

    Args:
        preds: Array of predictions
        dtrain: DMatrix of data

    Returns:
        Gradient and hessian for huber loss
    """
    d = preds - dtrain.get_label()
    h = 1
    scale = 1 + (d / h)**2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt
    hess = 1 / scale / scale_sqrt
    # TODO returns np.arrays, not floats.
    return grad, hess
        def gradient(predt: np.ndarray, dtrain: xgb.DMatrix):
            '''Fair Xgboost Gradient'''
            # predt is the prediction array

            # Find the right protected group vector
            if len(predt) == len(protected_train):
                protected_feature = np.array(protected_train.copy())

            elif len(predt) == len(protected_full):
                protected_feature = np.array(protected_full.copy())

            elif len(predt) == len(protected_valid):
                protected_feature = np.array(protected_valid.copy())

            else:
                protected_feature = 0

            y = dtrain.get_label()

            answer = sigmoid(predt) - y
            answer += mu * (protected_feature - sigmoid(predt))

            return answer