Esempio n. 1
0
def auc_metric(solution, prediction, task=BINARY_CLASSIFICATION):
    """
    Normarlized Area under ROC curve (AUC).

    Return Gini index = 2*AUC-1 for  binary classification problems.
    Should work for a vector of binary 0/1 (or -1/1)"solution" and any discriminant values
    for the predictions. If solution and prediction are not vectors, the AUC
    of the columns of the matrices are computed and averaged (with no weight).
    The same for all classification problems (in fact it treats well only the
    binary and multilabel classification problems).
    :param solution:
    :param prediction:
    :param task:
    :return:
    """
    if task == BINARY_CLASSIFICATION:
        if len(solution.shape) == 1:
            # Solution won't be touched - no copy
            solution = solution.reshape((-1, 1))
        elif len(solution.shape) == 2:
            if solution.shape[1] > 1:
                raise ValueError('Solution array must only contain one class '
                                 'label, but contains %d' % solution.shape[1])
        else:
            raise ValueError('Solution.shape %s' % solution.shape)
        solution = solution.copy()

        if len(prediction.shape) == 2:
            if prediction.shape[1] > 2:
                raise ValueError('A prediction array with probability values '
                                 'for %d classes is not a binary '
                                 'classification problem' %
                                 prediction.shape[1])
            elif prediction.shape[1] == 2:
                # Prediction will be copied into a new binary array - no copy
                prediction = prediction[:, 1].reshape((-1, 1))
        else:
            raise ValueError('Invalid prediction shape %s' % prediction.shape)

    elif task == MULTICLASS_CLASSIFICATION:
        if len(solution.shape) == 1:
            solution = create_multiclass_solution(solution, prediction)
        elif len(solution.shape) == 2:
            if solution.shape[1] > 1:
                raise ValueError('Solution array must only contain one class '
                                 'label, but contains %d' % solution.shape[1])
            else:
                solution = create_multiclass_solution(
                    solution.reshape((-1, 1)), prediction)
        else:
            raise ValueError('Solution.shape %s' % solution.shape)
    elif task == MULTILABEL_CLASSIFICATION:
        solution = solution.copy()
    else:
        raise NotImplementedError('auc_metric does not support task type %s' %
                                  task)

    solution, prediction = normalize_array(solution, prediction.copy())

    label_num = solution.shape[1]
    auc = np.empty(label_num)
    for k in range(label_num):
        r_ = scipy.stats.rankdata(prediction[:, k])
        s_ = solution[:, k]
        if sum(s_) == 0:
            print('WARNING: no positive class example in class {}'.format(k +
                                                                          1))
        npos = np.sum(s_ == 1)
        nneg = np.sum(s_ < 1)
        auc[k] = (np.sum(r_[s_ == 1]) - npos * (npos + 1) / 2) / (nneg * npos)
    auc[~np.isfinite(auc)] = 0
    return 2 * np.mean(auc) - 1
def auc_metric(solution, prediction, task=BINARY_CLASSIFICATION):
    """
    Normarlized Area under ROC curve (AUC).

    Return Gini index = 2*AUC-1 for  binary classification problems.
    Should work for a vector of binary 0/1 (or -1/1)"solution" and any discriminant values
    for the predictions. If solution and prediction are not vectors, the AUC
    of the columns of the matrices are computed and averaged (with no weight).
    The same for all classification problems (in fact it treats well only the
    binary and multilabel classification problems).
    :param solution:
    :param prediction:
    :param task:
    :return:
    """
    if task == BINARY_CLASSIFICATION:
        if len(solution.shape) == 1:
            # Solution won't be touched - no copy
            solution = solution.reshape((-1, 1))
        elif len(solution.shape) == 2:
            if solution.shape[1] > 1:
                raise ValueError('Solution array must only contain one class '
                                 'label, but contains %d' % solution.shape[1])
            else:
                solution = solution[:, 1]
        else:
            raise ValueError('Solution.shape %s' % solution.shape)
        solution = solution.copy()

        if len(prediction.shape) == 2:
            if prediction.shape[1] > 2:
                raise ValueError('A prediction array with probability values '
                                 'for %d classes is not a binary '
                                 'classification problem' % prediction.shape[1])
            # Prediction will be copied into a new binary array - no copy
            prediction = prediction[:, 1].reshape((-1, 1))
        else:
            raise ValueError('Invalid prediction shape %s' % prediction.shape)

    elif task == MULTICLASS_CLASSIFICATION:
        if len(solution.shape) == 1:
            solution = create_multiclass_solution(solution, prediction)
        elif len(solution.shape) == 2:
            if solution.shape[1] > 1:
                raise ValueError('Solution array must only contain one class '
                                 'label, but contains %d' % solution.shape[1])
            else:
                solution = create_multiclass_solution(solution.reshape((-1, 1)),
                                                      prediction)
        else:
            raise ValueError('Solution.shape %s' % solution.shape)
    elif task == MULTILABEL_CLASSIFICATION:
        solution = solution.copy()
    else:
        raise NotImplementedError('auc_metric does not support task type %s'
                                  % task)

    solution, prediction = normalize_array(solution, prediction.copy())

    label_num = solution.shape[1]
    auc = np.empty(label_num)
    for k in range(label_num):
        r_ = scipy.stats.rankdata(prediction[:, k])
        s_ = solution[:, k]
        if sum(s_) == 0:
            print(
                'WARNING: no positive class example in class {}'.format(k + 1))
        npos = np.sum(s_ == 1)
        nneg = np.sum(s_ < 1)
        auc[k] = (np.sum(r_[s_ == 1]) - npos * (npos + 1) / 2) / (nneg * npos)
    auc[~np.isfinite(auc)] = 0
    return 2 * np.mean(auc) - 1
Esempio n. 3
0
def pac_metric(solution, prediction, task=BINARY_CLASSIFICATION):
    """
    Probabilistic Accuracy based on log_loss metric.

    We assume the solution is in {0, 1} and prediction in [0, 1].
    Otherwise, run normalize_array.
    :param solution:
    :param prediction:
    :param task:
    :return:
    """
    if task == BINARY_CLASSIFICATION:
        if len(solution.shape) == 1:
            # Solution won't be touched - no copy
            solution = solution.reshape((-1, 1))
        elif len(solution.shape) == 2:
            if solution.shape[1] > 1:
                raise ValueError('Solution array must only contain one class '
                                 'label, but contains %d' % solution.shape[1])
        else:
            raise ValueError('Solution.shape %s' % solution.shape)
        solution = solution.copy()

        if len(prediction.shape) == 2:
            if prediction.shape[1] > 2:
                raise ValueError('A prediction array with probability values '
                                 'for %d classes is not a binary '
                                 'classification problem' %
                                 prediction.shape[1])
            # Prediction will be copied into a new binary array - no copy
            prediction = prediction[:, 1].reshape((-1, 1))
        else:
            raise ValueError('Invalid prediction shape %s' % prediction.shape)

    elif task == MULTICLASS_CLASSIFICATION:
        if len(solution.shape) == 1:
            solution = create_multiclass_solution(solution, prediction)
        elif len(solution.shape) == 2:
            if solution.shape[1] > 1:
                raise ValueError('Solution array must only contain one class '
                                 'label, but contains %d' % solution.shape[1])
            else:
                solution = create_multiclass_solution(
                    solution.reshape((-1, 1)), prediction)
        else:
            raise ValueError('Solution.shape %s' % solution.shape)
    elif task == MULTILABEL_CLASSIFICATION:
        solution = solution.copy()
    else:
        raise NotImplementedError('auc_metric does not support task type %s' %
                                  task)
    solution, prediction = normalize_array(solution, prediction.copy())

    [sample_num, label_num] = solution.shape
    if label_num == 1:
        task = BINARY_CLASSIFICATION
    eps = 1e-7
    # Compute the base log loss (using the prior probabilities)
    pos_num = 1. * np.sum(solution, axis=0, dtype=float)  # float conversion!
    frac_pos = pos_num / sample_num  # prior proba of positive class
    the_base_log_loss = prior_log_loss(frac_pos, task)
    the_log_loss = log_loss(solution, prediction, task)

    # Exponentiate to turn into an accuracy-like score.
    # In the multi-label case, we need to average AFTER taking the exp
    # because it is an NL operation
    pac = np.mean(np.exp(-the_log_loss))
    base_pac = np.mean(np.exp(-the_base_log_loss))
    # Normalize: 0 for random, 1 for perfect
    score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac))
    return score
def pac_metric(solution, prediction, task=BINARY_CLASSIFICATION):
    """
    Probabilistic Accuracy based on log_loss metric.

    We assume the solution is in {0, 1} and prediction in [0, 1].
    Otherwise, run normalize_array.
    :param solution:
    :param prediction:
    :param task:
    :return:
    """
    if task == BINARY_CLASSIFICATION:
        if len(solution.shape) == 1:
            # Solution won't be touched - no copy
            solution = solution.reshape((-1, 1))
        elif len(solution.shape) == 2:
            if solution.shape[1] > 1:
                raise ValueError('Solution array must only contain one class '
                                 'label, but contains %d' % solution.shape[1])
            else:
                solution = solution[:, 1]
        else:
            raise ValueError('Solution.shape %s' % solution.shape)
        solution = solution.copy()

        if len(prediction.shape) == 2:
            if prediction.shape[1] > 2:
                raise ValueError('A prediction array with probability values '
                                 'for %d classes is not a binary '
                                 'classification problem' % prediction.shape[1])
            # Prediction will be copied into a new binary array - no copy
            prediction = prediction[:, 1].reshape((-1, 1))
        else:
            raise ValueError('Invalid prediction shape %s' % prediction.shape)

    elif task == MULTICLASS_CLASSIFICATION:
        if len(solution.shape) == 1:
            solution = create_multiclass_solution(solution, prediction)
        elif len(solution.shape) == 2:
            if solution.shape[1] > 1:
                raise ValueError('Solution array must only contain one class '
                                 'label, but contains %d' % solution.shape[1])
            else:
                solution = create_multiclass_solution(solution.reshape((-1, 1)),
                                                      prediction)
        else:
            raise ValueError('Solution.shape %s' % solution.shape)
    elif task == MULTILABEL_CLASSIFICATION:
        solution = solution.copy()
    else:
        raise NotImplementedError('auc_metric does not support task type %s'
                                  % task)
    solution, prediction = normalize_array(solution, prediction.copy())

    [sample_num, label_num] = solution.shape
    if label_num == 1:
        task = BINARY_CLASSIFICATION
    eps = 1e-7
    # Compute the base log loss (using the prior probabilities)
    pos_num = 1. * np.sum(solution, axis=0, dtype=float)  # float conversion!
    frac_pos = pos_num / sample_num  # prior proba of positive class
    the_base_log_loss = prior_log_loss(frac_pos, task)
    the_log_loss = log_loss(solution, prediction, task)

    # Exponentiate to turn into an accuracy-like score.
    # In the multi-label case, we need to average AFTER taking the exp
    # because it is an NL operation
    pac = np.mean(np.exp(-the_log_loss))
    base_pac = np.mean(np.exp(-the_base_log_loss))
    # Normalize: 0 for random, 1 for perfect
    score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac))
    return score