def auc_metric(solution, prediction, task=BINARY_CLASSIFICATION): """ Normarlized Area under ROC curve (AUC). Return Gini index = 2*AUC-1 for binary classification problems. Should work for a vector of binary 0/1 (or -1/1)"solution" and any discriminant values for the predictions. If solution and prediction are not vectors, the AUC of the columns of the matrices are computed and averaged (with no weight). The same for all classification problems (in fact it treats well only the binary and multilabel classification problems). :param solution: :param prediction: :param task: :return: """ if task == BINARY_CLASSIFICATION: if len(solution.shape) == 1: # Solution won't be touched - no copy solution = solution.reshape((-1, 1)) elif len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) else: raise ValueError('Solution.shape %s' % solution.shape) solution = solution.copy() if len(prediction.shape) == 2: if prediction.shape[1] > 2: raise ValueError('A prediction array with probability values ' 'for %d classes is not a binary ' 'classification problem' % prediction.shape[1]) elif prediction.shape[1] == 2: # Prediction will be copied into a new binary array - no copy prediction = prediction[:, 1].reshape((-1, 1)) else: raise ValueError('Invalid prediction shape %s' % prediction.shape) elif task == MULTICLASS_CLASSIFICATION: if len(solution.shape) == 1: solution = create_multiclass_solution(solution, prediction) elif len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) else: solution = create_multiclass_solution( solution.reshape((-1, 1)), prediction) else: raise ValueError('Solution.shape %s' % solution.shape) elif task == MULTILABEL_CLASSIFICATION: solution = solution.copy() else: raise NotImplementedError('auc_metric does not support task type %s' % task) solution, prediction = normalize_array(solution, prediction.copy()) label_num = solution.shape[1] auc = np.empty(label_num) for k in range(label_num): r_ = scipy.stats.rankdata(prediction[:, k]) s_ = solution[:, k] if sum(s_) == 0: print('WARNING: no positive class example in class {}'.format(k + 1)) npos = np.sum(s_ == 1) nneg = np.sum(s_ < 1) auc[k] = (np.sum(r_[s_ == 1]) - npos * (npos + 1) / 2) / (nneg * npos) auc[~np.isfinite(auc)] = 0 return 2 * np.mean(auc) - 1
def auc_metric(solution, prediction, task=BINARY_CLASSIFICATION): """ Normarlized Area under ROC curve (AUC). Return Gini index = 2*AUC-1 for binary classification problems. Should work for a vector of binary 0/1 (or -1/1)"solution" and any discriminant values for the predictions. If solution and prediction are not vectors, the AUC of the columns of the matrices are computed and averaged (with no weight). The same for all classification problems (in fact it treats well only the binary and multilabel classification problems). :param solution: :param prediction: :param task: :return: """ if task == BINARY_CLASSIFICATION: if len(solution.shape) == 1: # Solution won't be touched - no copy solution = solution.reshape((-1, 1)) elif len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) else: solution = solution[:, 1] else: raise ValueError('Solution.shape %s' % solution.shape) solution = solution.copy() if len(prediction.shape) == 2: if prediction.shape[1] > 2: raise ValueError('A prediction array with probability values ' 'for %d classes is not a binary ' 'classification problem' % prediction.shape[1]) # Prediction will be copied into a new binary array - no copy prediction = prediction[:, 1].reshape((-1, 1)) else: raise ValueError('Invalid prediction shape %s' % prediction.shape) elif task == MULTICLASS_CLASSIFICATION: if len(solution.shape) == 1: solution = create_multiclass_solution(solution, prediction) elif len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) else: solution = create_multiclass_solution(solution.reshape((-1, 1)), prediction) else: raise ValueError('Solution.shape %s' % solution.shape) elif task == MULTILABEL_CLASSIFICATION: solution = solution.copy() else: raise NotImplementedError('auc_metric does not support task type %s' % task) solution, prediction = normalize_array(solution, prediction.copy()) label_num = solution.shape[1] auc = np.empty(label_num) for k in range(label_num): r_ = scipy.stats.rankdata(prediction[:, k]) s_ = solution[:, k] if sum(s_) == 0: print( 'WARNING: no positive class example in class {}'.format(k + 1)) npos = np.sum(s_ == 1) nneg = np.sum(s_ < 1) auc[k] = (np.sum(r_[s_ == 1]) - npos * (npos + 1) / 2) / (nneg * npos) auc[~np.isfinite(auc)] = 0 return 2 * np.mean(auc) - 1
def pac_metric(solution, prediction, task=BINARY_CLASSIFICATION): """ Probabilistic Accuracy based on log_loss metric. We assume the solution is in {0, 1} and prediction in [0, 1]. Otherwise, run normalize_array. :param solution: :param prediction: :param task: :return: """ if task == BINARY_CLASSIFICATION: if len(solution.shape) == 1: # Solution won't be touched - no copy solution = solution.reshape((-1, 1)) elif len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) else: raise ValueError('Solution.shape %s' % solution.shape) solution = solution.copy() if len(prediction.shape) == 2: if prediction.shape[1] > 2: raise ValueError('A prediction array with probability values ' 'for %d classes is not a binary ' 'classification problem' % prediction.shape[1]) # Prediction will be copied into a new binary array - no copy prediction = prediction[:, 1].reshape((-1, 1)) else: raise ValueError('Invalid prediction shape %s' % prediction.shape) elif task == MULTICLASS_CLASSIFICATION: if len(solution.shape) == 1: solution = create_multiclass_solution(solution, prediction) elif len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) else: solution = create_multiclass_solution( solution.reshape((-1, 1)), prediction) else: raise ValueError('Solution.shape %s' % solution.shape) elif task == MULTILABEL_CLASSIFICATION: solution = solution.copy() else: raise NotImplementedError('auc_metric does not support task type %s' % task) solution, prediction = normalize_array(solution, prediction.copy()) [sample_num, label_num] = solution.shape if label_num == 1: task = BINARY_CLASSIFICATION eps = 1e-7 # Compute the base log loss (using the prior probabilities) pos_num = 1. * np.sum(solution, axis=0, dtype=float) # float conversion! frac_pos = pos_num / sample_num # prior proba of positive class the_base_log_loss = prior_log_loss(frac_pos, task) the_log_loss = log_loss(solution, prediction, task) # Exponentiate to turn into an accuracy-like score. # In the multi-label case, we need to average AFTER taking the exp # because it is an NL operation pac = np.mean(np.exp(-the_log_loss)) base_pac = np.mean(np.exp(-the_base_log_loss)) # Normalize: 0 for random, 1 for perfect score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac)) return score
def pac_metric(solution, prediction, task=BINARY_CLASSIFICATION): """ Probabilistic Accuracy based on log_loss metric. We assume the solution is in {0, 1} and prediction in [0, 1]. Otherwise, run normalize_array. :param solution: :param prediction: :param task: :return: """ if task == BINARY_CLASSIFICATION: if len(solution.shape) == 1: # Solution won't be touched - no copy solution = solution.reshape((-1, 1)) elif len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) else: solution = solution[:, 1] else: raise ValueError('Solution.shape %s' % solution.shape) solution = solution.copy() if len(prediction.shape) == 2: if prediction.shape[1] > 2: raise ValueError('A prediction array with probability values ' 'for %d classes is not a binary ' 'classification problem' % prediction.shape[1]) # Prediction will be copied into a new binary array - no copy prediction = prediction[:, 1].reshape((-1, 1)) else: raise ValueError('Invalid prediction shape %s' % prediction.shape) elif task == MULTICLASS_CLASSIFICATION: if len(solution.shape) == 1: solution = create_multiclass_solution(solution, prediction) elif len(solution.shape) == 2: if solution.shape[1] > 1: raise ValueError('Solution array must only contain one class ' 'label, but contains %d' % solution.shape[1]) else: solution = create_multiclass_solution(solution.reshape((-1, 1)), prediction) else: raise ValueError('Solution.shape %s' % solution.shape) elif task == MULTILABEL_CLASSIFICATION: solution = solution.copy() else: raise NotImplementedError('auc_metric does not support task type %s' % task) solution, prediction = normalize_array(solution, prediction.copy()) [sample_num, label_num] = solution.shape if label_num == 1: task = BINARY_CLASSIFICATION eps = 1e-7 # Compute the base log loss (using the prior probabilities) pos_num = 1. * np.sum(solution, axis=0, dtype=float) # float conversion! frac_pos = pos_num / sample_num # prior proba of positive class the_base_log_loss = prior_log_loss(frac_pos, task) the_log_loss = log_loss(solution, prediction, task) # Exponentiate to turn into an accuracy-like score. # In the multi-label case, we need to average AFTER taking the exp # because it is an NL operation pac = np.mean(np.exp(-the_log_loss)) base_pac = np.mean(np.exp(-the_base_log_loss)) # Normalize: 0 for random, 1 for perfect score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac)) return score