def cindex(Y, P): """Concordance, aka pairwise ranking accuracy. Computes the relative fraction of concordant pairs, that is, Y[i] > Y[j] and P[i] > P[j] (ties with P[i]=P[j] are assumed to be broken randomly). Equivalent to area under ROC curve, if Y[i] belong to {-1, 1}. An O(n*log(n)) implementation, based on order statistic tree computations. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_labels] Correct labels, can be any real numbers. P : {array-like}, shape = [n_samples] or [n_samples, n_labels] Predicted labels, can be any real numbers. Returns ------- concordance index : float number between 0 and 1, around 0.5 means random performance """ Y = array_tools.as_2d_array(Y) P = array_tools.as_2d_array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") perfs = cindex_multitask(Y, P) perfs = np.array(perfs) perfs = perfs[np.invert(np.isnan(perfs))] if len(perfs) == 0: raise UndefinedPerformance( "No pairs, all the instances have the same output") return np.mean(perfs)
class NfoldCV(object): def __init__(self, learner, measure, folds): self.rls = learner if measure == None: self.measure = sqerror else: self.measure = measure self.folds = folds def cv(self, regparam): rls = self.rls folds = self.folds measure = self.measure rls.solve(regparam) Y = rls.Y performances = [] P_all = [] for fold in folds: P = rls.holdout(fold) P_all.append(P) try: performance = measure(Y[fold], P) performances.append(performance) except UndefinedPerformance, e: pass #performance = measure_utilities.aggregate(performances) if len(performances) > 0: performance = np.mean(performances) else: raise UndefinedPerformance("Performance undefined for all folds") return performance, P_all
def accuracy(Y, P): """Binary classification accuracy. A performance measure for binary classification problems. Returns the fraction of correct class predictions. P[i]>0 is considered a positive class prediction and P[i]<0 negative. P[i]==0 is considered as classifier abstaining to make a decision, which incurs 0.5 errors (in contrast to 0 error for correct and 1 error for incorrect prediction). If 2-dimensional arrays are supplied as arguments, then accuracy is separately computed for each column, after which the accuracies are averaged. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_labels] Correct labels, must belong to set {-1,1} P : {array-like}, shape = [n_samples] or [n_samples, n_labels] Predicted labels, can be any real numbers. Returns ------- accuracy : float number between 0 and 1 """ Y = array_tools.as_2d_array(Y) P = array_tools.as_2d_array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") return np.mean(accuracy_multitask(Y, P))
def fscore(Y, P): """F1-Score. A performance measure for binary classification problems. F1 = 2*(Precision*Recall)/(Precision+Recall) If 2-dimensional arrays are supplied as arguments, then macro-averaged F-score is computed over the columns. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_labels] Correct labels, must belong to set {-1,1} P : {array-like}, shape = [n_samples] or [n_samples, n_labels] Predicted labels, can be any real numbers. P[i]>0 is treated as a positive, and P[i]<=0 as a negative class prediction. Returns ------- fscore : float number between 0 and 1 """ Y = array_tools.as_2d_array(Y) P = array_tools.as_2d_array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") return np.mean(fscore_multitask(Y, P))
class LQOCV(object): def __init__(self, learner, measure): self.rls = learner self.measure = measure def cv(self, regparam): rls = self.rls measure = self.measure rls.solve(regparam) Y = rls.Y performances = [] folds = rls.qids for fold in folds: P = rls.computeHO(fold) try: performance = measure(Y[fold], P) performances.append(performance) except UndefinedPerformance, e: pass #performance = measure_utilities.aggregate(performances) if len(performances) > 0: performance = np.mean(performances) else: raise UndefinedPerformance("Performance undefined for all folds") return performance
def sqmprank(Y, P): """Squared magnitude preserving ranking error. A performance measure for ranking problems. Computes the sum of (Y[i]-Y[j]-P[i]+P[j])**2 over all index pairs. normalized by the number of pairs. For query-structured data, one would typically want to compute the error separately for each query, and average. If 2-dimensional arrays are supplied as arguments, then error is separately computed for each column, after which the errors are averaged. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_labels] Correct utility values, can be any real numbers P : {array-like}, shape = [n_samples] or [n_samples, n_labels] Predicted utility values, can be any real numbers. Returns ------- error : float """ Y = array_tools.as_2d_array(Y) P = array_tools.as_2d_array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") return np.mean(sqmprank_multitask(Y, P))
def cindex_singletask(Y, P): Y = np.array(Y).T[0] P = np.array(P).T[0] correct = Y.astype(np.float64) predictions = P.astype(np.float64) assert len(correct) == len(predictions) C = array(correct).reshape(len(correct), ) C.sort() pairs = 0 c_ties = 0 for i in range(1, len(C)): if C[i] != C[i - 1]: c_ties = 0 else: c_ties += 1 #this example forms a pair with each previous example, that has a lower value pairs += i - c_ties if pairs == 0: raise UndefinedPerformance( "No pairs, all the instances have the same output") correct = array(correct).reshape(correct.shape[0], ) predictions = array(predictions).reshape(predictions.shape[0], ) s = swapped.count_swapped(correct, predictions) disagreement = float(s) / float(pairs) return 1. - disagreement
def sqerror(Y, P): """Mean squared error. A performance measure for regression problems. Computes the sum of (Y[i]-P[i])**2 over all index pairs, normalized by the number of instances. If 2-dimensional arrays are supplied as arguments, then error is separately computed for each column, after which the errors are averaged. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_tasks] Correct utility values, can be any real numbers P : {array-like}, shape = [n_samples] or [n_samples, n_tasks] Predicted utility values, can be any real numbers. Returns ------- error : float """ Y = array_tools.as_2d_array(Y) P = array_tools.as_2d_array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") return np.mean(sqerror_multitask(Y,P))
def ova_accuracy(Y, P): """One-vs-all classification accuracy for multi-class problems. Computes the accuracy for a one-versus-all decomposed classification problem. Each column in Y and P correspond to one possible class label. On each row, exactly one column in Y is 1, all the rest must be -1. The prediction for the i:th example is computed by taking the argmax over the indices of row i in P. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_classes] Correct labels, must belong to set {-1,1}, with exactly one 1 on each row. P : {array-like}, shape = [n_samples] or [n_samples, n_classes] Predicted labels, can be any real numbers. Returns ------- accuracy : float number between 0 and 1 """ Y = np.array(Y) P = np.array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") Y = np.argmax(Y, axis=1) P = np.argmax(P, axis=1) return np.mean(Y == P)
def cv_old(self, regparam): rls = self.rls rls.solve(regparam) Y = rls.Y aucs = [] for k in range(Y.shape[1]): pairs_start_inds, pairs_end_inds = [], [] for i in range(Y.shape[0] - 1): for j in range(i + 1, Y.shape[0]): if Y[i,k] > Y[j,k]: pairs_start_inds.append(i) pairs_end_inds.append(j) elif Y[i,k] < Y[j,k]: pairs_start_inds.append(j) pairs_end_inds.append(i) if len(pairs_start_inds) == 0: raise UndefinedPerformance("Leave-pair-out undefined, all labels same for output %d" %k) pred_start, pred_end = rls.leave_pair_out(np.array(pairs_start_inds), np.array(pairs_end_inds)) pred_start = array_tools.as_2d_array(pred_start) pred_end = array_tools.as_2d_array(pred_end) auc = 0. for h in range(len(pred_start)): if pred_start[h,k] > pred_end[h,k]: auc += 1. elif pred_start[h,k] == pred_end[h,k]: auc += 0.5 auc /= len(pairs_start_inds) aucs.append(auc) auc = np.mean(aucs) return auc, None
def auc(Y, P): """Area under the ROC curve (AUC). A performance measure for binary classification problems. Can be interpreted as an estimate of the probability, that the classifier is able to discriminate between a randomly drawn positive and negative training examples. An O(n*log(n)) time implementation, with correction for tied predictions. If 2-dimensional arrays are supplied as arguments, then AUC is separately computed for each column, after which the AUCs are averaged. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_labels] Correct labels, must belong to set {-1,1} P : {array-like}, shape = [n_samples] or [n_samples, n_labels] Predicted labels, can be any real numbers. Returns ------- auc : float number between 0 and 1 """ Y = array_tools.as_2d_array(Y) P = array_tools.as_2d_array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") return np.mean(auc_multitask(Y, P))
def accuracy_singletask(Y, P): assert Y.shape[0] == P.shape[0] if not np.all((Y == 1) + (Y == -1)): raise UndefinedPerformance( "binary classification accuracy accepts as Y-values only 1 and -1") vlen = float(Y.shape[0]) perf = np.sum(np.sign(np.multiply(Y, P)) + 1.) / (2 * vlen) return perf
def cindex(Y, P): Y = array_tools.as_labelmatrix(Y) P = array_tools.as_labelmatrix(P) perfs = cindex_multitask(Y, P) perfs = np.array(perfs) perfs = perfs[np.invert(np.isnan(perfs))] if len(perfs) == 0: raise UndefinedPerformance( "No pairs, all the instances have the same output") return np.mean(perfs)
def accuracy_multitask(Y, P): Y = np.mat(Y) P = np.mat(P) if not np.all((Y == 1) + (Y == -1)): raise UndefinedPerformance( "binary classification accuracy accepts as Y-values only 1 and -1") vlen = float(Y.shape[0]) performances = np.sum(np.sign(np.multiply(Y, P)) + 1., axis=0) / (2 * vlen) performances = np.array(performances)[0] return performances
def cv(self, regparam): rls = self.rls rls.solve(regparam) Y = rls.Y #Union of all pairs for which predictions are needed all_pairs = set([]) for k in range(Y.shape[1]): pairs = [] for i in range(Y.shape[0] - 1): for j in range(i + 1, Y.shape[0]): if Y[i, k] != Y[j, k]: pairs.append((i, j)) #If all labels for some column are same, ranking accuracy is undefined if len(pairs) == 0: raise UndefinedPerformance( "Leave-pair-out undefined, all labels same for output %d" % k) all_pairs.update(pairs) all_start_inds = [x[0] for x in all_pairs] all_end_inds = [x[1] for x in all_pairs] #Compute leave-pair-out predictions for all pairs all_start_inds = np.array(all_start_inds) all_end_inds = np.array(all_end_inds) pred_start, pred_end = rls.leave_pair_out(all_start_inds, all_end_inds) pred_start = array_tools.as_2d_array(pred_start) pred_end = array_tools.as_2d_array(pred_end) pair_dict = dict(zip(all_pairs, range(pred_start.shape[0]))) aucs = [] #compute auc/ranking accuracy for each column of Y separately for k in range(Y.shape[1]): comparisons = [] #1 if the true and predicted agree, 0 if disagree, 0.5 if predictions tied for i in range(Y.shape[0] - 1): for j in range(i + 1, Y.shape[0]): if Y[i, k] > Y[j, k]: ind = pair_dict[(i, j)] if pred_start[ind, k] > pred_end[ind, k]: comparisons.append(1.) elif pred_start[ind, k] == pred_end[ind, k]: comparisons.append(0.5) else: comparisons.append(0.) elif Y[i, k] < Y[j, k]: ind = pair_dict[(i, j)] if pred_start[ind, k] < pred_end[ind, k]: comparisons.append(1.) elif pred_start[ind, k] == pred_end[ind, k]: comparisons.append(0.5) else: comparisons.append(0.) auc = np.mean(comparisons) aucs.append(auc) #Take the mean of all columnwise aucs auc = np.mean(aucs) return auc, None
def auc_singletask(Y, P): #the implementation has n(log(n)) time complexity #P: predicted labels #Y: true labels, y_i \in {-1,1} for each y_i \in Y # if not np.all((Y == 1) + (Y == -1)): raise UndefinedPerformance("auc accepts as Y-values only 1 and -1") size = len(P) #form a list of prediction-label pairs I = np.argsort(P) Y = Y[I] P = P[I] poscount = 0. #The number of positive labels that have the same prediction #as the current P[i] value posties = 0. #Number of pairwise mistakes this far errors = 0. j = 0 for i in range(size): #j points always to the next entry in P for which #P[j] > P[i]. In the end j will point outside of P if j == i: poscount += posties posties = 0. while j < size and P[i] == P[j]: if Y[j] == 1: posties += 1 j += 1 if Y[i] == -1: #every pairwise inversion of positive-negative pair #incurs one error, except for ties where it incurs 0.5 #errors errors += poscount + 0.5 * posties poscount += posties #the number of positive-negative pairs paircount = poscount * (size - poscount) #AUC is 1 - number of pairwise errors if paircount == 0: raise UndefinedPerformance("AUC undefined if both classes not present") AUC = 1. - errors / paircount return AUC
def cv(self, regparam): rls = self.rls rls.solve(regparam) Y = rls.Y perfs = [] #special handling for concordance index / auc if self.measure.func_name in ["cindex", "auc"]: for index in range(Y.shape[1]): pairs = [] for i in range(Y.shape[0] - 1): for j in range(i + 1, Y.shape[0]): if Y[i, index] > Y[j, index]: pairs.append((i, j)) elif Y[i, index] < Y[j, index]: pairs.append((j, i)) if len(pairs) > 0: pred = rls.computePairwiseCV(pairs, index) auc = 0. for pair in pred: if pair[0] > pair[1]: auc += 1. elif pair[0] == pair[1]: auc += 0.5 auc /= len(pred) perfs.append(auc) if len(perfs) > 0: performance = np.mean(perfs) else: raise UndefinedPerformance( "Performance undefined for all folds") return performance else: #Horribly inefficient, but maybe OK for small data sets pairs = [] for i in range(Y.shape[0]): for j in range(Y.shape[0]): pairs.append((i, j)) for index in range(Y.shape[1]): preds = rls.computePairwiseCV(pairs, index) for i in range(len(pairs)): pair = pairs[i] pred = preds[i] perfs.append( self.measure( np.array([Y[pair[0], index], Y[pair[1], index]]), np.array(pred))) perf = np.mean(perfs) return perf
def ova_accuracy(Y, P): """One-vs-all classification accuracy for multi-class problems. Computes the accuracy for a one-versus-all decomposed classification problem. Each column in Y and P correspond to one possible class label. On each row, exactly one column in Y is 1, all the rest must be -1. The prediction for the i:th example is computed by taking the argmax over the indices of row i in P. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_classes] Correct labels, must belong to set {-1,1}, with exactly one 1 on each row. P : {array-like}, shape = [n_samples] or [n_samples, n_classes] Predicted labels, can be any real numbers. Returns ------- accuracy : float number between 0 and 1 """ Y = np.array(Y) P = np.array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") correct = 0 for i in range(Y.shape[0]): largest_pred = None predicted = None true = None for j in range(Y.shape[1]): if Y[i, j] == 1: true = j if (not largest_pred) or (P[i, j] > largest_pred): largest_pred = P[i, j] predicted = j if true == predicted: correct += 1 perf = float(correct) / float(Y.shape[0]) return perf
def cv(self, regparam): rls = self.rls measure = self.measure rls.solve(regparam) Y = rls.Y performances = [] predictions = [] folds = rls.qidlist for fold in folds: P = rls.holdout(fold) predictions.append(P) try: performance = measure(Y[fold], P) performances.append(performance) except UndefinedPerformance: pass if len(performances) > 0: performance = np.mean(performances) else: raise UndefinedPerformance("Performance undefined for all folds") return performance, predictions
def spearman(Y, P): """Spearman correlation. Parameters ---------- Y : {array-like}, shape = [n_samples] or [n_samples, n_labels] Correct labels P : {array-like}, shape = [n_samples] or [n_samples, n_labels] Predicted labels Returns ------- correlation : float number between -1 and 1 """ Y = array_tools.as_2d_array(Y) P = array_tools.as_2d_array(P) if not Y.shape == P.shape: raise UndefinedPerformance("Y and P must be of same shape") return np.mean(spearman_multitask(Y, P))
def cindex_singletask_SLOW(Y, P): correct = Y predictions = P assert len(correct) == len(predictions) disagreement = 0. decisions = 0. for i in range(len(correct)): for j in range(len(correct)): if correct[i] > correct[j]: decisions += 1. if predictions[i] < predictions[j]: disagreement += 1. elif predictions[i] == predictions[j]: disagreement += 0.5 #Disagreement error is not defined for cases where there #are no disagreeing pairs if decisions == 0: raise UndefinedPerformance("No pairs, all the instances have the same output") else: disagreement /= decisions return 1. - disagreement
def cv(self, regparam): rls = self.rls folds = self.folds measure = self.measure rls.solve(regparam) Y = rls.Y performances = [] P_all = [] for fold in folds: P = rls.holdout(fold) P_all.append(P) try: performance = measure(Y[fold], P) performances.append(performance) except UndefinedPerformance as e: pass #No warning printed #performance = measure_utilities.aggregate(performances) if len(performances) > 0: performance = np.mean(performances) else: raise UndefinedPerformance("Performance undefined for all folds") return performance, P_all
def disagreement(Y, P): """Disagreement error, also known as the pairwise ranking error. A performance measure for ranking problems. Computes the number of pairwise disagreements between the correct and predicted rankings. An O(n^2)-time implementation, can be slow for large problems (loglinear time implementation would be possible using search trees). For query-structured data, one would typically want to compute the disagreement separately for each query, and average. If 2-dimensional arrays are supplied as arguments, then disagreement is separately computed for each column, after which the disagreements are averaged. Parameters ---------- Y: {array-like}, shape = [n_samples] or [n_samples, n_labels] Correct utility values, can be any real numbers P: {array-like}, shape = [n_samples] or [n_samples, n_labels] Predicted utility values, can be any real numbers. Returns ------- disagreement: float number between 0 and 1 """ Y = array_tools.as_labelmatrix(Y) P = array_tools.as_labelmatrix(P) perfs = disagreement_multitask(Y, P) perfs = np.array(perfs) perfs = perfs[np.invert(np.isnan(perfs))] if len(perfs) == 0: raise UndefinedPerformance( "No pairs, all the instances have the same label") perf = np.mean(perfs) return perf
def fscore_singletask(Y, P): correct = Y predictions = P if not np.all((Y == 1) + (Y == -1)): raise UndefinedPerformance("fscore accepts as Y-values only 1 and -1") assert len(correct) == len(predictions) TP = 0 FP = 0 FN = 0 for i in range(len(correct)): if correct[i] == 1: if predictions[i] > 0.: TP += 1 else: FN += 1 elif correct[i] == -1: if predictions[i] > 0.: FP += 1 else: assert False P = float(TP) / (TP + FP) R = float(TP) / (TP + FN) F = 2. * (P * R) / (P + R) return F