def holdout(self, indices): """Computes hold-out predictions for a trained RLS. Parameters ---------- indices : list of indices, shape = [n_hsamples] list of indices of training examples belonging to the set for which the hold-out predictions are calculated. Should correspond to one query. Returns ------- F : array, shape = [n_hsamples, n_labels] holdout query predictions Notes ----- Computational complexity of holdout: m = n_samples, d = n_features, l = n_labels, b = n_bvectors, h=n_hsamples O(h^3 + lmh): basic case O(min(h^3 + lh^2, d^3 + ld^2) +ldh): Linear Kernel, d < m O(min(h^3 + lh^2, b^3 + lb^2) +lbh): Sparse approximation with basis vectors """ indices = array_tools.as_index_list(indices, self.Y.shape[0]) if len(indices) == 0: raise IndexError( 'Hold-out predictions can not be computed for an empty hold-out set.' ) if len(indices) != len(np.unique(indices)): raise IndexError('Hold-out can have each index only once.') hoqid = self.qids[indices[0]] for ind in indices: if not hoqid == self.qids[ind]: raise IndexError( 'All examples in the hold-out set must have the same qid.') indlen = len(indices) Qleft = self.multipleleft[indices] sqrtQho = np.multiply(Qleft, np.sqrt(self.neweigvals)) Qho = sqrtQho * sqrtQho.T Pho = np.mat(np.ones((len(indices), 1))) / np.sqrt(len(indices)) Yho = self.Y[indices] Dho = self.D[:, indices] LhoYho = np.multiply(Dho.T, Yho) - Pho * (Pho.T * Yho) RQY = Qleft * np.multiply(self.neweigvals.T, self.multipleright) - Qho * LhoYho sqrtRQRTLho = np.multiply(Dho.T, sqrtQho) - Pho * (Pho.T * sqrtQho) if sqrtQho.shape[0] <= sqrtQho.shape[1]: RQRTLho = sqrtQho * sqrtRQRTLho.T I = np.mat(np.identity(indlen)) return np.squeeze(np.array((I - RQRTLho).I * RQY)) else: RQRTLho = sqrtRQRTLho.T * sqrtQho I = np.mat(np.identity(sqrtQho.shape[1])) return np.squeeze( np.array(RQY + sqrtQho * ((I - RQRTLho).I * (sqrtRQRTLho.T * RQY))))
def holdout(self, indices): """Computes hold-out predictions Parameters ---------- indices : list of indices, shape = [n_hsamples] list of indices of training examples belonging to the set for which the hold-out predictions are calculated. The list can not be empty. Returns ------- F : array, shape = [n_hsamples, n_labels] holdout predictions Notes ----- Computational complexity of holdout: m = n_samples, d = n_features, l = n_labels, b = n_bvectors, h=n_hsamples O(h^3 + lmh): basic case O(min(h^3 + lh^2, d^3 + ld^2) +ldh): Linear Kernel, d < m O(min(h^3 + lh^2, b^3 + lb^2) +lbh): Sparse approximation with basis vectors The fast holdout algorithm is based on results presented in [1,2]. However, the removal of basis vectors decribed in [2] is currently not implemented. References ---------- [1] Tapio Pahikkala, Jorma Boberg, and Tapio Salakoski. Fast n-Fold Cross-Validation for Regularized Least-Squares. Proceedings of the Ninth Scandinavian Conference on Artificial Intelligence, 83-90, Otamedia Oy, 2006. [2] Tapio Pahikkala, Hanna Suominen, and Jorma Boberg. Efficient cross-validation for kernelized least-squares regression with sparse basis expansions. Machine Learning, 87(3):381--407, June 2012. """ indices = array_tools.as_index_list(indices, self.Y.shape[0]) if len(indices) != len(np.unique(indices)): raise IndexError('Hold-out can have each index only once.') bevals = multiply(self.evals, self.newevals) A = self.svecs[indices] right = self.svecsTY - A.T * self.Y[indices] #O(hrl) RQY = A * multiply(bevals.T, right) #O(hrl) B = multiply(bevals.T, A.T) if len(indices) <= A.shape[1]: #h < r I = mat(identity(len(indices))) result = la.inv(I - A * B) * RQY #O(h^3 + h^2 * l) else: #h > r I = mat(identity(A.shape[1])) result = RQY - A * (la.inv(B * A - I) * (B * RQY)) #O(r^3 + r^2 * l + h * r * l) return np.squeeze(np.array(result))
def holdout(self, indices): """Computes hold-out predictions Parameters ---------- indices : list of indices, shape = [n_hsamples] list of indices of training examples belonging to the set for which the hold-out predictions are calculated. The list can not be empty. Returns ------- F : array, shape = [n_hsamples, n_labels] holdout predictions Notes ----- Computational complexity of holdout: m = n_samples, d = n_features, l = n_labels, b = n_bvectors, h=n_hsamples O(h^3 + lmh): basic case O(min(h^3 + lh^2, d^3 + ld^2) +ldh): Linear Kernel, d < m O(min(h^3 + lh^2, b^3 + lb^2) +lbh): Sparse approximation with basis vectors The fast holdout algorithm is based on results presented in [1,2]. However, the removal of basis vectors decribed in [2] is currently not implemented. References ---------- [1] Tapio Pahikkala, Jorma Boberg, and Tapio Salakoski. Fast n-Fold Cross-Validation for Regularized Least-Squares. Proceedings of the Ninth Scandinavian Conference on Artificial Intelligence, 83-90, Otamedia Oy, 2006. [2] Tapio Pahikkala, Hanna Suominen, and Jorma Boberg. Efficient cross-validation for kernelized least-squares regression with sparse basis expansions. Machine Learning, 87(3):381--407, June 2012. """ indices = array_tools.as_index_list(indices, self.Y.shape[0]) if len(indices) != len(np.unique(indices)): raise IndexError('Hold-out can have each index only once.') bevals = multiply(self.evals, self.newevals) A = self.svecs[indices] right = self.svecsTY - A.T * self.Y[indices] #O(hrl) RQY = A * multiply(bevals.T, right) #O(hrl) B = multiply(bevals.T, A.T) if len(indices) <= A.shape[1]: #h < r I = mat(identity(len(indices))) result = la.inv(I - A * B) * RQY #O(h^3 + h^2 * l) else: #h > r I = mat(identity(A.shape[1])) result = RQY - A * (la.inv(B * A - I) * (B * RQY)) #O(r^3 + r^2 * l + h * r * l) return np.squeeze(np.array(result))
def holdout(self, indices): """Computes hold-out predictions for a trained RLS. Parameters ---------- indices : list of indices, shape = [n_hsamples] list of indices of training examples belonging to the set for which the hold-out predictions are calculated. Should correspond to one query. Returns ------- F : array, shape = [n_hsamples, n_labels] holdout query predictions Notes ----- Computational complexity of holdout: m = n_samples, d = n_features, l = n_labels, b = n_bvectors, h=n_hsamples O(h^3 + lmh): basic case O(min(h^3 + lh^2, d^3 + ld^2) +ldh): Linear Kernel, d < m O(min(h^3 + lh^2, b^3 + lb^2) +lbh): Sparse approximation with basis vectors """ indices = array_tools.as_index_list(indices, self.Y.shape[0]) if len(indices) == 0: raise IndexError('Hold-out predictions can not be computed for an empty hold-out set.') if len(indices) != len(np.unique(indices)): raise IndexError('Hold-out can have each index only once.') hoqid = self.qids[indices[0]] for ind in indices: if not hoqid == self.qids[ind]: raise IndexError('All examples in the hold-out set must have the same qid.') indlen = len(indices) Qleft = self.multipleleft[indices] sqrtQho = np.multiply(Qleft, np.sqrt(self.neweigvals)) Qho = sqrtQho * sqrtQho.T Pho = np.mat(np.ones((len(indices),1))) / np.sqrt(len(indices)) Yho = self.Y[indices] Dho = self.D[:, indices] LhoYho = np.multiply(Dho.T, Yho) - Pho * (Pho.T * Yho) RQY = Qleft * np.multiply(self.neweigvals.T, self.multipleright) - Qho * LhoYho sqrtRQRTLho = np.multiply(Dho.T, sqrtQho) - Pho * (Pho.T * sqrtQho) if sqrtQho.shape[0] <= sqrtQho.shape[1]: RQRTLho = sqrtQho * sqrtRQRTLho.T I = np.mat(np.identity(indlen)) return np.squeeze(np.array((I - RQRTLho).I * RQY)) else: RQRTLho = sqrtRQRTLho.T * sqrtQho I = np.mat(np.identity(sqrtQho.shape[1])) return np.squeeze(np.array(RQY + sqrtQho * ((I - RQRTLho).I * (sqrtRQRTLho.T * RQY))))
def leave_pair_out(self, pairs_start_inds, pairs_end_inds): """Computes leave-pair-out predictions Parameters ---------- pairs_start_inds : list of indices, shape = [n_pairs] list of indices from range [0, n_samples-1] pairs_end_inds : list of indices, shape = [n_pairs] list of indices from range [0, n_samples-1] Returns ------- P1 : array, shape = [n_pairs, n_labels] holdout predictions for pairs_start_inds P2 : array, shape = [n_pairs, n_labels] holdout predictions for pairs_end_inds Notes ----- Computes the leave-pair-out cross-validation predictions, where each (i,j) pair with i= pair_start_inds[k] and j = pairs_end_inds[k] is left out in turn. When estimating area under ROC curve with leave-pair-out, one should leave out all positive-negative pairs, while for estimating the general ranking error one should leave out all pairs with different labels. Computational complexity of leave-pair-out with most pairs left out: m = n_samples, d = n_features, l = n_labels, b = n_bvectors O(lm^2+m^3): basic case O(lm^2+dm^2): Linear Kernel, d < m O(lm^2+bm^2): Sparse approximation with basis vectors The algorithm is an adaptation of the method published originally in [1]. The use of leave-pair-out cross-validation for AUC estimation has been analyzed in [2]. References ---------- [1] Tapio Pahikkala, Antti Airola, Jorma Boberg, and Tapio Salakoski. Exact and efficient leave-pair-out cross-validation for ranking RLS. In Proceedings of the 2nd International and Interdisciplinary Conference on Adaptive Knowledge Representation and Reasoning (AKRR'08), pages 1-8, Espoo, Finland, 2008. [2] Antti Airola, Tapio Pahikkala, Willem Waegeman, Bernard De Baets, and Tapio Salakoski. An experimental comparison of cross-validation techniques for estimating the area under the ROC curve. Computational Statistics & Data Analysis, 55(4):1828--1844, April 2011. """ pairs_start_inds = array_tools.as_index_list(pairs_start_inds, self.Y.shape[0]) pairs_end_inds = array_tools.as_index_list(pairs_end_inds, self.Y.shape[0]) pairslen = len(pairs_start_inds) if not len(pairs_start_inds) == len(pairs_end_inds): raise Exception("Incorrect arguments: lengths of pairs_start_inds and pairs_end_inds do no match") bevals = multiply(self.evals, self.newevals) svecsbevals = multiply(self.svecs, bevals) hatmatrixdiagonal = np.squeeze(np.array(np.sum(np.multiply(self.svecs, svecsbevals), axis = 1))) svecsbevalssvecsTY = svecsbevals * self.svecsTY results_first = np.zeros((pairslen, self.Y.shape[1])) results_second = np.zeros((pairslen, self.Y.shape[1])) _rls.leave_pair_out(pairslen, pairs_start_inds, pairs_end_inds, self.Y.shape[1], self.Y, self.svecs, np.atleast_1d(np.squeeze(np.array(bevals))), self.svecs.shape[1], hatmatrixdiagonal, svecsbevalssvecsTY, results_first, results_second) return np.squeeze(results_first), np.squeeze(results_second)
def holdout(self, indices): """Computes hold-out predictions for a trained RankRLS Parameters ---------- indices : list of indices, shape = [n_hsamples] list of indices of training examples belonging to the set for which the hold-out predictions are calculated. The list can not be empty. Returns ------- F : array, shape = [n_hsamples, n_labels] holdout predictions Notes ----- The algorithm is a modification of the ones published in [1,2] for the regular RLS method. References ---------- [1] Tapio Pahikkala, Jorma Boberg, and Tapio Salakoski. Fast n-Fold Cross-Validation for Regularized Least-Squares. Proceedings of the Ninth Scandinavian Conference on Artificial Intelligence, 83-90, Otamedia Oy, 2006. [2] Tapio Pahikkala, Hanna Suominen, and Jorma Boberg. Efficient cross-validation for kernelized least-squares regression with sparse basis expansions. Machine Learning, 87(3):381--407, June 2012. """ indices = array_tools.as_index_list(indices, self.Y.shape[0]) if len(indices) != len(np.unique(indices)): raise IndexError('Hold-out can have each index only once.') Y = self.Y m = self.size evals, V = self.evals, self.svecs #results = [] C = np.mat(np.zeros((self.size, 3), dtype = np.float64)) onevec = np.mat(np.ones((self.size, 1), dtype = np.float64)) for i in range(self.size): C[i, 0] = 1. VTY = V.T * Y VTC = V.T * onevec CTY = onevec.T * Y holen = len(indices) newevals = multiply(evals, 1. / ((m - holen) * evals + self.regparam)) R = np.mat(np.zeros((holen, holen + 1), dtype = np.float64)) for i in range(len(indices)): R[i, 0] = -1. R[i, i + 1] = sqrt(self.size - float(holen)) Vho = V[indices] Vhov = multiply(Vho, newevals) Ghoho = Vhov * Vho.T GCho = Vhov * VTC GBho = Ghoho * R for i in range(len(indices)): GBho[i, 0] += GCho[i, 0] CTGC = multiply(VTC.T, newevals) * VTC RTGCho = R.T * GCho BTGB = R.T * Ghoho * R for i in range(len(indices) + 1): BTGB[i, 0] += RTGCho[i, 0] BTGB[0, i] += RTGCho[i, 0] BTGB[0, 0] += CTGC[0, 0] BTY = R.T * Y[indices] BTY[0] = BTY[0] + CTY[0] GDYho = Vhov * (self.size - holen) * VTY GLYho = GDYho - GBho * BTY CTGDY = multiply(VTC.T, newevals) * (self.size - holen) * VTY BTGLY = R.T * GDYho - BTGB * BTY BTGLY[0] = BTGLY[0] + CTGDY[0] F = GLYho - GBho * la.inv(-mat(eye(holen + 1)) + BTGB) * BTGLY #results.append(F) #return results F = np.squeeze(np.array(F)) return F
def leave_pair_out(self, pairs_start_inds, pairs_end_inds): """Computes leave-pair-out predictions for a trained RankRLS. Parameters ---------- pairs_start_inds : list of indices, shape = [n_pairs] list of indices from range [0, n_samples-1] pairs_end_inds : list of indices, shape = [n_pairs] list of indices from range [0, n_samples-1] Returns ------- P1 : array, shape = [n_pairs] holdout predictions for pairs_start_inds P2 : array, shape = [n_pairs] holdout predictions for pairs_end_inds Notes ----- Computes the leave-pair-out cross-validation predictions, where each (i,j) pair with i= pair_start_inds[k] and j = pairs_end_inds[k] is left out in turn. When estimating area under ROC curve with leave-pair-out, one should leave out all positive-negative pairs, while for estimating the general ranking error one should leave out all pairs with different labels. Computational complexity of leave-pair-out with most pairs left out: m = n_samples, d = n_features, l = n_labels, b = n_bvectors O(lm^2+m^3): basic case O(lm^2+dm^2): Linear Kernel, d < m O(lm^2+bm^2): Sparse approximation with basis vectors The leave-pair-out cross-validation algorithm is described in [1,2]. The use of leave-pair-out cross-validation for AUC estimation has been analyzed in [3] [1] Tapio Pahikkala, Evgeni Tsivtsivadze, Antti Airola, Jouni Jarvinen, and Jorma Boberg. An efficient algorithm for learning to rank from preference graphs. Machine Learning, 75(1):129-165, 2009. [2] Tapio Pahikkala, Antti Airola, Jorma Boberg, and Tapio Salakoski. Exact and efficient leave-pair-out cross-validation for ranking RLS. In Proceedings of the 2nd International and Interdisciplinary Conference on Adaptive Knowledge Representation and Reasoning (AKRR'08), pages 1-8, Espoo, Finland, 2008. [3] Antti Airola, Tapio Pahikkala, Willem Waegeman, Bernard De Baets, Tapio Salakoski. An Experimental Comparison of Cross-Validation Techniques for Estimating the Area Under the ROC Curve. Computational Statistics & Data Analysis 55(4), 1828-1844, 2011. """ pairs_start_inds = array_tools.as_index_list(pairs_start_inds, self.Y.shape[0]) pairs_end_inds = array_tools.as_index_list(pairs_end_inds, self.Y.shape[0]) evals, svecs = self.evals, self.svecs m = self.size Y = self.Y modevals = np.squeeze(np.array(np.multiply(evals, 1. / ((m - 2.) * evals + self.regparam)))) GDY = (self.size - 2.) * (svecs * np.multiply(np.mat(modevals).T, (svecs.T * Y))) GC = np.squeeze(np.array(svecs * np.multiply(np.mat(modevals).T, np.sum(svecs.T, axis = 1)))) CTGC = np.sum(GC) pairslen = len(pairs_start_inds) sm2Gdiag = np.zeros((self.Y.shape[0])) BTY = np.zeros((self.Y.shape)) sqrtsm2GDY = np.zeros((self.Y.shape)) BTGBBTY = np.zeros((self.Y.shape)) results_first = np.zeros((pairslen, self.Y.shape[1])) results_second = np.zeros((pairslen, self.Y.shape[1])) _global_rankrls.leave_pair_out(pairslen, self.Y.shape[0], pairs_start_inds, pairs_end_inds, self.Y.shape[1], Y, svecs, modevals, svecs.shape[1], np.zeros((self.Y.shape[0])), np.squeeze(np.array(GC)), sm2Gdiag, CTGC, GDY, BTY, sqrtsm2GDY, BTGBBTY, np.array(np.sum(Y, axis = 0))[0], #CTY np.array(np.sum(GDY, axis = 0))[0], #CTGDY results_first, results_second) return np.squeeze(results_first), np.squeeze(results_second)
def holdout(self, indices): """Computes hold-out predictions for a trained RankRLS Parameters ---------- indices : list of indices, shape = [n_hsamples] list of indices of training examples belonging to the set for which the hold-out predictions are calculated. The list can not be empty. Returns ------- F : array, shape = [n_hsamples, n_labels] holdout predictions Notes ----- The algorithm is a modification of the ones published in [1,2] for the regular RLS method. References ---------- [1] Tapio Pahikkala, Jorma Boberg, and Tapio Salakoski. Fast n-Fold Cross-Validation for Regularized Least-Squares. Proceedings of the Ninth Scandinavian Conference on Artificial Intelligence, 83-90, Otamedia Oy, 2006. [2] Tapio Pahikkala, Hanna Suominen, and Jorma Boberg. Efficient cross-validation for kernelized least-squares regression with sparse basis expansions. Machine Learning, 87(3):381--407, June 2012. """ indices = array_tools.as_index_list(indices, self.Y.shape[0]) if len(indices) != len(np.unique(indices)): raise IndexError('Hold-out can have each index only once.') Y = self.Y m = self.size evals, V = self.evals, self.svecs #results = [] C = np.mat(np.zeros((self.size, 3), dtype=np.float64)) onevec = np.mat(np.ones((self.size, 1), dtype=np.float64)) for i in range(self.size): C[i, 0] = 1. VTY = V.T * Y VTC = V.T * onevec CTY = onevec.T * Y holen = len(indices) newevals = multiply(evals, 1. / ((m - holen) * evals + self.regparam)) R = np.mat(np.zeros((holen, holen + 1), dtype=np.float64)) for i in range(len(indices)): R[i, 0] = -1. R[i, i + 1] = sqrt(self.size - float(holen)) Vho = V[indices] Vhov = multiply(Vho, newevals) Ghoho = Vhov * Vho.T GCho = Vhov * VTC GBho = Ghoho * R for i in range(len(indices)): GBho[i, 0] += GCho[i, 0] CTGC = multiply(VTC.T, newevals) * VTC RTGCho = R.T * GCho BTGB = R.T * Ghoho * R for i in range(len(indices) + 1): BTGB[i, 0] += RTGCho[i, 0] BTGB[0, i] += RTGCho[i, 0] BTGB[0, 0] += CTGC[0, 0] BTY = R.T * Y[indices] #BTY[0, 0] += CTY[0, 0] BTY[0] = BTY[0] + CTY[0] GDYho = Vhov * (self.size - holen) * VTY GLYho = GDYho - GBho * BTY CTGDY = multiply(VTC.T, newevals) * (self.size - holen) * VTY BTGLY = R.T * GDYho - BTGB * BTY #BTGLY[0, 0] += CTGDY[0, 0] BTGLY[0] = BTGLY[0] + CTGDY[0] F = GLYho - GBho * la.inv(-mat(eye(holen + 1)) + BTGB) * BTGLY #results.append(F) #return results F = np.squeeze(np.array(F)) return F
def leave_pair_out(self, pairs_start_inds, pairs_end_inds): """Computes leave-pair-out predictions for a trained RankRLS. Parameters ---------- pairs_start_inds : list of indices, shape = [n_pairs] list of indices from range [0, n_samples-1] pairs_end_inds : list of indices, shape = [n_pairs] list of indices from range [0, n_samples-1] Returns ------- P1 : array, shape = [n_pairs] holdout predictions for pairs_start_inds P2 : array, shape = [n_pairs] holdout predictions for pairs_end_inds Notes ----- Computes the leave-pair-out cross-validation predictions, where each (i,j) pair with i= pair_start_inds[k] and j = pairs_end_inds[k] is left out in turn. When estimating area under ROC curve with leave-pair-out, one should leave out all positive-negative pairs, while for estimating the general ranking error one should leave out all pairs with different labels. Computational complexity of leave-pair-out with most pairs left out: m = n_samples, d = n_features, l = n_labels, b = n_bvectors O(lm^2+m^3): basic case O(lm^2+dm^2): Linear Kernel, d < m O(lm^2+bm^2): Sparse approximation with basis vectors The leave-pair-out cross-validation algorithm is described in [1,2]. The use of leave-pair-out cross-validation for AUC estimation has been analyzed in [3] [1] Tapio Pahikkala, Evgeni Tsivtsivadze, Antti Airola, Jouni Jarvinen, and Jorma Boberg. An efficient algorithm for learning to rank from preference graphs. Machine Learning, 75(1):129-165, 2009. [2] Tapio Pahikkala, Antti Airola, Jorma Boberg, and Tapio Salakoski. Exact and efficient leave-pair-out cross-validation for ranking RLS. In Proceedings of the 2nd International and Interdisciplinary Conference on Adaptive Knowledge Representation and Reasoning (AKRR'08), pages 1-8, Espoo, Finland, 2008. [3] Antti Airola, Tapio Pahikkala, Willem Waegeman, Bernard De Baets, Tapio Salakoski. An Experimental Comparison of Cross-Validation Techniques for Estimating the Area Under the ROC Curve. Computational Statistics & Data Analysis 55(4), 1828-1844, 2011. """ pairs_start_inds = array_tools.as_index_list(pairs_start_inds, self.Y.shape[0]) pairs_end_inds = array_tools.as_index_list(pairs_end_inds, self.Y.shape[0]) evals, svecs = self.evals, self.svecs m = self.size Y = self.Y modevals = np.squeeze( np.array( np.multiply(evals, 1. / ((m - 2.) * evals + self.regparam)))) GDY = (self.size - 2.) * (svecs * np.multiply(np.mat(modevals).T, (svecs.T * Y))) GC = np.squeeze( np.array(svecs * np.multiply(np.mat(modevals).T, np.sum(svecs.T, axis=1)))) CTGC = np.sum(GC) pairslen = len(pairs_start_inds) sm2Gdiag = np.zeros((self.Y.shape[0])) BTY = np.zeros((self.Y.shape)) sqrtsm2GDY = np.zeros((self.Y.shape)) BTGBBTY = np.zeros((self.Y.shape)) results_first = np.zeros((pairslen, self.Y.shape[1])) results_second = np.zeros((pairslen, self.Y.shape[1])) _global_rankrls.leave_pair_out( pairslen, self.Y.shape[0], pairs_start_inds, pairs_end_inds, self.Y.shape[1], Y, svecs, modevals, svecs.shape[1], np.zeros((self.Y.shape[0])), np.squeeze(np.array(GC)), sm2Gdiag, CTGC, GDY, BTY, sqrtsm2GDY, BTGBBTY, np.array(np.sum(Y, axis=0))[0], #CTY np.array(np.sum(GDY, axis=0))[0], #CTGDY results_first, results_second) return np.squeeze(results_first), np.squeeze(results_second)
def leave_pair_out(self, pairs_start_inds, pairs_end_inds): """Computes leave-pair-out predictions Parameters ---------- pairs_start_inds : list of indices, shape = [n_pairs] list of indices from range [0, n_samples-1] pairs_end_inds : list of indices, shape = [n_pairs] list of indices from range [0, n_samples-1] Returns ------- P1 : array, shape = [n_pairs, n_labels] holdout predictions for pairs_start_inds P2 : array, shape = [n_pairs, n_labels] holdout predictions for pairs_end_inds Notes ----- Computes the leave-pair-out cross-validation predictions, where each (i,j) pair with i= pair_start_inds[k] and j = pairs_end_inds[k] is left out in turn. When estimating area under ROC curve with leave-pair-out, one should leave out all positive-negative pairs, while for estimating the general ranking error one should leave out all pairs with different labels. Computational complexity of leave-pair-out with most pairs left out: m = n_samples, d = n_features, l = n_labels, b = n_bvectors O(lm^2+m^3): basic case O(lm^2+dm^2): Linear Kernel, d < m O(lm^2+bm^2): Sparse approximation with basis vectors The algorithm is an adaptation of the method published originally in [1]. The use of leave-pair-out cross-validation for AUC estimation has been analyzed in [2]. References ---------- [1] Tapio Pahikkala, Antti Airola, Jorma Boberg, and Tapio Salakoski. Exact and efficient leave-pair-out cross-validation for ranking RLS. In Proceedings of the 2nd International and Interdisciplinary Conference on Adaptive Knowledge Representation and Reasoning (AKRR'08), pages 1-8, Espoo, Finland, 2008. [2] Antti Airola, Tapio Pahikkala, Willem Waegeman, Bernard De Baets, and Tapio Salakoski. An experimental comparison of cross-validation techniques for estimating the area under the ROC curve. Computational Statistics & Data Analysis, 55(4):1828--1844, April 2011. """ pairs_start_inds = array_tools.as_index_list(pairs_start_inds, self.Y.shape[0]) pairs_end_inds = array_tools.as_index_list(pairs_end_inds, self.Y.shape[0]) pairslen = len(pairs_start_inds) if not len(pairs_start_inds) == len(pairs_end_inds): raise Exception("Incorrect arguments: lengths of pairs_start_inds and pairs_end_inds do no match") bevals = multiply(self.evals, self.newevals) svecsbevals = multiply(self.svecs, bevals) hatmatrixdiagonal = np.squeeze(np.array(np.sum(np.multiply(self.svecs, svecsbevals), axis = 1))) svecsbevalssvecsTY = svecsbevals * self.svecsTY results_first = np.zeros((pairslen, self.Y.shape[1])) results_second = np.zeros((pairslen, self.Y.shape[1])) _rls.leave_pair_out(pairslen, pairs_start_inds, pairs_end_inds, self.Y.shape[1], self.Y, self.svecs, np.atleast_1d(np.squeeze(np.array(bevals))), self.svecs.shape[1], hatmatrixdiagonal, svecsbevalssvecsTY, results_first, results_second) return np.squeeze(results_first), np.squeeze(results_second)