Beispiel #1
0
    def holdout(self, indices):
        """Computes hold-out predictions for a trained RLS.
        
        Parameters
        ----------
        indices : list of indices, shape = [n_hsamples]
            list of indices of training examples belonging to the set for which the
            hold-out predictions are calculated. Should correspond to one query.

        Returns
        -------
        F : array, shape = [n_hsamples, n_labels]
            holdout query predictions
            
        Notes
        -----
        
        Computational complexity of holdout:
        m = n_samples, d = n_features, l = n_labels, b = n_bvectors, h=n_hsamples
        
        O(h^3 + lmh): basic case
        
        O(min(h^3 + lh^2, d^3 + ld^2) +ldh): Linear Kernel, d < m
        
        O(min(h^3 + lh^2, b^3 + lb^2) +lbh): Sparse approximation with basis vectors 
        """
        indices = array_tools.as_index_list(indices, self.Y.shape[0])
        if len(indices) == 0:
            raise IndexError(
                'Hold-out predictions can not be computed for an empty hold-out set.'
            )
        if len(indices) != len(np.unique(indices)):
            raise IndexError('Hold-out can have each index only once.')
        hoqid = self.qids[indices[0]]
        for ind in indices:
            if not hoqid == self.qids[ind]:
                raise IndexError(
                    'All examples in the hold-out set must have the same qid.')

        indlen = len(indices)
        Qleft = self.multipleleft[indices]
        sqrtQho = np.multiply(Qleft, np.sqrt(self.neweigvals))
        Qho = sqrtQho * sqrtQho.T
        Pho = np.mat(np.ones((len(indices), 1))) / np.sqrt(len(indices))
        Yho = self.Y[indices]
        Dho = self.D[:, indices]
        LhoYho = np.multiply(Dho.T, Yho) - Pho * (Pho.T * Yho)
        RQY = Qleft * np.multiply(self.neweigvals.T,
                                  self.multipleright) - Qho * LhoYho
        sqrtRQRTLho = np.multiply(Dho.T, sqrtQho) - Pho * (Pho.T * sqrtQho)
        if sqrtQho.shape[0] <= sqrtQho.shape[1]:
            RQRTLho = sqrtQho * sqrtRQRTLho.T
            I = np.mat(np.identity(indlen))
            return np.squeeze(np.array((I - RQRTLho).I * RQY))
        else:
            RQRTLho = sqrtRQRTLho.T * sqrtQho
            I = np.mat(np.identity(sqrtQho.shape[1]))
            return np.squeeze(
                np.array(RQY + sqrtQho * ((I - RQRTLho).I *
                                          (sqrtRQRTLho.T * RQY))))
Beispiel #2
0
    def holdout(self, indices):
        """Computes hold-out predictions
        
        Parameters
        ----------
        indices : list of indices, shape = [n_hsamples]
            list of indices of training examples belonging to the set for which the
            hold-out predictions are calculated. The list can not be empty.

        Returns
        -------
        F : array, shape = [n_hsamples, n_labels]
            holdout predictions
            
        Notes
        -----
        
        Computational complexity of holdout:
        m = n_samples, d = n_features, l = n_labels, b = n_bvectors, h=n_hsamples
        
        O(h^3 + lmh): basic case
        
        O(min(h^3 + lh^2, d^3 + ld^2) +ldh): Linear Kernel, d < m
        
        O(min(h^3 + lh^2, b^3 + lb^2) +lbh): Sparse approximation with basis vectors 
        
        The fast holdout algorithm is based on results presented in [1,2]. However, the removal of basis vectors decribed in [2] is currently not implemented.
            
        References
        ----------
        
        [1] Tapio Pahikkala, Jorma Boberg, and Tapio Salakoski.
        Fast n-Fold Cross-Validation for Regularized Least-Squares.
        Proceedings of the Ninth Scandinavian Conference on Artificial Intelligence,
        83-90, Otamedia Oy, 2006.
        
        [2] Tapio Pahikkala, Hanna Suominen, and Jorma Boberg.
        Efficient cross-validation for kernelized least-squares regression with sparse basis expansions.
        Machine Learning, 87(3):381--407, June 2012.
        """
        indices = array_tools.as_index_list(indices, self.Y.shape[0])

        if len(indices) != len(np.unique(indices)):
            raise IndexError('Hold-out can have each index only once.')

        bevals = multiply(self.evals, self.newevals)
        A = self.svecs[indices]
        right = self.svecsTY - A.T * self.Y[indices]  #O(hrl)
        RQY = A * multiply(bevals.T, right)  #O(hrl)
        B = multiply(bevals.T, A.T)
        if len(indices) <= A.shape[1]:  #h < r
            I = mat(identity(len(indices)))
            result = la.inv(I - A * B) * RQY  #O(h^3 + h^2 * l)
        else:  #h > r
            I = mat(identity(A.shape[1]))
            result = RQY - A * (la.inv(B * A - I) *
                                (B * RQY))  #O(r^3 + r^2 * l + h * r * l)
        return np.squeeze(np.array(result))
Beispiel #3
0
    def holdout(self, indices):
        """Computes hold-out predictions
        
        Parameters
        ----------
        indices : list of indices, shape = [n_hsamples]
            list of indices of training examples belonging to the set for which the
            hold-out predictions are calculated. The list can not be empty.

        Returns
        -------
        F : array, shape = [n_hsamples, n_labels]
            holdout predictions
            
        Notes
        -----
        
        Computational complexity of holdout:
        m = n_samples, d = n_features, l = n_labels, b = n_bvectors, h=n_hsamples
        
        O(h^3 + lmh): basic case
        
        O(min(h^3 + lh^2, d^3 + ld^2) +ldh): Linear Kernel, d < m
        
        O(min(h^3 + lh^2, b^3 + lb^2) +lbh): Sparse approximation with basis vectors 
        
        The fast holdout algorithm is based on results presented in [1,2]. However, the removal of basis vectors decribed in [2] is currently not implemented.
            
        References
        ----------
        
        [1] Tapio Pahikkala, Jorma Boberg, and Tapio Salakoski.
        Fast n-Fold Cross-Validation for Regularized Least-Squares.
        Proceedings of the Ninth Scandinavian Conference on Artificial Intelligence,
        83-90, Otamedia Oy, 2006.
        
        [2] Tapio Pahikkala, Hanna Suominen, and Jorma Boberg.
        Efficient cross-validation for kernelized least-squares regression with sparse basis expansions.
        Machine Learning, 87(3):381--407, June 2012.
        """
        indices = array_tools.as_index_list(indices, self.Y.shape[0])
        
        if len(indices) != len(np.unique(indices)):
            raise IndexError('Hold-out can have each index only once.')
        
        bevals = multiply(self.evals, self.newevals)
        A = self.svecs[indices]
        right = self.svecsTY - A.T * self.Y[indices] #O(hrl)
        RQY = A * multiply(bevals.T, right) #O(hrl)
        B = multiply(bevals.T, A.T)
        if len(indices) <= A.shape[1]: #h < r
            I = mat(identity(len(indices)))
            result = la.inv(I - A * B) * RQY #O(h^3 + h^2 * l)
        else: #h > r
            I = mat(identity(A.shape[1]))
            result = RQY - A * (la.inv(B * A - I) * (B * RQY)) #O(r^3 + r^2 * l + h * r * l)
        return np.squeeze(np.array(result))
Beispiel #4
0
    def holdout(self, indices):
        """Computes hold-out predictions for a trained RLS.
        
        Parameters
        ----------
        indices : list of indices, shape = [n_hsamples]
            list of indices of training examples belonging to the set for which the
            hold-out predictions are calculated. Should correspond to one query.

        Returns
        -------
        F : array, shape = [n_hsamples, n_labels]
            holdout query predictions
            
        Notes
        -----
        
        Computational complexity of holdout:
        m = n_samples, d = n_features, l = n_labels, b = n_bvectors, h=n_hsamples
        
        O(h^3 + lmh): basic case
        
        O(min(h^3 + lh^2, d^3 + ld^2) +ldh): Linear Kernel, d < m
        
        O(min(h^3 + lh^2, b^3 + lb^2) +lbh): Sparse approximation with basis vectors 
        """
        indices = array_tools.as_index_list(indices, self.Y.shape[0])
        if len(indices) == 0:
            raise IndexError('Hold-out predictions can not be computed for an empty hold-out set.')   
        if len(indices) != len(np.unique(indices)):
            raise IndexError('Hold-out can have each index only once.')        
        hoqid = self.qids[indices[0]]
        for ind in indices:
            if not hoqid == self.qids[ind]:
                raise IndexError('All examples in the hold-out set must have the same qid.')
        
        indlen = len(indices)
        Qleft = self.multipleleft[indices]
        sqrtQho = np.multiply(Qleft, np.sqrt(self.neweigvals))
        Qho = sqrtQho * sqrtQho.T
        Pho = np.mat(np.ones((len(indices),1))) / np.sqrt(len(indices))
        Yho = self.Y[indices]
        Dho = self.D[:, indices]
        LhoYho = np.multiply(Dho.T, Yho) - Pho * (Pho.T * Yho)
        RQY = Qleft * np.multiply(self.neweigvals.T, self.multipleright) - Qho * LhoYho
        sqrtRQRTLho = np.multiply(Dho.T, sqrtQho) - Pho * (Pho.T * sqrtQho)
        if sqrtQho.shape[0] <= sqrtQho.shape[1]:
            RQRTLho = sqrtQho * sqrtRQRTLho.T
            I = np.mat(np.identity(indlen))
            return np.squeeze(np.array((I - RQRTLho).I * RQY))
        else:
            RQRTLho = sqrtRQRTLho.T * sqrtQho
            I = np.mat(np.identity(sqrtQho.shape[1]))
            return np.squeeze(np.array(RQY + sqrtQho * ((I - RQRTLho).I * (sqrtRQRTLho.T * RQY))))
Beispiel #5
0
    def leave_pair_out(self, pairs_start_inds, pairs_end_inds):
        
        """Computes leave-pair-out predictions
        
        Parameters
        ----------
        pairs_start_inds : list of indices, shape = [n_pairs]
            list of indices from range [0, n_samples-1]
        pairs_end_inds : list of indices, shape = [n_pairs]
            list of indices from range [0, n_samples-1]
        
        Returns
        -------
        P1 : array, shape = [n_pairs, n_labels]
            holdout predictions for pairs_start_inds
        P2 : array, shape = [n_pairs, n_labels]
            holdout predictions for pairs_end_inds
            
        Notes
        -----
    
        Computes the leave-pair-out cross-validation predictions, where each (i,j) pair with
        i= pair_start_inds[k] and j = pairs_end_inds[k] is left out in turn.
        
        When estimating area under ROC curve with leave-pair-out, one should leave out all
        positive-negative pairs, while for estimating the general ranking error one should
        leave out all pairs with different labels.
        
        Computational complexity of leave-pair-out with most pairs left out:
        m = n_samples, d = n_features, l = n_labels, b = n_bvectors
        
        O(lm^2+m^3): basic case
        
        O(lm^2+dm^2): Linear Kernel, d < m
        
        O(lm^2+bm^2): Sparse approximation with basis vectors 
        
        The algorithm is an adaptation of the method published originally in [1]. The use of
        leave-pair-out cross-validation for AUC estimation has been analyzed in [2].

        References
        ---------- 
        
        [1] Tapio Pahikkala, Antti Airola, Jorma Boberg, and Tapio Salakoski.
        Exact and efficient leave-pair-out cross-validation for ranking RLS.
        In Proceedings of the 2nd International and Interdisciplinary Conference
        on Adaptive Knowledge Representation and Reasoning (AKRR'08), pages 1-8,
        Espoo, Finland, 2008.
        
        [2] Antti Airola, Tapio Pahikkala, Willem Waegeman, Bernard De Baets, and Tapio Salakoski.
        An experimental comparison of cross-validation techniques for estimating the area under the ROC curve.
        Computational Statistics & Data Analysis, 55(4):1828--1844, April 2011.
        """
        
        pairs_start_inds = array_tools.as_index_list(pairs_start_inds, self.Y.shape[0])
        pairs_end_inds = array_tools.as_index_list(pairs_end_inds, self.Y.shape[0])
        pairslen = len(pairs_start_inds)
        if not len(pairs_start_inds) == len(pairs_end_inds):
            raise Exception("Incorrect arguments: lengths of pairs_start_inds and pairs_end_inds do no match")
        
        bevals = multiply(self.evals, self.newevals)
        svecsbevals = multiply(self.svecs, bevals)
        hatmatrixdiagonal = np.squeeze(np.array(np.sum(np.multiply(self.svecs, svecsbevals), axis = 1)))
        svecsbevalssvecsTY = svecsbevals * self.svecsTY
        results_first = np.zeros((pairslen, self.Y.shape[1]))
        results_second = np.zeros((pairslen, self.Y.shape[1]))
        _rls.leave_pair_out(pairslen,
                                                     pairs_start_inds,
                                                     pairs_end_inds,
                                                     self.Y.shape[1],
                                                     self.Y,
                                                     self.svecs,
                                                     np.atleast_1d(np.squeeze(np.array(bevals))),
                                                     self.svecs.shape[1],
                                                     hatmatrixdiagonal,
                                                     svecsbevalssvecsTY,
                                                     results_first,
                                                     results_second)
        return np.squeeze(results_first), np.squeeze(results_second)
Beispiel #6
0
    def holdout(self, indices):
        """Computes hold-out predictions for a trained RankRLS
        
        Parameters
        ----------
        indices : list of indices, shape = [n_hsamples]
            list of indices of training examples belonging to the set for which the
            hold-out predictions are calculated. The list can not be empty.

        Returns
        -------
        F : array, shape = [n_hsamples, n_labels]
            holdout predictions
            
        Notes
        -----
    
        The algorithm is a modification of the ones published in [1,2] for the regular RLS method.
        
        References
        ----------
        
        [1] Tapio Pahikkala, Jorma Boberg, and Tapio Salakoski.
        Fast n-Fold Cross-Validation for Regularized Least-Squares.
        Proceedings of the Ninth Scandinavian Conference on Artificial Intelligence,
        83-90, Otamedia Oy, 2006.
        
        [2] Tapio Pahikkala, Hanna Suominen, and Jorma Boberg.
        Efficient cross-validation for kernelized least-squares regression with sparse basis expansions.
        Machine Learning, 87(3):381--407, June 2012.     
        """
        
        indices = array_tools.as_index_list(indices, self.Y.shape[0])
        
        if len(indices) != len(np.unique(indices)):
            raise IndexError('Hold-out can have each index only once.')
        
        Y = self.Y
        m = self.size
        
        evals, V = self.evals, self.svecs
        
        #results = []
        
        C = np.mat(np.zeros((self.size, 3), dtype = np.float64))
        onevec = np.mat(np.ones((self.size, 1), dtype = np.float64))
        for i in range(self.size):
            C[i, 0] = 1.
        
        
        VTY = V.T * Y
        VTC = V.T * onevec
        CTY = onevec.T * Y
        
        holen = len(indices)
        
        newevals = multiply(evals, 1. / ((m - holen) * evals + self.regparam))
        
        R = np.mat(np.zeros((holen, holen + 1), dtype = np.float64))
        for i in range(len(indices)):
            R[i, 0] = -1.
            R[i, i + 1] = sqrt(self.size - float(holen))
        
        Vho = V[indices]
        Vhov = multiply(Vho, newevals)
        Ghoho = Vhov * Vho.T
        GCho = Vhov * VTC
        GBho = Ghoho * R
        for i in range(len(indices)):
            GBho[i, 0] += GCho[i, 0]
        
        CTGC = multiply(VTC.T, newevals) * VTC
        RTGCho = R.T * GCho
        
        BTGB = R.T * Ghoho * R
        for i in range(len(indices) + 1):
            BTGB[i, 0] += RTGCho[i, 0]
            BTGB[0, i] += RTGCho[i, 0]
        BTGB[0, 0] += CTGC[0, 0]
        
        BTY = R.T * Y[indices]
        BTY[0] = BTY[0] + CTY[0]
        
        GDYho = Vhov * (self.size - holen) * VTY
        GLYho = GDYho - GBho * BTY
        
        CTGDY = multiply(VTC.T, newevals) * (self.size - holen) * VTY
        BTGLY = R.T * GDYho - BTGB * BTY
        BTGLY[0] = BTGLY[0] + CTGDY[0]
        
        F = GLYho - GBho * la.inv(-mat(eye(holen + 1)) + BTGB) * BTGLY
        
        #results.append(F)
        #return results
        F = np.squeeze(np.array(F))
        return F
Beispiel #7
0
    def leave_pair_out(self, pairs_start_inds, pairs_end_inds):
        """Computes leave-pair-out predictions for a trained RankRLS.
        
        Parameters
        ----------
        pairs_start_inds : list of indices, shape = [n_pairs]
            list of indices from range [0, n_samples-1]
        pairs_end_inds : list of indices, shape = [n_pairs]
            list of indices from range [0, n_samples-1]
        
        Returns
        -------
        P1 : array, shape = [n_pairs]
            holdout predictions for pairs_start_inds
        P2 : array, shape = [n_pairs]
            holdout predictions for pairs_end_inds
            
        Notes
        -----
    
        Computes the leave-pair-out cross-validation predictions, where each (i,j) pair with
        i= pair_start_inds[k] and j = pairs_end_inds[k] is left out in turn.
        
        When estimating area under ROC curve with leave-pair-out, one should leave out all
        positive-negative pairs, while for estimating the general ranking error one should
        leave out all pairs with different labels.
        
        Computational complexity of leave-pair-out with most pairs left out:
        m = n_samples, d = n_features, l = n_labels, b = n_bvectors
        
        O(lm^2+m^3): basic case
        
        O(lm^2+dm^2): Linear Kernel, d < m
        
        O(lm^2+bm^2): Sparse approximation with basis vectors 
        
        The leave-pair-out cross-validation algorithm is described in [1,2]. The use of
        leave-pair-out cross-validation for AUC estimation has been analyzed in [3]
        
        [1] Tapio Pahikkala, Evgeni Tsivtsivadze, Antti Airola, Jouni Jarvinen, and Jorma Boberg.
        An efficient algorithm for learning to rank from preference graphs.
        Machine Learning, 75(1):129-165, 2009.
    
        [2] Tapio Pahikkala, Antti Airola, Jorma Boberg, and Tapio Salakoski.
        Exact and efficient leave-pair-out cross-validation for ranking RLS.
        In Proceedings of the 2nd International and Interdisciplinary Conference
        on Adaptive Knowledge Representation and Reasoning (AKRR'08), pages 1-8,
        Espoo, Finland, 2008.

        [3] Antti Airola, Tapio Pahikkala, Willem Waegeman, Bernard De Baets, Tapio Salakoski.
        An Experimental Comparison of Cross-Validation Techniques for Estimating the Area Under the ROC Curve.
        Computational Statistics & Data Analysis 55(4), 1828-1844, 2011.
        """
        pairs_start_inds = array_tools.as_index_list(pairs_start_inds, self.Y.shape[0])
        pairs_end_inds = array_tools.as_index_list(pairs_end_inds, self.Y.shape[0])        
        
        evals, svecs = self.evals, self.svecs
        m = self.size
        
        Y = self.Y
        
        modevals = np.squeeze(np.array(np.multiply(evals, 1. / ((m - 2.) * evals + self.regparam))))
        GDY = (self.size - 2.) * (svecs * np.multiply(np.mat(modevals).T, (svecs.T * Y)))
        GC = np.squeeze(np.array(svecs * np.multiply(np.mat(modevals).T, np.sum(svecs.T, axis = 1))))
        CTGC = np.sum(GC)
        
        pairslen = len(pairs_start_inds)
        sm2Gdiag = np.zeros((self.Y.shape[0]))
        BTY = np.zeros((self.Y.shape))
        sqrtsm2GDY = np.zeros((self.Y.shape))
        BTGBBTY = np.zeros((self.Y.shape))
        results_first = np.zeros((pairslen, self.Y.shape[1]))
        results_second = np.zeros((pairslen, self.Y.shape[1]))
        
        _global_rankrls.leave_pair_out(pairslen,
                                                             self.Y.shape[0],
                                                             pairs_start_inds,
                                                             pairs_end_inds,
                                                             self.Y.shape[1],
                                                             Y,
                                                             svecs,
                                                             modevals,
                                                             svecs.shape[1],
                                                             np.zeros((self.Y.shape[0])),
                                                             np.squeeze(np.array(GC)),
                                                             sm2Gdiag,
                                                             CTGC,
                                                             GDY,
                                                             BTY,
                                                             sqrtsm2GDY,
                                                             BTGBBTY,
                                                             np.array(np.sum(Y, axis = 0))[0], #CTY
                                                             np.array(np.sum(GDY, axis = 0))[0], #CTGDY
                                                             results_first,
                                                             results_second)        
        
        return np.squeeze(results_first), np.squeeze(results_second)
Beispiel #8
0
    def holdout(self, indices):
        """Computes hold-out predictions for a trained RankRLS
        
        Parameters
        ----------
        indices : list of indices, shape = [n_hsamples]
            list of indices of training examples belonging to the set for which the
            hold-out predictions are calculated. The list can not be empty.

        Returns
        -------
        F : array, shape = [n_hsamples, n_labels]
            holdout predictions
            
        Notes
        -----
    
        The algorithm is a modification of the ones published in [1,2] for the regular RLS method.
        
        References
        ----------
        
        [1] Tapio Pahikkala, Jorma Boberg, and Tapio Salakoski.
        Fast n-Fold Cross-Validation for Regularized Least-Squares.
        Proceedings of the Ninth Scandinavian Conference on Artificial Intelligence,
        83-90, Otamedia Oy, 2006.
        
        [2] Tapio Pahikkala, Hanna Suominen, and Jorma Boberg.
        Efficient cross-validation for kernelized least-squares regression with sparse basis expansions.
        Machine Learning, 87(3):381--407, June 2012.     
        """

        indices = array_tools.as_index_list(indices, self.Y.shape[0])

        if len(indices) != len(np.unique(indices)):
            raise IndexError('Hold-out can have each index only once.')

        Y = self.Y
        m = self.size

        evals, V = self.evals, self.svecs

        #results = []

        C = np.mat(np.zeros((self.size, 3), dtype=np.float64))
        onevec = np.mat(np.ones((self.size, 1), dtype=np.float64))
        for i in range(self.size):
            C[i, 0] = 1.

        VTY = V.T * Y
        VTC = V.T * onevec
        CTY = onevec.T * Y

        holen = len(indices)

        newevals = multiply(evals, 1. / ((m - holen) * evals + self.regparam))

        R = np.mat(np.zeros((holen, holen + 1), dtype=np.float64))
        for i in range(len(indices)):
            R[i, 0] = -1.
            R[i, i + 1] = sqrt(self.size - float(holen))

        Vho = V[indices]
        Vhov = multiply(Vho, newevals)
        Ghoho = Vhov * Vho.T
        GCho = Vhov * VTC
        GBho = Ghoho * R
        for i in range(len(indices)):
            GBho[i, 0] += GCho[i, 0]

        CTGC = multiply(VTC.T, newevals) * VTC
        RTGCho = R.T * GCho

        BTGB = R.T * Ghoho * R
        for i in range(len(indices) + 1):
            BTGB[i, 0] += RTGCho[i, 0]
            BTGB[0, i] += RTGCho[i, 0]
        BTGB[0, 0] += CTGC[0, 0]

        BTY = R.T * Y[indices]
        #BTY[0, 0] += CTY[0, 0]
        BTY[0] = BTY[0] + CTY[0]

        GDYho = Vhov * (self.size - holen) * VTY
        GLYho = GDYho - GBho * BTY

        CTGDY = multiply(VTC.T, newevals) * (self.size - holen) * VTY
        BTGLY = R.T * GDYho - BTGB * BTY
        #BTGLY[0, 0] += CTGDY[0, 0]
        BTGLY[0] = BTGLY[0] + CTGDY[0]

        F = GLYho - GBho * la.inv(-mat(eye(holen + 1)) + BTGB) * BTGLY

        #results.append(F)
        #return results
        F = np.squeeze(np.array(F))
        return F
Beispiel #9
0
    def leave_pair_out(self, pairs_start_inds, pairs_end_inds):
        """Computes leave-pair-out predictions for a trained RankRLS.
        
        Parameters
        ----------
        pairs_start_inds : list of indices, shape = [n_pairs]
            list of indices from range [0, n_samples-1]
        pairs_end_inds : list of indices, shape = [n_pairs]
            list of indices from range [0, n_samples-1]
        
        Returns
        -------
        P1 : array, shape = [n_pairs]
            holdout predictions for pairs_start_inds
        P2 : array, shape = [n_pairs]
            holdout predictions for pairs_end_inds
            
        Notes
        -----
    
        Computes the leave-pair-out cross-validation predictions, where each (i,j) pair with
        i= pair_start_inds[k] and j = pairs_end_inds[k] is left out in turn.
        
        When estimating area under ROC curve with leave-pair-out, one should leave out all
        positive-negative pairs, while for estimating the general ranking error one should
        leave out all pairs with different labels.
        
        Computational complexity of leave-pair-out with most pairs left out:
        m = n_samples, d = n_features, l = n_labels, b = n_bvectors
        
        O(lm^2+m^3): basic case
        
        O(lm^2+dm^2): Linear Kernel, d < m
        
        O(lm^2+bm^2): Sparse approximation with basis vectors 
        
        The leave-pair-out cross-validation algorithm is described in [1,2]. The use of
        leave-pair-out cross-validation for AUC estimation has been analyzed in [3]
        
        [1] Tapio Pahikkala, Evgeni Tsivtsivadze, Antti Airola, Jouni Jarvinen, and Jorma Boberg.
        An efficient algorithm for learning to rank from preference graphs.
        Machine Learning, 75(1):129-165, 2009.
    
        [2] Tapio Pahikkala, Antti Airola, Jorma Boberg, and Tapio Salakoski.
        Exact and efficient leave-pair-out cross-validation for ranking RLS.
        In Proceedings of the 2nd International and Interdisciplinary Conference
        on Adaptive Knowledge Representation and Reasoning (AKRR'08), pages 1-8,
        Espoo, Finland, 2008.

        [3] Antti Airola, Tapio Pahikkala, Willem Waegeman, Bernard De Baets, Tapio Salakoski.
        An Experimental Comparison of Cross-Validation Techniques for Estimating the Area Under the ROC Curve.
        Computational Statistics & Data Analysis 55(4), 1828-1844, 2011.
        """
        pairs_start_inds = array_tools.as_index_list(pairs_start_inds,
                                                     self.Y.shape[0])
        pairs_end_inds = array_tools.as_index_list(pairs_end_inds,
                                                   self.Y.shape[0])

        evals, svecs = self.evals, self.svecs
        m = self.size

        Y = self.Y

        modevals = np.squeeze(
            np.array(
                np.multiply(evals, 1. / ((m - 2.) * evals + self.regparam))))
        GDY = (self.size -
               2.) * (svecs * np.multiply(np.mat(modevals).T, (svecs.T * Y)))
        GC = np.squeeze(
            np.array(svecs *
                     np.multiply(np.mat(modevals).T, np.sum(svecs.T, axis=1))))
        CTGC = np.sum(GC)

        pairslen = len(pairs_start_inds)
        sm2Gdiag = np.zeros((self.Y.shape[0]))
        BTY = np.zeros((self.Y.shape))
        sqrtsm2GDY = np.zeros((self.Y.shape))
        BTGBBTY = np.zeros((self.Y.shape))
        results_first = np.zeros((pairslen, self.Y.shape[1]))
        results_second = np.zeros((pairslen, self.Y.shape[1]))

        _global_rankrls.leave_pair_out(
            pairslen,
            self.Y.shape[0],
            pairs_start_inds,
            pairs_end_inds,
            self.Y.shape[1],
            Y,
            svecs,
            modevals,
            svecs.shape[1],
            np.zeros((self.Y.shape[0])),
            np.squeeze(np.array(GC)),
            sm2Gdiag,
            CTGC,
            GDY,
            BTY,
            sqrtsm2GDY,
            BTGBBTY,
            np.array(np.sum(Y, axis=0))[0],  #CTY
            np.array(np.sum(GDY, axis=0))[0],  #CTGDY
            results_first,
            results_second)

        return np.squeeze(results_first), np.squeeze(results_second)
Beispiel #10
0
    def leave_pair_out(self, pairs_start_inds, pairs_end_inds):
        
        """Computes leave-pair-out predictions
        
        Parameters
        ----------
        pairs_start_inds : list of indices, shape = [n_pairs]
            list of indices from range [0, n_samples-1]
        pairs_end_inds : list of indices, shape = [n_pairs]
            list of indices from range [0, n_samples-1]
        
        Returns
        -------
        P1 : array, shape = [n_pairs, n_labels]
            holdout predictions for pairs_start_inds
        P2 : array, shape = [n_pairs, n_labels]
            holdout predictions for pairs_end_inds
            
        Notes
        -----
    
        Computes the leave-pair-out cross-validation predictions, where each (i,j) pair with
        i= pair_start_inds[k] and j = pairs_end_inds[k] is left out in turn.
        
        When estimating area under ROC curve with leave-pair-out, one should leave out all
        positive-negative pairs, while for estimating the general ranking error one should
        leave out all pairs with different labels.
        
        Computational complexity of leave-pair-out with most pairs left out:
        m = n_samples, d = n_features, l = n_labels, b = n_bvectors
        
        O(lm^2+m^3): basic case
        
        O(lm^2+dm^2): Linear Kernel, d < m
        
        O(lm^2+bm^2): Sparse approximation with basis vectors 
        
        The algorithm is an adaptation of the method published originally in [1]. The use of
        leave-pair-out cross-validation for AUC estimation has been analyzed in [2].

        References
        ---------- 
        
        [1] Tapio Pahikkala, Antti Airola, Jorma Boberg, and Tapio Salakoski.
        Exact and efficient leave-pair-out cross-validation for ranking RLS.
        In Proceedings of the 2nd International and Interdisciplinary Conference
        on Adaptive Knowledge Representation and Reasoning (AKRR'08), pages 1-8,
        Espoo, Finland, 2008.
        
        [2] Antti Airola, Tapio Pahikkala, Willem Waegeman, Bernard De Baets, and Tapio Salakoski.
        An experimental comparison of cross-validation techniques for estimating the area under the ROC curve.
        Computational Statistics & Data Analysis, 55(4):1828--1844, April 2011.
        """
        
        pairs_start_inds = array_tools.as_index_list(pairs_start_inds, self.Y.shape[0])
        pairs_end_inds = array_tools.as_index_list(pairs_end_inds, self.Y.shape[0])
        pairslen = len(pairs_start_inds)
        if not len(pairs_start_inds) == len(pairs_end_inds):
            raise Exception("Incorrect arguments: lengths of pairs_start_inds and pairs_end_inds do no match")
        
        bevals = multiply(self.evals, self.newevals)
        svecsbevals = multiply(self.svecs, bevals)
        hatmatrixdiagonal = np.squeeze(np.array(np.sum(np.multiply(self.svecs, svecsbevals), axis = 1)))
        svecsbevalssvecsTY = svecsbevals * self.svecsTY
        results_first = np.zeros((pairslen, self.Y.shape[1]))
        results_second = np.zeros((pairslen, self.Y.shape[1]))
        _rls.leave_pair_out(pairslen,
                                                     pairs_start_inds,
                                                     pairs_end_inds,
                                                     self.Y.shape[1],
                                                     self.Y,
                                                     self.svecs,
                                                     np.atleast_1d(np.squeeze(np.array(bevals))),
                                                     self.svecs.shape[1],
                                                     hatmatrixdiagonal,
                                                     svecsbevalssvecsTY,
                                                     results_first,
                                                     results_second)
        return np.squeeze(results_first), np.squeeze(results_second)