コード例 #1
0
ファイル: leapUtils.py プロジェクト: MicrosoftGenomics/LEAP
def computeCovar(bed, shrinkMethod, fitIndividuals):
	eigen = dict([])

	if (shrinkMethod in ['lw', 'oas', 'l1', 'cv']):
		import sklearn.covariance as cov
		t0 = time.time()
		print 'Estimating shrunk covariance using', shrinkMethod, 'estimator...'
				
		if (shrinkMethod == 'lw'): covEstimator = cov.LedoitWolf(assume_centered=True, block_size = 5*bed.val.shape[0])
		elif (shrinkMethod == 'oas'): covEstimator = cov.OAS(assume_centered=True)
		elif (shrinkMethod == 'l1'): covEstimator = cov.GraphLassoCV(assume_centered=True, verbose=True)
		elif (shrinkMethod == 'cv'):
			shrunkEstimator = cov.ShrunkCovariance(assume_centered=True)
			param_grid = {'shrinkage': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99]}			
			covEstimator = sklearn.grid_search.GridSearchCV(shrunkEstimator, param_grid)		
		else: raise Exception('unknown covariance regularizer')
		
		covEstimator.fit(bed.val[fitIndividuals, :].T)
		if (shrinkMethod == 'l1'):
			alpha = covEstimator.alpha_
			print 'l1 alpha chosen:', alpha
			covEstimator2 = cov.GraphLasso(alpha=alpha, assume_centered=True, verbose=True)
		else:
			if (shrinkMethod == 'cv'): shrinkEstimator = clf.best_params_['shrinkage']
			else: shrinkEstimator = covEstimator.shrinkage_
			print 'shrinkage estimator:', shrinkEstimator
			covEstimator2 = cov.ShrunkCovariance(shrinkage=shrinkEstimator, assume_centered=True)
		covEstimator2.fit(bed.val.T)
		XXT = covEstimator2.covariance_ * bed.val.shape[1]
		print 'Done in %0.2f'%(time.time()-t0), 'seconds'
			
	else:
		print 'Computing kinship matrix...'	
		t0 = time.time()
		XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1))
		print 'Done in %0.2f'%(time.time()-t0), 'seconds'		
		try: shrinkParam = float(shrinkMethod)
		except: shrinkParam = -1
		if (shrinkMethod == 'mylw'):
			XXT_fit = XXT[np.ix_(fitIndividuals, fitIndividuals)]
			sE2R = (np.sum(XXT_fit**2) - np.sum(np.diag(XXT_fit)**2)) / (bed.val.shape[1]**2)
			#temp = (bed.val**2).dot((bed.val.T)**2)
			temp = symmetrize(blas.dsyrk(1.0, bed.val[fitIndividuals, :]**2, lower=1))
			sER2 = (temp.sum() - np.diag(temp).sum()) / bed.val.shape[1]
			shrinkParam = (sER2 - sE2R) / (sE2R * (bed.val.shape[1]-1))		
		if (shrinkParam > 0):
			print 'shrinkage estimator:', 1-shrinkParam
			XXT = (1-shrinkParam)*XXT + bed.val.shape[1]*shrinkParam*np.eye(XXT.shape[0])
	
	return XXT
コード例 #2
0
ファイル: kernels.py プロジェクト: sspeng/MKLMM
	def __init__(self, X):
		Kernel.__init__(self)		
		self.X_scaled = X / X.shape[1]		
		if (X.shape[1] >= X.shape[0]): self.XXT = gpUtils.symmetrize(blas.dsyrk(1.0/X.shape[1], X, lower=0))
		else:
			self.XXT = None
			self.X = X
コード例 #3
0
ファイル: dsyrk.py プロジェクト: bdatdo0601/projectAri
def run_dsyrk(N, l):

    A = randn(N, N).astype('float64', order='F')
    C = zeros((N, N), dtype='float64', order='F')

    start = time.time()
    for i in range(0, l):
        blas.dsyrk(1.0, A, c=C, overwrite_c=True)
    end = time.time()

    timediff = (end - start)
    mflops = (N * N * N) * l / timediff
    mflops *= 1e-6

    size = "%dx%d" % (N, N)
    print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
コード例 #4
0
def tdot_blas(mat, out=None):
    """returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles."""
    if (mat.dtype != 'float64') or (len(mat.shape) != 2):
        return np.dot(mat, mat.T)
    nn = mat.shape[0]
    if out is None:
        out = np.zeros((nn, nn))
    else:
        assert (out.dtype == 'float64')
        assert (out.shape == (nn, nn))
        # FIXME: should allow non-contiguous out, and copy output into it:
        assert (8 in out.strides)
        # zeroing needed because of dumb way I copy across triangular answer
        out[:] = 0.0

    # # Call to DSYRK from BLAS
    mat = np.asfortranarray(mat)
    out = blas.dsyrk(alpha=1.0,
                     a=mat,
                     beta=0.0,
                     c=out,
                     overwrite_c=1,
                     trans=0,
                     lower=0)

    symmetrify(out, upper=True)
    return np.ascontiguousarray(out)
コード例 #5
0
ファイル: cholesky.py プロジェクト: ucerd/apps
def syrk(A, B, MKLProc):
    from scipy.linalg.blas import dsyrk
    import os
    os.environ['MKL_NUM_THREADS'] = str(MKLProc)
    alpha = -1.0
    beta = 1.0
    B = dsyrk(alpha, A, c=B, beta=beta, lower=True)
    return B
コード例 #6
0
    def create_standard_kinship(self):
        """
        compute kinship matrix ( X * X.transpose() ) / (number of sites)
        returns matrix of dimensions nXn
        """
        logging.info("Creating a standard kinship...")
        X = self.data
        if not self.standardize:
            X = tools.standardize(X, axis=0)

        # compute kinship matrix ( X * X.transpose() ) / (number of sites)
        return tools.symmetrize(blas.dsyrk(1.0, X, lower=1)) / X.shape[1]
コード例 #7
0
def _XXT(XT):
    """
	[Added 30/9/2018]
	Computes X @ XT much faster than naive X @ XT.
	Notice X @ XT is symmetric, hence instead of doing the
	full matrix multiplication X @ XT which takes O(pn^2) time,
	compute only the upper triangular which takes slightly
	less time and memory.
	"""
    if XT.dtype == float64:
        return dsyrk(1, XT, trans=1).T
    return ssyrk(1, XT, trans=1).T
コード例 #8
0
ファイル: leapUtils.py プロジェクト: MicrosoftGenomics/LEAP
def findRelated(bed, cutoff):
	print 'Computing kinship matrix...'
	t0 = time.time()	
	XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1) / bed.val.shape[1])
	print 'Done in %0.2f'%(time.time()-t0), 'seconds'

	#Find related individuals
	removeSet = set(np.sort(vc.VertexCut().work(XXT, cutoff))) #These are the indexes of the IIDs to remove		
	print 'Marking', len(removeSet), 'individuals to be removed due to high relatedness'
	
	#keepArr = np.array([(1 if iid in keepSet else 0) for iid in bed.iid], dtype=bool)	
	keepArr = np.ones(bed.iid.shape[0], dtype=bool)
	for i in removeSet: keepArr[i] = False	
	return keepArr
コード例 #9
0
def findRelated(bed, cutoff):
    print 'Computing kinship matrix...'
    t0 = time.time()
    XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1) / bed.val.shape[1])
    print 'Done in %0.2f' % (time.time() - t0), 'seconds'

    #Find related individuals
    removeSet = set(np.sort(vc.VertexCut().work(
        XXT, cutoff)))  #These are the indexes of the IIDs to remove
    print 'Marking', len(
        removeSet), 'individuals to be removed due to high relatedness'

    #keepArr = np.array([(1 if iid in keepSet else 0) for iid in bed.iid], dtype=bool)
    keepArr = np.ones(bed.iid.shape[0], dtype=bool)
    for i in removeSet:
        keepArr[i] = False
    return keepArr
コード例 #10
0
def eigenDecompose(bed, outFile=None):

	bed = leapUtils._fixupBed(bed)

	#Compute kinship matrix
	t0 = time.time()
	print 'Computing kinship matrix...'	
	XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) / bed.val.shape[1]
	print 'Done in %0.2f'%(time.time()-t0), 'seconds'

	#Compute eigendecomposition
	S,U = leapUtils.eigenDecompose(XXT)
	if (outFile is not None): np.savez_compressed(outFile, arr_0=U, arr_1=S, XXT=XXT)	
	eigen = dict([])
	eigen['XXT'] = XXT
	eigen['arr_0'] = U
	eigen['arr_1'] = S
	return eigen
コード例 #11
0
ファイル: eigenDecompose.py プロジェクト: cthorball/LEAP
def eigenDecompose(bed, outFile):

    bed = leapUtils._fixupBed(bed)

    # Compute kinship matrix
    t0 = time.time()
    print "Computing kinship matrix..."
    XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) / bed.val.shape[1]
    print "Done in %0.2f" % (time.time() - t0), "seconds"

    # Compute eigendecomposition
    S, U = leapUtils.eigenDecompose(XXT)
    if outFile is not None:
        np.savez_compressed(outFile, arr_0=U, arr_1=S, XXT=XXT)
    eigen = dict([])
    eigen["XXT"] = XXT
    eigen["arr_0"] = U
    eigen["arr_1"] = S
    return eigen
コード例 #12
0
def eigenDecompose(bed, outFile=None):

    bed = leapUtils._fixupBed(bed)

    #Compute kinship matrix
    t0 = time.time()
    print 'Computing kinship matrix...'
    XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val,
                                          lower=1)) / bed.val.shape[1]
    print 'Done in %0.2f' % (time.time() - t0), 'seconds'

    #Compute eigendecomposition
    S, U = leapUtils.eigenDecompose(XXT)
    if (outFile is not None):
        np.savez_compressed(outFile, arr_0=U, arr_1=S, XXT=XXT)
    eigen = dict([])
    eigen['XXT'] = XXT
    eigen['arr_0'] = U
    eigen['arr_1'] = S
    return eigen
コード例 #13
0
ファイル: linalg.py プロジェクト: Mitan/CS4266-GP-project
def tdot_blas(mat, out=None):
    """returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles."""
    if (mat.dtype != 'float64') or (len(mat.shape) != 2):
        return np.dot(mat, mat.T)
    nn = mat.shape[0]
    if out is None:
        out = np.zeros((nn, nn))
    else:
        assert(out.dtype == 'float64')
        assert(out.shape == (nn, nn))
        # FIXME: should allow non-contiguous out, and copy output into it:
        assert(8 in out.strides)
        # zeroing needed because of dumb way I copy across triangular answer
        out[:] = 0.0

    # # Call to DSYRK from BLAS
    mat = np.asfortranarray(mat)
    out = blas.dsyrk(alpha=1.0, a=mat, beta=0.0, c=out, overwrite_c=1,
                     trans=0, lower=0)

    symmetrify(out, upper=True)
    return np.ascontiguousarray(out)
コード例 #14
0
def eigenDecompose(bed, kinshipFile=None, outFile=None, ignore_neig=False):

    if (kinshipFile is None):
        #Compute kinship matrix
        bed = leapUtils._fixupBed(bed)
        t0 = time.time()
        print('Computing kinship matrix...')
        XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val,
                                              lower=1)) / bed.val.shape[1]
        print('Done in %0.2f' % (time.time() - t0), 'seconds')
    else:
        XXT = np.loadtxt(kinshipFile)

    #Compute eigendecomposition
    S, U = leapUtils.eigenDecompose(XXT, ignore_neig)
    if (outFile is not None):
        np.savez_compressed(outFile, arr_0=U, arr_1=S, XXT=XXT)
    eigen = dict([])
    eigen['XXT'] = XXT
    eigen['arr_0'] = U
    eigen['arr_1'] = S
    return eigen
コード例 #15
0
def removeTopPCs(X, numRemovePCs):
    t0 = time.time()
    X_mean = X.mean(axis=0)
    X -= X_mean
    XXT = symmetrize(blas.dsyrk(1.0, X, lower=0))
    s, U = la.eigh(XXT)
    if (np.min(s) < -1e-4): raise Exception('Negative eigenvalues found')
    s[s < 0] = 0
    ind = np.argsort(s)[::-1]
    U = U[:, ind]
    s = s[ind]
    s = np.sqrt(s)

    #remove null PCs
    ind = (s > 1e-6)
    U = U[:, ind]
    s = s[ind]

    V = X.T.dot(U / s)
    #print 'max diff:', np.max(((U*s).dot(V.T) - X)**2)
    X = (U[:, numRemovePCs:] * s[numRemovePCs:]).dot((V.T)[numRemovePCs:, :])
    X += X_mean

    return X
コード例 #16
0
ファイル: gpUtils.py プロジェクト: omerwe/MKLMM
def removeTopPCs(X, numRemovePCs):	
	t0 = time.time()
	X_mean = X.mean(axis=0)
	X -= X_mean
	XXT = symmetrize(blas.dsyrk(1.0, X, lower=0))
	s,U = la.eigh(XXT)
	if (np.min(s) < -1e-4): raise Exception('Negative eigenvalues found')
	s[s<0]=0
	ind = np.argsort(s)[::-1]
	U = U[:, ind]
	s = s[ind]
	s = np.sqrt(s)
		
	#remove null PCs
	ind = (s>1e-6)
	U = U[:, ind]
	s = s[ind]
	
	V = X.T.dot(U/s)	
	#print 'max diff:', np.max(((U*s).dot(V.T) - X)**2)
	X = (U[:, numRemovePCs:]*s[numRemovePCs:]).dot((V.T)[numRemovePCs:, :])
	X += X_mean
	
	return X
コード例 #17
0
ファイル: leapUtils.py プロジェクト: Joannavonberg/LEAP
def computeCovar(bed, shrinkMethod, fitIndividuals):
    eigen = dict([])

    if (shrinkMethod in ['lw', 'oas', 'l1', 'cv']):
        import sklearn.covariance as cov
        t0 = time.time()
        print 'Estimating shrunk covariance using', shrinkMethod, 'estimator...'

        if (shrinkMethod == 'lw'):
            covEstimator = cov.LedoitWolf(assume_centered=True,
                                          block_size=5 * bed.val.shape[0])
        elif (shrinkMethod == 'oas'):
            covEstimator = cov.OAS(assume_centered=True)
        elif (shrinkMethod == 'l1'):
            covEstimator = cov.GraphLassoCV(assume_centered=True, verbose=True)
        elif (shrinkMethod == 'cv'):
            shrunkEstimator = cov.ShrunkCovariance(assume_centered=True)
            param_grid = {'shrinkage': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99]}
            covEstimator = sklearn.grid_search.GridSearchCV(
                shrunkEstimator, param_grid)
        else:
            raise Exception('unknown covariance regularizer')

        covEstimator.fit(bed.val[fitIndividuals, :].T)
        if (shrinkMethod == 'l1'):
            alpha = covEstimator.alpha_
            print 'l1 alpha chosen:', alpha
            covEstimator2 = cov.GraphLasso(alpha=alpha,
                                           assume_centered=True,
                                           verbose=True)
        else:
            if (shrinkMethod == 'cv'):
                shrinkEstimator = clf.best_params_['shrinkage']
            else:
                shrinkEstimator = covEstimator.shrinkage_
            print 'shrinkage estimator:', shrinkEstimator
            covEstimator2 = cov.ShrunkCovariance(shrinkage=shrinkEstimator,
                                                 assume_centered=True)
        covEstimator2.fit(bed.val.T)
        XXT = covEstimator2.covariance_ * bed.val.shape[1]
        print 'Done in %0.2f' % (time.time() - t0), 'seconds'

    else:
        print 'Computing kinship matrix...'
        t0 = time.time()
        XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1))
        print 'Done in %0.2f' % (time.time() - t0), 'seconds'
        try:
            shrinkParam = float(shrinkMethod)
        except:
            shrinkParam = -1
        if (shrinkMethod == 'mylw'):
            XXT_fit = XXT[np.ix_(fitIndividuals, fitIndividuals)]
            sE2R = (np.sum(XXT_fit**2) -
                    np.sum(np.diag(XXT_fit)**2)) / (bed.val.shape[1]**2)
            #temp = (bed.val**2).dot((bed.val.T)**2)
            temp = symmetrize(
                blas.dsyrk(1.0, bed.val[fitIndividuals, :]**2, lower=1))
            sER2 = (temp.sum() - np.diag(temp).sum()) / bed.val.shape[1]
            shrinkParam = (sER2 - sE2R) / (sE2R * (bed.val.shape[1] - 1))
        if (shrinkParam > 0):
            print 'shrinkage estimator:', 1 - shrinkParam
            XXT = (1 - shrinkParam) * XXT + bed.val.shape[
                1] * shrinkParam * np.eye(XXT.shape[0])

    return XXT
コード例 #18
0
ファイル: hdda.py プロジェクト: mfauvel/HDDA
    def fit(self, X, y=None):
        """Estimate the model parameters with the EM algorithm

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        Returns
        -------
        self
        """

        # Initialization
        n, d = X.shape
        self.n = n
        self.d = d
        LL = []
        ITER = 0

        X = check_array(X, copy=False, order='C', dtype=sp.float64)

        # Compute constant
        self.cst = self.d*sp.log(2*sp.pi)

        # Set minimum clusters size
        # Rule of dumbs for minimal size pi = 1 :
        # one mean vector (d) + one eigenvalues/vectors (1 + d)
        # + noise term (1) ~ 2(d+1)
        if self.population is None:
            self.population = 2*(self.d+1)

        if self.population > self.n/self.C:
            print("Number of classes to high w.r.t the number of samples:"
                  "C should be deacreased")
            return - 2

        # Initialization of the clustering
        if self.C == 1:
            self.T = sp.ones((self.n, 1))
        else:
            if self.init == 'kmeans':
                label = KMeans(n_clusters=self.C,
                               n_init=1, n_jobs=-1,
                               random_state=self.random_state).fit(X).labels_
                label += 1  # Label starts at one
            elif self.init == 'random':
                sp.random.seed(self.random_state)
                label = sp.random.randint(1, high=self.C+1, size=n)
            elif self.init == 'user':
                if self.C != y.max():
                    print("The number of class does not"
                          "match between self.C and y")
                label = y
            else:
                print("Initialization should be kmeans or random or user")
                return - 2  # Bad init values

            # Convert label to membership
            self.T = sp.zeros((self.n, self.C))
            self.T[sp.arange(self.n), label-1] = 1

        # Compute the whole covariance matrix and its eigenvalues if needed
        if self.model in ('M2', 'M4', 'M6', 'M8'):
            X_ = (X - sp.mean(X, axis=0))
            # Use dsyrk to take benefit of the product symmetric matrices
            # X^{t}X or XX^{t}
            # Transpose to put in fortran order
            if self.n >= self.d:
                W = dsyrk(1.0/self.n, X_.T, trans=False)
            else:
                W = dsyrk(1.0/self.n, X_.T, trans=True)
            del X_

            # Compute intrinsic dimension on the whole data set
            L = linalg.eigh(W, eigvals_only=True, lower=False)
            idx = L.argsort()[::-1]
            L = L[idx]
            # Chek for numerical errors
            L[L < EPS] = EPS
            self.dL = sp.absolute(sp.diff(L))
            self.dL /= self.dL.max()
            del W, L

        # Initialization of the parameter
        self.m_step(X)
        ll = self.e_step(X)
        LL.append(ll)

        # Main while loop
        while(ITER < self.itermax):
            # M step
            self.free()
            self.m_step(X)

            # E step
            ll = self.e_step(X)

            LL.append(ll)
            if (abs((LL[-1]-LL[-2])/LL[-2]) < self.tol) and \
               (self.C_[-2] == self.C_[-1]):
                break
            else:
                ITER += 1

        # Return the class membership and some parameters of the optimization
        self.LL = LL
        self.bic = - 2*LL[-1] + self.q*sp.log(self.n)
        self.aic = - 2*LL[-1] + 2*self.q
        # Add small constant to ICL to prevent numerical issues
        self.icl = self.bic - 2*sp.log(self.T.max(axis=1)+EPS).sum()
        self.niter = ITER + 1

        # Remove temporary variables
        self.T = None
        self.X = None
        return self
コード例 #19
0
ファイル: hdda.py プロジェクト: lopesm/STH_measures
    def fit_init(self, x, y):
        """This  function computes  the  empirical  estimators of  the  mean
        vector,  the convariance  matrix  and the  proportion of  each
        class.
        :param x: The sample matrix, is of size x \times d where n is the number of samples and d is the number of variables
        :param y: The vector of corresponding labels, is of size n \times 1 in the supervised case and n \times C in the unsupervised case
        :type x: float
        :type y: int
        """
        ## Get information from the data
        n, d = x.shape  # Number of samples and number of variables
        if y.ndim == 1:  # Number of classes
            C = int(y.max(0))
        else:
            C = y.shape[1]

        if n != y.shape[0]:
            print("size of x and y should match")
            exit()

        ## Compute constant
        self.cst = d * sp.log(2 * sp.pi)

        ## Compute the whole covariance matrix
        if self.model in ('M2', 'M4', 'M6', 'M8'):
            X = (x - sp.mean(x, axis=0))
            if n >= d:  # Here use dsyrk to take benefit of the product symmetric matrices X^{t}X or XX^{t}
                self.W = dsyrk(
                    1.0 / n, X.T,
                    trans=False)  # Transpose to put in fortran order
            else:
                self.W = dsyrk(1.0 / n, X.T,
                               trans=True)  # Transpose to put in fortran order
            X = None

        ## Learn the empirical of the model for each class
        for c in xrange(C):
            if y.ndim == 1:  # Supervised case
                j = sp.where(y == (c + 1))[0]
                self.ni.append(j.size)
                self.prop.append(float(self.ni[c]) / n)
                self.mean.append(sp.mean(x[j, :], axis=0))
                X = (x[j, :] - self.mean[c])

            else:  # Unsupervised case
                self.ni.append(y[:, c].sum())
                self.prop.append(float(self.ni[c]) / n)
                self.mean.append(sp.average(x, weights=y[:, c], axis=0))
                X = (x - self.mean[c]) * sp.sqrt(y[:, c]).reshape(n, 1)

            if n >= d:  # Here use dsyrk to take benefit of the product of symmetric matrices X^{t}X or XX^{t}
                cov = dsyrk(1.0 / (self.ni[c] - 1), X.T,
                            trans=False)  # Transpose to put in fortran order
            else:
                cov = dsyrk(1.0 / (self.ni[c] - 1), X.T,
                            trans=True)  # Transpose to put in fortran order
                self.X.append(X)

            X = None
            L, Q = linalg.eigh(
                cov, lower=False
            )  # Only the upper part of cov is initialize -> dsyrk
            idx = L.argsort()[::-1]
            L, Q = L[idx], Q[:, idx]
            L[L < EPS] = EPS  # Chek for numerical errors
            self.L.append(L)
            self.Q.append(Q)
            self.trace.append(cov.trace())
コード例 #20
0
ファイル: hdda.py プロジェクト: mfauvel/HDDA
    def m_step(self, X):
        """M step of the algorithm

        This function  computes the  empirical estimators of  the mean
        vector,  the convariance  matrix  and the  proportion of  each
        class.

        """
        # Learn the model for each class
        C_ = self.C
        c_delete = []
        for c in xrange(self.C):
            ni = self.T[:, c].sum()
            # Check if empty
            if self.check_empty and \
               ni < self.population:
                C_ -= 1
                c_delete.append(c)
            else:
                self.ni.append(ni)
                self.prop.append(float(self.ni[-1])/self.n)
                self.mean.append(sp.dot(self.T[:, c].T, X)/self.ni[-1])
                X_ = (X-self.mean[-1])*(sp.sqrt(self.T[:, c])[:, sp.newaxis])

                # Use dsyrk to take benefit of symmetric matrices
                if self.n >= self.d:
                    cov = dsyrk(1.0/(self.ni[-1]-1), X_.T, trans=False)
                else:
                    cov = dsyrk(1.0/(self.ni[-1]-1), X_.T, trans=True)
                    self.X.append(X_)
                X_ = None

                # Only the upper part of cov is initialize -> dsyrk
                L, Q = linalg.eigh(cov, lower=False)

                # Chek for numerical errors
                L[L < EPS] = EPS
                if self.check_empty and (L.max() - L.min()) < EPS:
                    # In that case all eigenvalues are equal
                    # and this does not match the model
                    C_ -= 1
                    c_delete.append(c)
                    del self.ni[-1]
                    del self.prop[-1]
                    del self.mean[-1]
                    if self.n < self.d:
                        del self.X[-1]
                else:
                    idx = L.argsort()[::-1]
                    L, Q = L[idx], Q[:, idx]

                    self.L.append(L)
                    self.Q.append(Q)
                    self.trace.append(cov.trace())

        # Update T
        if c_delete:
            self.T = sp.delete(self.T, c_delete, axis=1)

        # Update the number of clusters
        self.C_.append(C_)
        self.C = C_

        # Estimation of the signal subspace for specific size subspace models
        if self.model in ('M1', 'M3', 'M5', 'M7'):
            for c in xrange(self.C):
                # Scree test
                dL, pi = sp.absolute(sp.diff(self.L[c])), 1
                dL /= dL.max()
                while sp.any(dL[pi:] > self.th):
                    pi += 1
                if (pi < (min(self.ni[c], self.d) - 1)) and (pi > 0):
                    self.pi.append(pi)
                else:
                    self.pi.append(1)
        elif self.model in ('M2', 'M4', 'M6', 'M8'):
            dL, p = self.dL, 1
            while sp.any(dL[p:] > self.th):
                p += 1
            min_dim = int(min(min(self.ni), self.d))
            # Check if (p >= ni-1 or d-1) and p > 0
            if p < (min_dim - 1):
                self.pi = [p for c in xrange(self.C)]
            else:
                self.pi = [max((min_dim-2), 1) for c in xrange(self.C)]
            del dL, p, idx

        # Estim signal part
        self.a = [sL[:sPI] for sL, sPI in zip(self.L, self.pi)]
        if self.model in ('M5', 'M6', 'M7', 'M8'):
            self.a = [sp.repeat(sA[:].mean(), sA.size) for sA in self.a]

        # Estim noise term
        if self.model in ('M1', 'M2', 'M5', 'M6'):
            # Noise free
            self.b = [(sT-sA.sum())/(self.d-sPI)
                      for sT, sA, sPI in zip(self.trace, self.a, self.pi)]
            # Check for very small value of b
            self.b = [b if b > EPS else EPS for b in self.b]

        elif self.model in ('M3', 'M4', 'M7', 'M8'):
            # Noise common
            denom = self.d - sp.sum([sPR*sPI
                                     for sPR, sPI in
                                     zip(self.prop, self.pi)])
            num = sp.sum([sPR*(sT-sA.sum())
                          for sPR, sT, sA in
                          zip(self.prop, self.trace, self.a)])

            # Check for very small values
            if num < EPS:
                self.b = [EPS for i in xrange(self.C)]
            elif denom < EPS:
                self.b = [1/EPS for i in xrange(self.C)]
            else:
                self.b = [num/denom for i in xrange(self.C)]

        # Compute remainings parameters
        # Precompute logdet
        self.logdet = [(sp.log(sA).sum() + (self.d-sPI)*sp.log(sB))
                       for sA, sPI, sB in
                       zip(self.a, self.pi, self.b)]

        # Update the Q matrices
        if self.n >= self.d:
            self.Q = [sQ[:, :sPI]
                      for sQ, sPI in
                      zip(self.Q, self.pi)]
        else:
            self.Q = [sp.dot(sX.T, sQ[:, :sPI])/sp.sqrt(sL[:sPI])
                      for sX, sQ, sPI, sL in
                      zip(self.X, self.Q, self.pi, self.L)]

        # Compute the number of parameters of the model
        self.q = self.C*self.d + (self.C-1) + sum([sPI*(self.d-(sPI+1)/2)
                                                   for sPI in self.pi])
        # Number of noise subspaces
        if self.model in ('M1', 'M3', 'M5', 'M7'):
            self.q += self.C
        elif self.model in ('M2', 'M4', 'M6', 'M8'):
            self.q += 1
        # Size of signal subspaces
        if self.model in ('M1', 'M2'):
            self.q += sum(self.pi) + self.C
        elif self.model in ('M3', 'M4'):
            self.q += sum(self.pi) + 1
        elif self.model in ('M5', 'M6'):
            self.q += 2*self.C
        elif self.model in ('M7', 'M8'):
            self.q += self.C+1
コード例 #21
0
ファイル: hdda.py プロジェクト: artificyan/HDDA
    def m_step(self, X):
        """M step of the algorithm

        This function  computes the  empirical estimators of  the mean
        vector,  the convariance  matrix  and the  proportion of  each
        class.

        """
        # Learn the model for each class
        C_ = self.C
        c_delete = []
        for c in xrange(self.C):
            ni = self.T[:, c].sum()
            # Check if empty
            if self.check_empty and \
               ni < self.population:
                C_ -= 1
                c_delete.append(c)
            else:
                self.ni.append(ni)
                self.prop.append(float(self.ni[-1]) / self.n)
                self.mean.append(sp.dot(self.T[:, c].T, X) / self.ni[-1])
                X_ = (X - self.mean[-1]) * (sp.sqrt(self.T[:, c])[:,
                                                                  sp.newaxis])

                # Use dsyrk to take benefit of symmetric matrices
                if self.n >= self.d:
                    cov = dsyrk(1.0 / (self.ni[-1] - 1), X_.T, trans=False)
                else:
                    cov = dsyrk(1.0 / (self.ni[-1] - 1), X_.T, trans=True)
                    self.X.append(X_)
                X_ = None

                # Only the upper part of cov is initialize -> dsyrk
                L, Q = linalg.eigh(cov, lower=False)

                # Chek for numerical errors
                L[L < EPS] = EPS
                if self.check_empty and (L.max() - L.min()) < EPS:
                    # In that case all eigenvalues are equal
                    # and this does not match the model
                    C_ -= 1
                    c_delete.append(c)
                    del self.ni[-1]
                    del self.prop[-1]
                    del self.mean[-1]
                    if self.n < self.d:
                        del self.X[-1]
                else:
                    idx = L.argsort()[::-1]
                    L, Q = L[idx], Q[:, idx]

                    self.L.append(L)
                    self.Q.append(Q)
                    self.trace.append(cov.trace())

        # Update T
        if c_delete:
            self.T = sp.delete(self.T, c_delete, axis=1)

        # Update the number of clusters
        self.C_.append(C_)
        self.C = C_

        # Estimation of the signal subspace for specific size subspace models
        if self.model in ('M1', 'M3', 'M5', 'M7'):
            for c in xrange(self.C):
                # Scree test
                dL, pi = sp.absolute(sp.diff(self.L[c])), 1
                dL /= dL.max()
                while sp.any(dL[pi:] > self.th):
                    pi += 1
                if (pi < (min(self.ni[c], self.d) - 1)) and (pi > 0):
                    self.pi.append(pi)
                else:
                    self.pi.append(1)
        elif self.model in ('M2', 'M4', 'M6', 'M8'):
            dL, p = self.dL, 1
            while sp.any(dL[p:] > self.th):
                p += 1
            min_dim = int(min(min(self.ni), self.d))
            # Check if (p >= ni-1 or d-1) and p > 0
            if p < (min_dim - 1):
                self.pi = [p for c in xrange(self.C)]
            else:
                self.pi = [max((min_dim - 2), 1) for c in xrange(self.C)]
            del dL, p, idx

        # Estim signal part
        self.a = [sL[:sPI] for sL, sPI in zip(self.L, self.pi)]
        if self.model in ('M5', 'M6', 'M7', 'M8'):
            self.a = [sp.repeat(sA[:].mean(), sA.size) for sA in self.a]

        # Estim noise term
        if self.model in ('M1', 'M2', 'M5', 'M6'):
            # Noise free
            self.b = [(sT - sA.sum()) / (self.d - sPI)
                      for sT, sA, sPI in zip(self.trace, self.a, self.pi)]
            # Check for very small value of b
            self.b = [b if b > EPS else EPS for b in self.b]

        elif self.model in ('M3', 'M4', 'M7', 'M8'):
            # Noise common
            denom = self.d - sp.sum(
                [sPR * sPI for sPR, sPI in zip(self.prop, self.pi)])
            num = sp.sum([
                sPR * (sT - sA.sum())
                for sPR, sT, sA in zip(self.prop, self.trace, self.a)
            ])

            # Check for very small values
            if num < EPS:
                self.b = [EPS for i in xrange(self.C)]
            elif denom < EPS:
                self.b = [1 / EPS for i in xrange(self.C)]
            else:
                self.b = [num / denom for i in xrange(self.C)]

        # Compute remainings parameters
        # Precompute logdet
        self.logdet = [(sp.log(sA).sum() + (self.d - sPI) * sp.log(sB))
                       for sA, sPI, sB in zip(self.a, self.pi, self.b)]

        # Update the Q matrices
        if self.n >= self.d:
            self.Q = [sQ[:, :sPI] for sQ, sPI in zip(self.Q, self.pi)]
        else:
            self.Q = [
                sp.dot(sX.T, sQ[:, :sPI]) / sp.sqrt(sL[:sPI])
                for sX, sQ, sPI, sL in zip(self.X, self.Q, self.pi, self.L)
            ]

        # Compute the number of parameters of the model
        self.q = self.C * self.d + (self.C - 1) + sum(
            [sPI * (self.d - (sPI + 1) / 2) for sPI in self.pi])
        # Number of noise subspaces
        if self.model in ('M1', 'M3', 'M5', 'M7'):
            self.q += self.C
        elif self.model in ('M2', 'M4', 'M6', 'M8'):
            self.q += 1
        # Size of signal subspaces
        if self.model in ('M1', 'M2'):
            self.q += sum(self.pi) + self.C
        elif self.model in ('M3', 'M4'):
            self.q += sum(self.pi) + 1
        elif self.model in ('M5', 'M6'):
            self.q += 2 * self.C
        elif self.model in ('M7', 'M8'):
            self.q += self.C + 1
コード例 #22
0
ファイル: hdda.py プロジェクト: artificyan/HDDA
    def fit(self, X, y=None):
        """Estimate the model parameters with the EM algorithm

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        Returns
        -------
        self
        """

        # Initialization
        n, d = X.shape
        self.n = n
        self.d = d
        LL = []
        ITER = 0

        X = check_array(X, copy=False, order='C', dtype=sp.float64)

        # Compute constant
        self.cst = self.d * sp.log(2 * sp.pi)

        # Set minimum clusters size
        # Rule of dumbs for minimal size pi = 1 :
        # one mean vector (d) + one eigenvalues/vectors (1 + d)
        # + noise term (1) ~ 2(d+1)
        if self.population is None:
            self.population = 2 * (self.d + 1)

        if self.population > self.n / self.C:
            print(
                "Number of classes to high w.r.t the number of samples:"
                "C should be deacreased")
            return -2

        # Initialization of the clustering
        if self.C == 1:
            self.T = sp.ones((self.n, 1))
        else:
            if self.init == 'kmeans':
                label = KMeans(n_clusters=self.C,
                               n_init=1,
                               n_jobs=-1,
                               random_state=self.random_state).fit(X).labels_
                label += 1  # Label starts at one
            elif self.init == 'random':
                sp.random.seed(self.random_state)
                label = sp.random.randint(1, high=self.C + 1, size=n)
            elif self.init == 'user':
                if self.C != y.max():
                    print(
                        "The number of class does not"
                        "match between self.C and y")
                label = y
            else:
                print("Initialization should be kmeans or random or user")
                return -2  # Bad init values

            # Convert label to membership
            self.T = sp.zeros((self.n, self.C))
            self.T[sp.arange(self.n), label - 1] = 1

        # Compute the whole covariance matrix and its eigenvalues if needed
        if self.model in ('M2', 'M4', 'M6', 'M8'):
            X_ = (X - sp.mean(X, axis=0))
            # Use dsyrk to take benefit of the product symmetric matrices
            # X^{t}X or XX^{t}
            # Transpose to put in fortran order
            if self.n >= self.d:
                W = dsyrk(1.0 / self.n, X_.T, trans=False)
            else:
                W = dsyrk(1.0 / self.n, X_.T, trans=True)
            del X_

            # Compute intrinsic dimension on the whole data set
            L = linalg.eigh(W, eigvals_only=True, lower=False)
            idx = L.argsort()[::-1]
            L = L[idx]
            # Chek for numerical errors
            L[L < EPS] = EPS
            self.dL = sp.absolute(sp.diff(L))
            self.dL /= self.dL.max()
            del W, L

        # Initialization of the parameter
        self.m_step(X)
        ll = self.e_step(X)
        LL.append(ll)

        # Main while loop
        while (ITER < self.itermax):
            # M step
            self.free()
            self.m_step(X)

            # E step
            ll = self.e_step(X)

            LL.append(ll)
            if (abs((LL[-1]-LL[-2])/LL[-2]) < self.tol) and \
               (self.C_[-2] == self.C_[-1]):
                break
            else:
                ITER += 1

        # Return the class membership and some parameters of the optimization
        self.LL = LL
        self.bic = -2 * LL[-1] + self.q * sp.log(self.n)
        self.aic = -2 * LL[-1] + 2 * self.q
        # Add small constant to ICL to prevent numerical issues
        self.icl = self.bic - 2 * sp.log(self.T.max(axis=1) + EPS).sum()
        self.niter = ITER + 1

        # Remove temporary variables
        self.T = None
        self.X = None
        return self