def divide(self,b,mode): if mode == 1: y = sci.lstsq(self.matrix,b) else: y = sci.lstsq(self.matrix.conj().T,b) return y[0]
def solveQ3(trainData, testData, columnLabel): target = trainingData[:,dataMEDVCol:dataMEDVCol+1] targetTest = testingData[:,dataMEDVCol:dataMEDVCol+1] Ones = np.ones((len(target),1)) # Fitting the parameters: theta = (X'*X)^-1*X'*y Xtrain = np.hstack((Ones, trainData.reshape(len(Ones),1))) mTheta = lstsq(Xtrain, target)[0] target_pred = dot(Xtrain, mTheta) t = target-target_pred msePred = sum((target-target_pred)**2)/len(target) meanTarget = sum(target)/len(target) varianceTarget = sum((target-meanTarget)**2)/len(target) FVU = msePred/varianceTarget Xtest = np.hstack((Ones, testData.reshape(len(Ones),1))) mThetaTest = lstsq(Xtest, targetTest)[0] # use theta from training set, not from testing set target_pred_test = dot(Xtest, mTheta) msePredTest = sum((targetTest-target_pred_test)**2)/len(targetTest) meanTargetTest = sum(targetTest)/len(targetTest) varianceTargetTest = sum((targetTest-meanTargetTest)**2)/len(targetTest) FVUTest = msePredTest/varianceTargetTest print '###',columnLabel,'###' print 'MSE training set:', msePred print 'MSE testing set:', msePredTest print 'R2 of testing set against theta from training set:', 1 - FVUTest,'\n'
def backgroundCorrectPSFWF(d): import numpy as np from scipy import linalg zf = d.shape[2]/2 #subtract a linear background in x Ax = np.vstack([np.ones(d.shape[0]), np.arange(d.shape[0])]).T bgxf = (d[0,:,zf] + d[-1,:,zf])/2 gx = linalg.lstsq(Ax, bgxf)[0] d = d - np.dot(Ax, gx)[:,None,None] #do the same in y Ay = np.vstack([np.ones(d.shape[1]), np.arange(d.shape[1])]).T bgyf = (d[:,0,zf] + d[:,-1,zf])/2 gy = linalg.lstsq(Ay, bgyf)[0] d = d - np.dot(Ay, gy)[None, :,None] #estimate background on central slice as mean of rim pixels #bgr = (d[0,:,zf].mean() + d[-1,:,zf].mean() + d[:,0,zf].mean() + d[:,-1,zf].mean())/4 #sum over all pixels (and hence mean) should be preserved over z (for widefield psf) dm = d.mean(1).mean(0) bg = dm - dm[zf] return np.maximum(d - bg[None, None, :], 0) + 1e-5
def partial_corr(C): """ Returns the sample linear partial correlation coefficients between pairs of variables in C, controlling for the remaining variables in C. Parameters ---------- C : array-like, shape (n, p) Array with the different variables. Each column of C is taken as a variable Returns ------- P : array-like, shape (p, p) P[i, j] contains the partial correlation of C[:, i] and C[:, j] controlling for the remaining variables in C. """ C = np.asarray(C) p = C.shape[1] P_corr = np.zeros((p, p), dtype=np.float) for i in range(p): P_corr[i, i] = 1 for j in range(i+1, p): idx = np.ones(p, dtype=np.bool) idx[i] = False idx[j] = False beta_i = linalg.lstsq(C[:, idx], C[:, j])[0] beta_j = linalg.lstsq(C[:, idx], C[:, i])[0] res_j = C[:, j] - C[:, idx].dot( beta_i) res_i = C[:, i] - C[:, idx].dot(beta_j) corr = stats.pearsonr(res_i, res_j)[0] P_corr[i, j] = corr P_corr[j, i] = corr return P_corr
def pcorr(C,k): val=list(C.columns.values) C[u'ones']=np.ones(C.shape[0]) C = np.asarray(C) p = C.shape[1] P_corr = np.zeros((p-1, p-1), dtype=np.float) idx = np.zeros(p, dtype=np.bool) for kk in k: idx[kk] = True idx[p-1] = True for i in range(p-1): P_corr[i, i] = 1 for j in range(i+1, p-1): beta_i = linalg.lstsq(C[:, idx], C[:, i])[0] beta_j = linalg.lstsq(C[:, idx], C[:, j])[0] res_j = C[:, j] - C[:, idx].dot(beta_j) res_i = C[:, i] - C[:, idx].dot(beta_i) corr = stats.pearsonr(res_i, res_j)[0] P_corr[i, j] = corr P_corr[j, i] = corr p=pd.DataFrame(P_corr, index=val, columns=val) return p
def pcorParallel(X,Z,Y=None): """ computes the correlation matrix between X and Y conditioning on Z """ if Y is None: return pcorParallelSym(X,Z) if Z is None: return corrParallel(X,Y) if Z.ndim==1: Z = Z[SP.newaxis,:] X = X.T Y = Y.T Z = Z.T beta,_,_,_ = LA.lstsq(Z,Y) Yres = Y - SP.dot(Z,beta) beta,_,_,_ = LA.lstsq(Z,X) Xres = X - SP.dot(Z,beta) nSamples = Z.shape[0] nCovs = Z.shape[1] df = max(nSamples - 2 - nCovs,0) return corrParallel(Xres.T,Yres.T,df=df)
def process(self, data_in, obs_vec): """ Generate function network model. :param data: Training data matrix :math:`\mathcal{X}\in\mathbb{R}^{d\\times n}` :type data: numpy array :param obs_vec: Observation vector :math:`y\in\mathbb{R}^{1 \\times n}` :type obs_vec: numpy array :return: none :rtype: none """ # check consistency of data obs_num = obs_vec.shape[1] data_num = data_in.shape[1] if obs_num != data_num: raise Exception("Number of samples for data and observations must be the same") else: # initialize variables self.data = data_in self.data_dim = data_in.shape[0] nsamp = data_num # peel off parameters ki = self.k_type bandwidth = ki.params[0] # compute regularized kernel matrix kmat = kernel(self.data, self.data, self.k_type) + (pow(self.noise,2))*eye(nsamp) # perform Cholesky factorization, and compute mean vector (for stable inverse computations) self.lmat = cholesky(kmat).transpose() self.mean_vec = lstsq(self.lmat, obs_vec) self.mean_vec = lstsq(self.lmat.transpose(), self.mean_vec)
def pcor(X,Y,Z): """ computes the correlation amtrix of X and Y conditioning on Z """ if X.ndim==1: X = X[:,SP.newaxis] if Y.ndim==1: Y = Y[:,SP.newaxis] if Z is None: return STATS.pearsonr(X,Y) if Z.ndim==1: Z = Z[:,SP.newaxis] nSamples = X.shape[0] betaX, _, _, _ = LA.lstsq(Z,X) betaY, _, _, _ = LA.lstsq(Z,Y) Xres = X - SP.dot(Z,betaX) Yres = Y - SP.dot(Z,betaY) corr_cond = SP.corrcoef(Xres[:,0],Yres[:,0])[0,1] dz = Z.shape[1] # dimension of conditioning variable df = max(nSamples - dz - 2,0) # degrees of freedom with warnings.catch_warnings(): warnings.filterwarnings("ignore") tstat = corr_cond / SP.sqrt(1.0 - corr_cond ** 2) # calculate t statistic tstat = math.sqrt(df) * tstat pv_cond = 2 * t.cdf(-abs(tstat), df, loc=0, scale=1) # calculate p value return corr_cond,pv_cond
def bilinear_least_squares(X, y, b0=None, n_iter=10, fit_intercept=True): """assumes X.shape = n_samples, n_matrices, h, wi and does linear regression as a sum of rank 1 matrices""" if X.ndim == 3: X = X[:, np.newaxis] n_samples, n_matrices, n_feat_a, n_feat_b = X.shape if b0 is None: b0 = np.ones((n_matrices, n_feat_b)) / n_feat_b b = b0.copy() if fit_intercept: X_mean, y_mean = X.mean(0), y.mean() X = X - X_mean y = y - y_mean for i in range(n_iter): a_estimation_matrix = np.einsum( "ijkl, jl -> ijk", X, b).reshape(n_samples, -1) a = lstsq(a_estimation_matrix, y)[0].reshape(n_matrices, n_feat_a) b_estimation_matrix = np.einsum( "ijkl, jk -> ijl", X, a).reshape(n_samples, -1) b = lstsq(b_estimation_matrix, y)[0].reshape(n_matrices, n_feat_b) if fit_intercept: intercept = y_mean - np.einsum("jkl, jk, jl", X_mean, a, b) return a, b, intercept return a, b
def divide(self,x,mode): if mode == 1: y = linalg.lstsq(self.diag,x) else: y = linalg.lstsq(self.diag.conj().T,x) return y[0]
def partial_corr(C): """ Partial Correlation in Python (clone of Matlab's partialcorr) from https://gist.github.com/fabianp/9396204419c7b638d38f This uses the linear regression approach to compute the partial correlation (might be slow for a huge number of variables). The algorithm is detailed here: http://en.wikipedia.org/wiki/Partial_correlation#Using_linear_regression Taking X and Y two variables of interest and Z the matrix with all the variable minus {X, Y}, the algorithm can be summarized as 1) perform a normal linear least-squares regression with X as the target and Z as the predictor 2) calculate the residuals in Step #1 3) perform a normal linear least-squares regression with Y as the target and Z as the predictor 4) calculate the residuals in Step #3 5) calculate the correlation coefficient between the residuals from Steps #2 and #4; The result is the partial correlation between X and Y while controlling for the effect of Z Returns the sample linear partial correlation coefficients between pairs of variables in C, controlling for the remaining variables in C. Parameters ---------- C : array-like, shape (n, p) Array with the different variables. Each column of C is taken as a variable Returns ------- P : array-like, shape (p, p) P[i, j] contains the partial correlation of C[:, i] and C[:, j] controlling for the remaining variables in C. """ C = np.asarray(C) p = C.shape[1] P_corr = np.zeros((p, p), dtype=np.float) for i in range(p): P_corr[i, i] = 1 for j in range(i + 1, p): idx = np.ones(p, dtype=np.bool) idx[i] = False idx[j] = False beta_i = linalg.lstsq(C[:, idx], C[:, j])[0] beta_j = linalg.lstsq(C[:, idx], C[:, i])[0] res_j = C[:, j] - C[:, idx].dot(beta_i) res_i = C[:, i] - C[:, idx].dot(beta_j) corr = stats.pearsonr(res_i, res_j)[0] P_corr[i, j] = corr P_corr[j, i] = corr return P_corr
def trialFunFit_constrained(self, s, arr, alphas, pairs, zerostart=False): deg = len(alphas) carr = np.concatenate((arr.real, arr.imag)) # construct matrix for extended fitting problem A = np.concatenate((1. / (s[:,None] + alphas[None,:]), \ arr[:,None] / (s[:,None] + alphas[None,:])), axis=1) # implement the constraint pairsnew = np.concatenate((pairs, pairs)) for i, p in enumerate(pairsnew): if p: x1 = A[:,i] + A[:,i+1] x2 = 1j * (A[:,i] - A[:,i+1]) A[:,i] = x1 A[:,i+1] = x2 A = np.concatenate((A.real, A.imag), axis=0) # find auxiliary residues c = la.lstsq(A, carr)[0][-len(alphas):] # find zeros of fitted auxiliary function a = np.diag(alphas) b = np.ones(deg) # implement similarity transform for i, p in enumerate(pairs): if p: a[i:i+2, i:i+2] = np.array([[alphas[i].real, alphas[i].imag], \ [-alphas[i].imag, alphas[i].real]]) b[i:i+2] = np.array([2,0]) H = a.real - np.dot(b[:,None], c[None,:]) alphanew = np.linalg.eig(H)[0] inds = np.argsort(alphanew) alphanew = alphanew[inds] # indicates where pairs of complex conjugate poles occur auxarr = np.abs((np.abs(alphanew[:-1]) - np.abs(alphanew[1:])) / np.abs(alphanew[:-1])) auxarr2 = np.abs(alphas.imag) > 1e-15 pairs = np.logical_and(np.concatenate((auxarr < 1e-15, np.zeros(1, dtype=bool))), auxarr2) # find residues Anew = 1. / (s[:,None] + alphanew[None,:]) for i, p in enumerate(pairs): if p: x1 = Anew[:,i] + Anew[:,i+1] x2 = 1j * (Anew[:,i] - Anew[:,i+1]) Anew[:,i] = x1 Anew[:,i+1] = x2 Anew = np.concatenate((Anew.real, Anew.imag), axis=0) if zerostart: # enforce K(t=0)=0 constraint row1 = np.ones(2*deg) for i, p in enumerate(pairs): if p: row1[i+1] = 0 Anew = np.concatenate((np.ones((1, deg), dtype=complex), Anew), axis=0) carr = np.concatenate((np.zeros(1, dtype=complex), carr)) cnew = la.lstsq(Anew, carr)[0] cnew = np.array(cnew, dtype=complex) # recast cnew to complex values for i, p in enumerate(pairs): if p: cnew[i:i+2] = np.array([cnew[i] + 1j * cnew[i+1], cnew[i] - 1j * cnew[i+1]]) return alphanew, cnew, pairs
def time_lstsq(self, dtype, size, lapack_driver): if lapack_driver == 'numpy': np.linalg.lstsq(self.A, self.b, rcond=np.finfo(self.A.dtype).eps * 100) else: sl.lstsq(self.A, self.b, cond=None, overwrite_a=False, overwrite_b=False, check_finite=False, lapack_driver=lapack_driver)
def fit_values(self, s, x, damp=0.0): Phi = complete_polynomial(s.T, self.d).T self.Phi = Phi if damp == 0.0: self.coefs = np.ascontiguousarray(lstsq(Phi, x)[0]) else: new_coefs = np.ascontiguousarray(lstsq(Phi, x)[0]) self.coefs = (1 - damp) * new_coefs + damp * self.coefs
def _unscented_correct(cross_sigma, mu_pred, sigma2_pred, obs_mu_pred, obs_sigma2_pred, z): """Correct predicted state estimates with an observation Parameters ---------- cross_sigma : [n_dim_state, n_dim_obs] array cross-covariance between the state at time t given all observations from timesteps [0, t-1] and the observation at time t mu_pred : [n_dim_state] array mean of state at time t given observations from timesteps [0, t-1] sigma2_pred : [n_dim_state, n_dim_state] array square root of covariance of state at time t given observations from timesteps [0, t-1] obs_mu_pred : [n_dim_obs] array mean of observation at time t given observations from times [0, t-1] obs_sigma2_pred : [n_dim_obs] array square root of covariance of observation at time t given observations from times [0, t-1] z : [n_dim_obs] array observation at time t Returns ------- mu_filt : [n_dim_state] array mean of state at time t given observations from time steps [0, t] sigma2_filt : [n_dim_state, n_dim_state] array square root of covariance of state at time t given observations from time steps [0, t] """ n_dim_state = len(mu_pred) n_dim_obs = len(obs_mu_pred) if not np.any(ma.getmask(z)): ############################################## # Same as this, but more stable (supposedly) # ############################################## # K = cross_sigma.dot( # linalg.pinv( # obs_sigma2_pred.T.dot(obs_sigma2_pred) # ) # ) ############################################## # equivalent to this MATLAB code # K = (cross_sigma / obs_sigma2_pred.T) / obs_sigma2_pred K = linalg.lstsq(obs_sigma2_pred, cross_sigma.T)[0] K = linalg.lstsq(obs_sigma2_pred.T, K)[0] K = K.T # correct mu, sigma mu_filt = mu_pred + K.dot(z - obs_mu_pred) U = K.dot(obs_sigma2_pred) sigma2_filt = cholupdate(sigma2_pred, U.T, -1.0) else: # no corrections to be made mu_filt = mu_pred sigma2_filt = sigma2_pred return (mu_filt, sigma2_filt)
def solveQ5(trainData, testData, columnLabel): target = trainingData[:,dataMEDVCol:dataMEDVCol+1] targetTest = testingData[:,dataMEDVCol:dataMEDVCol+1] Ones = np.ones((len(target),1)) # Fitting the parameters: theta = (X'*X)^-1*X'*y Xtrain = np.hstack((Ones, trainData.reshape(len(Ones),1))) #firstCol = columnRM**2 #secondCol = columnLSTAT**2 #thirdCol = columnB**2 #fourthCol = columnZN**2 firstCol = trainData secondCol = trainData**2 thirdCol = trainData**3 fourthCol = trainData**4 Xtrain = np.hstack((Xtrain, firstCol.reshape(len(Xtrain),1))) Xtrain = np.hstack((Xtrain, secondCol.reshape(len(Xtrain),1))) Xtrain = np.hstack((Xtrain, thirdCol.reshape(len(Xtrain),1))) Xtrain = np.hstack((Xtrain, fourthCol.reshape(len(Xtrain),1))) mTheta = lstsq(Xtrain, target)[0] target_pred = dot(Xtrain, mTheta) msePred = sum((target-target_pred)**2)/len(target) meanTarget = sum(target)/len(target) varianceTarget = sum((target-meanTarget)**2)/len(target) FVU = msePred/varianceTarget Xtest = np.hstack((Ones, testData.reshape(len(Ones),1))) #firstCol = columnTestRM**2 #secondCol = columnTestLSTAT**2 #thirdCol = columnTestB**2 #fourthCol = columnTestZN**2 firstCol = testData secondCol = testData**2 thirdCol = testData**3 fourthCol = testData**4 Xtest = np.hstack((Xtest, firstCol.reshape(len(Xtest),1))) Xtest = np.hstack((Xtest, secondCol.reshape(len(Xtest),1))) Xtest = np.hstack((Xtest, thirdCol.reshape(len(Xtest),1))) Xtest = np.hstack((Xtest, fourthCol.reshape(len(Xtest),1))) mThetaTest = lstsq(Xtest, targetTest)[0] target_pred_test = dot(Xtest, mTheta) msePredTest = sum((targetTest-target_pred_test)**2)/len(targetTest) meanTargetTest = sum(targetTest)/len(targetTest) varianceTargetTest = sum((targetTest-meanTargetTest)**2)/len(targetTest) FVUTest = msePredTest/varianceTargetTest print '###',columnLabel,'###' print 'MSE training set:', msePred print 'MSE testing set:', msePredTest print 'R2 of testing set against theta from training set:', 1 - FVUTest,'\n'
def partial_corr(X,Y,Z): """ Partial Correlation in Python (clone of Matlab's partialcorr) But Returns only one partial correlation value. This uses the linear regression approach to compute the partial correlation (might be slow for a huge number of variables). The algorithm is detailed here: http://en.wikipedia.org/wiki/Partial_correlation#Using_linear_regression Taking X and Y two variables of interest and Z the matrix with all the variable minus {X, Y}, the algorithm can be summarized as 1) perform a normal linear least-squares regression with X as the target and Z as the predictor 2) calculate the residuals in Step #1 3) perform a normal linear least-squares regression with Y as the target and Z as the predictor 4) calculate the residuals in Step #3 5) calculate the correlation coefficient between the residuals from Steps #2 and #4; The result is the partial correlation between X and Y while controlling for the effect of Z Returns the sample linear partial correlation coefficient between X and Y controlling for Z. Parameters ---------- X : vector (length n) Y : vector (length n) Z : array-like, shape (n, p) where p are the variables to control for Returns ------- pcorr : float - partial correlation between X and Y controlling for Z Adapted from https://gist.github.com/fabianp/9396204419c7b638d38f to return one value instead of partial correlation matrix """ ## regress covariates on both X and Y beta_x = linalg.lstsq(Z, X)[0] beta_y = linalg.lstsq(Z, Y)[0] ## take residuals of above regression res_x = X - Z.dot(beta_x) res_y = Y - Z.dot(beta_y) ## correlate the residuals to get partial corr pcorr = stats.pearsonr(res_x, res_y)[0] ## return the partial correlation return pcorr
def train_(self, d): if type(self.heog) == str: self.heog = d.feat_lab[0].index(self.heog) if type(self.veog) == str: self.veog = d.feat_lab[0].index(self.veog) if type(self.reog) == str: self.reog = d.feat_lab[0].index(self.reog) self.eog = set([self.heog, self.veog, self.reog]) if self.eeg is None: self.eeg = set(range(d.nfeatures)) - self.eog else: self.eeg = set([d.feat_lab[0].index(ch) if type(ch) == str else ch for ch in self.eeg]) s = get_samplerate(d) # Extract EOG trials d_sliced = slice(d, self.mdict, (int(-1*s), int(1.5*s))) # Average the trials and baseline them d_erp = erp(d_sliced, enforce_equal_n=False) #d_erp = baseline(d_erp, (0, int(0.5*s))) d_erp = baseline(d_erp, (0, int(2.5*s))) # Concatenate blink trials and eye movement trials d_blink = concatenate_trials(d_erp[0]) d_movement = concatenate_trials(d_erp[1:]) # Calculate Bh and Bv v1 = np.vstack(( np.ones(d_movement.ninstances), d_movement.data[self.heog,:], d_movement.data[self.veog,:] )).T coeff1,_,_,_ = linalg.lstsq(v1,d_movement.data.T) self.Bh = coeff1[1,:] self.Bv = coeff1[2,:] # Remove HEOG and VEOG from the blink data corr1 = np.zeros(d_blink.data.T.shape) for channel in range(d_blink.nfeatures): corr1[:, channel] = d_blink.data[channel,:] - d_blink.data[self.heog,:]*self.Bh[channel] - d_blink.data[self.veog,:]*self.Bv[channel] # Calculate Br v2 = np.vstack(( np.ones(d_blink.ninstances), corr1[:,self.reog] )).T coeff2,_,_,_ = linalg.lstsq(v2, corr1) self.Br = coeff2[1,:]
def matrix_factor_ALS(matrix, dim, num_iters): #initialization of two factors: small random (between -1 and 1) #NOTE: scipy sparse linalg lstsq converts matrices to numpy arrays anyway #So we just initialize factors as numpy arrays to save the trouble factor1 = np.random.random((matrix.shape[0], dim)) factor2 = np.random.random((dim, matrix.shape[0])) for iteration in range(num_iters): #solve 2 least squares problems #one fixing the second factor and solving for the first #then fix first factor and solve for the second factor1 = lstsq(factor2.transpose(), matrix.A)[0] factor2 = lstsq(factor1.transpose(), matrix.A)[0] return sp.csr_matrix(factor1), sp.csr_matrix(factor2)
def execute(self, bem): """ Compute potential unknow data (gradients for free surface, and potentials for the other ones). @param bem Boundary Element Method instance. """ [bem['Ap'], residues, rank, s] = la.lstsq(bem['A'], bem['B']) if(rank < bem['N']): FreeCAD.Console.PrintError("\t\t[Sim]: Solving velocity potentials.\n") FreeCAD.Console.PrintError("\t\t\tEffective rank of linear system matrix is %i (N = %i)\n" % (rank, bem['N'])) [bem['Adp'], residues, rank, s] = la.lstsq(bem['A'], bem['dB']) if(rank < bem['N']): FreeCAD.Console.PrintError("\t\t[Sim]: Solving acceleration potentials.\n") FreeCAD.Console.PrintError("\t\t\tEffective rank of linear system matrix is %i (N = %i)\n" % (rank, bem['N']))
def trialFunFit(self, s, arr, alphas, pairs=None): # construct matrix for extended fitting problem A = np.concatenate((1. / (s[:,None] + alphas[None,:]), \ arr[:,None] / (s[:,None] + alphas[None,:])), axis=1) # find auxiliary residues c = la.lstsq(A, arr)[0][-len(alphas):] # find zeros of fitted auxiliary function H = np.diag(alphas) - np.dot(np.ones((len(alphas),1), dtype=complex), c[None,:]) alphanew = np.linalg.eig(H)[0] # find real residues Anew = 1. / (s[:,None] + alphanew[None,:]) cnew = la.lstsq(Anew, arr)[0] return alphanew, cnew, None
def partial_corr(C, verbose=0): """ Returns the sample linear partial correlation coefficients between pairs of variables in C, controlling for the remaining variables in C. Parameters ---------- C : array-like, shape (n, p) Array with the different variables. Each column of C is taken as a variable Returns ------- P : array-like, shape (p, p) P[i, j] contains the partial correlation of C[:, i] and C[:, j] controlling for the remaining variables in C. """ C = np.asarray(C) C = C[~np.isnan(C.sum(1))] p = C.shape[1] P_corr = np.zeros((p, p), dtype=np.float) if verbose >= 1: print("Looping over %d variables..." % p) for i in range(p): if i % 25 == 24: print("\tLoop %d of %d" % (i + 1, p)) P_corr[i, i] = 1 for j in range(i+1, p): idx = np.ones(p, dtype=np.bool) idx[i] = False idx[j] = False #beta_i = OLS(C[:, idx], C[:, j]).fit().params.squeeze() #beta_j = OLS(C[:, idx], C[:, i]).fit().params.squeeze() beta_i = linalg.lstsq(C[:, idx], C[:, j])[0] beta_j = linalg.lstsq(C[:, idx], C[:, i])[0] res_j = C[:, j] - C[:, idx].dot(beta_i) res_i = C[:, i] - C[:, idx].dot(beta_j) corr = stats.pearsonr(res_i, res_j)[0] P_corr[i, j] = corr P_corr[j, i] = corr if verbose >= 1: print("Done.") return P_corr
def test_for_CI(G, n1, n2, normalized_data, order, alpha): """Test if n1 and n2 are conditionally independent. If they are not return None, else return the conditional independence set. """ N = normalized_data.shape[1] ones = numpy.ones((N,1), dtype=float) n1_resp = normalized_data[n1,:] n1_neighbors = set(G.neighbors(n1)) n2_resp = normalized_data[n2,:] n2_neighbors = set(G.neighbors(n2)) common_neighbors = n1_neighbors.intersection(n2_neighbors) - set((n1, n2)) # if there aren't enough neighbors common to n1 and n2, return none if len(common_neighbors) < order: return None min_score = 1e100 best_p_val = None best_neighbors = None n_common_neighbors = 0 for covariates in combinations(common_neighbors, order): n_common_neighbors += 1 predictors = numpy.hstack( (ones, normalized_data[numpy.array(covariates),:].T)) # test if node is independent of neighbors given for some subset rv1, _, _, _ = lstsq(predictors, n1_resp) rv2, _, _, _ = lstsq(predictors, n2_resp) cor, pval = pearsonr(n1_resp - rv1.dot(predictors.T), n2_resp - rv2.dot(predictors.T)) if abs(cor) < min_score: min_score = abs(cor) best_neighbors = covariates best_p_val = pval # make the multiple testing correction /n_common_neighbors if best_p_val < alpha/n_common_neighbors: return None #score = math.sqrt(N-order-3)*0.5*math.log((1+cor)/(1-cor)) #print abs(score), norm.isf(alpha/(len(neighbors)*2)), cor, pval #if abs(score) < norm.isf(alpha/(len(neighbors)*2)): # make the multiple testing correction /n_common_neighbors if best_p_val < alpha/n_common_neighbors: return None else: return best_neighbors
def preProcess(u,y,NumDict): NumInputs = u.shape[0] NumOutputs = y.shape[0] NumRows = NumDict['Rows'] NumCols = NumDict['Columns'] NSig = NumDict['Dimension'] UPast,UFuture = getHankelMatrices(u,NumRows,NumCols) YPast,YFuture = getHankelMatrices(y,NumRows,NumCols) Data = np.vstack((UPast,UFuture,YPast)) L = la.lstsq(Data.T,YFuture.T)[0].T Z = np.dot(L,Data) DataShift = np.vstack((UPast,UFuture[NumInputs:],YPast)) LShift = la.lstsq(DataShift.T,YFuture[NumOutputs:].T)[0].T ZShift = np.dot(LShift,DataShift) L1 = L[:,:NumInputs*NumRows] L3 = L[:,2*NumInputs*NumRows:] LPast = np.hstack((L1,L3)) DataPast = np.vstack((UPast,YPast)) U, S, Vt = la.svd(np.dot(LPast,DataPast)) Sig = np.diag(S[:NSig]) SigRt = np.diag(np.sqrt(S[:NSig])) Gamma = np.dot(U[:,:NSig],SigRt) GammaLess = Gamma[:-NumOutputs] GammaPinv = la.pinv(Gamma) GammaLessPinv = la.pinv(GammaLess) GamShiftSolve = la.lstsq(GammaLess,ZShift)[0] GamSolve = la.lstsq(Gamma,Z)[0] GamData = np.vstack((GamSolve,UFuture)) GamYData = np.vstack((GamShiftSolve,YFuture[:NumOutputs])) # Should probably move to a better output structure # One that doesn't depent so heavily on ordering GammaDict = {'Data':GamData, 'DataLess':GammaLess, 'DataY':GamYData, 'Pinv': GammaPinv, 'LessPinv': GammaLessPinv} return GammaDict,S
def fgmres(self,rhs,tol=1e-6,restrt=None,maxiter=None,callback=None): if maxiter == None: maxiter = len(rhs) if restrt == None: restrt = 2*maxiter # implemented as in [Saad, 1993] # start x = zeros(len(rhs)) H = zeros((restrt+1, restrt)) V = zeros((len(rhs),restrt)) Z = zeros((len(rhs),restrt)) # Arnoldi process (with modified Gramm-Schmidt) res = 1. j = 0 r = rhs - self.point.matvec(x) beta = norm(r) V[:,0]=r/beta while j < maxiter and res > tol: Z[:,j] = self.point.psolve(V[:,j]) w = self.point.matvec(Z[:,j]) for i in range(j+1): H[i,j]=dot(w,V[:,i]) w = w - H[i,j]*V[:,i] H[j+1,j] = norm(w) V[:,j+1]=w/H[j+1,j] e = zeros(j+2) e[0]=1. y, res, rank, sing_val = lstsq(H[:j+2,:j+1],beta*e) j += 1 print "# GMRES| iteration :", j, "res: ", res/beta self.resid = r_[self.resid,res/beta] Zy = dot(Z[:,:j],y) x = x + Zy info = 1 return (x,info)
def _train(self, data): """Train the classifier using `data` (`Dataset`). """ if self.__implementation == "direct": # create matrices to solve with additional penalty term # determine the lambda matrix if self.__lm is None: # Not specified, so calculate based on .05*nfeatures Lambda = .05*data.nfeatures*np.eye(data.nfeatures) else: # use the provided penalty Lambda = self.__lm*np.eye(data.nfeatures) # add the penalty term a = np.concatenate( \ (np.concatenate((data.samples, np.ones((data.nsamples, 1))), 1), np.concatenate((Lambda, np.zeros((data.nfeatures, 1))), 1))) b = np.concatenate((data.sa[self.get_space()].value, np.zeros(data.nfeatures))) # perform the least sq regression and save the weights self.w = lstsq(a, b)[0] else: raise ValueError, "Unknown implementation '%s'" \ % self.__implementation
def process(self, data, obs_vec): """ Solve optimization problem to get :math:`\\alpha`. :param phi_mat: Training data matrix :math:`\Phi\in\mathbb{R}^{d\\times n}` :type phi_mat: numpy array :param obs_vec: Observation vector :math:`y\in\mathbb{R}^{n}` :type obs_vec: numpy array :return: `alpha`: estimated state vector :math:`\\alpha\in\mathbb{R}^{d}` :rtype: numpy array """ # check consistency of data obs_num = obs_vec.shape[0] data_num = data.shape[0] if obs_num == data_num: self.data = data # take into account both options if self.lam is 0: k_mat = kernel(data, data, self.k_type) else: dim = data.shape[1] k_mat = kernel(data, data, self.k_type) + self.lam*eye(dim) self.alpha = lstsq(k_mat, obs_vec.transpose())[0] return self.alpha else: print "ERROR: number of samples for data and observations must be the same"
def polynomialFit(x, y, order): X = np.array([[xi ** i for i in range(order + 1)] for xi in x]) Y = np.array(y).reshape((-1, 1)) # W = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(Y) W, _, _, _ = linalg.lstsq(X, Y) # print(W) return W
def test_multinomial_grad_hess(): rng = np.random.RandomState(0) n_samples, n_features, n_classes = 100, 5, 3 X = rng.randn(n_samples, n_features) w = rng.rand(n_classes, n_features) Y = np.zeros((n_samples, n_classes)) ind = np.argmax(np.dot(X, w.T), axis=1) Y[range(0, n_samples), ind] = 1 w = w.ravel() sample_weights = np.ones(X.shape[0]) grad, hessp = _multinomial_grad_hess(w, X, Y, alpha=1., sample_weight=sample_weights) # extract first column of hessian matrix vec = np.zeros(n_features * n_classes) vec[0] = 1 hess_col = hessp(vec) # Estimate hessian using least squares as done in # test_logistic_grad_hess e = 1e-3 d_x = np.linspace(-e, e, 30) d_grad = np.array([ _multinomial_grad_hess(w + t * vec, X, Y, alpha=1., sample_weight=sample_weights)[0] for t in d_x ]) d_grad -= d_grad.mean(axis=0) approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel() assert_array_almost_equal(hess_col, approx_hess_col)
def glm(event_matrix, Q, voxels, hrf_function=None, downsample=1, convolve=True): """ Perform a GLM from an event matrix and return estimated HRFs and associated coefficients Q: basis """ Q = np.asarray(Q) if Q.ndim == 1: Q = Q[:, None] if hrf_function is None: hrf_function = Q[:, 0] if convolve: glm_design = convolve_events(event_matrix, Q)[::downsample] else: glm_design = event_matrix n_basis = Q.shape[1] n_trials = glm_design.shape[1] / n_basis n_voxels = voxels.shape[-1] full_betas = linalg.lstsq(glm_design, voxels)[0] full_betas = full_betas.reshape(n_basis, n_trials, n_voxels, order='F') hrfs = full_betas.T.dot(Q.T) sign = np.sign((hrfs * hrf_function).sum(-1)) hrfs = hrfs * sign[..., None] norm = hrfs.max(-1) hrfs /= norm[..., None] betas = norm * sign return hrfs.T, betas.T
def QITE_step(H_, psi_, db, xv, check): import time nalpha = len(H_) dn_ = 1.0 if (xv is None): xv = [] for alpha in range(nalpha): (A, h, imp, gmp) = H_[alpha] nact = imp.shape[0] xv.append(np.zeros(nact)) for alpha in range(nalpha): # ----- target state t0 = time.time() delta_alpha, dnalpha_ = ExpmbH_alpha(H_, psi_, alpha, db) delta_alpha -= psi_.copy() dn_ *= dnalpha_ Xop = [] # ----- pauli action (A, h, imp, gmp) = H_[alpha] nact = imp.shape[0] # print('active:',imp) Pmu_psi = np.zeros(imp.shape, dtype=complex) for m in range(nact): Pmu_psi[m, :] = gmp[m, imp[m, :]] * psi_[imp[m, :]] t1 = time.time() # ----- set linear system Amat = np.dot(np.conj(Pmu_psi), Pmu_psi.T) # print('Amat:\n',Amat) Amat = 2.0 * np.real(Amat) t2 = time.time() bvec = np.dot(Pmu_psi, np.conj(delta_alpha)) bvec = -2.0 * np.imag(bvec) t3 = time.time() if (check): x = SciLA.lstsq(Amat, bvec)[0] else: zct = np.dot(bvec, Amat) def cost_fun(vct): return LA.norm(np.dot(Amat, vct) - bvec)**2 def J_cost_fun(vct): wct = np.dot(Amat, vct) wct = np.dot(Amat.T, wct) return 2.0 * (wct - zct) import scipy x = scipy.optimize.minimize(cost_fun, x0=xv[alpha], method='Newton-CG', jac=J_cost_fun, tol=1e-8).x xv[alpha] = x.copy() print('Pauli Operator') Xop.append((A, x, imp, gmp)) print_Hamiltonian(Xop) # print_Hamiltonian(xv) #print('\n wavefunction before\n',Pmu_psi) t4 = time.time() psi_ = Exp_ixP(x, psi_, imp, gmp) #print('\n wavefunction after\n', psi_,'\n') t5 = time.time() # print alpha,t5-t4,t4-t3,t3-t2,t2-t1,t1-t0 import sys sys.stdout.flush() # print('op:\n',xv) return psi_, dn_, xv, Xop
if trend_method == "corners": hull = ConvexHull(survey.srcField.rxList[0].locs[:,:2]) # Extract only those points that make the ConvexHull pts = np.c_[survey.srcField.rxList[0].locs[hull.vertices,:2], survey.dobs[hull.vertices]] else: # Extract all points pts = np.c_[survey.srcField.rxList[0].locs[:,:2], survey.dobs] if trend_order == 0: data_trend = np.mean(pts[:,2]) * np.ones(rxLoc[:,0].shape) print('Removed data mean: {0:.6g}'.format(data_trend[0])) elif trend_order == 1: # best-fit linear plane A = np.c_[pts[:,0], pts[:,1], np.ones(pts.shape[0])] C,_,_,_ = lstsq(A, pts[:,2]) # coefficients # evaluate at all data locations data_trend = C[0]*rxLoc[:,0] + C[1]*rxLoc[:,0] + C[2] elif trend_order == 2: # best-fit quadratic curve A = np.c_[np.ones(pts.shape[0]), pts[:,:2], np.prod(pts[:,:2], axis=1), pts[:,:2]**2] C,_,_,_ = lstsq(A, pts[:,2]) # evaluate at all data locations data_trend = np.dot(np.c_[ np.ones(rxLoc[:,0].shape), rxLoc[:,0], rxLoc[:,1], rxLoc[:,0]*rxLoc[:,1], rxLoc[:,0]**2, rxLoc[:,1]**2 ], C).reshape(rxLoc[:,0].shape) survey.dobs -= data_trend
def mfa(X, hdim, C, maxiters, W=None, M=None, psi=None, pi=None, eps=1e-2): """Fit a Mixture of FA. _X_ is dataset in _rows_. _hdim_ is the latent dimension, the same for all _C_ classes. """ # pre calculation of some 'constants'. N, d = X.shape Ih = np.eye(hdim) ll_const = -d / 2. * np.log(2 * np.pi) X_sq = X**2 if W is None: W = np.random.randn(C, hdim, d) if M is None: tmp = np.random.permutation(N) M = X[tmp[:C]].copy() if psi is None: psi = 100 * np.var(X) * np.ones((C, d)) if pi is None: pi = np.ones(C) / C # pre allocating some helper memory E_z = np.zeros((C, N, hdim)) Cov_z = np.zeros((C, hdim, hdim)) # store loglikelihood ll = np.zeros((C, N)) last_ll = -np.inf loglike = [] for i in xrange(maxiters): for c in xrange(C): # W_c is hdim x d W_c = W[c] mu_c = M[c] # psi_c is D psi_c = psi[c] fac = W_c / psi_c # see Bishop, p. 93, eq. 2.117 cov_z = la.inv(Ih + np.dot(fac, W_c.T)) tmp = np.dot(X - mu_c, fac.T) # latent expectations E_z[c, :, :] = np.dot(tmp, cov_z) # latent _covariance_ Cov_z[c, :, :] = cov_z # loglikelihood # woodbury identity inv_cov_x = np.diag(1. / psi_c) - np.dot(fac.T, np.dot(cov_z, fac)) _, _det = np.linalg.slogdet(inv_cov_x) tmp = np.dot(X - mu_c, inv_cov_x) # integrating out latent z's -> again, Bishop, p. 93, eq. 2.115 ll[c, :] = np.log(pi[c]) + ll_const + 0.5 * _det - 0.5 * np.sum( tmp * (X - mu_c), axis=1) # posterior class distribution given data posteriors = norm_logprob(ll, axis=0) # loglikelihood over all datapoints ll_sum = np.sum(logsumexp(ll, axis=0)) loglike.append(ll_sum) if ll_sum - last_ll < eps: break last_ll = ll_sum for c in xrange(C): z = np.append(E_z[c, :, :], np.ones((N, 1)), axis=1) wz = posteriors[c][:, np.newaxis] * z wzX = np.dot(wz.T, X) wzz = np.dot(wz.T, z) N_c = posteriors[c].sum() wzz[:hdim, :hdim] += N_c * Cov_z[c, :, :] sol = la.lstsq(wzz, wzX)[0] M[c, :] = sol[hdim, :] W[c, :, :] = sol[:hdim, :] psi[c, :] = (np.dot(posteriors[c], X_sq) - np.sum(sol * wzX, axis=0)) / N_c psi[c, :] = np.maximum(psi[c, :], SMALL) pi[c] = N_c / N return W, M, psi, pi, loglike
def test_simple_exact(self): a = [[1, 20], [-30, 4]] for b in ([[1, 0], [0, 1]], [1, 0], [[2, 1], [-30, 4]]): x = lstsq(a, b)[0] assert_array_almost_equal(dot(a, x), b)
def test_simple_overdet_complex(self): a = [[1 + 2j, 2], [4, 5], [3, 4]] b = [1, 2 + 4j, 3] x, res, r, s = lstsq(a, b) assert_array_almost_equal(x, direct_lstsq(a, b, cmplx=1)) assert_almost_equal(res, (abs(dot(a, x) - b)**2).sum(axis=0))
def calc_risk_scores(bed_file, rs_id_map, phen_map, out_file=None, split_by_chrom=False, adjust_for_sex=False, adjust_for_covariates=False, adjust_for_pcs=False, non_zero_chromosomes=None): print 'Parsing PLINK bed file: %s' % bed_file num_individs = len(phen_map) assert num_individs > 0, 'No individuals found. Problems parsing the phenotype file?' if split_by_chrom: raw_effects_prs = sp.zeros(num_individs) pval_derived_effects_prs = sp.zeros(num_individs) for i in range(1, 23): if non_zero_chromosomes is None or i in non_zero_chromosomes: genotype_file = bed_file + '_%i_keep' % i if os.path.isfile(genotype_file + '.bed'): print 'Working on chromosome %d' % i prs_dict = get_prs(genotype_file, rs_id_map, phen_map) raw_effects_prs += prs_dict['raw_effects_prs'] pval_derived_effects_prs += prs_dict[ 'pval_derived_effects_prs'] else: print 'Skipping chromosome' else: prs_dict = get_prs(bed_file, rs_id_map, phen_map) raw_effects_prs = prs_dict['raw_effects_prs'] pval_derived_effects_prs = prs_dict['pval_derived_effects_prs'] true_phens = prs_dict['true_phens'] # Report prediction accuracy raw_eff_corr = sp.corrcoef(raw_effects_prs, prs_dict['true_phens'])[0, 1] raw_eff_r2 = raw_eff_corr**2 pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, prs_dict['true_phens'])[0, 1] pval_eff_r2 = pval_eff_corr**2 print 'Final raw effects PRS correlation: %0.4f' % raw_eff_corr print 'Final raw effects PRS r2: %0.4f' % raw_eff_r2 print 'Final weighted effects PRS correlation: %0.4f' % pval_eff_corr print 'Final weighted effects PRS r2: %0.4f' % pval_eff_r2 res_dict = {'pred_r2': pval_eff_r2} raw_effects_prs.shape = (len(raw_effects_prs), 1) pval_derived_effects_prs.shape = (len(pval_derived_effects_prs), 1) true_phens = sp.array(true_phens) true_phens.shape = (len(true_phens), 1) # Store covariate weights, slope, etc. weights_dict = {} # Store Adjusted predictions adj_pred_dict = {} # Direct effect Xs = sp.hstack([pval_derived_effects_prs, sp.ones((len(true_phens), 1))]) (betas, rss00, r, s) = linalg.lstsq(sp.ones((len(true_phens), 1)), true_phens) (betas, rss, r, s) = linalg.lstsq(Xs, true_phens) pred_r2 = 1 - rss / rss00 weights_dict['unadjusted'] = { 'Intercept': betas[1][0], 'ldpred_prs_effect': betas[0][0] } # Adjust for sex if adjust_for_sex and 'sex' in prs_dict and len(prs_dict['sex']) > 0: sex = sp.array(prs_dict['sex']) sex.shape = (len(sex), 1) (betas, rss0, r, s) = linalg.lstsq(sp.hstack([sex, sp.ones((len(true_phens), 1))]), true_phens) (betas, rss, r, s) = linalg.lstsq( sp.hstack([raw_effects_prs, sex, sp.ones((len(true_phens), 1))]), true_phens) Xs = sp.hstack( [pval_derived_effects_prs, sex, sp.ones((len(true_phens), 1))]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) weights_dict['sex_adj'] = { 'Intercept': betas[2][0], 'ldpred_prs_effect': betas[0][0], 'sex': betas[1][0] } print 'Fitted effects (betas) for PRS, sex, and intercept on true phenotype:', betas adj_pred_dict['sex_adj'] = sp.dot(Xs, betas) pred_r2 = 1 - rss / rss0 print 'Sex adjusted prediction accuracy (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss / rss00 print 'Sex adjusted prediction + Sex (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss_pd / rss0 print 'Sex adjusted prediction accuracy (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['PC_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print 'Sex adjusted prediction + Sex (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['PC_adj_pred_r2+PC'] = pred_r2 # Adjust for PCs if adjust_for_pcs and 'pcs' in prs_dict and len(prs_dict['pcs']) > 0: pcs = prs_dict['pcs'] (betas, rss0, r, s) = linalg.lstsq(sp.hstack([pcs, sp.ones((len(true_phens), 1))]), true_phens) (betas, rss, r, s) = linalg.lstsq( sp.hstack([raw_effects_prs, pcs, sp.ones((len(true_phens), 1))]), true_phens) Xs = sp.hstack( [pval_derived_effects_prs, sp.ones((len(true_phens), 1)), pcs]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) weights_dict['pc_adj'] = { 'Intercept': betas[1][0], 'ldpred_prs_effect': betas[0][0], 'pcs': betas[2][0] } adj_pred_dict['pc_adj'] = sp.dot(Xs, betas) pred_r2 = 1 - rss / rss0 print 'PC adjusted prediction accuracy (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss / rss00 print 'PC adjusted prediction + PCs (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss_pd / rss0 print 'PC adjusted prediction accuracy (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['PC_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print 'PC adjusted prediction + PCs (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['PC_adj_pred_r2+PC'] = pred_r2 # Adjust for both PCs and Sex if adjust_for_sex and 'sex' in prs_dict and len(prs_dict['sex']) > 0: sex = sp.array(prs_dict['sex']) sex.shape = (len(sex), 1) (betas, rss0, r, s) = linalg.lstsq( sp.hstack([sex, pcs, sp.ones((len(true_phens), 1))]), true_phens) (betas, rss, r, s) = linalg.lstsq( sp.hstack( [raw_effects_prs, sex, pcs, sp.ones((len(true_phens), 1))]), true_phens) Xs = sp.hstack([ pval_derived_effects_prs, sex, sp.ones((len(true_phens), 1)), pcs ]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) weights_dict['sex_pc_adj'] = { 'Intercept': betas[2][0], 'ldpred_prs_effect': betas[0][0], 'sex': betas[1][0], 'pcs': betas[3][0] } adj_pred_dict['sex_pc_adj'] = sp.dot(Xs, betas) pred_r2 = 1 - rss / rss0 print 'PCs+Sex adjusted prediction accuracy (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss / rss00 print 'PCs+Sex adjusted prediction and PCs+Sex (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss_pd / rss0 print 'PCs+Sex adjusted prediction accuracy (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['PC_Sex_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print 'PCs+Sex adjusted prediction and PCs+Sex (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['PC_Sex_adj_pred_r2+PC_Sex'] = pred_r2 # Adjust for covariates if adjust_for_covariates and 'covariates' in prs_dict and len( prs_dict['covariates']) > 0: covariates = prs_dict['covariates'] (betas, rss0, r, s) = linalg.lstsq( sp.hstack([covariates, sp.ones((len(true_phens), 1))]), true_phens) (betas, rss, r, s) = linalg.lstsq( sp.hstack( [raw_effects_prs, covariates, sp.ones((len(true_phens), 1))]), true_phens) Xs = sp.hstack([ pval_derived_effects_prs, covariates, sp.ones((len(true_phens), 1)) ]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) adj_pred_dict['cov_adj'] = sp.dot(Xs, betas) pred_r2 = 1 - rss / rss0 print 'Cov adjusted prediction accuracy (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss / rss00 print 'Cov adjusted prediction + Cov (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss_pd / rss0 print 'Cov adjusted prediction accuracy (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['Cov_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print 'Cov adjusted prediction + Cov (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['Cov_adj_pred_r2+Cov'] = pred_r2 if adjust_for_pcs and 'pcs' in prs_dict and len( prs_dict['pcs']) and 'sex' in prs_dict and len( prs_dict['sex']) > 0: pcs = prs_dict['pcs'] sex = sp.array(prs_dict['sex']) sex.shape = (len(sex), 1) (betas, rss0, r, s) = linalg.lstsq( sp.hstack( [covariates, sex, pcs, sp.ones((len(true_phens), 1))]), true_phens) (betas, rss, r, s) = linalg.lstsq( sp.hstack([ raw_effects_prs, covariates, sex, pcs, sp.ones((len(true_phens), 1)) ]), true_phens) Xs = sp.hstack([ pval_derived_effects_prs, covariates, sex, pcs, sp.ones((len(true_phens), 1)) ]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) adj_pred_dict['cov_sex_pc_adj'] = sp.dot(Xs, betas) pred_r2 = 1 - rss / rss0 print 'Cov+PCs+Sex adjusted prediction accuracy (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss / rss00 print 'Cov+PCs+Sex adjusted prediction and PCs+Sex (R^2) for the whole genome PRS with raw effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) pred_r2 = 1 - rss_pd / rss0 print 'Cov+PCs+Sex adjusted prediction accuracy (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['Cov_PC_Sex_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print 'Cov+PCs+Sex adjusted prediction and PCs+Sex (R^2) for the whole genome PRS with weighted effects was: %0.4f (%0.6f)' % ( pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)) res_dict['Cov_PC_Sex_adj_pred_r2+Cov_PC_Sex'] = pred_r2 # Now calibration y_norm = (true_phens - sp.mean(true_phens)) / sp.std(true_phens) denominator = sp.dot(raw_effects_prs.T, raw_effects_prs) numerator = sp.dot(raw_effects_prs.T, y_norm) regression_slope = (numerator / denominator)[0][0] print 'The slope for predictions with raw effects is:', regression_slope denominator = sp.dot(pval_derived_effects_prs.T, pval_derived_effects_prs) numerator = sp.dot(pval_derived_effects_prs.T, y_norm) regression_slope = (numerator / denominator)[0][0] print 'The slope for predictions with weighted effects is:', regression_slope num_individs = len(prs_dict['pval_derived_effects_prs']) # Write PRS out to file. if out_file != None: with open(out_file, 'w') as f: out_str = 'IID, true_phens, raw_effects_prs, pval_derived_effects_prs' if 'sex' in prs_dict: out_str = out_str + ', sex' if 'pcs' in prs_dict: pcs_str = ', '.join([ 'PC%d' % (1 + pc_i) for pc_i in range(len(prs_dict['pcs'][0])) ]) out_str = out_str + ', ' + pcs_str out_str += '\n' f.write(out_str) for i in range(num_individs): out_str = '%s, %0.6e, %0.6e, %0.6e, ' % ( prs_dict['iids'][i], prs_dict['true_phens'][i], raw_effects_prs[i], pval_derived_effects_prs[i]) if 'sex' in prs_dict: out_str = out_str + '%d, ' % prs_dict['sex'][i] if 'pcs' in prs_dict: pcs_str = ', '.join(map(str, prs_dict['pcs'][i])) out_str = out_str + pcs_str out_str += '\n' f.write(out_str) if len(adj_pred_dict.keys()) > 0: with open(out_file + '.adj', 'w') as f: adj_prs_labels = adj_pred_dict.keys() out_str = 'IID, true_phens, raw_effects_prs, pval_derived_effects_prs, ' + \ ', '.join(adj_prs_labels) out_str += '\n' f.write(out_str) for i in range(num_individs): out_str = '%s, %0.6e, %0.6e, %0.6e' % ( prs_dict['iids'][i], prs_dict['true_phens'][i], raw_effects_prs[i], pval_derived_effects_prs[i]) for adj_prs in adj_prs_labels: out_str += ', %0.4f' % adj_pred_dict[adj_prs][i] out_str += '\n' f.write(out_str) if weights_dict != None: oh5f = h5py.File(out_file + '.weights.hdf5', 'w') for k1 in weights_dict.keys(): kg = oh5f.create_group(k1) for k2 in weights_dict[k1]: kg.create_dataset(k2, data=sp.array(weights_dict[k1][k2])) oh5f.close() return res_dict
def find_roots_2d(coef1, coef2, tol=1e-3): """ Find the common roots of two bivariate polynomials with coefficients specified by two 2D arrays. the variation along the first dimension (i.e., columns) is in the increasing order of y. the variation along the second dimension (i.e., rows) is in the increasing order of x. :param coef1: polynomial coefficients the first polynomial for the annihilation along rows :param coef2: polynomial coefficients the second polynomial for the annihilation along cols :return: """ coef1 /= np.max(np.abs(coef1)) coef2 /= np.max(np.abs(coef2)) log_tol = np.log10(tol) # assert coef_col.shape[0] >= coef_row.shape[0] and coef_row.shape[1] >= coef_col.shape[1] if coef1.shape[1] < coef2.shape[1]: # swap input coefficients coef1, coef2 = coef2, coef1 x, y = sympy.symbols('x, y') # build symbols # collect both polynomials as a function of x; y will be included in the coefficients poly1 = 0 poly2 = 0 max_row_degree_y, max_row_degree_x = np.array(coef1.shape) - 1 for x_count in range(max_row_degree_x + 1): for y_count in range(max_row_degree_y + 1): if np.abs(coef1[y_count, x_count]) > 1e-10: poly1 += coef1[y_count, x_count] * x ** (max_row_degree_x - x_count) * \ y ** (max_row_degree_y - y_count) else: coef1[y_count, x_count] = 0 max_col_degree_y, max_col_degree_x = np.array(coef2.shape) - 1 for x_count in range(max_col_degree_x + 1): for y_count in range(max_col_degree_y + 1): if np.abs(coef2[y_count, x_count]) > 1e-10: poly2 += coef2[y_count, x_count] * x ** (max_col_degree_x - x_count) * \ y ** (max_col_degree_y - y_count) else: coef2[y_count, x_count] = 0 poly1_x = sympy.Poly(poly1, x) poly2_x = sympy.Poly(poly2, x) K_x = max_row_degree_x # highest power of the first polynomial (in x) L_x = max_col_degree_x # highest power of the second polynomial (in x) if coef1.shape[0] == 1: # i.e., independent of variable y x_roots_all = np.roots(coef1.squeeze()) eval_poly2 = sympy.lambdify(x, poly2) x_roots = [] y_roots = [] for x_loop in x_roots_all: y_roots_loop = np.roots(np.array(sympy.Poly(eval_poly2(x_loop), y).all_coeffs(), dtype=complex)) y_roots.append(y_roots_loop) x_roots.append(np.tile(x_loop, y_roots_loop.size)) coef_validate = coef2 elif coef2.shape[1] == 1: # i.e., independent of variable x y_roots_all = np.roots(coef2.squeeze()) eval_poly1 = sympy.lambdify(y, poly1) x_roots = [] y_roots = [] for y_loop in y_roots_all: x_roots_loop = np.roots(np.array(sympy.Poly(eval_poly1(y_loop), x).all_coeffs(), dtype=complex)) x_roots.append(x_roots_loop) y_roots.append(np.tile(y_loop, x_roots_loop.size)) coef_validate = coef1 else: if L_x >= 1: toep1_r = np.hstack((poly1_x.all_coeffs()[::-1], np.zeros(L_x - 1))) toep1_r = np.concatenate((toep1_r, np.zeros(L_x + K_x - toep1_r.size))) toep1_c = np.concatenate(([poly1_x.all_coeffs()[-1]], np.zeros(L_x - 1))) else: # for the case with L_x == 0 toep1_r = np.zeros((0, L_x + K_x)) toep1_c = np.zeros((0, 0)) if K_x >= 1: toep2_r = np.hstack((poly2_x.all_coeffs()[::-1], np.zeros(K_x - 1))) toep2_r = np.concatenate((toep2_r, np.zeros(L_x + K_x - toep2_r.size))) toep2_c = np.concatenate(([poly2_x.all_coeffs()[-1]], np.zeros(K_x - 1))) else: # for the case with K_x == 0 toep2_r = np.zeros((0, L_x + K_x)) toep2_c = np.zeros((0, 0)) blk_mtx1 = linalg.toeplitz(toep1_c, toep1_r) blk_mtx2 = linalg.toeplitz(toep2_c, toep2_r) if blk_mtx1.size != 0 and blk_mtx2.size != 0: mtx = np.vstack((blk_mtx1, blk_mtx2)) elif blk_mtx1.size == 0 and blk_mtx2.size != 0: mtx = blk_mtx2 elif blk_mtx1.size != 0 and blk_mtx2.size == 0: mtx = blk_mtx1 else: mtx = np.zeros((0, 0)) max_y_degree1 = coef1.shape[0] - 1 max_y_degree2 = coef2.shape[0] - 1 max_poly_degree = np.int(max_y_degree1 * L_x + max_y_degree2 * K_x) num_samples = (max_poly_degree + 1) * 8 # <= 4 is the over-sampling factor used to determined the poly coef. # randomly generate y-values # y_vals = np.random.randn(num_samples, 1) + \ # 1j * np.random.randn(num_samples, 1) y_vals = np.exp(1j * 2 * np.pi / num_samples * np.arange(num_samples))[:, np.newaxis] y_powers = np.reshape(np.arange(max_poly_degree + 1)[::-1], (1, -1), order='F') Y = ne.evaluate('y_vals ** y_powers') # compute resultant, which is the determinant of mtx. # it is a polynomial in terms of variable y func_resultant = sympy.lambdify(y, sympy.Matrix(mtx)) det_As = np.array([linalg.det(np.array(func_resultant(y_roots_loop), dtype=complex)) for y_roots_loop in y_vals.squeeze()], dtype=complex) coef_resultant = linalg.lstsq(Y, det_As)[0] # trim out very small coefficients # eps = np.max(np.abs(coef_resultant)) * tol # coef_resultant[np.abs(coef_resultant) < eps] = 0 y_roots_all = np.roots(coef_resultant) # check if there're duplicated roots y_roots_all = eliminate_duplicate_roots(y_roots_all) # use the root values for y to find the root values for x # check if poly1_x or poly2_x are constant w.r.t. x if len(poly1_x.all_coeffs()) > 1: func_loop = sympy.lambdify(y, poly1_x.all_coeffs()) coef_validate = coef2 elif len(poly2_x.all_coeffs()) > 1: func_loop = sympy.lambdify(y, poly2_x.all_coeffs()) coef_validate = coef1 else: raise RuntimeError('Neither polynomials contain x') x_roots = [] y_roots = [] for loop in range(y_roots_all.size): y_roots_loop = y_roots_all[loop] x_roots_loop = np.roots(func_loop(y_roots_loop)) # check if there're duplicated roots x_roots_loop = eliminate_duplicate_roots(x_roots_loop) for roots_loop in x_roots_loop: x_roots.append(roots_loop) for roots_loop in np.tile(y_roots_loop, x_roots_loop.size): y_roots.append(roots_loop) x_roots, y_roots = np.array(x_roots).flatten('F'), np.array(y_roots).flatten('F') x_roots, y_roots = eliminate_duplicate_roots_2d(x_roots, y_roots) # validate based on the polynomial values of the other polynomila # that is not used in the last step to get the roots poly_val = np.log10(np.abs( check_error_2d(coef_validate / linalg.norm(coef_validate.flatten()), x_roots, y_roots))) # if the error is 2 orders larger than the smallest error, then we discard the root # print(poly_val) valid_idx = np.bitwise_or(poly_val < np.min(poly_val) + 2, poly_val < log_tol) x_roots = x_roots[valid_idx] y_roots = y_roots[valid_idx] ''' Further verification with the resultant w.r.t. y, which should also vanish at the common roots ''' poly1_y = sympy.Poly(poly1, y) poly2_y = sympy.Poly(poly2, y) K_y = max_row_degree_y # highest power of the first polynomial (in y) L_y = max_col_degree_y # highest power of the second polynomial (in y) if L_y >= 1: toep1_r = np.hstack((poly1_y.all_coeffs()[::-1], np.zeros(L_y - 1))) toep1_r = np.concatenate((toep1_r, np.zeros(L_y + K_y - toep1_r.size))) toep1_c = np.concatenate(([poly1_y.all_coeffs()[-1]], np.zeros(L_y - 1))) else: # for the case with L_y == 0 toep1_r = np.zeros((0, L_y + K_y)) toep1_c = np.zeros((0, 0)) if K_y >= 1: toep2_r = np.hstack((poly2_y.all_coeffs()[::-1], np.zeros(K_y - 1))) toep2_r = np.concatenate((toep2_r, np.zeros(L_y + K_y - toep2_r.size))) toep2_c = np.concatenate(([poly2_y.all_coeffs()[-1]], np.zeros(K_y - 1))) else: # for the case with K_y == 0 toep2_r = np.zeros((0, L_y + K_y)) toep2_c = np.zeros((0, 0)) blk_mtx1 = linalg.toeplitz(toep1_c, toep1_r) blk_mtx2 = linalg.toeplitz(toep2_c, toep2_r) if blk_mtx1.size != 0 and blk_mtx2.size != 0: mtx = np.vstack((blk_mtx1, blk_mtx2)) elif blk_mtx1.size == 0 and blk_mtx2.size != 0: mtx = blk_mtx2 elif blk_mtx1.size != 0 and blk_mtx2.size == 0: mtx = blk_mtx1 else: mtx = np.zeros((0, 0)) func_resultant_verify = sympy.lambdify((x, y), sympy.Matrix(mtx)) # evaluate the resultant w.r.t. y at the found roots. it should also vanish if # the pair is the common root res_y_val = np.zeros(x_roots.size, dtype=float) for loop in range(x_roots.size): res_y_val[loop] = \ np.abs(linalg.det( np.array( func_resultant_verify(x_roots[loop], y_roots[loop]), dtype=complex ))) log_res_y_val = np.log10(res_y_val) valid_idx = np.bitwise_or(log_res_y_val < np.min(log_res_y_val) + 2, log_res_y_val < log_tol) x_roots = x_roots[valid_idx] y_roots = y_roots[valid_idx] return x_roots, y_roots
def buildRectangleModel(self, recBounds, steepness=1): ''' Builds a softmax model in 2 dimensions with a rectangular interior class Inputs recBounds: A 2x2 list, with the coordinates of the lower left and upper right corners of the rectangle steepness: A scalar determining how steep the bounds between softmax classes are ''' B = np.matrix([ -1, 0, recBounds[0][0], 1, 0, -recBounds[1][0], 0, 1, -recBounds[1][1], 0, -1, recBounds[0][1] ]).T M = np.zeros(shape=(12, 15)) #Boundry: Left|Near rowSB = 0 classNum1 = 1 classNum2 = 0 for i in range(0, 3): M[3 * rowSB + i, 3 * classNum2 + i] = -1 M[3 * rowSB + i, 3 * classNum1 + i] = 1 #Boundry: Right|Near rowSB = 1 classNum1 = 2 classNum2 = 0 for i in range(0, 3): M[3 * rowSB + i, 3 * classNum2 + i] = -1 M[3 * rowSB + i, 3 * classNum1 + i] = 1 #Boundry: Up|Near rowSB = 2 classNum1 = 3 classNum2 = 0 for i in range(0, 3): M[3 * rowSB + i, 3 * classNum2 + i] = -1 M[3 * rowSB + i, 3 * classNum1 + i] = 1 #Boundry: Down|Near rowSB = 3 classNum1 = 4 classNum2 = 0 for i in range(0, 3): M[3 * rowSB + i, 3 * classNum2 + i] = -1 M[3 * rowSB + i, 3 * classNum1 + i] = 1 A = np.hstack((M, B)) # print(np.linalg.matrix_rank(A)) # print(np.linalg.matrix_rank(M)) Theta = linalg.lstsq(M, B)[0].tolist() weight = [] bias = [] for i in range(0, len(Theta) // 3): weight.append([Theta[3 * i][0], Theta[3 * i + 1][0]]) bias.append(Theta[3 * i + 2][0]) steep = steepness self.weights = (np.array(weight) * steep).tolist() self.bias = (np.array(bias) * steep).tolist() self.size = len(self.weights) self.alpha = 3 self.zeta_c = [0] * len(self.weights) for i in range(0, len(self.weights)): self.zeta_c[i] = random() * 10
def buildPointsModel(self, points, steepness=1): ''' Builds a 2D softmax model by constructing an interior class from the given points Inputs points: list of 2D points that construct a convex polygon steepness: A scalar determining how steep the bounds between softmax classes are ''' dims = 2 pointsx = [p[0] for p in points] pointsy = [p[1] for p in points] centroid = [sum(pointsx) / len(points), sum(pointsy) / len(points)] #for each point to the next, find the normal between them. B = [] for i in range(0, len(points)): p1 = points[i] if (i == len(points) - 1): p2 = points[0] else: p2 = points[i + 1] mid = [] for i in range(0, len(p1)): mid.append((p1[i] + p2[i]) / 2) H = np.matrix([[p1[0], p1[1], 1], [p2[0], p2[1], 1], [mid[0], mid[1], 1]]) Hnull = (self.nullspace(H)).tolist() distMed1 = self.distance(mid[0] + Hnull[0][0], mid[1] + Hnull[1][0], centroid[0], centroid[1]) distMed2 = self.distance(mid[0] - Hnull[0][0], mid[1] - Hnull[1][0], centroid[0], centroid[1]) if (distMed1 < distMed2): Hnull[0][0] = -Hnull[0][0] Hnull[1][0] = -Hnull[1][0] Hnull[2][0] = -Hnull[2][0] for j in Hnull: B.append(j[0]) B = np.matrix(B).T numClasses = len(points) + 1 boundries = [] for i in range(1, numClasses): boundries.append([i, 0]) M = np.zeros(shape=(len(boundries) * (dims + 1), numClasses * (dims + 1))) for j in range(0, len(boundries)): for i in range(0, dims + 1): M[(dims + 1) * j + i, (dims + 1) * boundries[j][1] + i] = -1 M[(dims + 1) * j + i, (dims + 1) * boundries[j][0] + i] = 1 A = np.hstack((M, B)) #print(np.linalg.matrix_rank(A)) #print(np.linalg.matrix_rank(M)) Theta = linalg.lstsq(M, B)[0].tolist() weight = [] bias = [] for i in range(0, len(Theta) // (dims + 1)): weight.append( [Theta[(dims + 1) * i][0], Theta[(dims + 1) * i + 1][0]]) bias.append(Theta[(dims + 1) * i + dims][0]) steep = steepness self.weights = (np.array(weight) * steep).tolist() self.bias = (np.array(bias) * steep).tolist() self.size = len(self.weights) self.alpha = 3 self.zeta_c = [0] * len(self.weights) for i in range(0, len(self.weights)): self.zeta_c[i] = random() * 10
import socket from bcgdata import read_bcg_data x,ex,y,ey = read_bcg_data() # # pivot # ax = 14.5; x = x - ax ay = 12.5; y = y - ay A = np.array([x,np.ones(len(x))]) w = np.linalg.lstsq(A.T,y)[0] A2 = np.array([np.exp(x),x**2,x,np.ones(len(x))]) c = la.lstsq(A2.T,y)[0] #c,resid,rank,sigma = linalg.lstsq(A,y) xf = np.linspace(-1,1,100) yf = w[0]*xf + w[1] yf2 = c[2]*xf + c[3] yf3 = c[0]*np.exp(xf) + c[1]*xf**2 + c[2]*xf + c[3] print "numpy slope=",w[0]," intercept=",w[1] print "scipy slope=",c[2]," intercept=",c[3] print "cube, square coeff=", c[0], c[1] # # plot results: # plt.plot(x,y,'ro',xf,yf) plt.plot(xf,yf3)
def sgt_dist(freqdist, **kwargs): """ Returns a Simple Good-Turing log-probability distribution. The returned log-probability distribution is based on the Good-Turing frequency estimation, as first developed by Alan Turing and I. J. Good and implemented in a more easily computable way by Gale and Sampson's (1995/2001 reprint) in the so-called "Simple Good-Turing". This implementation is based mostly in the one by "maxbane" (2011) (https://github.com/maxbane/simplegoodturing/blob/master/sgt.py), as well as in the original one in C by Geoffrey Sampson (1995; 2000; 2005; 2008) (https://www.grsampson.net/Resources.html), and in the one by Loper, Bird et al. (2001-2018, NLTK Project) (http://www.nltk.org/_modules/nltk/probability.html). Please note that due to minor differences in implementation intended to guarantee non-zero probabilities even in cases of expected underflow, as well as our relience on scipy's libraries for speed and our way of handling probabilities that are not computable when the assumptions of SGT are not met, most results will not exactly match those of the 'gold standard' of Gale and Sampson, even though the differences are never expected to be significative and are equally distributed across the samples. Parameters ---------- freqdist : dict Frequency distribution of samples (keys) and counts (values) from which the probability distribution will be calculated. p_value : float The p-value for calculating the confidence interval of the empirical Turing estimate, which guides the decision of using either the Turing estimate "x" or the loglinear smoothed "y". Defaults to 0.05, as per the reference implementation by Sampson, but consider that the authors, both in their paper and in the code following suggestions credited to private communication with Fan Yang, consider using a value of 0.1. allow_fail : bool A logic value informing if the function is allowed to fail, throwing RuntimeWarning exceptions, if the essential assumptions on the frequency distribution are not met, i.e., if the slope of the loglinear regression is > -1.0 or if an unobserved count is reached before we are able to cross the smoothing threshold. If set to False, the estimation might result in an unreliable probability distribution; defaults to True. default_p0 : float An optional value indicating the probability for unobserved samples ("p0") in cases where no samples with a single count are observed; if this value is not specified, "p0" will default to a Laplace estimation for the current frequency distribution. Please note that this is intended change from the reference implementation by Gale and Sampson. Returns ------- state_prob: dict A dictionary of sample to log-probabilities for all the samples in the frequency distribution. unobserved_prob: float The log-probability for samples not found in the frequency distribution. """ # Make sure the scientific libraries have been loaded, raising an # ImportError if not if not np: raise ImportError('The package `numpy` is needed by SGT.') if not linalg or not stats: raise ImportError('The package `scipy` is needed by SGT.') # Deal with additional arguments. default_p0 = kwargs.get('default_p0', None) p_value = kwargs.get('p_value', 0.05) allow_fail = kwargs.get('allow_fail', True) # Perform basic argument checking. _check_probdist_args(freqdist, default_p0=default_p0, p_value=p_value) # Calculate the confidence level from the p_value. confidence_level = stats.norm.ppf(1. - (p_value / 2.0)) # Remove all samples with `count` equal to zero. freqdist = { sample: count for sample, count in freqdist.items() if count > 0 } # Prepare vectors for frequencies (`r` in G&S) and frequencies of # frequencies (`Nr` in G&S). freqdist.values() is cast to a tuple because # we can't consume the iterable a single time. `freqs_keys` is sorted to # make vector computations faster later on (so we query lists and not # dictionaries). freqs = tuple(freqdist.values()) freqs_keys = sorted(set(freqs)) # r -> n (G&S) freqs_of_freqs = {c: freqs.count(c) for c in freqs_keys} # The papers and the implementations are not clear on how to calculate the # probability of unobserved states in case of missing single-count samples # (unless we just fail, of course); Gale and Sampson's C implementation # defaults to 0.0, which is not acceptable for our purposes. The solution # here offered is to either use an user-provided probability (but in this # case we are not necessarily defaulting to _UNOBS, and, in fact, the # function argument name is `default_p0` and not `unobs_prob`) or default # to a Lidstone smoothing with a gamma of 1.0 (i.e., using Laplace # smoothing constant). # TODO: Investigate and discuss other possible solutions, including # user-defined `gamma`, `bins`, and/or `N`. if 1 in freqs_keys: p0 = freqs_of_freqs[1] / sum(freqs) else: p0 = default_p0 or (1. / (sum(freqs) + 1)) # Compute Sampson's Z: for each count `j`, we set Z[j] to the linear # interpolation of {i, j, k}, where `i` is the greatest observed count less # than `j`, and `k` the smallest observed count greater than `j`. I = [0] + freqs_keys[:-1] K = freqs_keys[1:] + [2 * freqs_keys[-1] - I[-1]] Z = { j: 2 * freqs_of_freqs[j] / (k - i) for i, j, k in zip(I, freqs_keys, K) } # Compute a loglinear regression of Z[r] over r. We cast keys and values to # a list for the computation with `linalg.lstsq`. z_keys = list(Z.keys()) z_values = list(Z.values()) slope, intercept = \ linalg.lstsq(np.c_[np.log(z_keys), (1,)*len(z_keys)], np.log(z_values))[0] #print ('Regression: log(z) = %f*log(r) + %f' % (slope, intercept)) if slope > -1.0 and allow_fail: raise RuntimeWarning("In SGT, linear regression slope is > -1.0.") # Aapply Gale and Sampson's "simple" loglinear smoothing method. r_smoothed = {} use_y = False for r in freqs_keys: # `y` is the loglinear smoothing. y = float(r+1) * \ np.exp(slope*np.log(r+1) + intercept) / \ np.exp(slope*np.log(r) + intercept) # If we've already started using `y` as the estimate for `r`, then # continue doing so; also start doing so if no samples were observed # with count equal to `r+1` (following comments and variable names in # both Sampson's C implementation and in NLTK, we check at which # point we should `switch`) if r + 1 not in freqs_of_freqs: if not use_y: # An unobserved count was reached before we were able to cross # the smoothing threshold; this means that assumptions were # not met and the results will likely be off. if allow_fail: raise RuntimeWarning( "In SGT, unobserved count before smoothing threshold.") use_y = True # If we are using `y`, just copy its value to `r_smoothed`, otherwise # perform the actual calculation. if use_y: r_smoothed[r] = y else: # `estim` is the empirical Turing estimate for `r` (equivalent to # `x` in G&S) estim = (float(r + 1) * freqs_of_freqs[r + 1]) / freqs_of_freqs[r] Nr = float(freqs_of_freqs[r]) Nr1 = float(freqs_of_freqs[r + 1]) # `width` is the width of the confidence interval of the empirical # Turing estimate (for which Sampson uses 95% but suggests 90%), # when assuming independence. width = confidence_level * \ np.sqrt(float(r+1)**2 * (Nr1 / Nr**2) * (1. + (Nr1 / Nr))) # If the difference between `x` and `y` is more than `t`, then the # empirical Turing estimate `x` tends to be more accurate. # Otherwise, use the loglinear smoothed value `y`. if abs(estim - y) > width: r_smoothed[r] = estim else: use_y = True r_smoothed[r] = y # (Re)normalize and return the resulting smoothed probabilities, less the # estimated probability mass of unseen species; please note that we might # be unable to calculate some probabilities if the function was not allowed # to fail, mostly due to math domain errors. We default to `p0` in all such # cases. smooth_sum = sum( [freqs_of_freqs[r] * r_smooth for r, r_smooth in r_smoothed.items()]) # Build the probability distribution for the observed samples and for # unobserved ones. prob_unk = math.log(p0) probdist = {} for sample, count in freqdist.items(): prob = (1.0 - p0) * (r_smoothed[count] / smooth_sum) if prob == 0.0: probdist[sample] = math.log(p0) else: probdist[sample] = math.log(prob) return probdist, prob_unk
def __arr_update_core(i, micro_matrix, rhs, solution, rcond, direction): """Update TT core for ARR Parameters ---------- i: int core index micro_op: ndarray micro matrix for ith TT core rhs: ndarray right-hand side for ith TT core solution: instance of TT class approximated solution of the system of linear equations rcond: float cut-off ratio for singular values of the subproblems, parameter for NumPy's lstsq direction: string 'forward' if first half sweep, 'backward' if second half sweep """ # solve the micro system for the ith TT core solution.cores[i], _, _, _ = lin.lstsq(micro_matrix.T, rhs, cond=rcond, lapack_driver='gelss') # reshape solution and orthonormalization # --------------------------------------- # first half sweep if direction == 'forward': # decompose solution [q, _] = lin.qr(solution.cores[i].reshape( solution.ranks[i] * solution.row_dims[i], solution.ranks[i + 1]), overwrite_a=True, mode='economic', check_finite=False) # set new rank solution.ranks[i + 1] = q.shape[1] # save orthonormal part solution.cores[i] = q.reshape(solution.ranks[i], solution.row_dims[i], 1, solution.ranks[i + 1]) # second half sweep if direction == 'backward': if i > 0: # decompose solution [_, q] = lin.rq(solution.cores[i].reshape( solution.ranks[i], solution.row_dims[i] * solution.ranks[i + 1]), overwrite_a=True, mode='economic', check_finite=False) # set new rank solution.ranks[i] = q.shape[0] # save orthonormal part solution.cores[i] = q.reshape(solution.ranks[i], solution.row_dims[i], 1, solution.ranks[i + 1]) else: # last iteration step solution.cores[i] = solution.cores[i].reshape( solution.ranks[i], solution.row_dims[i], 1, solution.ranks[i + 1])
def test_simple_underdet(self): a = [[1, 2, 3], [4, 5, 6]] b = [1, 2] x, res, r, s = lstsq(a, b) # XXX: need independent check assert_array_almost_equal(x, [-0.05555556, 0.11111111, 0.27777778])
def estimate_dem_error(ts0, A0, tbase, drop_date=None, phaseVelocity=False, num_step=0): """Estimate DEM error with least square optimization. Parameters: ts0 : 2D np.array in size of (numDate, numPixel), original displacement time-series A0 : 2D np.array in size of (numDate, model_num), design matrix in [A_geom, A_def] tbase : 2D np.array in size of (numDate, 1), temporal baseline drop_date : 1D np.array in bool data type, mark the date used in the estimation phaseVelocity : bool, use phase history or phase velocity for minimization Returns: delta_z: 2D np.array in size of (1, numPixel) estimated DEM residual ts_cor : 2D np.array in size of (numDate, numPixel), corrected timeseries = tsOrig - delta_z_phase ts_res : 2D np.array in size of (numDate, numPixel), residual timeseries = tsOrig - delta_z_phase - defModel Example: delta_z, ts_cor, ts_res = estimate_dem_error(ts, A, tbase, drop_date) """ if len(ts0.shape) == 1: ts0 = ts0.reshape(-1, 1) if drop_date is None: drop_date = np.ones(ts0.shape[0], np.bool_) # Prepare Design matrix A and observations ts for inversion A = A0[drop_date, :] ts = ts0[drop_date, :] if phaseVelocity: tbase = tbase[drop_date, :] A = np.diff(A, axis=0) / np.diff(tbase, axis=0) ts = np.diff(ts, axis=0) / np.diff(tbase, axis=0) # Inverse using L-2 norm to get unknown parameters X # X = [delta_z, constC, vel, acc, deltaAcc, ..., step1, step2, ...] # equivalent to X = np.dot(np.dot(np.linalg.inv(np.dot(A.T, A)), A.T), ts) # X = np.dot(np.linalg.pinv(A), ts) X = linalg.lstsq(A, ts, cond=1e-15)[0] # Prepare Outputs delta_z = X[0, :] ts_cor = ts0 - np.dot(A0[:, 0].reshape(-1, 1), delta_z.reshape(1, -1)) ts_res = ts0 - np.dot(A0, X) step_def = None if num_step > 0: step_def = X[-1 * num_step:, :].reshape(num_step, -1) # for debug debug_mode = False if debug_mode: import matplotlib.pyplot as plt fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1, figsize=(8, 8)) ts_all = np.hstack((ts0, ts_res, ts_cor)) ymin = np.min(ts_all) ymax = np.max(ts_all) ax1.plot(ts0, '.') ax1.set_ylim((ymin, ymax)) ax1.set_title('Original Timeseries') ax2.plot(ts_cor, '.') ax2.set_ylim((ymin, ymax)) ax2.set_title('Corrected Timeseries') ax3.plot(ts_res, '.') ax3.set_ylim((ymin, ymax)) ax3.set_title('Fitting Residual') ax4.plot(ts_cor - ts_res, '.') ax4.set_ylim((ymin, ymax)) ax4.set_title('Fitted Deformation Model') plt.show() return delta_z, ts_cor, ts_res, step_def
def test_simple_overdet(self): a = [[1, 2], [4, 5], [3, 4]] b = [1, 2, 3] x, res, r, s = lstsq(a, b) assert_array_almost_equal(x, direct_lstsq(a, b)) assert_almost_equal((abs(dot(a, x) - b)**2).sum(axis=0), res)
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values. Will be cast to X's dtype if necessary sample_weight : numpy array of shape [n_samples] Individual weights for each sample .. versionadded:: 0.17 parameter *sample_weight* support to LinearRegression. Returns ------- self : returns an instance of self. """ n_jobs_ = self.n_jobs X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) if sp.issparse(X): if y.ndim < 2: out = sparse_lsqr(X, y) self.coef_ = out[0] self._residues = out[3] else: # sparse_lstsq cannot handle y with shape (M, K) outs = Parallel(n_jobs=n_jobs_)( delayed(sparse_lsqr)(X, y[:, j].ravel()) for j in range(y.shape[1])) self.coef_ = np.vstack(out[0] for out in outs) self._residues = np.vstack(out[3] for out in outs) else: self.coef_, self._residues, self.rank_, self.singular_ = \ linalg.lstsq(X, y) self.coef_ = self.coef_.T if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) return self
def calc_risk_scores(bimfile_name, rs_id_map, phen_map, K_bins=1, out_file=None, verbose=False, cv_10fold=True, weights_file=None, print_effects=False): num_individs = len(phen_map) assert num_individs > 0, 'No individuals found. Problems parsing the phenotype file?' #print K_bins if K_bins > 1: prs_dict_bins = {} bk = 1 while bk <= K_bins: prs_dict_bins["pval_derived_effects_prs_bin_%d" % bk] = sp.zeros(num_individs) bk += 1 #print prs_dict_bins.keys() if bimfile_name is not None: raw_effects_prs = sp.zeros(num_individs) pval_derived_effects_prs = sp.zeros(num_individs) bimf1 = re.sub(r"\[1:22\]", "[0-9]", bimfile_name) bimf2 = re.sub(r"\[1:22\]", "[0-2][0-9]", bimfile_name) bimfile_list = glob.glob(bimf1 + ".bim") + glob.glob(bimf2 + ".bim") bimfile_list.sort(key=natural_keys) for bimfile in bimfile_list: genotype_file = re.sub(r".bim", "", bimfile) print 'Get PRS on file %s' % bimfile prs_dict = get_prs_bins(genotype_file, rs_id_map, phen_map=phen_map, K_bins=K_bins, verbose=verbose) raw_effects_prs += prs_dict['raw_effects_prs'] pval_derived_effects_prs += prs_dict['pval_derived_effects_prs'] if K_bins > 1: bk = 1 while bk <= K_bins: prs_dict_bins["pval_derived_effects_prs_bin_%d" % bk] += prs_dict[ "pval_derived_effects_prs_bin_%d" % bk] bk += 1 true_phens = prs_dict['true_phens'] raw_eff_corr = sp.corrcoef(raw_effects_prs, prs_dict['true_phens'])[0, 1] raw_eff_r2 = raw_eff_corr**2 pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, prs_dict['true_phens'])[0, 1] pval_eff_r2 = pval_eff_corr**2 print 'Final raw effects PRS correlation: %0.4f' % raw_eff_corr print 'Final raw effects PRS r2: %0.4f' % raw_eff_r2 print 'Final LDpred-funct-inf PRS correlation: %0.4f' % pval_eff_corr print 'Final LDpred-funct-inf PRS r2: %0.4f' % pval_eff_r2 if K_bins == 1: print "Since the selected/calculated number of bins is 1, LDpred-funct equals to LDpred-funct-inf" cv_effects_dir = {} if K_bins > 1: X = sp.ones((num_individs, 1)) bk = 1 while bk <= K_bins: prs_dict_bins["pval_derived_effects_prs_bin_%d" % bk].shape = (len( prs_dict_bins["pval_derived_effects_prs_bin_%d" % bk]), 1) X = sp.hstack( [X, prs_dict_bins["pval_derived_effects_prs_bin_%d" % bk]]) #print(bk) #print(X[0:5,]) bk += 1 true_phens = sp.array(true_phens) true_phens.shape = (len(true_phens), 1) (betas, rss0, r, s) = linalg.lstsq(X, true_phens) ### In sample fit Y_pred = sp.dot(X, betas) Y_pred.shape = (len(true_phens), ) # Report prediction accuracy bin_in_sample_eff_corr = sp.corrcoef(Y_pred, prs_dict['true_phens'])[0, 1] bin_eff_r2 = bin_in_sample_eff_corr**2 print 'Final in-sample LDpredfunct (%d bins) PRS correlation: %0.4f' % ( K_bins, bin_in_sample_eff_corr) print 'Final in-sample LDpredfunct (%d bins) PRS R2: %0.4f' % ( K_bins, bin_eff_r2) print 'Final in-sample LDpredfunct (%d bins) PRS adjusted-R2: %0.4f' % ( K_bins, 1 - (1 - bin_eff_r2) * (len(true_phens) - 1) / (len(true_phens) - K_bins - 1)) ### if cv_10fold: test_size = len(true_phens) cv_fold_size = int(test_size / 10) bound_cv_test = [] for k in range(10): bound_cv_test.append(k * cv_fold_size) bound_cv_test.append(test_size - 1) bin_eff_r2_arr = [] for cv_iter in range(10): Xtrain = sp.copy(X) Xtest = sp.copy(X) Ytrain = sp.copy(true_phens) Ytest = sp.copy(true_phens) Xtest = Xtest[bound_cv_test[cv_iter]:bound_cv_test[cv_iter + 1], ] Ytest = Ytest[bound_cv_test[cv_iter]:bound_cv_test[cv_iter + 1]] Xtrain = sp.delete( Xtrain, range(bound_cv_test[cv_iter], bound_cv_test[cv_iter + 1]), 0) Ytrain = sp.delete( Ytrain, range(bound_cv_test[cv_iter], bound_cv_test[cv_iter + 1]), 0) (betas, rss0, r, s) = linalg.lstsq(Xtrain, Ytrain) Y_pred = sp.dot(Xtest, betas) Y_pred.shape = (len(Ytest), ) Ytest.shape = (len(Ytest), ) # Report prediction accuracy bin_in_sample_eff_corr = sp.corrcoef(Y_pred, Ytest)[0, 1] bin_eff_r2 = bin_in_sample_eff_corr**2 bin_eff_r2_arr.append(bin_eff_r2) if print_effects: cv_effects_dir["cv_%d" % cv_iter] = bin_eff_r2_arr parse_ldpred_res_bins_regularized( weights_file, weights_out_file=weights_file + "cv_%d.txt" % cv_iter, weights=betas, rs_id_map=rs_id_map) print 'Final 10-fold cross validation LDpredfunct (%d bins) PRS average R2 : %0.4f ' % ( K_bins, sp.mean(bin_eff_r2_arr)) res_dict = {'pred_r2': pval_eff_r2} raw_effects_prs.shape = (len(raw_effects_prs), 1) pval_derived_effects_prs.shape = (len(pval_derived_effects_prs), 1) true_phens = sp.array(true_phens) true_phens.shape = (len(true_phens), 1) # Store covariate weights, slope, etc. weights_dict = {} num_individs = len(prs_dict['pval_derived_effects_prs']) # Write PRS out to file. if out_file != None: with open(out_file, 'w') as f: out_str = 'IID, true_phens, raw_effects_prs, pval_derived_effects_prs' if K_bins > 1: Kbins_str = ",".join("Bin_%d" % (1 + bin_i) for bin_i in range(K_bins)) out_str = out_str + ', ' + Kbins_str out_str += '\n' f.write(out_str) for i in range(num_individs): out_str = '%s, %0.6e, %0.6e, %0.6e ' % ( prs_dict['iids'][i], prs_dict['true_phens'][i], raw_effects_prs[i], pval_derived_effects_prs[i]) bins_prs_ind_i = [] if K_bins > 1: bk = 1 while bk <= K_bins: bins_prs_ind_i.append( str(prs_dict_bins["pval_derived_effects_prs_bin_%d" % bk][i][0])) bk += 1 Kbins_str = ', '.join(map(str, bins_prs_ind_i)) out_str = out_str + ', ' + Kbins_str out_str += '\n' f.write(out_str) # if weights_dict != None: # oh5f = h5py.File(out_file + '.weights.hdf5', 'w') # for k1 in weights_dict.keys(): # kg = oh5f.create_group(k1) # for k2 in weights_dict[k1]: # kg.create_dataset(k2, data=sp.array(weights_dict[k1][k2])) # oh5f.close() return res_dict
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary sample_weight : array-like of shape (n_samples,), default=None Individual weights for each sample .. versionadded:: 0.17 parameter *sample_weight* support to LinearRegression. Returns ------- self : returns an instance of self. """ n_jobs_ = self.n_jobs X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight, return_mean=True) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) if sp.issparse(X): X_offset_scale = X_offset / X_scale def matvec(b): return X.dot(b) - b.dot(X_offset_scale) def rmatvec(b): return X.T.dot(b) - X_offset_scale * np.sum(b) X_centered = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec) if y.ndim < 2: out = sparse_lsqr(X_centered, y) self.coef_ = out[0] self._residues = out[3] else: # sparse_lstsq cannot handle y with shape (M, K) outs = Parallel(n_jobs=n_jobs_)( delayed(sparse_lsqr)(X_centered, y[:, j].ravel()) for j in range(y.shape[1])) self.coef_ = np.vstack([out[0] for out in outs]) self._residues = np.vstack([out[3] for out in outs]) else: self.coef_, self._residues, self.rank_, self.singular_ = \ linalg.lstsq(X, y) self.coef_ = self.coef_.T if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) return self
def calc_risk_scores(bed_file, rs_id_map, phen_map, out_file=None, split_by_chrom=False, adjust_for_sex=False, adjust_for_covariates=False, adjust_for_pcs=False, non_zero_chromosomes=None, only_score=False, verbose=False, summary_dict=None): if verbose: print('Parsing PLINK bed file: %s' % bed_file) if split_by_chrom: num_individs = len(phen_map) assert num_individs > 0, 'No individuals found. Problems parsing the phenotype file?' pval_derived_effects_prs = sp.zeros(num_individs) for i in range(1, 23): if non_zero_chromosomes is None or i in non_zero_chromosomes: genotype_file = bed_file + '_%i_keep' % i if os.path.isfile(genotype_file + '.bed'): if verbose: print('Working on chromosome %d' % i) prs_dict = get_prs(genotype_file, rs_id_map, phen_map, only_score=only_score, verbose=verbose) pval_derived_effects_prs += prs_dict[ 'pval_derived_effects_prs'] elif verbose: print('Skipping chromosome') else: prs_dict = get_prs(bed_file, rs_id_map, phen_map, only_score=only_score, verbose=verbose) num_individs = len(prs_dict['iids']) pval_derived_effects_prs = prs_dict['pval_derived_effects_prs'] if only_score: write_only_scores_file(out_file, prs_dict, pval_derived_effects_prs) res_dict = {} elif sp.std(prs_dict['true_phens']) == 0: if verbose: print('No variance left to explain in phenotype.') res_dict = {'pred_r2': 0} else: # Report prediction accuracy assert len( phen_map ) > 0, 'No individuals found. Problems parsing the phenotype file?' # Store covariate weights, slope, etc. weights_dict = {} # Store Adjusted predictions adj_pred_dict = {} #If there is no prediction, then output 0s. if sp.std(pval_derived_effects_prs) == 0: res_dict = {'pred_r2': 0} weights_dict['unadjusted'] = { 'Intercept': 0, 'ldpred_prs_effect': 0 } else: pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, prs_dict['true_phens'])[0, 1] pval_eff_r2 = pval_eff_corr**2 res_dict = {'pred_r2': pval_eff_r2} pval_derived_effects_prs.shape = (len(pval_derived_effects_prs), 1) true_phens = sp.array(prs_dict['true_phens']) true_phens.shape = (len(true_phens), 1) # Direct effect Xs = sp.hstack( [pval_derived_effects_prs, sp.ones((len(true_phens), 1))]) (betas, rss00, r, s) = linalg.lstsq(sp.ones((len(true_phens), 1)), true_phens) (betas, rss, r, s) = linalg.lstsq(Xs, true_phens) pred_r2 = 1 - rss / rss00 weights_dict['unadjusted'] = { 'Intercept': betas[1][0], 'ldpred_prs_effect': betas[0][0] } if verbose: print('PRS correlation: %0.4f' % pval_eff_corr) print('Variance explained (Pearson R2) by PRS: %0.4f' % pred_r2) # Adjust for sex if adjust_for_sex and 'sex' in prs_dict and len( prs_dict['sex']) > 0: sex = sp.array(prs_dict['sex']) sex.shape = (len(sex), 1) (betas, rss0, r, s) = linalg.lstsq( sp.hstack([sex, sp.ones((len(true_phens), 1))]), true_phens) Xs = sp.hstack([ pval_derived_effects_prs, sex, sp.ones((len(true_phens), 1)) ]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) weights_dict['sex_adj'] = { 'Intercept': betas[2][0], 'ldpred_prs_effect': betas[0][0], 'sex': betas[1][0] } if verbose: print( 'Fitted effects (betas) for PRS, sex, and intercept on true phenotype:', betas) adj_pred_dict['sex_prs'] = sp.dot(Xs, betas) pred_r2 = 1 - rss_pd / rss0 print( 'Variance explained (Pearson R2) by PRS adjusted for Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['Sex_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print( 'Variance explained (Pearson R2) by PRS + Sex : %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['Sex_adj_pred_r2+Sex'] = pred_r2 # Adjust for PCs if adjust_for_pcs and 'pcs' in prs_dict and len( prs_dict['pcs']) > 0: pcs = prs_dict['pcs'] (betas, rss0, r, s) = linalg.lstsq( sp.hstack([pcs, sp.ones((len(true_phens), 1))]), true_phens) Xs = sp.hstack([ pval_derived_effects_prs, sp.ones((len(true_phens), 1)), pcs ]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) weights_dict['pc_adj'] = { 'Intercept': betas[1][0], 'ldpred_prs_effect': betas[0][0], 'pcs': betas[2][0] } adj_pred_dict['pc_prs'] = sp.dot(Xs, betas) pred_r2 = 1 - rss_pd / rss0 print( 'Variance explained (Pearson R2) by PRS adjusted for PCs: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['PC_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print( 'Variance explained (Pearson R2) by PRS + PCs: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['PC_adj_pred_r2+PC'] = pred_r2 # Adjust for both PCs and Sex if adjust_for_sex and 'sex' in prs_dict and len( prs_dict['sex']) > 0: sex = sp.array(prs_dict['sex']) sex.shape = (len(sex), 1) (betas, rss0, r, s) = linalg.lstsq( sp.hstack([sex, pcs, sp.ones((len(true_phens), 1))]), true_phens) Xs = sp.hstack([ pval_derived_effects_prs, sex, sp.ones((len(true_phens), 1)), pcs ]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) weights_dict['sex_pc_adj'] = { 'Intercept': betas[2][0], 'ldpred_prs_effect': betas[0][0], 'sex': betas[1][0], 'pcs': betas[3][0] } adj_pred_dict['sex_pc_prs'] = sp.dot(Xs, betas) pred_r2 = 1 - rss_pd / rss0 print( 'Variance explained (Pearson R2) by PRS adjusted for PCs and Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['PC_Sex_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print( 'Variance explained (Pearson R2) by PRS+PCs+Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['PC_Sex_adj_pred_r2+PC_Sex'] = pred_r2 # Adjust for covariates if adjust_for_covariates and 'covariates' in prs_dict and len( prs_dict['covariates']) > 0: covariates = prs_dict['covariates'] (betas, rss0, r, s) = linalg.lstsq( sp.hstack([covariates, sp.ones((len(true_phens), 1))]), true_phens) Xs = sp.hstack([ pval_derived_effects_prs, covariates, sp.ones((len(true_phens), 1)) ]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) adj_pred_dict['cov_prs'] = sp.dot(Xs, betas) pred_r2 = 1 - rss_pd / rss0 print( 'Variance explained (Pearson R2) by PRS adjusted for Covariates: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['Cov_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print( 'Variance explained (Pearson R2) by PRS + Cov: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['Cov_adj_pred_r2+Cov'] = pred_r2 if adjust_for_pcs and 'pcs' in prs_dict and len( prs_dict['pcs']) and 'sex' in prs_dict and len( prs_dict['sex']) > 0: pcs = prs_dict['pcs'] sex = sp.array(prs_dict['sex']) sex.shape = (len(sex), 1) (betas, rss0, r, s) = linalg.lstsq( sp.hstack([ covariates, sex, pcs, sp.ones((len(true_phens), 1)) ]), true_phens) Xs = sp.hstack([ pval_derived_effects_prs, covariates, sex, pcs, sp.ones((len(true_phens), 1)) ]) (betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens) adj_pred_dict['cov_sex_pc_prs'] = sp.dot(Xs, betas) pred_r2 = 1 - rss_pd / rss0 print( 'Variance explained (Pearson R2) by PRS adjusted for Cov+PCs+Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['Cov_PC_Sex_adj_pred_r2'] = pred_r2 pred_r2 = 1 - rss_pd / rss00 print( 'Variance explained (Pearson R2) by PRS+Cov+PCs+Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs))) res_dict['Cov_PC_Sex_adj_pred_r2+Cov_PC_Sex'] = pred_r2 # Now calibration y_norm = (true_phens - sp.mean(true_phens)) / sp.std(true_phens) denominator = sp.dot(pval_derived_effects_prs.T, pval_derived_effects_prs) numerator = sp.dot(pval_derived_effects_prs.T, y_norm) regression_slope = (numerator / denominator)[0][0] if verbose: print( 'The slope for predictions with weighted effects is: %0.4f' % regression_slope) num_individs = len(prs_dict['pval_derived_effects_prs']) # Write PRS out to file. if out_file != None: write_scores_file(out_file, prs_dict, pval_derived_effects_prs, adj_pred_dict, weights_dict=weights_dict, verbose=verbose) return res_dict
def andersonAcceleration(self): maximalDimensionOfKrylovSpace = 5 isMaximalDimensionOfKrylovSpaceReached = False dimensionOfKrylovSpace = 0 b = 1.0 #Start iteration G = self.x_0 i = 0 while i < self.numberOfIterations and not self.isInterrupted: if dimensionOfKrylovSpace < maximalDimensionOfKrylovSpace: dimensionOfKrylovSpace += 1 else: isMaximalDimensionOfKrylovSpaceReached = True G_old = G #save previuos state G_new = self.fixPointOperator(G) #np.newaxis makes 2D array which can be transposed. Mathematically, #this is like interpreting a vector in R^n as Matrix in R^{n x 1} G_new_AsColumnVector = G_new[np.newaxis].T #Save state G_new in Matrix K if not isMaximalDimensionOfKrylovSpaceReached: if dimensionOfKrylovSpace == 1: #init K K = G_new_AsColumnVector else: #addColumnToMatrixByExtension K = np.concatenate((K, G_new_AsColumnVector), axis=1) else: #addColumnToMatrixByShifting K = np.roll(K, -1, axis=1) K[:, -1] = G_new #save residuum r in Matrix D r = G_new - G_old r_AsColumnVector = r[np.newaxis].T #Is now 2D array if not isMaximalDimensionOfKrylovSpaceReached: if dimensionOfKrylovSpace == 1: #init D D = r_AsColumnVector else: #addColumnToMatrixByExtension D = np.concatenate((D, r_AsColumnVector), axis=1) else: #addColumnToMatrixByShifting D = np.roll(D, -1, axis=1) D[:, -1] = r if dimensionOfKrylovSpace == 1: G = G_new continue #Only calculate W if needed if not isMaximalDimensionOfKrylovSpaceReached: W = self.calculateW(dimensionOfKrylovSpace) #Start solving least square problem D_reduced = np.dot(D, W) #D_reduced = DW #solve argmin(a) ||D_reduced a + r || a_reduced, resid, rank, sigma = linalg.lstsq(D_reduced, -r) a_AsColumnVector = np.dot(W, a_reduced[np.newaxis].T) a_AsColumnVector[-1] += 1.0 G_AsColumnVector = np.dot( K, a_AsColumnVector) - (1.0 - b) * np.dot(D, a_AsColumnVector) G = G_AsColumnVector[:, 0] #We only check for convergence if K and D don't grow anymore if isMaximalDimensionOfKrylovSpaceReached: norm = np.linalg.norm(G_new) previousNorm = np.linalg.norm(G_old) relativeProgress = abs(previousNorm - norm) / (norm + np.finfo(float).eps) if relativeProgress < self.convergenceCriterion: #Defined in super class #pass print "Anderson converged after", i, "steps" break i += 1 #end while if i == self.numberOfIterations: print "Anderson did not converge after", self.numberOfIterations, "steps" self.derivePhysicalQuantitiesFromFixpoint(G)
def LMqr(fun, pars, args, tau = 1e-3, eps1 = 1e-8, eps2 = 1e-8, kmax = 100, verbose = False): from scipy.linalg import lstsq import scipy.linalg """Implementation of the Levenberg-Marquardt algorithm in pure Python. Instead of using the normal equations this version uses QR factorization for enhanced accuracy. Significantly slower (factor 2).""" p = pars f, J = fun(p, *args) A = inner(J,J) g = inner(J,f) I = eye(len(p)) k = 0; nu = 2 mu = tau * max(diag(A)) stop = norm(g, Inf) < eps1 while not stop and k < kmax: k += 1 if verbose: print("step %d: |f|: %9.3g mu: %g"%(k, norm(f), mu)) tic = time.time() A = inner(J, J) g = inner(J, f) d = solve( A + mu*I, -g) print ('XX', d, time.time() - tic) des = numpy.hstack((-f, numpy.zeros((len(p),)))) Des = numpy.vstack((numpy.transpose(J), numpy.sqrt(mu)*I)) tic = time.time() d0, resids, rank, s = lstsq(Des, des) print('d0', d0, time.time() - tic) tic = time.time() #q, r = scipy.linalg.qr(Des, econ = True, mode = 'qr') #d4 = solve(r, inner(numpy.transpose(q), des)) r = scipy.linalg.qr(Des, econ = True, mode = 'r') d4 = scipy.linalg.cho_solve( (r, False), -inner(J, f)) print('d4', d4, time.time() - tic) tic = time.time() q, r = scipy.linalg.qr(numpy.transpose(J), econ = True, mode = 'qr') d3 = solve( r + mu*numpy.linalg.inv(r.transpose()), -inner(numpy.transpose(q),f)) #d3 = scipy.linalg.cho_solve( (r + mu*numpy.linalg.inv(r.transpose()), False), # -inner(numpy.transpose(q),f)) print ('d3', d3, time.time() - tic) print (d - d0) print (d3 - d0) print (d4 - d0) if norm(d) < eps2*(norm(p) + eps2): stop = True reason = 'small step' break pnew = p + d fnew, Jnew = fun(pnew, *args) rho = (norm(f) - norm(fnew))/inner(d, mu*d - g) # /2???? if rho > 0: p = pnew #A = inner(Jnew, Jnew) #g = inner(Jnew, fnew) f = fnew J = Jnew if (norm(g, Inf) < eps1): # or norm(fnew) < eps3): stop = True reason = "small gradient" break mu = mu * max(1.0/3, 1 - (2*rho - 1)**3) nu = 2 else: mu = mu * nu nu = 2*nu else: reason = "max iter reached" if verbose: print (reason) return p
def N4SID(u,y,NumRows,NumCols,NSig,require_stable=False): """ A,B,C,D,Cov,Sigma = N4SID(u,y,NumRows,NumCols,n,require_stable=False) Let NumVals be the number of input and output values available In this case: u - NumInputs x NumVals array of inputs y - NumOutputs x NumVals array of outputs NumRows - Number of block rows in the past and future block Hankel matrices NumCols - Number of columns in the past and future block Hankel matrices n - desired state dimension. For the algorithm to work, you must have: NumVals >= 2*NumRows + NumCols - 1 Returns A,B,C,D - the state space realization from inputs to outputs Cov - the joint covariance of the process and measurement noise Sigma - the singular values of the oblique projection of row space of future outputs along row space of future inputs on the row space of past inputs and outputs. Examining Sigma can be used to determine the required state dimension require_stable - An optional boolean parameter. Default is False If False, the standard N4SID algorithm is used If True, the state matrix, A, will have spectral radius < 1. In order to run with require_stable=True, cvxpy must be installed. """ NumInputs = u.shape[0] NumOutputs = y.shape[0] NumDict = {'Inputs': NumInputs, 'Outputs': NumOutputs, 'Dimension':NSig, 'Rows':NumRows, 'Columns':NumCols} GammaDict,S = preProcess(u,y,NumDict) GamData = GammaDict['Data'] GamYData = GammaDict['DataY'] if not require_stable: K = la.lstsq(GamData.T,GamYData.T)[0].T else: Kvar = cvx.Variable(NSig+NumOutputs,NSig+NumInputs*NumRows) Avar = Kvar[:NSig,:NSig] Pvar = cvx.Semidef(NSig) LyapCheck = cvx.vstack(cvx.hstack(Pvar,Avar), cvx.hstack(Avar.T,np.eye(NSig))) Constraints = [LyapCheck>>0,Pvar << np.eye(NSig)] diffVar = GamYData - Kvar*GamData objFun = cvx.norm(diffVar,'fro') Objective = cvx.Minimize(objFun) Prob = cvx.Problem(Objective,Constraints) result = Prob.solve() K = Kvar.value AID,BID,CID,DID,CovID = postProcess(K,GammaDict,NumDict) return AID,BID,CID,DID,CovID,S
def _apply_rap_music(data, info, times, forward, noise_cov, n_dipoles=2, picks=None): """RAP-MUSIC for evoked data. Parameters ---------- data : array, shape (n_channels, n_times) Evoked data. info : dict Measurement info. times : array Times. forward : instance of Forward Forward operator. noise_cov : instance of Covariance The noise covariance. n_dipoles : int The number of dipoles to estimate. The default value is 2. picks : array-like of int | None Indices (in info) of data channels. If None, MEG and EEG data channels (without bad channels) will be used. Returns ------- dipoles : list of instances of Dipole The dipole fits. explained_data : array | None Data explained by the dipoles using a least square fitting with the selected active dipoles and their estimated orientation. Computed only if return_explained_data is True. """ is_free_ori, ch_names, proj, vertno, G = _prepare_beamformer_input( info, forward, label=None, picks=picks, pick_ori=None) gain = G.copy() # Handle whitening + data covariance whitener, _ = compute_whitener(noise_cov, info, picks) if info['projs']: whitener = np.dot(whitener, proj) # whiten the leadfield and the data G = np.dot(whitener, G) data = np.dot(whitener, data) eig_values, eig_vectors = linalg.eigh(np.dot(data, data.T)) phi_sig = eig_vectors[:, -n_dipoles:] n_orient = 3 if is_free_ori else 1 n_channels = G.shape[0] A = np.empty((n_channels, n_dipoles)) gain_dip = np.empty((n_channels, n_dipoles)) oris = np.empty((n_dipoles, 3)) poss = np.empty((n_dipoles, 3)) G_proj = G.copy() phi_sig_proj = phi_sig.copy() for k in range(n_dipoles): subcorr_max = -1. for i_source in range(G.shape[1] // n_orient): idx_k = slice(n_orient * i_source, n_orient * (i_source + 1)) Gk = G_proj[:, idx_k] if n_orient == 3: Gk = np.dot(Gk, forward['source_nn'][idx_k]) subcorr, ori = _compute_subcorr(Gk, phi_sig_proj) if subcorr > subcorr_max: subcorr_max = subcorr source_idx = i_source source_ori = ori if n_orient == 3 and source_ori[-1] < 0: # make sure ori is relative to surface ori source_ori *= -1 # XXX source_pos = forward['source_rr'][i_source] if n_orient == 1: source_ori = forward['source_nn'][i_source] idx_k = slice(n_orient * source_idx, n_orient * (source_idx + 1)) Ak = G[:, idx_k] if n_orient == 3: Ak = np.dot(Ak, np.dot(forward['source_nn'][idx_k], source_ori)) A[:, k] = Ak.ravel() gain_k = gain[:, idx_k] if n_orient == 3: gain_k = np.dot(gain_k, np.dot(forward['source_nn'][idx_k], source_ori)) gain_dip[:, k] = gain_k.ravel() oris[k] = source_ori poss[k] = source_pos logger.info("source %s found: p = %s" % (k + 1, source_idx)) if n_orient == 3: logger.info("ori = %s %s %s" % tuple(oris[k])) projection = _compute_proj(A[:, :k + 1]) G_proj = np.dot(projection, G) phi_sig_proj = np.dot(projection, phi_sig) sol = linalg.lstsq(A, data)[0] explained_data = np.dot(gain_dip, sol) residual = data - np.dot(whitener, explained_data) gof = 1. - np.sum(residual**2, axis=0) / np.sum(data**2, axis=0) return _make_dipoles(times, poss, oris, sol, gof), explained_data
def partial_corr(C): """ Partial Correlation in Python (clone of Matlab's partialcorr) This uses the linear regression approach to compute the partial correlation (might be slow for a huge number of variables). The algorithm is detailed here: http://en.wikipedia.org/wiki/Partial_correlation#Using_linear_regression Taking X and Y two variables of interest and Z the matrix with all the variable minus {X, Y}, the algorithm can be summarized as 1) perform a normal linear least-squares regression with X as the target and Z as the predictor 2) calculate the residuals in Step #1 3) perform a normal linear least-squares regression with Y as the target and Z as the predictor 4) calculate the residuals in Step #3 5) calculate the correlation coefficient between the residuals from Steps #2 and #4; The result is the partial correlation between X and Y while controlling for the effect of Z. Date: Nov 2014 Author: Fabian Pedregosa-Izquierdo, [email protected] Testing: Valentina Borghesani, [email protected] """ """ Returns the sample linear partial correlation coefficients between pairs of variables in C, controlling for the remaining variables in C. Parameters ---------- C : array-like, shape (n, p) Array with the different variables. Each column of C is taken as a variable Returns ------- P : array-like, shape (p, p) P[i, j] contains the partial correlation of C[:, i] and C[:, j] controlling for the remaining variables in C. """ C = np.asarray(C) p = C.shape[1] P_corr = np.zeros((p, p), dtype=np.float) for i in range(p): P_corr[i, i] = 1 for j in range(i + 1, p): idx = np.ones(p, dtype=np.bool) idx[i] = False idx[j] = False beta_i = linalg.lstsq(C[:, idx], C[:, j])[0] beta_j = linalg.lstsq(C[:, idx], C[:, i])[0] res_j = C[:, j] - C[:, idx].dot(beta_i) res_i = C[:, i] - C[:, idx].dot(beta_j) # corr = sp.pearsonr(res_i, res_j)[0] corr = sp.spearmanr(res_i, res_j, nan_policy='omit')[0] P_corr[i, j] = corr P_corr[j, i] = corr return P_corr
def lin_leastsq(model, points, vals, errs=None, fullOutput=False, **keywords): ''' Performs linear least squares on a function & dataset @param model function to be fit to. Contains function basis(), which is an array of funcs that take points as arguments to form config matrix Or can be a list of said functions @param points[i][j] coordinates of data points, where points[i] are individual datapoints and points[i][j] are components of datapoint position @param vals value of data and each position in points @param errs error in vals at each position in points @param fullOutput selects how much info to return @returns params, {residual, covar}, isConverged where: params is a list of best fit parameters chisq is chisq or equivalent Student-t distribution covar is the covariance matrix isConvered is a bool describing convergence (always true) ''' log.log(12, 'Entering lin_leastsq') ################### def makeMatrixs(points, vals, errs): A = numpy.zeros((len(points), len(basis))) if errs is None: for i in xrange(len(points)): for j in xrange(len(basis)): A[i, j] = basis[j](points[i]) b = vals else: b = zeros((len(points), 1)) for i in xrange(len(points)): for j in xrange(len(basis)): A[i, j] = (basis[j](points[i])) / errs[i] b[i] = vals[i] / errs[i] return A, b ##################### def calcCovarMatrix(params): u, s, v = linalg.svd(A) covar = numpy.zeros((len(params), len(params))) for i in xrange(len(params)): for j in xrange(len(params)): for k in xrange(len(params)): covar[i, j] += v[i, k] * v[j, k] / s[k] covar[j, i] = covar[i, j] return covar ##################### def calcResiduals(params, x, y, errs): predicted = model(x, params) if errs is None: return y - predicted else: return numpy.divide(y - predicted, errs) ##################### if hasattr(model, '__getitem__'): basis = model else: basis = model.basis() points, vals, errs = _prepData(points, vals, errs) A, b = makeMatrixs(points, vals, errs) (params, resids, rank, s) = linalg.lstsq(A, b) if fullOutput: covar = calcCovarMatrix(params) chisq = numpy.sum(calcResiduals(params, points, vals, errs)**2) log.log(12, 'Returning from lin_leastsq: fullOutput') return params, chisq, covar, True log.log(12, 'Returning from lin_leastsq') return params, True
def nnmf_sparse(V0, XYZ0, W0, B0, S0, tolfun=1e-4, miniter=10, maxiter=100, timeseries_mean=1.0, timepoints=None, verbosity=1): ''' cell detection via nonnegative matrix factorization with sparseness projection V0 = voxel_timeseries_valid XYZ0 = voxel_xyz_valid W0 = cell_weight_init_valid B0 = cell_neighborhood_valid S0 = cell_sparseness ''' import os import numpy as np from scipy import stats from scipy import linalg from skimage import measure from voluseg._tools.sparseness_projection import sparseness_projection os.environ['MKL_NUM_THREADS'] = '1' # CAUTION: variable is modified in-place to save memory V0 *= (timeseries_mean / V0.mean(1)[:, None]) # normalize voxel timeseries if not timepoints is None: V = V0[:, timepoints].astype(float) # copy input signal else: V = V0.astype(float) # copy input signal XYZ = XYZ0.astype(int) W = W0.astype(float) B = B0.astype(bool) S = S0.copy() # get dimensions n, t = V.shape n_, c = W.shape assert (n_ == n) H = np.zeros((c, t)) # zero timeseries array dnorm_prev = np.full(2, np.inf) # last two d-norms for ii in range(maxiter): # save current states H_ = H.copy() # Alternate least squares with regularization H = np.maximum(linalg.lstsq(W, V)[0], 0) H *= (timeseries_mean / H.mean(1)[:, None] ) # normalize component timeseries W = np.maximum(linalg.lstsq(V.T, H.T)[0], 0) W[np.logical_not(B)] = 0 # restrict component boundaries for ci in range(c): W_ci = W[B[:, ci], ci] if np.any(W_ci) and (S[ci] > 0): # get relative dimensions of component XYZ_ci = XYZ[B[:, ci]] - XYZ[B[:, ci]].min(0) # enforce component sparseness and percentile threshold W_ci = sparseness_projection(W_ci, S[ci], at_least_as_sparse=True) # retain largest connected component (mode) L_ci = np.zeros(np.ptp(XYZ_ci, 0) + 1, dtype=bool) L_ci[tuple(zip(*XYZ_ci))] = W_ci > 0 L_ci = measure.label(L_ci, connectivity=3) lci_mode = stats.mode(L_ci[L_ci > 0]).mode[0] W_ci[L_ci[tuple(zip(*XYZ_ci))] != lci_mode] = 0 W[B[:, ci], ci] = W_ci # Get norm of difference and check for convergence dnorm = np.sqrt(np.mean(np.square(V - W.dot(H)))) / timeseries_mean diffh = np.sqrt(np.mean(np.square(H - H_))) / timeseries_mean if ((dnorm_prev.max(0) - dnorm) < tolfun) & (diffh < tolfun): if (ii >= miniter): break dnorm_prev[1] = dnorm_prev[0] dnorm_prev[0] = dnorm if verbosity: print((ii, dnorm, diffh)) # Perform final regression on full input timeseries H = np.maximum(linalg.lstsq(W, V0)[0], 0) H *= (timeseries_mean / H.mean(1)[:, None] ) # normalize component timeseries return (W, H, dnorm)
def simcond(self, xo, method='approx', i_unknown=None): """ Simulate values conditionally on observed known values Parameters ---------- x : vector timeseries including missing data. (missing data must be NaN if i_unknown is not given) Assumption: The covariance of x is equal to self and have the same sample period. method : string defining method used in the conditional simulation. Options are: 'approximate': Condition only on the closest points. Quite fast 'exact' : Exact simulation. Slow for large data sets, may not return any result due to near singularity of the covariance matrix. i_unknown : integers indices to spurious or missing data in x Returns ------- sample : ndarray a random sample of the missing values conditioned on the observed data. mu, sigma : ndarray mean and standard deviation, respectively, of the missing values conditioned on the observed data. Notes ----- SIMCOND generates the missing values from x conditioned on the observed values assuming x comes from a multivariate Gaussian distribution with zero expectation and Auto Covariance function R. See also -------- CovData1D.sim TimeSeries.reconstruct, rndnormnd References ---------- Brodtkorb, P, Myrhaug, D, and Rue, H (2001) "Joint distribution of wave height and wave crest velocity from reconstructed data with application to ringing" Int. Journal of Offshore and Polar Engineering, Vol 11, No. 1, pp 23--32 Brodtkorb, P, Myrhaug, D, and Rue, H (1999) "Joint distribution of wave height and wave crest velocity from reconstructed data" in Proceedings of 9th ISOPE Conference, Vol III, pp 66-73 """ x = atleast_1d(xo).ravel() acf = self._get_acf() num_x = len(x) num_acf = len(acf) if i_unknown is not None: x[i_unknown] = nan i_unknown = flatnonzero(isnan(x)) num_unknown = len(i_unknown) mu1o = zeros((num_unknown,)) mu1o_std = zeros((num_unknown,)) sample = zeros((num_unknown,)) if num_unknown == 0: warnings.warn('No missing data, no point to continue.') return sample, mu1o, mu1o_std if num_unknown == num_x: warnings.warn('All data missing, returning sample from' + ' the apriori distribution.') mu1o_std = ones(num_unknown) * sqrt(acf[0]) return self.sim(ns=num_unknown, cases=1)[:, 1], mu1o, mu1o_std i_known = flatnonzero(1 - isnan(x)) if method.startswith('exac'): # exact but slow. It also may not return any result if num_acf > 0.3 * num_x: sigma = toeplitz(hstack((acf, zeros(num_x - num_acf)))) else: acf[0] = acf[0] * 1.00001 sigma = sptoeplitz(hstack((acf, zeros(num_x - num_acf)))) soo, so1, s11 = self._split_cov(sigma, i_known, i_unknown) if issparse(sigma): so1 = so1.todense() s11 = s11.todense() s1o_sooinv = spsolve(soo + soo.T, 2 * so1).T else: sooinv_so1, _res, _rank, _s = lstsq(soo + soo.T, 2 * so1, cond=1e-4) s1o_sooinv = sooinv_so1.T mu1o = s1o_sooinv.dot(x[i_known]) sigma1o = s11 - s1o_sooinv.dot(so1) if (diag(sigma1o) < 0).any(): raise ValueError('Failed to converge to a solution') mu1o_std = sqrt(diag(sigma1o)) sample[:] = rndnormnd(mu1o, sigma1o, cases=1).ravel() elif method.startswith('appr'): # approximating by only condition on the closest points num_sig = min(2 * num_acf, num_x) sigma = toeplitz(hstack((acf, zeros(num_sig - num_acf)))) overlap = int(num_sig / 4) # indices to the points used idx = r_[0:num_sig] + max(0, min(i_unknown[0] - overlap, num_x - num_sig)) mask_unknown = zeros(num_x, dtype=bool) # temporary storage of indices to missing points mask_unknown[i_unknown] = True t_unknown = where(mask_unknown[idx])[0] t_known = where(1 - mask_unknown[idx])[0] ns = len(t_unknown) # number of missing data in the interval num_restored = 0 # number of previously simulated points x2 = x.copy() while ns > 0: soo, so1, s11 = self._split_cov(sigma, t_known, t_unknown) if issparse(soo): so1 = so1.todense() s11 = s11.todense() s1o_sooinv = spsolve(soo + soo.T, 2 * so1).T else: sooinv_so1, _res, _rank, _s = lstsq(soo + soo.T, 2 * so1, cond=1e-4) s1o_sooinv = sooinv_so1.T sigma1o = s11 - s1o_sooinv.dot(so1) if (diag(sigma1o) < 0).any(): raise ValueError('Failed to converge to a solution') ix = slice((num_restored), (num_restored + ns)) # standard deviation of the expected surface mu1o_std[ix] = np.maximum(mu1o_std[ix], sqrt(diag(sigma1o))) # expected surface conditioned on the closest known # observations from x mu1o[ix] = s1o_sooinv.dot(x2[idx[t_known]]) # sample conditioned on the known observations from x mu1os = s1o_sooinv.dot(x[idx[t_known]]) sample[ix] = rndnormnd(mu1os, sigma1o, cases=1) if idx[-1] == num_x - 1: ns = 0 # no more points to simulate else: x2[idx[t_unknown]] = mu1o[ix] # expected surface x[idx[t_unknown]] = sample[ix] # sampled surface # removing indices to data which has been simulated mask_unknown[idx[:-overlap]] = False # data we want to simulate once more nw = sum(mask_unknown[idx[-overlap:]] is True) num_restored += ns - nw # update # points simulated so far idx = self._update_window(idx, i_unknown, num_x, num_acf, overlap, nw, num_restored) # find new interval with missing data t_unknown = flatnonzero(mask_unknown[idx]) t_known = flatnonzero(1 - mask_unknown[idx]) ns = len(t_unknown) # # missing data in the interval return sample, mu1o, mu1o_std
def test_check_finite(self): a = [[1, 20], [-30, 4]] for b in ([[1, 0], [0, 1]], [1, 0], [[2, 1], [-30, 4]]): x = lstsq(a, b, check_finite=False)[0] assert_array_almost_equal(dot(a, x), b)
def fit(self, source, destination, order=4, reg=1e-5, center=True, match='oct5', verbose=None): """Fit the warp from source points to destination points. Parameters ---------- source : array, shape (n_src, 3) The source points. destination : array, shape (n_dest, 3) The destination points. order : int Order of the spherical harmonic fit. reg : float Regularization of the TPS warp. center : bool If True, center the points by fitting a sphere to points that are in a reasonable region for head digitization. match : str The uniformly-spaced points to match on the two surfaces. Can be "ico#" or "oct#" where "#" is an integer. The default is "oct5". %(verbose)s Returns ------- inst : instance of SphericalSurfaceWarp The warping object (for chaining). """ from .bem import _fit_sphere from .source_space import _check_spacing match_rr = _check_spacing(match, verbose=False)[2]['rr'] logger.info('Computing TPS warp') src_center = dest_center = np.zeros(3) if center: logger.info(' Centering data') hsp = np.array( [p for p in source if not (p[2] < -1e-6 and p[1] > 1e-6)]) src_center = _fit_sphere(hsp, disp=False)[1] source = source - src_center hsp = np.array( [p for p in destination if not (p[2] < 0 and p[1] > 0)]) dest_center = _fit_sphere(hsp, disp=False)[1] destination = destination - dest_center logger.info(' Using centers %s -> %s' % (np.array_str( src_center, None, 3), np.array_str(dest_center, None, 3))) self._fit_params = dict(n_src=len(source), n_dest=len(destination), match=match, n_match=len(match_rr), order=order, reg=reg) assert source.shape[1] == destination.shape[1] == 3 self._destination = destination.copy() # 1. Compute spherical coordinates of source and destination points logger.info(' Converting to spherical coordinates') src_rad_az_pol = _cart_to_sph(source).T dest_rad_az_pol = _cart_to_sph(destination).T match_rad_az_pol = _cart_to_sph(match_rr).T del match_rr # 2. Compute spherical harmonic coefficients for all points logger.info(' Computing spherical harmonic approximation with ' 'order %s' % order) src_sph = _compute_sph_harm(order, *src_rad_az_pol[1:]) dest_sph = _compute_sph_harm(order, *dest_rad_az_pol[1:]) match_sph = _compute_sph_harm(order, *match_rad_az_pol[1:]) # 3. Fit spherical harmonics to both surfaces to smooth them src_coeffs = linalg.lstsq(src_sph, src_rad_az_pol[0])[0] dest_coeffs = linalg.lstsq(dest_sph, dest_rad_az_pol[0])[0] # 4. Smooth both surfaces using these coefficients, and evaluate at # the "shape" points logger.info(' Matching %d points (%s) on smoothed surfaces' % (len(match_sph), match)) src_rad_az_pol = match_rad_az_pol.copy() src_rad_az_pol[0] = np.abs(np.dot(match_sph, src_coeffs)) dest_rad_az_pol = match_rad_az_pol.copy() dest_rad_az_pol[0] = np.abs(np.dot(match_sph, dest_coeffs)) # 5. Convert matched points to Cartesion coordinates and put back source = _sph_to_cart(src_rad_az_pol.T) source += src_center destination = _sph_to_cart(dest_rad_az_pol.T) destination += dest_center # 6. Compute TPS warp of matched points from smoothed surfaces self._warp = _TPSWarp().fit(source, destination, reg) self._matched = np.array([source, destination]) logger.info('[done]') return self
def img_to_signals_maps(imgs, maps_img, mask_img=None): """Extract region signals from image. This function is applicable to regions defined by maps. Parameters ---------- imgs: Niimg-like object See http://nilearn.github.io/manipulating_images/input_output.html Input images. maps_img: Niimg-like object See http://nilearn.github.io/manipulating_images/input_output.html regions definition as maps (array of weights). shape: imgs.shape + (region number, ) mask_img: Niimg-like object See http://nilearn.github.io/manipulating_images/input_output.html mask to apply to regions before extracting signals. Every point outside the mask is considered as background (i.e. outside of any region). order: str ordering of output array ("C" or "F"). Defaults to "F". Returns ------- region_signals: numpy.ndarray Signals extracted from each region. Shape is: (scans number, number of regions intersecting mask) labels: list maps_img[..., labels[n]] is the region that has been used to extract signal region_signals[:, n]. See also -------- nilearn.regions.img_to_signals_labels nilearn.regions.signals_to_img_maps """ maps_img = _utils.check_niimg_4d(maps_img) imgs = _utils.check_niimg_4d(imgs) affine = imgs.affine shape = imgs.shape[:3] # Check shapes and affines. if maps_img.shape[:3] != shape: raise ValueError("maps_img and imgs shapes must be identical.") if abs(maps_img.affine - affine).max() > 1e-9: raise ValueError("maps_img and imgs affines must be identical") maps_data = _safe_get_data(maps_img, ensure_finite=True) if mask_img is not None: mask_img = _utils.check_niimg_3d(mask_img) if mask_img.shape != shape: raise ValueError("mask_img and imgs shapes must be identical.") if abs(mask_img.affine - affine).max() > 1e-9: raise ValueError("mask_img and imgs affines must be identical") maps_data, maps_mask, labels = \ _trim_maps(maps_data, _safe_get_data(mask_img, ensure_finite=True), keep_empty=True) maps_mask = _utils.as_ndarray(maps_mask, dtype=np.bool) else: maps_mask = np.ones(maps_data.shape[:3], dtype=np.bool) labels = np.arange(maps_data.shape[-1], dtype=np.int) data = _safe_get_data(imgs, ensure_finite=True) region_signals = linalg.lstsq(maps_data[maps_mask, :], data[maps_mask, :])[0].T return region_signals, list(labels)