def HolsteinPrimakoff( generation, lattice = "cactus", periodic = True ): """ Given a generation we want to build the cactus out to construct the matrix and return the eigenvalues. We will do this in the semi-roundabout way proposed by Mucciolo, Castro Neto, and Chamon in PRB 69 (214424), so what is returned is the true eigenspectrum and an eigenvalue matrix representing the rotations of a boguliobov-type transformation. """ if lattice == "cactus": H = HusimiHamiltonian( generation, periodic ) elif lattice == "triangle": H = TriangleHamiltonian( generation ) else: raise ValueError, "Options are 'cactus' and 'triangle'" l = H.shape[0] / 2 K = H[:l, :l] L = H[:l, l:] squaredDiff = scipy.dot(K, K) - scipy.dot(L, L) commutator = scipy.dot(L,K) - scipy.dot(K,L) if scipy.sum(commutator) == 0.0: eigVals, eigVects = scipy.linalg.eigh( squaredDiff ) else: eigVals, eigVects = scipy.linalg.eig( squaredDiff - commutator ) # The 'real' is not a cheat -- zero values could be negative as a result # of roundoff, this takes that into account. eigVals = scipy.real( scipy.sqrt( eigVals ) ) return eigVals, eigVects
def cov_dvrpmllbb_to_vxyz_single(d,e_d,e_vr,pmll,pmbb,cov_pmllbb,l,b): """ NAME: cov_dvrpmllbb_to_vxyz PURPOSE: propagate distance, radial velocity, and proper motion uncertainties to Galactic coordinates for scalar inputs INPUT: d - distance [kpc, as/mas for plx] e_d - distance uncertainty [kpc, [as/mas] for plx] e_vr - low velocity uncertainty [km/s] pmll - proper motion in l (*cos(b)) [ [as/mas]/yr ] pmbb - proper motion in b [ [as/mas]/yr ] cov_pmllbb - uncertainty covariance for proper motion l - Galactic longitude [rad] b - Galactic lattitude [rad] OUTPUT: cov(vx,vy,vz) [3,3] HISTORY: 2010-04-12 - Written - Bovy (NYU) """ M= _K*sc.array([[pmll,d,0.],[pmbb,0.,d]]) cov_dpmllbb= sc.zeros((3,3)) cov_dpmllbb[0,0]= e_d**2. cov_dpmllbb[1:3,1:3]= cov_pmllbb cov_vlvb= sc.dot(M,sc.dot(cov_dpmllbb,M.T)) cov_vrvlvb= sc.zeros((3,3)) cov_vrvlvb[0,0]= e_vr**2. cov_vrvlvb[1:3,1:3]= cov_vlvb R= sc.array([[m.cos(l)*m.cos(b), m.sin(l)*m.cos(b), m.sin(b)], [-m.sin(l),m.cos(l),0.], [-m.cos(l)*m.sin(b),-m.sin(l)*m.sin(b), m.cos(b)]]) return sc.dot(R.T,sc.dot(cov_vrvlvb,R))
def get_stderr_fit(f,Xdata,popt,pcov): Y= f(Xdata, popt) listdY=[] for i in xrange(len(popt)): p=popt[i] dp= abs(p)/1e6+1e-20 popt[i]+=dp Yi= f(Xdata, popt) dY= (Yi-Y)/dp listdY.append(dY) popt[i]-=dp listdY= scipy.array(listdY) #list dy is the d in the derivation. it has N X M #pcov is N X N left= scipy.dot(listdY.T,pcov) right=scipy.dot(left,listdY) sigma2y= right.diagonal() #sigma2y is a standard function of fit mean_sigma2y= scipy.mean(right.diagonal()) M= Xdata.shape[0] N= len(popt) avg_stddev_data=scipy.sqrt(M*mean_sigma2y/N) sigmay= scipy.sqrt(sigma2y) return sigmay,avg_stddev_data
def dot_fromfeatures(features1, features2 = None): if features2 is None: features2 = features1 npoints1 = features1.shape[0] npoints2 = features2.shape[0] features1.shape = npoints1, -1 features2.shape = npoints2, -1 ndims = features1.shape[1] assert(features2.shape[1] == ndims) if ndims < DOT_MAX_NDIMS: out = sp.dot(features1, features2.T) else: out = sp.dot(features1[:,:DOT_MAX_NDIMS], features2[:,:DOT_MAX_NDIMS].T) ndims_done = DOT_MAX_NDIMS while ndims_done < ndims: out += sp.dot(features1[:,ndims_done:ndims_done+DOT_MAX_NDIMS], features2[:,ndims_done:ndims_done+DOT_MAX_NDIMS].T) ndims_done += DOT_MAX_NDIMS return out
def fgmres(self,rhs,tol=1e-6,restrt=None,maxiter=None,callback=None): if maxiter == None: maxiter = len(rhs) if restrt == None: restrt = 2*maxiter # implemented as in [Saad, 1993] # start x = zeros(len(rhs)) H = zeros((restrt+1, restrt)) V = zeros((len(rhs),restrt)) Z = zeros((len(rhs),restrt)) # Arnoldi process (with modified Gramm-Schmidt) res = 1. j = 0 r = rhs - self.point.matvec(x) beta = norm(r) V[:,0]=r/beta while j < maxiter and res > tol: Z[:,j] = self.point.psolve(V[:,j]) w = self.point.matvec(Z[:,j]) for i in range(j+1): H[i,j]=dot(w,V[:,i]) w = w - H[i,j]*V[:,i] H[j+1,j] = norm(w) V[:,j+1]=w/H[j+1,j] e = zeros(j+2) e[0]=1. y, res, rank, sing_val = lstsq(H[:j+2,:j+1],beta*e) j += 1 print "# GMRES| iteration :", j, "res: ", res/beta self.resid = r_[self.resid,res/beta] Zy = dot(Z[:,:j],y) x = x + Zy info = 1 return (x,info)
def snr_maha(waveforms, invC, mu=None): """SNR from Mahalanobis distance (generalised euclidean distance) Definition of signal to noise ratio (SNR) as derived from the Mahalanobis distance. For C=eye this is equivalent to snr_power. :type waveforms: ndarray :param waveforms: waveform data (signal), one per row :type invC: ndarray :param invC: noise covariance matrix (a block toeplitz matrix) :type mu: ndarray :param mu: mean correction. Usually we assume zero-mean waveforms, so if this is None it will be ignored. Default=None :returns: ndarray - SNR per waveform """ # inits and checks n, dim = waveforms.shape if dim != invC.shape[0] != invC.shape[1]: raise ValueError("dimension mismatch for waveforms and covariance") rval = sp.zeros(n) # correct for mu if mu is not None: if mu.shape != (dim,): raise ValueError("dimension mismatch for waveforms and mu") waveforms -= mu # compute for i in xrange(n): rval[i] = sp.dot(sp.dot(waveforms[i], invC), waveforms[i].T) rval[i] /= float(dim) return sp.sqrt(rval)
def calcInvFisher(sigma, invSigma=None, factorSigma=None): """ Efficiently compute the exact inverse of the FIM of a Gaussian. Returns a list of the diagonal blocks. """ if invSigma == None: invSigma = inv(sigma) if factorSigma == None: factorSigma = cholesky(sigma) dim = sigma.shape[0] invF = [mat(1 / (invSigma[-1, -1] + factorSigma[-1, -1] ** -2))] invD = 1 / invSigma[-1, -1] for k in reversed(list(range(dim - 1))): v = invSigma[k + 1:, k] w = invSigma[k, k] wr = w + factorSigma[k, k] ** -2 u = dot(invD, v) s = dot(v, u) q = 1 / (w - s) qr = 1 / (wr - s) t = -(1 + q * s) / w tr = -(1 + qr * s) / wr invF.append(blockCombine([[qr, tr * u], [mat(tr * u).T, invD + qr * outer(u, u)]])) invD = blockCombine([[q , t * u], [mat(t * u).T, invD + q * outer(u, u)]]) invF.append(sigma) invF.reverse() return invF
def K_grad_i_dot(self, M, i): if i < self.Cr.getNumberParams(): R = sp.dot(self.W(), sp.dot(self.W_grad_i(i).T, M)) R+= sp.dot(self.W_grad_i(i), sp.dot(self.W().T, M)) else: R = self.d_grad_i(i-self.Cr.getNumberParams())[:, sp.newaxis] * M return R
def rlsloo_ll1( V, D, Y, lambd): """ Computes cs and the actual LOO errors for a single value of lambda. (lambd) """ n = V.shape[0] cl = Y.shape[1] inner = 1/(D + lambd) inner = inner.conj() VtY = sp.dot(V.T, Y) VtY = VtY.conj() # Because of signs of D are flipped (scipy.linalg.eig returns # flipped signs for complex part of the eigenvalues) in_dot = sp.ones((n,1)) * inner ViD = V * in_dot cs = sp.dot(ViD, VtY) dGi = sp.sum(ViD*V, axis = 1) # -- till here works fine #check matrix dimensions looerrs = cs.ravel()/sp.real(dGi.ravel()) looerrs = sp.real(looerrs) cs = sp.real(cs.transpose()) return cs.ravel(), looerrs
def Xgen(X0,Z,PP,QQ,Xbar): """ This function generates a history of X given a history technology shocks (Z), a P matrix, a Q matrix, and an intial X (X0). Note Xt(tilde) = PXt-1(tilde) +QZt(tilde) Xt=Xbar*e^Xt(tilde) """ num_endog=sp.shape(PP)[1] T=len(Z)#sp.shape(Z)[0] #display(T) X=sp.zeros((num_endog,T)) X[:,0]=X0 for i in range(1,T): Zt=Z[i] Xt_1=sp.zeros((num_endog,1)) for j in range(num_endog): Xt_1[j,0]=X[j,i-1] Xt=sp.dot(PP,Xt_1)+sp.dot(QQ,Zt) for k in range(num_endog): X[k,i]=Xt[k,0] exponents=sp.exp(X) for p in range(T): for q in range(num_endog): X[q,p]=Xbar[0,q]*exponents[q,p] return X
def Areml_K_grad_i(self,i): i = self.covar._actindex2index(i) R = sp.dot(self.WcCtildeLcA_o_WrRF(i).T, self.dWLW()) R+= R.T R+= -self.ALcCtildeLcA_o_FRF(i) R+= -sp.dot(self.dWLW().T, self.Cbar_o_Sr_dWLW(i)) return R
def _LMLgrad_lik(self,hyperparams): """derivative of the likelihood parameters""" logtheta = hyperparams['covar'] try: KV = self.get_covariances(hyperparams) except linalg.LinAlgError: LG.error("exception caught (%s)" % (str(hyperparams))) return 1E6 #loop through all dimensions #logdet term: Kd = 2*KV['Knoise'] dldet = 0.5*(Kd*KV['Si']).sum(axis=0) #quadratic term y_roti = KV['y_roti'] dlquad = -0.5 * (y_roti * Kd * y_roti).sum(axis=0) if VERBOSE: dldet_ = SP.zeros([self.d]) dlquad_ = SP.zeros([self.d]) for d in xrange(self.d): _K = KV['K'] + SP.diag(KV['Knoise'][:,d]) _Ki = SP.linalg.inv(_K) dldet_[d] = 0.5* SP.dot(_Ki,SP.diag(Kd[:,d])).trace() dlquad_[d] = -0.5*SP.dot(self.y[:,d],SP.dot(_Ki,SP.dot(SP.diag(Kd[:,d]),SP.dot(_Ki,self.y[:,d])))) assert (SP.absolute(dldet-dldet_)<1E-3).all(), 'outch' assert (SP.absolute(dlquad-dlquad_)<1E-3).all(), 'outch' LMLgrad = dldet + dlquad RV = {'lik': LMLgrad} return RV
def _LML_covar(self, hyperparams): """ log marginal likelihood contributions from covariance hyperparameters """ try: KV = self.get_covariances(hyperparams) except linalg.LinAlgError: LG.error("exception caught (%s)" % (str(hyperparams))) return 1E6 #all in one go #negative log marginal likelihood, see derivations lquad = 0.5* (KV['y_rot']*KV['Si']*KV['y_rot']).sum() ldet = 0.5*-SP.log(KV['Si'][:,:]).sum() LML = 0.5*self.n*self.d * SP.log(2*SP.pi) + lquad + ldet if VERBOSE: #1. slow and explicit way lmls_ = SP.zeros([self.d]) for i in xrange(self.d): _y = self.y[:,i] sigma2 = SP.exp(2*hyperparams['lik']) _K = KV['K'] + SP.diag(KV['Knoise'][:,i]) _Ki = SP.linalg.inv(_K) lquad_ = 0.5 * SP.dot(_y,SP.dot(_Ki,_y)) ldet_ = 0.5 * SP.log(SP.linalg.det(_K)) lmls_[i] = 0.5 * self.n* SP.log(2*SP.pi) + lquad_ + ldet_ assert SP.absolute(lmls_.sum()-LML)<1E-3, 'outch' return LML
def GP_sample_posterior(covar,X,logtheta,x,y,ns=1): """ Sample from the posterior distribution of a GP x : [double] training inputs y : [double] training targets other : See :py:func:`gp_sample.GP_sample_prior` """ KXx = covar.K(logtheta,x,X) KXX = covar.K(logtheta,X) Kxx = covar.K(logtheta,x) iKxx = SP.linalg.inv(Kxx+eye(Kxx.shape[0])*0.01) mu = SP.dot(KXx.T,SP.dot(iKxx,y)).reshape([-1,1]) cov = KXX - SP.dot(KXx.T,SP.dot(iKxx,KXx)) L = SP.linalg.cholesky(cov).T Y = mu + SP.dot(L,random.randn(X.shape[0],ns)) return Y
def _backwardImplementation(self, outerr, inerr, outbuf, inbuf): if self.onesigma: # algorithm for one global sigma for all mu's expln_params = expln(self.params) sumxsquared = dot(self.state, self.state) self._derivs += ( sum((outbuf - inbuf) ** 2 - expln_params ** 2 * sumxsquared) / expln_params * explnPrime(self.params) ) inerr[:] = outbuf - inbuf if not self.autoalpha and sumxsquared != 0: inerr /= expln_params ** 2 * sumxsquared self._derivs /= expln_params ** 2 * sumxsquared else: # Algorithm for seperate sigma for each mu expln_params = expln(self.params).reshape(len(outbuf), len(self.state)) explnPrime_params = explnPrime(self.params).reshape(len(outbuf), len(self.state)) idx = 0 for j in xrange(len(outbuf)): sigma_subst2 = dot(self.state ** 2, expln_params[j, :] ** 2) for i in xrange(len(self.state)): self._derivs[idx] = ( ((outbuf[j] - inbuf[j]) ** 2 - sigma_subst2) / sigma_subst2 * self.state[i] ** 2 * expln_params[j, i] * explnPrime_params[j, i] ) if self.autoalpha and sigma_subst2 != 0: self._derivs[idx] /= sigma_subst2 idx += 1 inerr[j] = outbuf[j] - inbuf[j] if not self.autoalpha and sigma_subst2 != 0: inerr[j] /= sigma_subst2
def update_step(self, input_signal=None, teaching_signal=None): """update the network with the given input and teach_output, input_signal and teaching_signal must be a column vector notice that input_signal is u(n+1) and output is output(n+1) this step makes state(n) -> state(n+1) the x_history is a list of state's state_history , every item is a row vector like (100L,)""" if input_signal != None: assert input_signal.shape == (self.input_unit_amount, 1) if teaching_signal != None: assert teaching_signal.shape == (self.output_unit_amount, 1) if self.feedback_matrix != None and self.input_matrix != None: self.state = self.unit_type_ufunc(sp.dot(self.input_matrix, input_signal) + sp.dot(self.internal_matrix, self.state) + sp.dot(self.feedback_matrix, self.output)) if teaching_signal == None: self.output = sp.dot(self.output_matrix, sp.append(input_signal.T,self.state.T).T) else: self.output = teaching_signal elif self.feedback_matrix != None: self.state = self.unit_type_ufunc(sp.dot(self.internal_matrix, self.state) + sp.dot(self.feedback_matrix, self.output)) if teaching_signal == None: self.output = sp.dot(self.output_matrix, self.state) else: self.output = teaching_signal else: self.state = self.unit_type_ufunc(sp.dot(self.input_matrix, input_signal) + sp.dot(self.internal_matrix, self.state)) if input_signal != None: self.state_history.append(sp.append(input_signal.T, self.state.T)) else: self.state_history.append(self.state.reshape(-1)) self.output_history.append(self.output)
def ar_fit(p_data, p_or_plist=range(100), selector='sbc'): """fits a (multivariate) AR (_A_uto_R_egrssive) model to data :Parameters: p_data : ndarray Data with observations on the rows and variables on the columns p_or_plist : list List of model orders to select from. This list has to be continuous with a step size of 1, e.g. [10,11,12,13,14] selector : str One of 'sbc' for the Schwarz Bayesian Criterion or 'fpe' for the log of Akaike's Final Prediction Error. This determines what metric is used to evaluate the best model order. """ # checks and inits if not isinstance(p_data, N.ndarray): raise ValueError('p_data is not an ndarray') data = p_data.copy() n, m = data.shape if selector not in ['sbc', 'fpe']: raise ValueError('selector has to be one of: "sbc" or "fpe"!') if not isinstance(p_or_plist, list): p_or_plist = [p_or_plist] p_max = max(p_or_plist) ne = n - p_max npmax = m * p_max if ne <= npmax: raise ValueError('time series to short!') R = _ar_model_qr(data, p_max) # model order selection if len(p_or_plist) > 1: sbc, fpe, ldp, np = _ar_model_select(R, m, ne, p_or_plist) if selector == 'sbc': crit = sbc elif selector == 'fpe': crit = fpe else: crit = N.zeros(1) p_opt = crit.argmin() np = m * p_opt # get lower right triangle of R # # | R11 R12 | # R = | | # | 0 R22 | # R11 = R[:np, :np] R12 = R[:np, npmax:] R22 = R[np:, npmax:] # build the model A = N.dot(NL.inv(R11), R12).T C = N.dot(R22.T, R22) / (ne - np) # return del R, R11, R12, R22 return A, C, crit
def learn(self, X, t, tol=0.01, amax=1e10): u"""学習""" N = X.shape[0] a = sp.ones(N+1) # hyperparameter b = 1.0 phi = sp.ones((N, N+1)) # design matrix phi[:,1:] = [[self._kernel(xi, xj) for xj in X] for xi in X] diff = 1 while diff >= tol: sigma = spla.inv(sp.diag(a) + b * sp.dot(phi.T, phi)) m = b * sp.dot(sigma, sp.dot(phi.T, t)) gamma = sp.ones(N+1) - a * sigma.diagonal() anew = gamma / (m * m) bnew = (N - gamma.sum()) / sp.square(spla.norm(t - sp.dot(phi, m))) anew[anew >= amax] = amax adiff, bdiff = anew - a, bnew - b diff = (adiff * adiff).sum() + bdiff * bdiff a, b = anew, bnew print ".", self._a = a self._b = b self._X = X self._m = m self._sigma = sigma self._amax = amax
def solve_pressure_eigenproblem(self, mtx, eig_problem=None, n_eigs=0, check=False): """G = B*AI*BT or B*AI*BT+D""" def get_slice(n_eigs, nn): if n_eigs > 0: ii = slice(0, n_eigs) elif n_eigs < 0: ii = slice(nn + n_eigs, nn) else: ii = slice(0, 0) return ii eig_problem = get_default(eig_problem, self.eig_problem) n_eigs = get_default(n_eigs, self.n_eigs) check = get_default(check, self.check) mtx_c, mtx_b, action_aibt = mtx['C'], mtx['B'], mtx['action_aibt'] mtx_g = mtx_b * action_aibt.to_array() # mtx_b must be sparse! if eig_problem == 'B*AI*BT+D': mtx_g += mtx['D'].toarray() mtx['G'] = mtx_g output(mtx_c.shape, mtx_g.shape) eigs, mtx_q = eig(mtx_c.toarray(), mtx_g, method='eig.sgscipy') if check: ee = nm.diag(sc.dot(mtx_q.T * mtx_c, mtx_q)).squeeze() oo = nm.diag(sc.dot(sc.dot(mtx_q.T, mtx_g), mtx_q)).squeeze() try: assert_(nm.allclose(ee, eigs)) assert_(nm.allclose(oo, nm.ones_like(eigs))) except ValueError: debug() nn = mtx_c.shape[0] if isinstance(n_eigs, tuple): output('required number of eigenvalues: (%d, %d)' % n_eigs) if sum(n_eigs) < nn: ii0 = get_slice(n_eigs[0], nn) ii1 = get_slice(-n_eigs[1], nn) eigs = nm.concatenate((eigs[ii0], eigs[ii1])) mtx_q = nm.concatenate((mtx_q[:,ii0], mtx_q[:,ii1]), 1) else: output('required number of eigenvalues: %d' % n_eigs) if (n_eigs != 0) and (abs(n_eigs) < nn): ii = get_slice(n_eigs, nn) eigs = eigs[ii] mtx_q = mtx_q[:,ii] ## from sfepy.base.plotutils import pylab, iplot ## pylab.semilogy(eigs) ## pylab.figure(2) ## iplot(eigs) ## pylab.show() ## debug() out = Struct(eigs=eigs, mtx_q=mtx_q) return out
def multivariateNormalPdf(z, x, sigma): """ The pdf of a multivariate normal distribution (not in scipy). The sample z and the mean x should be 1-dim-arrays, and sigma a square 2-dim-array. """ assert len(z.shape) == 1 and len(x.shape) == 1 and len(x) == len(z) and sigma.shape == (len(x), len(z)) tmp = -0.5 * dot(dot((z - x), inv(sigma)), (z - x)) res = (1. / power(2.0 * pi, len(z) / 2.)) * (1. / sqrt(det(sigma))) * exp(tmp) return res
def get(self): B = dok_matrix((self.rows,self.d), dtype=float32) for ((row,col,val),p) in self.sampler.get(with_probabilities=True): B[row,col] += val/(p*self.nnz) covariance = dot(B.transpose(),B) (_,s,Vt) = svds(covariance, k=self.ell, maxiter=50, return_singular_vectors=True) return dot(diag(sqrt(s[:self.ell])), Vt[:self.ell,:])
def dlsim( self, u, x0 = None, Tl = 0, Ts = 0.001 ): """ @summary: Simulate the motor for one input @param u: The control signal @param Ts: Sampling time (0.001 by default) @param x0: The initial conditions on the state vector (zero by default). @return: The system response """ if not x0 is None: self.x0 = x0 if self.x0 is None: self.x0 = zeros( ( 5, 1 ) ) self.x0[4, 0] = Tl ( self.Ad, self.Bd, self.Cd, self.Dd ) = self.dss( self.x0, Ts ) self.x0 = dot( self.Ad, self.x0 ) + dot( self.Bd, u ) y_out = dot( self.Cd, self.x0 ) # + dot( self.Dd, u ) return ( y_out, self.x0 )
def fastsvd(M): """ Fast Singular Value Decomposition Inputs: M -- 2d numpy array Outputs: U,S,V -- see scipy.linalg.svd """ h, w = M.shape # -- thin matrix if h >= w: # subspace of M'M U, S, V = N.linalg.svd(N.dot(M.T, M)) U = N.dot(M, V.T) # normalize for i in xrange(w): S[i] = fastnorm(U[:,i]) U[:,i] = U[:,i] / S[i] # -- fat matrix else: # subspace of MM' U, S, V = N.linalg.svd(N.dot(M, M.T)) V = N.dot(U.T, M) # normalize for i in xrange(h): S[i] = fastnorm(V[i]) V[i,:] = V[i] / S[i] return U, S, V
def get_stderr_fit(f,Xdata,popt,pcov): Y=f(Xdata,popt) listdY=[] for i in xrange(len(popt)): p=popt[i] dp=abs(p)/1e6+1e-20 popt[i]+=dp Yi=f(Xdata,popt) dY=(Yi-Y)/dp listdY.append(dY) popt[i]-=dp listdY=scipy.array(listdY) #listdY is an array with N rows and M columns, N=len(popt), M=len(xdata[0]) #pcov is an array with N rows and N columns left=scipy.dot(listdY.T,pcov) #left is an array of M rows and N columns right=scipy.dot(left,listdY) #right is an array of M rows and M columns sigma2y=right.diagonal() #sigma2y is standard error of fit and function of X mean_sigma2y=scipy.mean(right.diagonal()) M=Xdata.shape[1];print M N=len(popt);print N avg_stddev_data=scipy.sqrt(M*mean_sigma2y/N) #this is because if exp error is constant at sig_dat,then mean_sigma2y=N/M*sig_dat**2 sigmay=scipy.sqrt(sigma2y) return sigmay,avg_stddev_data
def _learnStep(self): """ Main part of the algorithm. """ I = eye(self.numParameters) self._produceSamples() utilities = self.shapingFunction(self._currentEvaluations) utilities /= sum(utilities) # make the utilities sum to 1 if self.uniformBaseline: utilities -= 1./self.batchSize samples = array(map(self._base2sample, self._population)) dCenter = dot(samples.T, utilities) covGradient = dot(array([outer(s,s) - I for s in samples]).T, utilities) covTrace = trace(covGradient) covGradient -= covTrace/self.numParameters * I dA = 0.5 * (self.scaleLearningRate * covTrace/self.numParameters * I +self.covLearningRate * covGradient) self._lastLogDetA = self._logDetA self._lastInvA = self._invA self._center += self.centerLearningRate * dot(self._A, dCenter) self._A = dot(self._A, expm2(dA)) self._invA = dot(expm2(-dA), self._invA) self._logDetA += 0.5 * self.scaleLearningRate * covTrace if self.storeAllDistributions: self._allDistributions.append((self._center.copy(), self._A.copy()))
def dw(self): """Calculates the Durbin-Waston statistic """ de = diff(self.e,1) dw = dot(de,de) / dot(self.e,self.e) return dw
def basex_core_transform(rawdata, M_vert, M_horz, Mc_vert, Mc_horz, vert_left, horz_right, dr=1.0): """ This is the internal function that does the actual BASEX transform. It requires that the matrices of basis set coefficients be passed. Parameters ---------- rawdata : NxM numpy array the raw image. M_vert_etc. : Numpy arrays 2D arrays given by the basis set calculation function dr : float pixel size. This only affects the absolute scaling of the output. Returns ------- IM : NxM numpy array The abel-transformed image, a slice of the 3D distribution """ # Reconstructing image - This is where the magic happens Ci = scipy.dot(scipy.dot(vert_left, rawdata), horz_right) # previously: vert_left.dot(rawdata).dot(horz_right) # use an heuristic scaling factor to match the analytical abel transform # For more info see https://github.com/PyAbel/PyAbel/issues/4 MAGIC_NUMBER = 1.1122244156826457 Ci *= MAGIC_NUMBER/dr IM = scipy.dot(scipy.dot(Mc_vert, Ci), Mc_horz.T) # Previously: Mc_vert.dot(Ci).dot(Mc_horz.T) # P = dot(dot(Mc,Ci),M.T) # This calculates the projection, # which should recreate the original image return IM
def lsaTransform(self,dimensions=1): """ Calculate SVD of objects matrix: U . SIGMA . VT = MATRIX Reduce the dimension of sigma by specified factor producing sigma'. Then dot product the matrices: U . SIGMA' . VT = MATRIX' """ rows,cols= self.matrix.shape if dimensions <= rows: #Its a valid reduction #Sigma comes out as a list rather than a matrix u,sigma,vt = linalg.svd(self.matrix) #Dimension reduction, build SIGMA' for index in xrange(rows-dimensions, rows): sigma[index]=0 #print linalg.diagsvd(sigma,len(self.matrix), len(vt)) #Reconstruct MATRIX' reconstructedMatrix= dot(dot(u,linalg.diagsvd(sigma,len(self.matrix),len(vt))),vt) #Save transform self.matrix=reconstructedMatrix else: print "dimension reduction cannot be greater than %s" % rows
def fit(self, X, y, useQR = True, addConstant = True): '''Solve y = Xb. Parameters ---------- x : array, shape (M, N) y : array, shape (M,) useQR : boolean Whether or not to use QR decomposition to fit regression line. addConstant: boolean Whether or not to add a constant column to X ''' if y.shape[0] != X.shape[0]: raise ValueError('incompatible dimensions') if addConstant: self.X = c_[ones(X.shape[0]), X] self.y = y self.X_columns = getattr(X,'columns', None) self.y_columns = getattr(y,'columns', None) if useQR: # TODO: Ehh, this is broken. Need to fix. Q,R = scipy.linalg.qr(self.X) Qty = dot(Q.T, y) self.b = scipy.linalg.solve(R,Qty) else: self.inv_xx = inv(dot(self.X.T,self.X)) xy = dot(self.X.T,self.y) self.b = dot(self.inv_xx,xy) self.computeStatistics()
def estimate(self): # estimating coefficients, and basic stats self.inv_xx = inv(dot(self.x.T,self.x)) xy = dot(self.x.T,self.y) self.betas = dot(self.inv_xx,xy) # estimate coefficients self.nobs = self.y.shape[0] # number of observations self.ncoef = self.x.shape[1] # number of coef. self.df_e = self.nobs - self.ncoef # degrees of freedom, error self.df_r = self.ncoef - 1 # degrees of freedom, regression self.e = self.y - dot(self.x,self.betas) # residuals self.sse = dot(self.e,self.e)/self.df_e # SSE self.se = sqrt(diagonal(self.sse*self.inv_xx)) # coef. standard errors self.t = self.betas / self.se # coef. t-statistics self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2 # coef. p-values self.R2 = 1 - self.e.var()/self.y.var() # model R-squared self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef)) # adjusted R-square self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e) # model F-statistic self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e) # F-statistic p-value
def simple_interaction_kronecker(snps, phenos, covs=None, Acovs=None, Asnps1=None, Asnps0=None, K1r=None, K1c=None, K2r=None, K2c=None, covar_type='lowrank_diag', rank=1, NumIntervalsDelta0=100, NumIntervalsDeltaAlt=0, searchDelta=False): """ I-variate fixed effects interaction test for phenotype specific SNP effects Args: snps: [N x S] SP.array of S SNPs for N individuals (test SNPs) phenos: [N x P] SP.array of P phenotypes for N individuals covs: list of SP.arrays holding covariates. Each covs[i] has one corresponding Acovs[i] Acovs: list of SP.arrays holding the phenotype design matrices for covariates. Each covs[i] has one corresponding Acovs[i]. Asnps1: list of SP.arrays of I interaction variables to be tested for N individuals. Note that it is assumed that Asnps0 is already included. If not provided, the alternative model will be the independent model Asnps0: single SP.array of I0 interaction variables to be included in the background model when testing for interaction with Inters K1r: [N x N] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K1c: [P x P] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K2r: [N x N] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K2c: [P x P] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed covar_type: type of covaraince to use. Default 'freeform'. possible values are 'freeform': free form optimization, 'fixed': use a fixed matrix specified in covar_K0, 'diag': optimize a diagonal matrix, 'lowrank': optimize a low rank matrix. The rank of the lowrank part is specified in the variable rank, 'lowrank_id': optimize a low rank matrix plus the weight of a constant diagonal matrix. The rank of the lowrank part is specified in the variable rank, 'lowrank_diag': optimize a low rank matrix plus a free diagonal matrix. The rank of the lowrank part is specified in the variable rank, 'block': optimize the weight of a constant P x P block matrix of ones, 'block_id': optimize the weight of a constant P x P block matrix of ones plus the weight of a constant diagonal matrix, 'block_diag': optimize the weight of a constant P x P block matrix of ones plus a free diagonal matrix, rank: rank of a possible lowrank component (default 1) NumIntervalsDelta0: number of steps for delta optimization on the null model (100) NumIntervalsDeltaAlt:number of steps for delta optimization on the alt. model (0 - no optimization) searchDelta: Carry out delta optimization on the alternative model? if yes We use NumIntervalsDeltaAlt steps Returns: pv: P-values of the interaction test pv0: P-values of the null model pvAlt: P-values of the alternative model """ S = snps.shape[1] #0. checks N = phenos.shape[0] P = phenos.shape[1] if K1r == None: K1r = SP.dot(snps, snps.T) else: assert K1r.shape[0] == N, 'K1r: dimensions dismatch' assert K1r.shape[1] == N, 'K1r: dimensions dismatch' if K2r == None: K2r = SP.eye(N) else: assert K2r.shape[0] == N, 'K2r: dimensions dismatch' assert K2r.shape[1] == N, 'K2r: dimensions dismatch' covs, Acovs = updateKronCovs(covs, Acovs, N, P) #Asnps can be several designs if (Asnps0 is None): Asnps0 = [SP.ones([1, P])] if Asnps1 is None: Asnps1 = [SP.eye([P])] if (type(Asnps0) != list): Asnps0 = [Asnps0] if (type(Asnps1) != list): Asnps1 = [Asnps1] assert (len(Asnps0) == 1) and ( len(Asnps1) > 0), "need at least one Snp design matrix for null and alt model" #one row per column design matrix pv = SP.zeros((len(Asnps1), snps.shape[1])) lrt = SP.zeros((len(Asnps1), snps.shape[1])) pvAlt = SP.zeros((len(Asnps1), snps.shape[1])) lrtAlt = SP.zeros((len(Asnps1), snps.shape[1])) #1. run GP model to infer suitable covariance structure if K1c == None or K2c == None: vc = estimateKronCovariances(phenos=phenos, K1r=K1r, K2r=K2r, K1c=K1c, K2c=K2c, covs=covs, Acovs=Acovs, covar_type=covar_type, rank=rank) K1c = vc.getEstTraitCovar(0) K2c = vc.getEstTraitCovar(1) else: assert K1c.shape[0] == P, 'K1c: dimensions dismatch' assert K1c.shape[1] == P, 'K1c: dimensions dismatch' assert K2c.shape[0] == P, 'K2c: dimensions dismatch' assert K2c.shape[1] == P, 'K2c: dimensions dismatch' #2. run kroneckerLMM for null model lmm = limix.CKroneckerLMM() lmm.setK1r(K1r) lmm.setK1c(K1c) lmm.setK2r(K2r) lmm.setK2c(K2c) lmm.setSNPs(snps) #add covariates for ic in range(len(Acovs)): lmm.addCovariates(covs[ic], Acovs[ic]) lmm.setPheno(phenos) #delta serch on alt. model? if searchDelta: lmm.setNumIntervalsAlt(NumIntervalsDeltaAlt) lmm.setNumIntervals0_inter(NumIntervalsDeltaAlt) else: lmm.setNumIntervalsAlt(0) lmm.setNumIntervals0_inter(0) lmm.setNumIntervals0(NumIntervalsDelta0) #add SNP design lmm.setSNPcoldesign0_inter(Asnps0[0]) for iA in range(len(Asnps1)): lmm.setSNPcoldesign(Asnps1[iA]) lmm.process() pvAlt[iA, :] = lmm.getPv()[0] pv[iA, :] = lmm.getPv()[1] pv0 = lmm.getPv()[2] return pv, pv0, pvAlt
delimiter='\t'))).astype(float) # remove snp label X = X[:, :n_s] n_f = X.shape[0] for i in xrange(n_f): sd = (X[i]).std() if sd == 0: X[i] = X[i] - (X[i]).mean() else: X[i] = (X[i] - (X[i]).mean()) / sd X = X.T print X print X.shape K1 = 1.0 / n_f * SP.dot(X, X.T) print K1.shape K = K1 print K parents = SP.array(list(csv.reader(open('parents.txt', 'rb'), delimiter='\t'))).astype(int) parents = parents[:4754, :] idxm = range(1, 191) SP.random.shuffle(idxm) idxm = idxm[:5] idxf = range(1, 26) SP.random.shuffle(idxf) idxf = idxf[:5] train = []
def content_based(): dir_name = os.path.dirname(__file__) f_path = os.path.join(dir_name, "anonymous-msweb.data") raw_data = pd.read_csv(f_path, header=None, skiprows=7) #creating user profile user_activity = raw_data.loc[raw_data[0] != "A"] user_activity.columns = ['category', 'value', 'vote', 'desc', 'url'] #extract only first two columns user_activity = user_activity[['category', 'value']] site_count = len( user_activity.loc[user_activity['category'] == "V"].value.unique()) case_count = len( user_activity.loc[user_activity['category'] == "C"].value.unique()) print ' case/rating count: {}, site_count: {}'.format( case_count, site_count) tmp = 0 nextrow = False lastindex = user_activity.index[len(user_activity) - 1] for index, row in user_activity.iterrows(): if (index <= lastindex): if (user_activity.loc[index, 'category'] == "C"): # append two columns userid and webid to the user_activity dataframe tmp = 0 userid = user_activity.loc[index, 'value'] tmp = userid nextrow = True # C records always followed by V records, elif (user_activity.loc[index, 'category'] == "V" and nextrow == True): webid = user_activity.loc[index, 'value'] user_activity.loc[index, 'webid'] = webid # retrieve userid from previous C record, temporarily stored in tmp user_activity.loc[index, 'userid'] = tmp if (index != lastindex and user_activity.loc[index + 1, 'category'] == "C"): # the last 'V' record for previous C record nextrow = False # only keep all V records, which contains both webids and userids user_activity = user_activity[user_activity['category'] == "V"] # only keep columns userid and webid user_activity = user_activity[['userid', 'webid']] user_activity_sort = user_activity.sort('webid', ascending=True) user_activity['userid'].unique().shape[0] user_activity['webid'].unique().shape[0] plt.hist(user_activity[webid]) plt.show() sLength = len(user_activity_sort['webid']) #add a rating column, default value: 1 user_activity_sort['rating'] = pd.Series(np.ones((sLength, )), index=user_activity.index) #create a pivot, index is userid, columns are different webid, value = count the occurence of [userid, webid], set to 0 if none. rating_matrix = user_activity_sort.pivot(index='userid', columns='webid', values='rating').fillna(0) rating_matrix = rating_matrix.to_dense().as_matrix() #creating item profile items = raw_data.loc[raw_data[0] == "A"] items.columns = ['record', 'webid', 'vote', 'desc', 'url'] items = items[['webid', 'desc']] items['webid'].unique().shape[0] items_rated = items[items['webid'].isin(user_activity['webid'].tolist())] items_rated_sorted = items_rated.sort('webid', ascending=True) #project items to vector space using tfidf based on 'desc' v = TfidfVectorizer(stop_words="english", max_features=100, ngram_range=(0, 3), sublinear_tf=True) v.get_feature_names() #transform items desc to doc-term matrix x = v.fit_transform(items_rated_sorted['desc']) item_profile = x.to_dense() np.savetxt("tf_idf", x.todense()) # dot product of rating matrix and item profile to get user_profile user_profile = dot( rating_matrix, item_profile) / linalg.norm(rating_matrix) / linalg.norm(item_profile) # recommendations based on the smilarity between user profile an item profile similarityCalc = sklearn.metrics.pairwise.cosine_similarity( user_profile, item_profile, dense_output=True) final_pred = np.where(similarityCalc > 0.6, 1, 0) np.savetxt('pred', final_pred) final_pred.shape
def reduced_track_indices(coordinate_list, timesteps=None): # returns a list of indices of trackpoints that constitute the reduced track # takes a list of kartesian coordinate tuples m = len(coordinate_list) if m == 0: return [] if timesteps is not None and len(timesteps) != len(coordinate_list): timesteps = None # number of dimensions d = len(coordinate_list[0]) # remove identical entries (can speed up algorithm considerably) original_indices = [0] points = [{'p': coordinate_list[0], 'weight': 1}] if timesteps is not None: points[0]['t'] = timesteps[0] for i in range(1, m): if False in [ coordinate_list[i - 1][j] == coordinate_list[i][j] for j in range(d) ]: original_indices.append(i) points.append({'p': coordinate_list[i], 'weight': 1}) if timesteps is not None: points[-1]['t'] = timesteps[i] else: points[-1]['weight'] += 1 n = len(points) # progress printing initialisations progress_printed = False progress = None tprint = time.time() # execute Dijkstra-like algorithm on points points[0]['cost'] = 1.0 points[0]['prev'] = -1 for i2 in range(1, n): penalties = {} costmin = float('inf') for i1 in reversed(list(range(i2))): p1 = array(points[i1]['p']) p2 = array(points[i2]['p']) seglength = norm(p2 - p1) # estimate speed between p1 and p2 if timesteps is not None: dt = (points[i2]['t'] - points[i1]['t']).total_seconds() v = seglength / max(0.1, dt) else: v = seglength / float(i2 - i1) # assume 1s time spacing max_sep = options.max_sep0 + v * options.max_sep_time if options.max_dist >= 0: max_sep = min(max_sep, options.max_sep) if seglength >= max_sep and i1 != i2 - 1: # point separation is too far # but always accept direct predecessor i1 = i2 - 1 if seglength >= max_sep + options.max_dist: # no chance to find a valid earlier predecessor point break else: continue if points[i1]['cost'] + 1.0 > costmin: # the possible predecessor i1 is already too bad. continue i1_i2_segment_valid = True lower_i1_possible = True distance_squaremax = 0.0 distance_squaresum = 0.0 distances_squared = [] # iterate all medium points between i1 and i2 for im in range(i1 + 1, i2): pm = array(points[im]['p']) d = distance(p1, pm, p2, options.ele_weight) if d <= options.max_dist: d_sq = (d / options.max_dist)**2 distance_squaremax = max(distance_squaremax, d_sq) distance_squaresum += points[im]['weight'] * d_sq distances_squared.append(d_sq) else: i1_i2_segment_valid = False # check if connection to any further point i1 is impossible d1 = pl.dot(p1 - p2, p1 - p2) d2 = pl.dot(pm - p2, pm - p2) dd = options.max_dist**2 d1d2 = pl.dot(p1 - p2, pm - p2) # formula from cosines of point separation angle and cone-opening angles around points if d1 > dd and d2 > dd and (d1d2 + dd)**2 < (d2 - dd) * (d1 - dd): lower_i1_possible = False break if not lower_i1_possible: break if i1_i2_segment_valid: if options.weighting == 'sqrdistmax': penalties[i1] = distance_squaremax elif options.weighting == 'sqrdistsum': penalties[i1] = distance_squaresum elif options.weighting == 'sqrlength': penalties[i1] = (seglength / max_sep)**2 elif options.weighting == 'mix': penalties[i1] = (distance_squaremax * (1.0 + seglength / max_sep)) elif options.weighting == 'exp': penalties[i1] = 0.5 * sum([ 0.5**i * d for i, d in enumerate( sorted(distances_squared, reverse=True)) ]) else: penalties[i1] = 0.0 # add a penalty for kinks if options.bend > 0.: if points[i1]['prev'] != -1: p0 = array(points[points[i1]['prev']]['p']) v0 = p1 - p0 v1 = p2 - p1 if norm(v0) > 0. and norm(v1) > 0.: v0 /= norm(v0) v1 /= norm(v1) kink = (1.0 - dot(v0, v1)) / 2.0 penalties[i1] += options.bend * kink # find best predecessor imin = None costmin = float('inf') for prev, penalty in penalties.items(): # cost function is sum of points used (1.0) plus penalties cost = points[prev]['cost'] + 1.0 + penalty if cost < costmin: imin = prev costmin = cost points[i2]['cost'] = costmin points[i2]['prev'] = imin # print progess if options.verbose == 1 and ( 100 * i2) / n > progress and time.time() >= tprint + 1: tprint = time.time() progress = (100 * i2) / n print('\r', progress, '% of', n, 'points', end='') stdout.flush() progress_printed = True if progress_printed: print('\r', end='') # trace route backwards to collect final points final_pnums = [] i = n - 1 while i >= 0: final_pnums = [i] + final_pnums i = points[i]['prev'] return [original_indices[i] for i in final_pnums]
def simple_interaction_kronecker_deprecated(snps, phenos, covs=None, Acovs=None, Asnps1=None, Asnps0=None, K1r=None, K1c=None, K2r=None, K2c=None, covar_type='lowrank_diag', rank=1, searchDelta=False): """ I-variate fixed effects interaction test for phenotype specific SNP effects. (Runs multiple likelihood ratio tests and computes the P-values in python from the likelihood ratios) Args: snps: [N x S] SP.array of S SNPs for N individuals (test SNPs) phenos: [N x P] SP.array of P phenotypes for N individuals covs: list of SP.arrays holding covariates. Each covs[i] has one corresponding Acovs[i] Acovs: list of SP.arrays holding the phenotype design matrices for covariates. Each covs[i] has one corresponding Acovs[i]. Asnps1: list of SP.arrays of I interaction variables to be tested for N individuals. Note that it is assumed that Asnps0 is already included. If not provided, the alternative model will be the independent model Asnps0: single SP.array of I0 interaction variables to be included in the background model when testing for interaction with Inters K1r: [N x N] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K1c: [P x P] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K2r: [N x N] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K2c: [P x P] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed covar_type: type of covaraince to use. Default 'freeform'. possible values are 'freeform': free form optimization, 'fixed': use a fixed matrix specified in covar_K0, 'diag': optimize a diagonal matrix, 'lowrank': optimize a low rank matrix. The rank of the lowrank part is specified in the variable rank, 'lowrank_id': optimize a low rank matrix plus the weight of a constant diagonal matrix. The rank of the lowrank part is specified in the variable rank, 'lowrank_diag': optimize a low rank matrix plus a free diagonal matrix. The rank of the lowrank part is specified in the variable rank, 'block': optimize the weight of a constant P x P block matrix of ones, 'block_id': optimize the weight of a constant P x P block matrix of ones plus the weight of a constant diagonal matrix, 'block_diag': optimize the weight of a constant P x P block matrix of ones plus a free diagonal matrix, rank: rank of a possible lowrank component (default 1) searchDelta: Boolean indicator if delta is optimized during SNP testing (default False) Returns: pv: P-values of the interaction test lrt0: log likelihood ratio statistics of the null model pv0: P-values of the null model lrt: log likelihood ratio statistics of the interaction test lrtAlt: log likelihood ratio statistics of the alternative model pvAlt: P-values of the alternative model """ S = snps.shape[1] #0. checks N = phenos.shape[0] P = phenos.shape[1] if K1r == None: K1r = SP.dot(snps, snps.T) else: assert K1r.shape[0] == N, 'K1r: dimensions dismatch' assert K1r.shape[1] == N, 'K1r: dimensions dismatch' if K2r == None: K2r = SP.eye(N) else: assert K2r.shape[0] == N, 'K2r: dimensions dismatch' assert K2r.shape[1] == N, 'K2r: dimensions dismatch' covs, Acovs = updateKronCovs(covs, Acovs, N, P) #Asnps can be several designs if (Asnps0 is None): Asnps0 = [SP.ones([1, P])] if Asnps1 is None: Asnps1 = [SP.eye([P])] if (type(Asnps0) != list): Asnps0 = [Asnps0] if (type(Asnps1) != list): Asnps1 = [Asnps1] assert (len(Asnps0) == 1) and ( len(Asnps1) > 0), "need at least one Snp design matrix for null and alt model" #one row per column design matrix pv = SP.zeros((len(Asnps1), snps.shape[1])) lrt = SP.zeros((len(Asnps1), snps.shape[1])) pvAlt = SP.zeros((len(Asnps1), snps.shape[1])) lrtAlt = SP.zeros((len(Asnps1), snps.shape[1])) #1. run GP model to infer suitable covariance structure if K1c == None or K2c == None: vc = estimateKronCovariances(phenos=phenos, K1r=K1r, K2r=K2r, K1c=K1c, K2c=K2c, covs=covs, Acovs=Acovs, covar_type=covar_type, rank=rank) K1c = vc.getEstTraitCovar(0) K2c = vc.getEstTraitCovar(1) else: assert K1c.shape[0] == P, 'K1c: dimensions dismatch' assert K1c.shape[1] == P, 'K1c: dimensions dismatch' assert K2c.shape[0] == P, 'K2c: dimensions dismatch' assert K2c.shape[1] == P, 'K2c: dimensions dismatch' #2. run kroneckerLMM for null model lmm = limix.CKroneckerLMM() lmm.setK1r(K1r) lmm.setK1c(K1c) lmm.setK2r(K2r) lmm.setK2c(K2c) lmm.setSNPs(snps) #add covariates for ic in range(len(Acovs)): lmm.addCovariates(covs[ic], Acovs[ic]) lmm.setPheno(phenos) if searchDelta: lmm.setNumIntervalsAlt(100) else: lmm.setNumIntervalsAlt(0) lmm.setNumIntervals0(100) #add SNP design lmm.setSNPcoldesign(Asnps0[0]) lmm.process() dof0 = Asnps0[0].shape[0] pv0 = lmm.getPv() lrt0 = ST.chi2.isf(pv0, dof0) for iA in range(len(Asnps1)): dof1 = Asnps1[iA].shape[0] dof = dof1 - dof0 lmm.setSNPcoldesign(Asnps1[iA]) lmm.process() pvAlt[iA, :] = lmm.getPv()[0] lrtAlt[iA, :] = ST.chi2.isf(pvAlt[iA, :], dof1) lrt[iA, :] = lrtAlt[iA, :] - lrt0[ 0] # Don't need the likelihood ratios, as null model is the same between the two models pv[iA, :] = ST.chi2.sf(lrt[iA, :], dof) return pv, lrt0, pv0, lrt, lrtAlt, pvAlt
def forward_lmm_kronecker(snps, phenos, Asnps=None, Acond=None, K1r=None, K1c=None, K2r=None, K2c=None, covs=None, Acovs=None, threshold=5e-8, maxiter=2, qvalues=False, update_covariances=False, **kw_args): """ Kronecker fixed effects test with forward selection Args: snps: [N x S] SP.array of S SNPs for N individuals (test SNPs) pheno: [N x P] SP.array of 1 phenotype for N individuals K: [N x N] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed covs: [N x D] SP.array of D covariates for N individuals threshold: (float) P-value thrashold for inclusion in forward selection (default 5e-8) maxiter: (int) maximum number of interaction scans. First scan is without inclusion, so maxiter-1 inclusions can be performed. (default 2) qvalues: Use q-value threshold and return q-values in addition (default False) update_covar: Boolean indicator if covariances should be re-estimated after each forward step (default False) Returns: lm: lmix LMMi object resultStruct with elements: iadded: array of indices of SNPs included in order of inclusion pvadded: array of Pvalues obtained by the included SNPs in iteration before inclusion pvall: [maxiter x S] SP.array of Pvalues for all iterations Optional: corresponding q-values qvadded qvall """ #0. checks N = phenos.shape[0] P = phenos.shape[1] if K1r == None: K1r = SP.dot(snps, snps.T) else: assert K1r.shape[0] == N, 'K1r: dimensions dismatch' assert K1r.shape[1] == N, 'K1r: dimensions dismatch' if K2r == None: K2r = SP.eye(N) else: assert K2r.shape[0] == N, 'K2r: dimensions dismatch' assert K2r.shape[1] == N, 'K2r: dimensions dismatch' covs, Acovs = updateKronCovs(covs, Acovs, N, P) if Asnps is None: Asnps = [SP.ones([1, P])] if (type(Asnps) != list): Asnps = [Asnps] assert len(Asnps) > 0, "need at least one Snp design matrix" if Acond is None: Acond = Asnps if (type(Acond) != list): Acond = [Acond] assert len(Acond) > 0, "need at least one Snp design matrix" #1. run GP model to infer suitable covariance structure if K1c == None or K2c == None: vc = estimateKronCovariances(phenos=phenos, K1r=K1r, K2r=K2r, K1c=K1c, K2c=K2c, covs=covs, Acovs=Acovs, **kw_args) K1c = vc.getEstTraitCovar(0) K2c = vc.getEstTraitCovar(1) else: vc = None assert K1c.shape[0] == P, 'K1c: dimensions dismatch' assert K1c.shape[1] == P, 'K1c: dimensions dismatch' assert K2c.shape[0] == P, 'K2c: dimensions dismatch' assert K2c.shape[1] == P, 'K2c: dimensions dismatch' t0 = time.time() lm, pv = kronecker_lmm(snps=snps, phenos=phenos, Asnps=Asnps, K1r=K1r, K2r=K2r, K1c=K1c, K2c=K2c, covs=covs, Acovs=Acovs) #get pv #start stuff iadded = [] pvadded = [] qvadded = [] time_el = [] pvall = SP.zeros((pv.shape[0] * maxiter, pv.shape[1])) qvall = None t1 = time.time() print(("finished GWAS testing in %.2f seconds" % (t1 - t0))) time_el.append(t1 - t0) pvall[0:pv.shape[0], :] = pv imin = SP.unravel_index(pv.argmin(), pv.shape) score = pv[imin].min() niter = 1 if qvalues: assert pv.shape[ 0] == 1, "This is untested with the fdr package. pv.shape[0]==1 failed" qvall = SP.zeros((maxiter, snps.shape[1])) qv = FDR.qvalues(pv) qvall[0:1, :] = qv score = qv[imin] #loop: while (score < threshold) and niter < maxiter: t0 = time.time() pvadded.append(pv[imin]) iadded.append(imin) if qvalues: qvadded.append(qv[imin]) if update_covariances and vc is not None: vc.addFixedTerm(snps[:, imin[1]:(imin[1] + 1)], Acond[imin[0]]) vc.setScales( ) #CL: don't know what this does, but findLocalOptima crashes becahuse vc.noisPos=None vc.findLocalOptima(fast=True) K1c = vc.getEstTraitCovar(0) K2c = vc.getEstTraitCovar(1) lm.setK1c(K1c) lm.setK2c(K2c) lm.addCovariates(snps[:, imin[1]:(imin[1] + 1)], Acond[imin[0]]) for i in range(len(Asnps)): #add SNP design lm.setSNPcoldesign(Asnps[i]) lm.process() pv[i, :] = lm.getPv()[0] pvall[niter * pv.shape[0]:(niter + 1) * pv.shape[0]] = pv imin = SP.unravel_index(pv.argmin(), pv.shape) if qvalues: qv = FDR.qvalues(pv) qvall[niter:niter + 1, :] = qv score = qv[imin].min() else: score = pv[imin].min() t1 = time.time() print(("finished GWAS testing in %.2f seconds" % (t1 - t0))) time_el.append(t1 - t0) niter = niter + 1 RV = {} RV['iadded'] = iadded RV['pvadded'] = pvadded RV['pvall'] = pvall RV['time_el'] = time_el if qvalues: RV['qvall'] = qvall RV['qvadded'] = qvadded return lm, RV
def kronecker_lmm(snps, phenos, covs=None, Acovs=None, Asnps=None, K1r=None, K1c=None, K2r=None, K2c=None, covar_type='lowrank_diag', rank=1, NumIntervalsDelta0=100, NumIntervalsDeltaAlt=0, searchDelta=False): """ simple wrapper for kroneckerLMM code Args: snps: [N x S] SP.array of S SNPs for N individuals (test SNPs) phenos: [N x P] SP.array of P phenotypes for N individuals covs: list of SP.arrays holding covariates. Each covs[i] has one corresponding Acovs[i] Acovs: list of SP.arrays holding the phenotype design matrices for covariates. Each covs[i] has one corresponding Acovs[i]. Asnps: single SP.array of I0 interaction variables to be included in the background model when testing for interaction with Inters If not provided, the alternative model will be the independent model K1r: [N x N] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K1c: [P x P] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K2r: [N x N] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed K2c: [P x P] SP.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed covar_type: type of covaraince to use. Default 'freeform'. possible values are 'freeform': free form optimization, 'fixed': use a fixed matrix specified in covar_K0, 'diag': optimize a diagonal matrix, 'lowrank': optimize a low rank matrix. The rank of the lowrank part is specified in the variable rank, 'lowrank_id': optimize a low rank matrix plus the weight of a constant diagonal matrix. The rank of the lowrank part is specified in the variable rank, 'lowrank_diag': optimize a low rank matrix plus a free diagonal matrix. The rank of the lowrank part is specified in the variable rank, 'block': optimize the weight of a constant P x P block matrix of ones, 'block_id': optimize the weight of a constant P x P block matrix of ones plus the weight of a constant diagonal matrix, 'block_diag': optimize the weight of a constant P x P block matrix of ones plus a free diagonal matrix, rank: rank of a possible lowrank component (default 1) NumIntervalsDelta0: number of steps for delta optimization on the null model (100) NumIntervalsDeltaAlt:number of steps for delta optimization on the alt. model (0 - no optimization) searchDelta: Boolean indicator if delta is optimized during SNP testing (default False) Returns: CKroneckerLMM object P-values for all SNPs from liklelihood ratio test """ #0. checks N = phenos.shape[0] P = phenos.shape[1] if K1r == None: K1r = SP.dot(snps, snps.T) else: assert K1r.shape[0] == N, 'K1r: dimensions dismatch' assert K1r.shape[1] == N, 'K1r: dimensions dismatch' if K2r == None: K2r = SP.eye(N) else: assert K2r.shape[0] == N, 'K2r: dimensions dismatch' assert K2r.shape[1] == N, 'K2r: dimensions dismatch' covs, Acovs = updateKronCovs(covs, Acovs, N, P) #Asnps can be several designs if Asnps is None: Asnps = [SP.ones([1, P])] if (type(Asnps) != list): Asnps = [Asnps] assert len(Asnps) > 0, "need at least one Snp design matrix" #one row per column design matrix pv = SP.zeros((len(Asnps), snps.shape[1])) #1. run GP model to infer suitable covariance structure if K1c == None or K2c == None: vc = estimateKronCovariances(phenos=phenos, K1r=K1r, K2r=K2r, K1c=K1c, K2c=K2c, covs=covs, Acovs=Acovs, covar_type=covar_type, rank=rank) K1c = vc.getEstTraitCovar(0) K2c = vc.getEstTraitCovar(1) else: assert K1c.shape[0] == P, 'K1c: dimensions dismatch' assert K1c.shape[1] == P, 'K1c: dimensions dismatch' assert K2c.shape[0] == P, 'K2c: dimensions dismatch' assert K2c.shape[1] == P, 'K2c: dimensions dismatch' #2. run kroneckerLMM lmm = limix.CKroneckerLMM() lmm.setK1r(K1r) lmm.setK1c(K1c) lmm.setK2r(K2r) lmm.setK2c(K2c) lmm.setSNPs(snps) #add covariates for ic in range(len(Acovs)): lmm.addCovariates(covs[ic], Acovs[ic]) lmm.setPheno(phenos) #delta serch on alt. model? if searchDelta: lmm.setNumIntervalsAlt(NumIntervalsDeltaAlt) else: lmm.setNumIntervalsAlt(0) lmm.setNumIntervals0(NumIntervalsDelta0) for iA in range(len(Asnps)): #add SNP design lmm.setSNPcoldesign(Asnps[iA]) lmm.process() pv[iA, :] = lmm.getPv()[0] return lmm, pv
def LcGrad_n(self, i): RV = sp.dot(self.U_CstarGrad_n(i).T, self.Cn.USi2().T) RV += sp.dot(self.U_Cstar().T, self.Cn.USi2grad(i).T) return RV
def CstarGrad_n(self, i): RV = sp.dot(self.Cn.USi2grad(i).T, sp.dot(self.Cg.K(), self.Cn.USi2())) RV += sp.dot(self.Cn.USi2().T, sp.dot(self.Cg.K(), self.Cn.USi2grad(i))) #RV+= RV.T return RV
def predict(self): """ predict the value of the fixed effect (F*B) """ return sp.dot(self.F,self.B)
def LcGradCnLc(self, i): return sp.dot(self.Lc(), sp.dot(self.Cn.K_grad_i(i), self.Lc().T))
def LcGrad_g(self, i): return sp.dot(self.U_CstarGrad_g(i).T, self.Cn.USi2().T)
def Cstar(self): return sp.dot(self.Cn.USi2().T, sp.dot(self.Cg.K(), self.Cn.USi2()))
def CstarGrad_g(self, i): return sp.dot(self.Cn.USi2().T, sp.dot(self.Cg.Kgrad_param(i), self.Cn.USi2()))
def sortKey(point_): distance = point - point_ return - dot(distance.T, distance)
def Lc(self): return sp.dot(self.U_Cstar().T, self.Cn.USi2().T)
def _findLocalBall_noinline(self, point): """Return the index of the ball that the point lies in.""" for i, ball in enumerate(self.gridBalls): distance = point - ball if dot(distance.T, distance) <= self.radiusSquared: return i
RV = sp.dot(self.U_CstarGrad_n(i).T, self.Cn.USi2().T) RV += sp.dot(self.U_Cstar().T, self.Cn.USi2grad(i).T) return RV def Sgrad_g(self, i): return sp.kron(self.S_CstarGrad_g(i), self.Sr()) def Sgrad_n(self, i): return sp.kron(self.S_CstarGrad_n(i), self.Sr()) if __name__ == '__main__': from limix.core.covar import FreeFormCov from limix.utils.preprocess import covar_rescale # define row caoriance dim_r = 10 X = sp.rand(dim_r, dim_r) R = covar_rescale(sp.dot(X, X.T)) # define col covariances dim_c = 3 Cg = FreeFormCov(dim_c) Cn = FreeFormCov(dim_c) cov = Cov2KronSum(Cg=Cg, Cn=Cn, R=R) cov.setRandomParams() print((cov.K())) print((cov.K_grad_i(0)))
def _calcBaseline(self, shapedfitnesses): paramWeightings = dot(ones(self.batchSize), self.phiSquareWindow) baseline = dot(shapedfitnesses, self.phiSquareWindow) / paramWeightings return baseline
def insert(self, point, satellite): """Put a point and its satellite information into the hash structure. """ point = dot(self.projection, point) index = self.findBall(point) self.balls[index].append((point, satellite))
def _logDerivX(self, sample, x, invSigma): return dot(invSigma, (sample - x))
def _revertToSafety(self): """ When encountering a bad matrix, this is how we revert to a safe one. """ self.factorSigma = eye(self.numParameters) self.x = self.bestEvaluable self.allFactorSigmas[-1][:] = self.factorSigma self.sigma = dot(self.factorSigma.T, self.factorSigma)
def train_interactX(X,Y,K,interactants=None,covariates=None,addBiasTerm=True,numintervalsAlt=0,ldeltaminAlt=-1.0,ldeltamaxAlt=1.0,numintervals0=10,ldeltamin0=-5.0,ldeltamax0=5.0): """ compute all pvalues If numintervalsAlt==0 use EMMA-X trick (keep delta fixed over alternative models) difference to previous model: Ux and Ucovariate are recomputed for every SNP """ n,s=X.shape; n_pheno=Y.shape[1]; S,U=LA.eigh(K); UY=SP.dot(U.T,Y); UX=SP.dot(U.T,X); if (covariates==None): covariates = SP.ones([n,0]) if (addBiasTerm): covariates=SP.concatenate((covariates,SP.ones([n,1])),axis=1) #Ucovariates Ucovariate=SP.dot(U.T,covariates); #Uinteractants Uinteractants = SP.dot(U.T,interactants) n_covar=covariates.shape[1] n_inter=interactants.shape[1] #weights #foreground: covaraits + SNP + interactions beta = SP.empty((n_pheno,s,1+n_covar+2*n_inter)); #background: covariates + direct SNP effect beta0 = SP.empty((n_pheno,s,1+n_covar+n_inter)); LL=SP.ones((n_pheno,s))*(-SP.inf); LL0=SP.ones((n_pheno,s))*(-SP.inf); ldelta=SP.empty([n_pheno,s]); ldelta0=SP.empty([n_pheno,s]); sigg2=SP.empty((n_pheno,s)); sigg20=SP.empty((n_pheno,s)); pval=SP.ones((n_pheno,s))*(-SP.inf); #0. fit 0 model on phenotypes and covariates alone for phen in SP.arange(n_pheno): #fit if phen is visited the first time #loop through phenoptypes #get transformed Y UY_=UY[:,phen] #1. fit background model to set delta ldelta0[phen,:]=optdelta(UY_,Ucovariate,S,ldeltanull=None,numintervals=numintervals0,ldeltamin=ldeltamin0,ldeltamax=ldeltamax0); #1. loop through all snps for snp in SP.arange(s): #loop through all SNPs #1. snp-specific backgroud model SNP effect + covaraites + interactants Ucovariates_=SP.hstack((UX[:,snp:snp+1],Uinteractants,Ucovariate)) #2. snp-specific foreground model #interactions Xi_ = X[:,snp:snp+1]*interactants #transform UXi_ = SP.dot(U.T,Xi_) #stack: interactions, interactants (main) SNPs (main) covariates (if any) UX_ = SP.hstack((UXi_,Ucovariates_)) for phen in SP.arange(n_pheno): UY_=UY[:,phen] #loop through all phenotypes #emmaX trick ldelta[phen,snp]=ldelta0[phen,snp] #evluate background and foreground #null model nLL0_, beta0_, sigg20_=nLLeval(ldelta0[phen,snp],UY_,Ucovariates_,S,MLparams=True) beta0[phen,snp,:]=beta0_ sigg20[phen,snp]=sigg20_ LL0[phen,snp]=-nLL0_ #foreground model nLL_, beta_, sigg2_=nLLeval(ldelta[phen,snp],UY_,UX_,S,MLparams=True) beta[phen,snp,:]=beta_ sigg2[phen,snp]=sigg2_ LL[phen,snp]=-nLL_ pval = st.chi2.sf(2*(LL-LL0),1) return LL0, LL, pval, ldelta0, sigg20, beta0, ldelta, sigg2, beta
def _logDerivsX(self, samples, x, invSigma): samplesArray = array(samples) tmpX = multiply(x, ones((len(samplesArray), self.numParameters))) return dot(invSigma, (samplesArray - tmpX).T).T
def calc_ld_table(snps, max_ld_dist=2000, min_r2=0.2, verbose=True, normalize=False): """ Calculate LD between all SNPs using a sliding LD square This function only retains r^2 values above the given threshold """ # Normalize SNPs (perhaps not necessary, but cheap) if normalize: snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) snps = snps.T if verbose: print('Calculating LD table') t0 = time.time() num_snps, num_indivs = snps.shape ld_table = {} for i in range(num_snps): ld_table[i] = {} a = min(max_ld_dist, num_snps) num_pairs = (a * (num_snps - 1)) - a * (a + 1) * 0.5 if verbose: print('Correlation between %d pairs will be tested' % num_pairs) num_stored = 0 for i in range(0, num_snps - 1): start_i = i + 1 end_i = min(start_i + max_ld_dist, num_snps) ld_vec = sp.dot(snps[i], sp.transpose( snps[start_i:end_i])) / float(num_indivs) ld_vec = sp.array(ld_vec).flatten() for k in range(start_i, end_i): ld_vec_i = k - start_i if ld_vec[ld_vec_i]**2 > min_r2: ld_table[i][k] = ld_vec[ld_vec_i] ld_table[k][i] = ld_vec[ld_vec_i] num_stored += 1 if verbose: if i % 1000 == 0: sys.stdout.write('.') # sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, float(i + 1) / (num_snps - 1))))) sys.stdout.flush() if verbose: sys.stdout.write('Done.\n') if num_pairs > 0: print( 'Stored %d (%0.4f%%) correlations that made the cut (r^2>%0.3f).' % (num_stored, 100 * (num_stored / float(num_pairs)), min_r2)) else: print('-') t1 = time.time() t = (t1 - t0) if verbose: print( '\nIt took %d minutes and %0.2f seconds to calculate the LD table' % (t / 60, t % 60)) del snps return ld_table
def ld_pruning(data_file=None, ld_radius=None, out_file_prefix=None, p_thres=None, verbose=False, max_r2=0.2): """ LD pruning + P-value thresholding """ df = h5py.File(data_file, 'r') has_phenotypes = False if 'y' in list(df.keys()): 'Validation phenotypes found.' y = df['y'][...] # Phenotype num_individs = len(y) risk_scores = sp.zeros(num_individs) has_phenotypes = True print('') if max_r2 < 1: print( 'Applying LD-pruning + P-value thresholding with p-value threshold of %0.2e, a LD radius of %d SNPs, and a max r2 of %0.2f' % (p_thres, ld_radius, max_r2)) else: if p_thres < 1: print( 'Applying P-value thresholding with p-value threshold of %0.2e' % (p_thres)) else: print('Calculating polygenic risk score using all SNPs') results_dict = {} num_snps = 0 cord_data_g = df['cord_data'] chromsomes = [] for chrom_str in list(cord_data_g.keys()): g = cord_data_g[chrom_str] betas = g['betas'][...] n_snps = len(betas) num_snps += n_snps chromsomes.append(int((chrom_str.split('_'))[1])) chromsomes.sort() p_str = '%0.4f' % p_thres results_dict[p_str] = {} if out_file_prefix: #Preparing output files raw_effect_sizes = [] raw_pval_effect_sizes = [] updated_effect_sizes = [] updated_pval_effect_sizes = [] sids = [] chromosomes = [] positions = [] nts = [] tot_num_snps = 0 num_snps_used = 0 for chrom in chromsomes: chrom_str = 'chrom_%d' % chrom #print 'Chromosome %s:' % chrom_str g = cord_data_g[chrom_str] pvalues = g['ps'][...] snp_filter = pvalues < p_thres num_snps = sp.sum(snp_filter) if num_snps == 0: #print 'No SNPs, skipping chromosome' continue tot_num_snps += num_snps pvalues = pvalues[snp_filter] if 'raw_snps_val' in list(g.keys()): raw_snps = g['raw_snps_val'][...][snp_filter] else: raw_snps = g['raw_snps_ref'][...][snp_filter] snp_means = g['snp_means_ref'][...][snp_filter] snp_stds = g['snp_stds_ref'][...][snp_filter] raw_betas = g['log_odds'][...][snp_filter] pval_derived_betas = g['betas'][...][snp_filter] if out_file_prefix: chromosomes.extend([chrom_str] * len(pval_derived_betas)) positions.extend(g['positions'][...][snp_filter]) sids.extend(g['sids'][...][snp_filter]) raw_effect_sizes.extend(raw_betas) raw_pval_effect_sizes.extend(pval_derived_betas) nts.extend(g['nts'][...][snp_filter]) if max_r2 < 1: #print 'Generating LD table from genotypes.' snp_means.shape = (len(snp_means), 1) snp_stds.shape = (len(snp_means), 1) #Normalize SNPs.. norm_ref_snps = sp.array((raw_snps - snp_means) / snp_stds, dtype='float32') ld_table = ld.calc_ld_table(norm_ref_snps, max_ld_dist=ld_radius, min_r2=max_r2, verbose=verbose) updated_raw_betas, pruning_vector = smart_ld_pruning( raw_betas, ld_table, pvalues=pvalues, max_ld=max_r2, verbose=verbose) updated_pval_derived_betas = pval_derived_betas * pruning_vector num_snps_used += sp.sum(pruning_vector) else: updated_raw_betas = sp.copy(raw_effect_sizes) updated_pval_derived_betas = sp.copy(pval_derived_betas) updated_pval_derived_betas = updated_pval_derived_betas / ( snp_stds.flatten()) pruning_vector = sp.ones(len(pval_derived_betas)) num_snps_used += sp.sum(pruning_vector) if out_file_prefix: updated_effect_sizes.extend(updated_raw_betas) updated_pval_effect_sizes.extend(updated_pval_derived_betas) if has_phenotypes: print('Calculating scores for Chromosome %s' % chrom_str) prs = sp.dot(updated_raw_betas, raw_snps) risk_scores += prs corr = sp.corrcoef(y, prs)[0, 1] r2 = corr**2 print('The R2 prediction accuracy of PRS using %s was: %0.4f' % (chrom_str, r2)) print('There were %d (SNP) effects after p-value thresholding' % tot_num_snps) print('After LD-pruning %d SNPs had non-zero effects' % num_snps_used) if has_phenotypes: num_indivs = len(y) results_dict[p_str]['y'] = y results_dict[p_str]['risk_scores'] = risk_scores print('Prediction accuracy was assessed using %d individuals.' % (num_indivs)) corr = sp.corrcoef(y, risk_scores)[0, 1] r2 = corr**2 results_dict[p_str]['r2_pd'] = r2 print( 'The R2 prediction accuracy (observed scale) for the whole genome was: %0.4f (%0.6f)' % (r2, ((1 - r2)**2) / num_indivs)) if corr < 0: risk_scores = -1 * risk_scores # auc = calc_auc(y,risk_scores_pval_derived) # print 'AUC for the whole genome was: %0.4f'%auc #Now calibration denominator = sp.dot(risk_scores.T, risk_scores) y_norm = (y - sp.mean(y)) / sp.std(y) numerator = sp.dot(risk_scores.T, y_norm) regression_slope = (numerator / denominator) print('The slope for predictions with P-value derived effects is:', regression_slope) results_dict[p_str]['slope_pd'] = regression_slope if max_r2 == 1: weights_out_file = '%s_all_snps.txt' % (out_file_prefix) else: weights_out_file = '%s_P+T_p%0.4e.txt' % (out_file_prefix, p_thres) with open(weights_out_file, 'w') as f: f.write( 'chrom pos sid nt1 nt2 raw_beta raw_pval_beta updated_beta updated_pval_beta \n' ) for chrom, pos, sid, nt, raw_beta, raw_pval_beta, upd_beta, upd_pval_beta in it.izip( chromosomes, positions, sids, nts, raw_effect_sizes, raw_pval_effect_sizes, updated_effect_sizes, updated_pval_effect_sizes): nt1, nt2 = nt[0], nt[1] f.write( '%s %d %s %s %s %0.4e %0.4e %0.4e %0.4e\n' % (chrom, pos, sid, nt1, nt2, raw_beta, raw_pval_beta, upd_beta, upd_pval_beta))
def step(self, niter): """ xNES """ f = self.f mu, sigma, bmat = self.mu, self.sigma, self.bmat eta_mu, eta_sigma, eta_bmat = self.eta_mu, self.eta_sigma, self.eta_bmat npop = self.npop dim = self.dim sigma_old = self.sigma_old eyemat = eye(dim) with joblib.Parallel(n_jobs=self.n_jobs) as parallel: for i in range(niter): s_try = randn(npop, dim) z_try = mu + sigma * dot(s_try, bmat) # broadcast f_try = parallel(joblib.delayed(f)(z) for z in z_try) f_try = asarray(f_try) # save if best fitness = mean(f_try) if fitness - 1e-8 > self.fitness_best: self.fitness_best = fitness self.mu_best = mu.copy() self.counter = 0 else: self.counter += 1 if self.counter > self.patience: self.done = True return isort = argsort(f_try) f_try = f_try[isort] s_try = s_try[isort] z_try = z_try[isort] u_try = self.utilities if self.use_fshape else f_try if self.use_adasam and sigma_old is not None: # sigma_old must be available eta_sigma = self.adasam(eta_sigma, mu, sigma, bmat, sigma_old, z_try) dj_delta = dot(u_try, s_try) dj_mmat = dot(s_try.T, s_try * u_try.reshape(npop, 1)) - sum(u_try) * eyemat dj_sigma = trace(dj_mmat) * (1.0 / dim) dj_bmat = dj_mmat - dj_sigma * eyemat sigma_old = sigma # update mu += eta_mu * sigma * dot(bmat, dj_delta) sigma *= exp(0.5 * eta_sigma * dj_sigma) bmat = dot(bmat, expm(0.5 * eta_bmat * dj_bmat)) # logging self.history['fitness'].append(fitness) self.history['sigma'].append(sigma) self.history['eta_sigma'].append(eta_sigma) # keep last results self.mu, self.sigma, self.bmat = mu, sigma, bmat self.eta_sigma = eta_sigma self.sigma_old = sigma_old
def get_LDpred_ld_tables(snps, ld_radius=100, ld_window_size=0, h2=None, n_training=None, gm=None, gm_ld_radius=None): """ Calculates LD tables, and the LD score in one go... """ ld_dict = {} m, n = snps.shape ld_scores = sp.ones(m) ret_dict = {} if gm_ld_radius is None: for snp_i, snp in enumerate(snps): # Calculate D start_i = max(0, snp_i - ld_radius) stop_i = min(m, snp_i + ld_radius + 1) X = snps[start_i:stop_i] D_i = sp.dot(snp, X.T) / n r2s = D_i**2 ld_dict[snp_i] = D_i lds_i = sp.sum(r2s - (1 - r2s) / (n - 2), dtype='float32') ld_scores[snp_i] = lds_i else: assert gm is not None, 'Genetic map is missing.' window_sizes = [] ld_boundaries = [] for snp_i, snp in enumerate(snps): curr_cm = gm[snp_i] # Now find lower boundary start_i = snp_i min_cm = gm[snp_i] while start_i > 0 and min_cm > curr_cm - gm_ld_radius: start_i = start_i - 1 min_cm = gm[start_i] # Now find the upper boundary stop_i = snp_i max_cm = gm[snp_i] while stop_i > 0 and max_cm < curr_cm + gm_ld_radius: stop_i = stop_i + 1 max_cm = gm[stop_i] ld_boundaries.append([start_i, stop_i]) curr_ws = stop_i - start_i window_sizes.append(curr_ws) assert curr_ws > 0, 'Some issues with the genetic map' X = snps[start_i:stop_i] D_i = sp.dot(snp, X.T) / n r2s = D_i**2 ld_dict[snp_i] = D_i lds_i = sp.sum(r2s - (1 - r2s) / (n - 2), dtype='float32') ld_scores[snp_i] = lds_i avg_window_size = sp.mean(window_sizes) print('Average # of SNPs in LD window was %0.2f' % avg_window_size) if ld_window_size == 0: ld_window_size = avg_window_size * 2 ret_dict['ld_boundaries'] = ld_boundaries ret_dict['ld_dict'] = ld_dict ret_dict['ld_scores'] = ld_scores if ld_window_size > 0: ref_ld_matrices = [] inf_shrink_matrices = [] for wi in range(0, m, ld_window_size): start_i = wi stop_i = min(m, wi + ld_window_size) curr_window_size = stop_i - start_i X = snps[start_i:stop_i] D = sp.dot(X, X.T) / n ref_ld_matrices.append(D) if h2 != None and n_training != None: A = ((m / h2) * sp.eye(curr_window_size) + (n_training / (1)) * D) A_inv = linalg.pinv(A) inf_shrink_matrices.append(A_inv) ret_dict['ref_ld_matrices'] = ref_ld_matrices if h2 != None and n_training != None: ret_dict['inf_shrink_matrices'] = inf_shrink_matrices return ret_dict
def compute_JM(direction, variables, model, idx): """ Function that computes the Jeffries–Matusita distance of the model using the variables : idx +/- one of variables Inputs: variables: the variable to add to idx model: the model build with all the variables idx: the pool of retained variables Output: JM: the estimated Jeffries–Matusita distance Used in GMM.forward_selection() and GMM.backward_selection() """ # Get machine precision eps = sp.finfo(sp.float64).eps # Initialization JM = sp.zeros(variables.size) halfedLogdet = sp.zeros((model.C,variables.size)) # Compute all possible update of 0.5* log det cov(idx) if len(idx)==0: for c in xrange(model.C): for k,var in enumerate(variables): halfedLogdet[c,k] = 0.5*sp.log(model.cov[c,var,var]) else: for c in xrange(model.C): vp,Q,_ = model.decomposition(model.cov[c,idx,:][:,idx]) logdet = sp.sum(sp.log(vp)) invCov = sp.dot(Q,((1/vp)*Q).T) for k,var in enumerate(variables): if direction=='forward': alpha = model.cov[c,var,var] - sp.dot(model.cov[c,var,:][idx], sp.dot(invCov,model.cov[c,var,:][idx].T) ) elif direction=='backward': alpha = invCov[k,k] # it actually corresponds to 1/alpha from report if alpha < eps: alpha = eps halfedLogdet[c,k] = 0.5*( sp.log(alpha) + logdet) del vp,Q,alpha,invCov if len(idx)==0: for i in xrange(model.C): for j in xrange(i+1,model.C): for k,var in enumerate(variables): md = (model.mean[i,var]-model.mean[j,var]) cs = (model.cov[i,var,var]+model.cov[j,var,var])/2 logdet_ij = sp.log(2*cs) # 2* because we want det of 2*cs invCov = 1/cs bij = md*invCov*md/8 + 0.5*( logdet_ij - halfedLogdet[i,k] - halfedLogdet[j,k] ) JM[k] += sp.sqrt(2*(1-sp.exp(-bij)))*model.prop[i]*model.prop[j] else: for i in xrange(model.C): for j in xrange(i+1,model.C): cs = (model.cov[i,idx,:][:,idx]+model.cov[j,idx,:][:,idx])/2 vp,Q,rcond = model.decomposition(cs) invCov = sp.dot(Q,((1/vp)*Q).T) logdet = sp.sum(sp.log(vp)) for k,var in enumerate(variables): md = (model.mean[i,idx]-model.mean[j,idx]) if direction=='forward': id_t = list(idx) id_t.append(var) c1 = (model.cov[i,var,var]+model.cov[j,var,var])/2 c2 = (model.cov[i,var,:][idx]+model.cov[j,var,:][idx])/2 alpha = c1 - sp.dot(c2, sp.dot(invCov,c2.T) ) if alpha < eps: alpha = eps logdet_ij = logdet + sp.log(alpha * 2**(len(id_t)) ) # *2^d because we want det of 2*cs md_new = (model.mean[i,id_t]-model.mean[j,id_t]) row_feat = sp.hstack((-1/alpha * sp.dot(c2,invCov), 1/alpha)) cst_feat = alpha * (sp.dot(row_feat,md_new.T)**2) elif direction=='backward': alpha = 1/invCov[k,k] if alpha < eps: alpha = eps logdet_ij = logdet + sp.log(2**(len(idx)-1) / alpha) # *2^d because we want det of 2*cs row_feat = invCov[k,:] cst_feat = - alpha * (sp.dot(row_feat,md.T)**2) temp = sp.dot(md, sp.dot(invCov,md.T) ) + cst_feat bij = temp/8 + 0.5*(logdet_ij - halfedLogdet[i,k] - halfedLogdet[j,k] ) if bij < eps: bij = eps JM[k] += sp.sqrt(2*(1-sp.exp(-bij)))*model.prop[i]*model.prop[j] return JM
def findRotationMatrix(P, Q, R): # Make scipy vectors out of P,Q,R Pv = r_[P.x, P.y, P.z] Qv = r_[Q.x, Q.y, Q.z] Rv = r_[R.x, R.y, R.z] """ 1.) Find a vector that is normal to the plane (P,Q,R) by finding two vectors (a,b) and finding a point orthogonal to both. Vectors a and b lie in the plane translated to origin. """ a = Qv - Pv b = Rv - Pv n = cross_product(a, b) n = n / mag(n) # Normalize """ 2.) Create the new axes. These will be used to transform the original axis into the plane """ y_new = a / mag(a) z_new = n # Now all vectors in the new axis with a 0 z component will lie # in the plane x_new = cross_product(y_new, z_new) / mag(cross_product(y_new, z_new)) """ 3.) Finding euler's angle requires a little bit of geometry. Please refer to M.E. Rose, Elementary Theory of Angular Momemtum, Wiley: New York 1957 for an explanations of the angles. Unfortunately there are a lot of different conventions, and I randomly picked the one mentioned above. """ # Beta is measured between the new z axis and the old z axis. All angles are measured # counter-clockwise, so if the the y value of the new z axis is < 0 then we have to substract # the angle from 360 degrees. #line=cross_product([0,0,1],z_new) if (z_new[1] < 0): # line is a vector that specifies the intersection of the two x,y planes # alpha and gamma around found using this line line = cross_product(z_new, [0, 0, 1]) beta = 2 * pi - acos( dot(z_new, [0, 0, 1]) / (mag(z_new) * mag([0, 0, 1]))) else: # line is a vector that specifies the intersection of the two x,y planes # alpha and gamma around found using this line line = cross_product([0, 0, 1], z_new) beta = acos(dot(z_new, [0, 0, 1]) / (mag(z_new) * mag([0, 0, 1]))) # Alpha is the angle between 'line' and the original y axis. if (line[0] > 0): alpha = 2 * pi - acos( dot(line, [0, 1, 0]) / (mag(line) * mag([0, 1, 0]))) else: alpha = acos(dot(line, [0, 1, 0]) / (mag(line) * mag([0, 1, 0]))) # Gamma is the angle between 'line' and the new y axis if ((beta < pi and y_new[2] < 0) or (beta > pi and y_new[2] > 0)): gamma = 2 * pi - acos(dot(line, y_new) / (mag(line) * mag(y_new))) else: gamma = acos(dot(line, y_new) / (mag(line) * mag(y_new))) """ 4.) Now all we need to do is find the rotation matrix. This can be found by multiplying each of the three rotation matrixes (one for each angle). """ rot11 = cos(alpha) * cos(beta) * cos(gamma) - sin(alpha) * sin(gamma) rot12 = -cos(alpha) * cos(beta) * sin(gamma) - sin(alpha) * cos(gamma) rot13 = cos(alpha) * sin(beta) rot21 = sin(alpha) * cos(beta) * cos(gamma) + cos(alpha) * sin(gamma) rot22 = -sin(alpha) * cos(beta) * sin(gamma) + cos(alpha) * cos(gamma) rot23 = sin(alpha) * sin(beta) rot31 = -sin(beta) * cos(gamma) rot32 = sin(beta) * sin(gamma) rot33 = cos(beta) rot = r_[[[rot11, rot12, rot13]], [[rot21, rot22, rot23]], [[rot31, rot32, rot33]]] return rot