def TESTBEFOREUSING_pca_covariates(bedfile, npc, outfile=None): ''' Read in bed file and compute PC covariates (a la Eigenstrat) bedfile should include the .bed extension returns pccov, U, S ''' import fastlmm.pyplink.plink as pl import os.path import time import scipy.linalg as la import numpy as np root, ext = os.path.splitext(bedfile) print "reading in bed file..." t0 = time.time() SNPs = Bed(root).read().standardize() t1 = time.time() print("Elapsed time %.2f seconds" % (t1 - t0)) [N, M] = snps.shape print "found nind=" + str(N) + ", nsnp=" + str(M) if M < N: print "only doing full rank, should use low rank here" #not needed, and in practice, makes no difference #mean center the individuals #meanval=sp.mean(snps,axis=0) #snps=snps-meanval print "computing kernel..." t0 = time.time() K = sp.dot(snps, snps.T) t1 = time.time() print("Elapsed time %.2f seconds" % (t1 - t0)) snps = None t0 = time.time() print "computing svd..." [U, S, V] = la.svd(K, full_matrices=False) t1 = time.time() print("Elapsed time %.2f seconds" % (t1 - t0)) S = sp.sqrt(S) #UShalf=sp.dot(U,sp.diag(S)) #expensive UShalf = sp.multiply(U, np.tile( S, (N, 1))) #faster, but not as fast as as_strided which I can't get to work pccov = UShalf[:, 0:npc] print "done." if outfile is not None: import fastlmm.util.util as ut ut.write_plink_covariates(SNPs['iid'], pccov, outfile) return pccov, U, S
def TESTBEFOREUSING_pca_covariates(bedfile,npc,outfile=None): ''' Read in bed file and compute PC covariates (a la Eigenstrat) bedfile should include the .bed extension returns pccov, U, S ''' import fastlmm.pyplink.plink as pl import os.path import time import scipy.linalg as la import numpy as np root, ext = os.path.splitext(bedfile) print "reading in bed file..." t0=time.time() SNPs = Bed(root).read().standardize() t1=time.time() print ("Elapsed time %.2f seconds" % (t1-t0)) [N,M]=snps.shape print "found nind=" + str(N) + ", nsnp=" + str(M) if M<N: print "only doing full rank, should use low rank here" #not needed, and in practice, makes no difference #mean center the individuals #meanval=sp.mean(snps,axis=0) #snps=snps-meanval print "computing kernel..." t0=time.time() K=sp.dot(snps,snps.T) t1=time.time() print ("Elapsed time %.2f seconds" % (t1-t0)) snps=None t0=time.time() print "computing svd..." [U,S,V] = la.svd(K,full_matrices = False) t1=time.time() print ("Elapsed time %.2f seconds" % (t1-t0)) S=sp.sqrt(S) #UShalf=sp.dot(U,sp.diag(S)) #expensive UShalf=sp.multiply(U,np.tile(S,(N,1))) #faster, but not as fast as as_strided which I can't get to work pccov=UShalf[:,0:npc] print "done." if outfile is not None: import fastlmm.util.util as ut ut.write_plink_covariates(SNPs['iid'],pccov,outfile) return pccov,U,S
def linreg_entire_dataset(self, output_prefix): self.run_once() # We do a little extra work here. We write the pcs out here because they will be known here and we know this method will only be run # once even on a big cluster jobs if output_prefix is not None and self.num_pcs > 0: filename = self.pcs_filename(output_prefix) import fastlmm.util.util as ut ut.write_plink_covariates(self.ind_iid, self.pcs, filename) logging.info("performing final scan through entire data set") tt0 = time.time() _F, _pval = self.G.lin_reg(self.y, self.X) feat_idx = np.argsort(_pval) _pval_feat_idx = _pval[feat_idx] sid_feat_idx = self.sid[feat_idx] lingreg_results = (_pval[feat_idx], self.sid[feat_idx]) logging.info("fin_scan time %.2f s" % (float(time.time() - tt0))) return lingreg_results
def linreg_entire_dataset(self, output_prefix): self.run_once() # We do a little extra work here. We write the pcs out here because they will be known here and we know this method will only be run # once even on a big cluster jobs if output_prefix is not None and self.num_pcs > 0: filename = self.pcs_filename(output_prefix) import fastlmm.util.util as ut ut.write_plink_covariates(self.ind_iid,self.pcs,filename) logging.info("performing final scan through entire data set") tt0 = time.time() _F,_pval = self.G.lin_reg(self.y, self.X) feat_idx = np.argsort(_pval) _pval_feat_idx = _pval[feat_idx] sid_feat_idx = self.sid[feat_idx] lingreg_results = (_pval[feat_idx],self.sid[feat_idx]) logging.info("fin_scan time %.2f s" % (float(time.time() - tt0))) return lingreg_results