def scan(bfile,Y,cov,null,wnds,minSnps,i0,i1,perm_i,resfile,F,colCovarType_r='lowrank',rank_r=1): if perm_i is not None: print 'Generating permutation (permutation %d)'%perm_i np.random.seed(perm_i) perm = np.random.permutation(Y.shape[0]) mtSet = limix.MTSet(Y=Y, S_R=cov['eval'], U_R=cov['evec'], F=F, rank=rank_r) mtSet.setNull(null) bim = plink_reader.readBIM(bfile,usecols=(0,1,2,3)) fam = plink_reader.readFAM(bfile,usecols=(0,1)) print 'fitting model' wnd_file = csv.writer(open(resfile,'wb'),delimiter='\t') for wnd_i in range(i0,i1): print '.. window %d - (%d, %d-%d) - %d snps'%(wnd_i,int(wnds[wnd_i,1]),int(wnds[wnd_i,2]),int(wnds[wnd_i,3]),int(wnds[wnd_i,-1])) if int(wnds[wnd_i,-1])<minSnps: print 'SKIPPED: number of snps lower than minSnps' continue #RV = bed.read(PositionRange(int(wnds[wnd_i,-2]),int(wnds[wnd_i,-1]))) RV = plink_reader.readBED(bfile, useMAFencoding=True, blocksize = 1, start = int(wnds[wnd_i,4]), nSNPs = int(wnds[wnd_i,5]), order = 'F',standardizeSNPs=False,ipos = 2,bim=bim,fam=fam) Xr = RV['snps'] if perm_i is not None: Xr = Xr[perm,:] rv = mtSet.optimize(Xr) line = np.concatenate([wnds[wnd_i,:],rv['LLR']]) wnd_file.writerow(line) pass
def computePCsPython(out_dir,k,bfile,ffile): """ reading in """ RV = plink_reader.readBED(bfile,useMAFencoding=True) X = RV['snps'] """ normalizing markers """ print 'Normalizing SNPs...' p_ref = X.mean(axis=0)/2. X -= 2*p_ref with warnings.catch_warnings(): warnings.simplefilter("ignore") X /= sp.sqrt(2*p_ref*(1-p_ref)) hasNan = sp.any(sp.isnan(X),axis=0) print '%d SNPs have a nan entry. Exluding them for computing the covariance matrix.'%hasNan.sum() X = X[:,~hasNan] """ computing prinicipal components """ U,S,Vt = ssl.svds(X,k=k) U -= U.mean(0) U /= U.std(0) U = U[:,::-1] """ saving to output """ np.savetxt(ffile, U, delimiter='\t',fmt='%.6f')
def computeCovarianceMatrixPython(out_dir, bfile, cfile, sim_type='RRM'): print "Using python to create covariance matrix. This might be slow. We recommend using plink instead." if sim_type is not 'RRM': raise Exception('sim_type %s is not known' % sim_type) """ loading data """ data = plink_reader.readBED(bfile, useMAFencoding=True) iid = data['iid'] X = data['snps'] N = X.shape[1] print '%d variants loaded.' % N print '%d people loaded.' % X.shape[0] """ normalizing markers """ print 'Normalizing SNPs...' p_ref = X.mean(axis=0) / 2. X -= 2 * p_ref with warnings.catch_warnings(): warnings.simplefilter("ignore") X /= sp.sqrt(2 * p_ref * (1 - p_ref)) hasNan = sp.any(sp.isnan(X), axis=0) print '%d SNPs have a nan entry. Exluding them for computing the covariance matrix.' % hasNan.sum( ) """ computing covariance matrix """ print 'Computing relationship matrix...' K = sp.dot(X[:, ~hasNan], X[:, ~hasNan].T) K /= 1. * N print 'Relationship matrix calculation complete' print 'Relationship matrix written to %s.cov.' % cfile print 'IDs written to %s.cov.id.' % cfile """ saving to output """ np.savetxt(cfile + '.cov', K, delimiter='\t', fmt='%.6f') np.savetxt(cfile + '.cov.id', iid, delimiter=' ', fmt='%s')
def getRegion(self,size=3e4,min_nSNPs=1,chrom_i=None,pos_min=None,pos_max=None): """ Sample a region from the piece of genotype X, chrom, pos minSNPnum: minimum number of SNPs contained in the region Ichrom: restrict X to chromosome Ichrom before taking the region cis: bool vector that marks the sorted region region: vector that contains chrom and init and final position of the region """ if (self.chrom is None) or (self.pos is None): bim = plink_reader.readBIM(self.bfile,usecols=(0,1,2,3)) chrom = SP.array(bim[:,0],dtype=int) pos = SP.array(bim[:,3],dtype=int) else: chrom = self.chrom pos = self.pos if chrom_i is None: n_chroms = chrom.max() chrom_i = int(SP.ceil(SP.rand()*n_chroms)) pos = pos[chrom==chrom_i] chrom = chrom[chrom==chrom_i] ipos = SP.ones(len(pos),dtype=bool) if pos_min is not None: ipos = SP.logical_and(ipos,pos_min<pos) if pos_max is not None: ipos = SP.logical_and(ipos,pos<pos_max) pos = pos[ipos] chrom = chrom[ipos] if size==1: # select single SNP idx = int(SP.ceil(pos.shape[0]*SP.rand())) cis = SP.arange(pos.shape[0])==idx region = SP.array([chrom_i,pos[idx],pos[idx]]) else: while 1: idx = int(SP.floor(pos.shape[0]*SP.rand())) posT1 = pos[idx] posT2 = pos[idx]+size if posT2<=pos.max(): cis = chrom==chrom_i cis*= (pos>posT1)*(pos<posT2) if cis.sum()>min_nSNPs: break region = SP.array([chrom_i,posT1,posT2]) start = SP.nonzero(cis)[0].min() nSNPs = cis.sum() if self.X is None: rv = plink_reader.readBED(self.bfile,useMAFencoding=True,start = start, nSNPs = nSNPs,bim=bim) Xr = rv['snps'] else: Xr = self.X[:,start:start+nSnps] return Xr, region
def scan(bfile, Y, cov, null, wnds, minSnps, i0, i1, perm_i, resfile, F, colCovarType_r='lowrank', rank_r=1): if perm_i is not None: print 'Generating permutation (permutation %d)' % perm_i np.random.seed(perm_i) perm = np.random.permutation(Y.shape[0]) mtSet = limix.MTSet(Y=Y, S_R=cov['eval'], U_R=cov['evec'], F=F, rank=rank_r) mtSet.setNull(null) bim = plink_reader.readBIM(bfile, usecols=(0, 1, 2, 3)) fam = plink_reader.readFAM(bfile, usecols=(0, 1)) print 'fitting model' wnd_file = csv.writer(open(resfile, 'wb'), delimiter='\t') for wnd_i in range(i0, i1): print '.. window %d - (%d, %d-%d) - %d snps' % ( wnd_i, int(wnds[wnd_i, 1]), int(wnds[wnd_i, 2]), int( wnds[wnd_i, 3]), int(wnds[wnd_i, -1])) if int(wnds[wnd_i, -1]) < minSnps: print 'SKIPPED: number of snps lower than minSnps' continue #RV = bed.read(PositionRange(int(wnds[wnd_i,-2]),int(wnds[wnd_i,-1]))) RV = plink_reader.readBED(bfile, useMAFencoding=True, blocksize=1, start=int(wnds[wnd_i, 4]), nSNPs=int(wnds[wnd_i, 5]), order='F', standardizeSNPs=False, ipos=2, bim=bim, fam=fam) Xr = RV['snps'] if perm_i is not None: Xr = Xr[perm, :] rv = mtSet.optimize(Xr) line = np.concatenate([wnds[wnd_i, :], rv['LLR']]) wnd_file.writerow(line) pass
def _genBgTerm_fromSNPs(self,vTot=0.5,vCommon=0.1,pCausal=0.5,plot=False): """ generate """ if self.X is None: print 'Reading in all SNPs. This is slow.' rv = plink_reader.readBED(self.bfile,useMAFencoding=True) X = rv['snps'] else: X = self.X S = X.shape[1] vSpecific = vTot-vCommon # select causal SNPs nCausal = int(SP.floor(pCausal*S)) Ic = selectRnd(nCausal,S) X = X[:,Ic] # common effect Bc = SP.dot(self.genWeights(nCausal,self.P),self.genTraitEffect()) Yc = SP.dot(X,Bc) Yc *= SP.sqrt(vCommon/Yc.var(0).mean()) # indipendent effect Bi = SP.randn(nCausal,self.P) Yi = SP.dot(X,Bi) Yi *= SP.sqrt(vSpecific/Yi.var(0).mean()) if plot: import pylab as PL PL.ion() for p in range(self.P): PL.subplot(self.P,1,p+1) PL.plot(SP.arange(self.X.shape[1])[Ic],Bc[:,p],'o',color='y',alpha=0.05) PL.plot(SP.arange(self.X.shape[1])[Ic],Bi[:,p],'o',color='r',alpha=0.05) #PL.ylim(-2,2) PL.plot([0,Ic.shape[0]],[0,0],'k') return Yc, Yi
def computeCovarianceMatrixPython(out_dir,bfile,cfile,sim_type='RRM'): print "Using python to create covariance matrix. This might be slow. We recommend using plink instead." if sim_type is not 'RRM': raise Exception('sim_type %s is not known'%sim_type) """ loading data """ data = plink_reader.readBED(bfile,useMAFencoding=True) iid = data['iid'] X = data['snps'] N = X.shape[1] print '%d variants loaded.'%N print '%d people loaded.'%X.shape[0] """ normalizing markers """ print 'Normalizing SNPs...' p_ref = X.mean(axis=0)/2. X -= 2*p_ref with warnings.catch_warnings(): warnings.simplefilter("ignore") X /= sp.sqrt(2*p_ref*(1-p_ref)) hasNan = sp.any(sp.isnan(X),axis=0) print '%d SNPs have a nan entry. Exluding them for computing the covariance matrix.'%hasNan.sum() """ computing covariance matrix """ print 'Computing relationship matrix...' K = sp.dot(X[:,~hasNan],X[:,~hasNan].T) K/= 1.*N print 'Relationship matrix calculation complete' print 'Relationship matrix written to %s.cov.'%cfile print 'IDs written to %s.cov.id.'%cfile """ saving to output """ np.savetxt(cfile + '.cov', K, delimiter='\t',fmt='%.6f') np.savetxt(cfile + '.cov.id', iid, delimiter=' ',fmt='%s')
def computePCsPython(out_dir, k, bfile, ffile): """ reading in """ RV = plink_reader.readBED(bfile, useMAFencoding=True) X = RV['snps'] """ normalizing markers """ print 'Normalizing SNPs...' p_ref = X.mean(axis=0) / 2. X -= 2 * p_ref with warnings.catch_warnings(): warnings.simplefilter("ignore") X /= sp.sqrt(2 * p_ref * (1 - p_ref)) hasNan = sp.any(sp.isnan(X), axis=0) print '%d SNPs have a nan entry. Exluding them for computing the covariance matrix.' % hasNan.sum( ) X = X[:, ~hasNan] """ computing prinicipal components """ U, S, Vt = ssl.svds(X, k=k) U -= U.mean(0) U /= U.std(0) U = U[:, ::-1] """ saving to output """ np.savetxt(ffile, U, delimiter='\t', fmt='%.6f')