def build_kernel_blocked(snpreader, snp_idx=None, blocksize=10000,alt_snpreader=None,allowlowrank=False): """build kernel by loading blocks of SNPs """ if alt_snpreader is None: alt_snpreader = snpreader if hasattr(alt_snpreader,"ind_used") and alt_snpreader.ind_used is not None: N = len(alt_snpreader.ind_used) else: N = len(alt_snpreader.original_iids) t0 = time.time() K = sp.zeros([N,N]) num_snps = alt_snpreader.snp_count if snp_idx != None: snp_names = alt_snpreader.rs[snp_idx] current_size = len(snp_names) logging.info("reading %i SNPs in blocks of %i and adding up kernels" % (len(snp_idx), blocksize)) else: current_size = num_snps logging.info("constructing K from all %i SNPs (for %i individuals)" % (num_snps, N)) ct = 0 ts = time.time() if (not allowlowrank) and alt_snpreader.snp_count<N: raise Exception("need to adjust code to handle low rank") for start in xrange(0, current_size, blocksize): ct += blocksize if snp_idx == None: tmp_set = PositionRange(start, blocksize) else: tmp_set = SnpAndSetName('someset', snp_names[start:start+blocksize]) snps = alt_snpreader.read(tmp_set)['snps'] snps = up.standardize(snps) #logging.info("start = {0}".format(start)) K += snps.dot(snps.T) if ct % blocksize==0: logging.info("read %s SNPs in %.2f seconds" % (ct, time.time()-ts)) # normalize kernel #K = K/sp.sqrt(alt_snpreader.snp_count) #K = K + 1e-5*sp.eye(N,N) t1 = time.time() logging.info("%.2f seconds elapsed" % (t1-t0)) return K
def build_kernel_blocked(snpreader, snp_idx=None, blocksize=10000,alt_snpreader=None,allowlowrank=False): """build kernel by loading blocks of SNPs """ if alt_snpreader is None: alt_snpreader = snpreader if hasattr(alt_snpreader,"ind_used") and alt_snpreader.ind_used is not None: N = len(alt_snpreader.ind_used) else: N = len(alt_snpreader.original_iids) t0 = time.time() K = sp.zeros([N,N]) num_snps = alt_snpreader.snp_count if snp_idx != None: snp_names = alt_snpreader.rs[snp_idx] current_size = len(snp_names) logging.info("reading %i SNPs in blocks of %i and adding up kernels" % (len(snp_idx), blocksize)) else: current_size = num_snps logging.info("constructing K from all %i SNPs (for %i individuals)" % (num_snps, N)) ct = 0 ts = time.time() if (not allowlowrank) and alt_snpreader.snp_count<N: raise Exception("need to adjust code to handle low rank") for start in range(0, current_size, blocksize): ct += blocksize if snp_idx == None: tmp_set = PositionRange(start, blocksize) else: tmp_set = SnpAndSetName('someset', snp_names[start:start+blocksize]) snps = alt_snpreader.read(tmp_set)['snps'] snps = up.standardize(snps) #logging.info("start = {0}".format(start)) K += snps.dot(snps.T) if ct % blocksize==0: logging.info("read %s SNPs in %.2f seconds" % (ct, time.time()-ts)) # normalize kernel #K = K/sp.sqrt(alt_snpreader.snp_count) #K = K + 1e-5*sp.eye(N,N) t1 = time.time() logging.info("%.2f seconds elapsed" % (t1-t0)) return K
def set_snps0(SNPs0,sample_size,i_exclude=None, forcefullrank=False,blocksize=10000): ''' In full rank case, loads up the SNPs in blocks, and construct the kernel. In low rank case, loads up all SNPs in to memory ''' if SNPs0 is None: return None, None if SNPs0.has_key("K"): K0 = SNPs0["K"] G0 = None elif SNPs0.has_key("data"): K0 = None G0 = SNPs0["data"]["snps"] else: #full rank if len(SNPs0["snp_set"]) > sample_size or forcefullrank:# N = Y.shape[0] SNPs0["K"] = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=SNPs0["snp_set"].to_index, blocksize=blocksize,allowlowrank=forcefullrank) K0 = SNPs0["K"] G0 = None else: #low rank K0 = None SNPs0["data"] = SNPs0["snp_set"].read() SNPs0["data"]["snps"] = up.standardize(SNPs0["data"]["snps"]) G0 = SNPs0["data"]["snps"] #lrt_up should never do exclusion, because set_snps0 should only get called once, in run_once, without exclusion #exclude. So this is only for score test and lrt. if i_exclude is not None: if K0 is not None: #Also note in the full rank case with exclusion, for score, one could in principle use low rank updates to make it faster, #when the number of excluded SNPs is small: it wold be cubic in num_excluded * num_inner*num_outer iterations, versus now #where it is cubic in N in the outer loop only once K_up = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=np.array(SNPs0["snp_set"].to_index)[i_exclude], blocksize=blocksize,allowlowrank=forcefullrank) K0 = K0 - K_up elif G0 is not None: G0 = G0[:,~i_exclude] num_snps = SNPs0["num_snps"] - i_exclude.sum() else: num_snps = SNPs0["num_snps"] #intersect data? #normalize: if K0 is not None: K0 = K0 / num_snps#K0.diagonal().mean() elif G0 is not None: G0 = G0 / np.sqrt( num_snps )#(G0*G0).mean() ) # computes the sqrt of the mean of the diagonal of K=GG^T; * means pointwise multiplication return G0, K0
def set_snps0(SNPs0,sample_size,i_exclude=None, forcefullrank=False,blocksize=10000): ''' In full rank case, loads up the SNPs in blocks, and construct the kernel. In low rank case, loads up all SNPs in to memory ''' if SNPs0 is None: return None, None if "K" in SNPs0: K0 = SNPs0["K"] G0 = None elif "data" in SNPs0: K0 = None G0 = SNPs0["data"]["snps"] else: #full rank if len(SNPs0["snp_set"]) > sample_size or forcefullrank:# N = Y.shape[0] SNPs0["K"] = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=SNPs0["snp_set"].to_index, blocksize=blocksize,allowlowrank=forcefullrank) K0 = SNPs0["K"] G0 = None else: #low rank K0 = None SNPs0["data"] = SNPs0["snp_set"].read() SNPs0["data"]["snps"] = up.standardize(SNPs0["data"]["snps"]) G0 = SNPs0["data"]["snps"] #lrt_up should never do exclusion, because set_snps0 should only get called once, in run_once, without exclusion #exclude. So this is only for score test and lrt. if i_exclude is not None: if K0 is not None: #Also note in the full rank case with exclusion, for score, one could in principle use low rank updates to make it faster, #when the number of excluded SNPs is small: it wold be cubic in num_excluded * num_inner*num_outer iterations, versus now #where it is cubic in N in the outer loop only once K_up = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=np.array(SNPs0["snp_set"].to_index)[i_exclude], blocksize=blocksize,allowlowrank=forcefullrank) K0 = K0 - K_up elif G0 is not None: G0 = G0[:,~i_exclude] num_snps = SNPs0["num_snps"] - i_exclude.sum() else: num_snps = SNPs0["num_snps"] #intersect data? #normalize: if K0 is not None: K0 = K0 / num_snps#K0.diagonal().mean() elif G0 is not None: G0 = G0 / np.sqrt( num_snps )#(G0*G0).mean() ) # computes the sqrt of the mean of the diagonal of K=GG^T; * means pointwise multiplication return G0, K0
def TESTBEFOREUSING_score_testfilesFromDir(phenofile, base0, pedfilesalt, covarfile=None, outfile=None, ipheno=0, mindist=-1.0, idist=2, filetype='PED'): ''' given a list of basefilenames that define alternative models and a basefilename of the null model test all alternative models -------------------------------------------------------------------------- Input: phenofile : filename of the phenotype file base0 : basefilename of the .ped and .map files containing the null-model SNPs pedfilesalt : [Nalt] list of basefilenames of the Nalt .ped and .map files containing the alternative-model SNPs covarfile : filename of the covariates file (default None, in this case only a bias is used) outfile : filename of the output file (default None, in this case no output is written to disk) ipheno : 0-based index of the phenotype to be analyzed (default 0) mindist : minimum distance for SNPs to be included in null model (default -1.0: no excluson in this case) idist : index in pos array that the exclusion is based on. (1=genetic distance, 2=basepair distance) filetype : plink filetype of the input (default 'PED') 'PED' : PED file format 'BED' : BED file format -------------------------------------------------------------------------- Output dictionary: 'pv' : [Nalt] array P-values, 'lik0' : [Nalt] array containing the model parameters and negative log-likelihoods of the null models, 'lik1' : [Nalt] array containing the model parameters and negative log-likelihoods of the alternative models, 'nexclude' : [Nalt] array of numbers of SNPs excluded, 'filenames' : [Nalt] array of basefilenames -------------------------------------------------------------------------- ''' pheno = pstpheno.loadPhen(filename=phenofile, missing='-9', pheno=None) if covarfile is None: X = SP.ones((pheno['vals'].shape[0], 1)) else: covar = pstpheno.loadPhen(filename=covarfile, missing='-9', pheno=None) X = SP.hstack((SP.ones((pheno['vals'].shape[0], 1)), covar['vals'])) if filetype == 'PED': SNPs0 = plink.readPED(basefilename=base0, delimiter=' ', missing='0', standardize=True, pheno=None) elif filetype == 'BED': SNPs0 = plink.readBED(basefilename=base0) SNPs0['snps'] = util.standardize(SNPs0['snps']) y = pheno['vals'][:, ipheno] G0 = SNPs0['snps'] / SP.sqrt(SNPs0['snps'].shape[1]) #build the null model test2K = scoretest(Y=y[:, SP.newaxis], X=X, K=None, G=G0) squaredform = SP.zeros(len(pedfilesalt)) expectationsqform = SP.zeros(len(pedfilesalt)) varsqform = SP.zeros(len(pedfilesalt)) squaredform2K = SP.zeros(len(pedfilesalt)) expectationsqform2K = SP.zeros(len(pedfilesalt)) varsqform2K = SP.zeros(len(pedfilesalt)) nexclude = SP.zeros(len(pedfilesalt)) include = SP.zeros(len(pedfilesalt)) Pv = SP.zeros(len(pedfilesalt)) Pv2K = SP.zeros(len(pedfilesalt)) for i, base1 in enumerate(pedfilesalt): #iterate over all ped files SNPs1 = plink.readPED(basefilename=base1, delimiter=' ', missing='0', standardize=True, pheno=None) if mindist >= 0: i_exclude = excludeinds(SNPs0['pos'], SNPs1['pos'], mindist=mindist, idist=idist) nexclude[i] = i_exclude.sum() else: nexclude[i] = 0 G1 = SNPs1['snps'] / SP.sqrt(SNPs1['snps'].shape[1]) if nexclude[i] > 0: test2Ke = scoretest(Y=y[:, SP.newaxis], X=X, K=None, G=G0[:, ~i_exclude]) squaredform2K[i], expectationsqform2K[i], varsqform2K[ i] = test2Ke.score(G=G1) else: squaredform2K[i], expectationsqform2K[i], varsqform2K[ i] = test2K.score(G=G1) squaredform[i], expectationsqform[i], varsqform[i] = scoreNoK( y, X=X, G=G1, sigma2=None) #perform moment matching Pv2K[i], dofchi22K, scalechi22K = pv_mom(squaredform2K[i], expectationsqform2K[i], varsqform2K[i]) Pv[i], dofchi2, scalechi2 = pv_mom(squaredform[i], expectationsqform[i], varsqform[i]) ret = { 'filenames': SP.array(pedfilesalt, dtype='str'), 'squaredform': squaredform, 'expectationsqform': expectationsqform, 'varsqform': varsqform, 'P': Pv, 'squaredform2K': squaredform2K, 'expectationsqform2K': expectationsqform2K, 'varsqform2K': varsqform2K, 'nexclude': nexclude, 'P2K': Pv2K } if outfile is not None: #TODO print 'implement me!' #header = SP.array(['PV_5050','neg_log_lik_0','neg_loglik_alt','n_snps_excluded','filename_alt']) #data = SP.concatenate(()) return ret
def TESTBEFOREUSING_score_testfilesFromDir(phenofile, base0, pedfilesalt, covarfile = None, outfile = None, ipheno=0, mindist = -1.0, idist=2 ,filetype='PED'): ''' given a list of basefilenames that define alternative models and a basefilename of the null model test all alternative models -------------------------------------------------------------------------- Input: phenofile : filename of the phenotype file base0 : basefilename of the .ped and .map files containing the null-model SNPs pedfilesalt : [Nalt] list of basefilenames of the Nalt .ped and .map files containing the alternative-model SNPs covarfile : filename of the covariates file (default None, in this case only a bias is used) outfile : filename of the output file (default None, in this case no output is written to disk) ipheno : 0-based index of the phenotype to be analyzed (default 0) mindist : minimum distance for SNPs to be included in null model (default -1.0: no excluson in this case) idist : index in pos array that the exclusion is based on. (1=genetic distance, 2=basepair distance) filetype : plink filetype of the input (default 'PED') 'PED' : PED file format 'BED' : BED file format -------------------------------------------------------------------------- Output dictionary: 'pv' : [Nalt] array P-values, 'lik0' : [Nalt] array containing the model parameters and negative log-likelihoods of the null models, 'lik1' : [Nalt] array containing the model parameters and negative log-likelihoods of the alternative models, 'nexclude' : [Nalt] array of numbers of SNPs excluded, 'filenames' : [Nalt] array of basefilenames -------------------------------------------------------------------------- ''' pheno = plink.loadPhen(filename = phenofile, missing ='-9', pheno = None) if covarfile is None: X = SP.ones((pheno['vals'].shape[0],1)) else: covar = plink.loadPhen(filename = covarfile, missing ='-9', pheno = None) X = SP.hstack((SP.ones((pheno['vals'].shape[0],1)),covar['vals'])) if filetype =='PED': SNPs0 = plink.readPED(basefilename = base0, delimiter = ' ',missing = '0',standardize = True, pheno = None) elif filetype =='BED': SNPs0 = plink.readBED(basefilename = base0) SNPs0['snps'] = util.standardize(SNPs0['snps']) y = pheno['vals'][:,ipheno] G0 = SNPs0['snps']/SP.sqrt(SNPs0['snps'].shape[1]) #build the null model test2K = scoretest(Y=y[:,SP.newaxis],X=X,K=None,G=G0) squaredform = SP.zeros(len(pedfilesalt)) expectationsqform = SP.zeros(len(pedfilesalt)) varsqform = SP.zeros(len(pedfilesalt)) squaredform2K = SP.zeros(len(pedfilesalt)) expectationsqform2K = SP.zeros(len(pedfilesalt)) varsqform2K = SP.zeros(len(pedfilesalt)) nexclude = SP.zeros(len(pedfilesalt)) include = SP.zeros(len(pedfilesalt)) Pv = SP.zeros(len(pedfilesalt)) Pv2K = SP.zeros(len(pedfilesalt)) for i, base1 in enumerate(pedfilesalt):#iterate over all ped files SNPs1 = plink.readPED(basefilename = base1, delimiter = ' ',missing = '0',standardize = True, pheno = None) if mindist>=0: i_exclude = excludeinds(SNPs0['pos'], SNPs1['pos'], mindist = mindist,idist = idist) nexclude[i] = i_exclude.sum() else: nexclude[i]=0 G1 = SNPs1['snps']/SP.sqrt(SNPs1['snps'].shape[1]) if nexclude[i]>0: test2Ke = scoretest(Y=y[:,SP.newaxis],X=X,K=None,G=G0[:,~i_exclude]) squaredform2K[i], expectationsqform2K[i], varsqform2K[i] = test2Ke.score( G = G1 ) else: squaredform2K[i], expectationsqform2K[i], varsqform2K[i] = test2K.score( G = G1 ) squaredform[i], expectationsqform[i], varsqform[i] = scoreNoK( y, X = X, G = G1, sigma2=None) #perform moment matching Pv2K[i],dofchi22K,scalechi22K=pv_mom(squaredform2K[i],expectationsqform2K[i],varsqform2K[i]) Pv[i],dofchi2,scalechi2=pv_mom(squaredform[i],expectationsqform[i],varsqform[i]) ret = { 'filenames': SP.array(pedfilesalt,dtype = 'str'), 'squaredform':squaredform, 'expectationsqform':expectationsqform, 'varsqform':varsqform, 'P':Pv, 'squaredform2K':squaredform2K, 'expectationsqform2K':expectationsqform2K, 'varsqform2K':varsqform2K, 'nexclude':nexclude, 'P2K':Pv2K } if outfile is not None: #TODO print 'implement me!' #header = SP.array(['PV_5050','neg_log_lik_0','neg_loglik_alt','n_snps_excluded','filename_alt']) #data = SP.concatenate(()) return ret