def common_effect(ouF): S = [] ALL = [] ouFile = open(ouF, 'w') ###ouFile2 = open(ouF.split('-Sig')[0] + '-ALL', 'w') gs = G[0] for gene in gs: phenotype_names = [gene + ':RNA', gene + ':ProteinLight'] phenotype_query = "(phenotype_ID in %s)" % str(phenotype_names) data_subsample = dataset.subsample_phenotypes(phenotype_query=phenotype_query,intersection=True) snps = data_subsample.getGenotypes(impute_missing=True) phenotypes,sample_idx = data_subsample.getPhenotypes(phenotype_query=phenotype_query, intersection=True) sample_relatedness = data_subsample.getCovariance() phenotypes_vals_ranks = preprocess.rankStandardizeNormal(phenotypes.values) N, P = phenotypes.shape ###imax = 735 ### II:476596 #covars_conditional=np.concatenate((geno[sample_idx,imax:imax+1],np.ones((phenotypes_vals_ranks.values.shape[0],1))),1) ###covars_conditional=np.concatenate((geno[sample_idx,imax:imax+1],np.ones((N,1))),1) covs = None #covariates Acovs = None #the design matrix for the covariates #Asnps = sp.eye(P) #the design matrix for the SNPs Asnps = sp.ones((1,P)) K1r = sample_relatedness #the first sample-sample covariance matrix (non-noise) K2r = sp.eye(N) #the second sample-sample covariance matrix (noise) K1c = None #the first phenotype-phenotype covariance matrix (non-noise) K2c = None #the second phenotype-phenotype covariance matrix (noise) covar_type = 'freeform' #the type of the trait/trait covariance to be estimated searchDelta = False #specify if delta should be optimized for each SNP test="lrt" lmm, pv = qtl.test_lmm_kronecker(snps,phenotypes_vals_ranks,covs=covs,Acovs=Acovs, Asnps=Asnps,K1r=K1r,trait_covar_type=covar_type) pvalues = pd.DataFrame(data=pv.T,index=data_subsample.geno_ID,columns=[gene]) flag = 0 qvalues = fdr.qvalues(pv[0]) for n in range(pvalues.shape[0]): k = position.ix[n]['chrom']+':'+str(position.ix[n]['pos']) ###ALL.append([M[k],position.ix[n]['chrom'],str(position.ix[n]['pos']),str(pvalues.ix[n][0]),gene]) if qvalues[n] < FDR: #flag = 1 #print(pvalues) ouFile.write('\t'.join([M[k],position.ix[n]['chrom'],str(position.ix[n]['pos']),str(pvalues.ix[n][0]), str(qvalues[n]),gene]) + '\n') #S.append([M[k],position.ix[n]['chrom'],str(position.ix[n]['pos']),str(pvalues.ix[n][0]), str(qvalues[n]),gene]) ####if flag: #### manhattonPlot(gene, pvalues,ouF) #S.sort(cmp = lambda x,y:cmp(float(x[4]),float(y[4]))) #for item in S: # ouFile.write('\t'.join(item) + '\n') ###ALL.sort(cmp = lambda x,y:cmp(float(x[3]),float(y[3]))) ###for item in ALL: ### ouFile2.write('\t'.join(item) + '\n') ###ouFile2.close() ouFile.close()
def significant(pvalues_lm,ouF): #SigPhe = set() ouFile = open(ouF,'w') for i in range(pvalues_lm.shape[1]): n = -1 qvalues = fdr.qvalues(pvalues_lm.ix[:,i]) for x in qvalues: n += 1 if x < FDR: k = pos['chrom'][n] + ':' + str(pos['pos'][n]) ouFile.write("%s\t%s\t%s\t%s\t%s\t%s"%(M[k],pos['chrom'][n], pos['pos'][n],pvalues_lm.ix[:,i][n],x,pvalues_lm.columns[i]) + '\n') ###SigPhe.add(pvalues_lm.columns[i]) ouFile.close()
X_gene = np.random.permutation(X_gene) info_gene = info_df.iloc[snp_idx] for item in info_gene.columns: out_dict[item] = info_gene[item].values assoc_gene = gene.repeat(snp_idx.sum()) out_dict['assoc_gene'] = assoc_gene # Run the LMM analysis print " .. single trait analysis" if fit_design: lmm = QTL.test_lmm(X_gene, Y_gene, K=K, covs=design) else: lmm = QTL.test_lmm(X_gene, Y_gene, K=K) pv = lmm.getPv() pv[0][np.isnan(pv[0])] = 1.0 # set any NaN p-values to 1 out_dict['pv'] = pv[0] out_dict['qv'] = FDR.qvalues(pv)[0] out_dict['beta'] = lmm.getBetaSNP()[0] lambda_val = getLambda(pv) lambda_val = lambda_val.repeat(len(out_dict['pv'])) out_dict['lambda_val'] = lambda_val out_df = pd.DataFrame(out_dict, index=out_dict['gdid']) ## convert full stops in gene name to underscore gene = gene.replace(".", "_") ## append results for chunk to gene's results df to HDF5 file print " ....appending results..." fout.put(gene, out_df, table=True, data_columns=True) if all_results is None: all_results = out_df else: all_results = all_results.append(out_df, ignore_index=True) print " ....appending done."
print ' .. Importing data' try: Xc, info = data.getGenotypes(gene, return_info=True) except: print 'Error: no SNPs found in cis' continue Y = data.getPhenotypes(gene, peer=opt.peer, gauss=True) o = gene_group.create_group('snp_info') smartDumpDictHdf5(info, o) if opt.perm: if opt.seed is not None: sp.random.seed(opt.seed) idxs = sp.random.permutation(Xc.shape[0]) Xc = Xc[idxs, :] if 1: print " .. single trait analysis" lmm = QTL.test_lmm(Xc, Y, K=K) pv = lmm.getPv() RV = {} RV['pv'] = pv RV['qv'] = FDR.qvalues(pv) RV['beta'] = lmm.getBetaSNP() RV['lambda'] = getLambda(pv) o = gene_group.create_group('st') smartDumpDictHdf5(RV, o) fout.close()
temp['pos'] = fgene['snp_info']['pos'][:][idx] temp['rs'] = fgene['snp_info']['rs'][:][idx] except: print geneID, 'failed' continue #append the temp table into the big table for key in temp.keys(): smartAppend(table, key, temp[key]) f.close() for key in table.keys(): table[key] = sp.array(table[key]) print '.. correct for multiple testing' table['pv_bonf'][table['pv_bonf'] > 1] = 1. table['qv_all'] = FDR.qvalues(table['pv_bonf']) print 'no eQTLs at FDR 0.10:', (table['qv_all'] < 0.10).sum() print 'no genes:', table['qv_all'].shape[0] fout = h5py.File(out_file, 'w') smartDumpDictHdf5(table, fout) fout.close() else: f = h5py.File(out_file, 'r') R = {} for key in f.keys(): R[key] = f[key][:] f.close()
def specific_effect(ouF): S = [] ALL = [] ouFile = open(ouF, 'w') ouFile.write('\t'.join(['Marker','Chr','Position','Specific_pvalue','Common_pvalue','Any_pvalue','Specific_qvalue','Common_qvalue','Any_qvalue','Gene']) + '\n') ###ouFile2 = open(ouF.split('-Sig')[0] + '-ALL', 'w') gs = G[0] for gene in gs: phenotype_names = [gene + ':RNA', gene + ':ProteinLight'] phenotype_query = "(phenotype_ID in %s)" % str(phenotype_names) data_subsample = dataset.subsample_phenotypes(phenotype_query=phenotype_query,intersection=True) snps = data_subsample.getGenotypes(impute_missing=True) phenotypes,sample_idx = data_subsample.getPhenotypes(phenotype_query=phenotype_query, intersection=True) sample_relatedness = data_subsample.getCovariance() phenotypes_vals_ranks = preprocess.rankStandardizeNormal(phenotypes.values) N, P = phenotypes.shape imax = 735 ### II:476596 #covars_conditional=np.concatenate((geno[sample_idx,imax:imax+1],np.ones((phenotypes_vals_ranks.values.shape[0],1))),1) covars_conditional=np.concatenate((geno[sample_idx,imax:imax+1],np.ones((N,1))),1) covs = None #covariates Acovs = None #the design matrix for the covariates Asnps0 = sp.ones((1,P)) #the design matrix for the SNPs Asnps1 = sp.zeros((2,P)) Asnps1[0,:] = 1.0 Asnps1[1,0] = 1.0 K1r = sample_relatedness #the first sample-sample covariance matrix (non-noise) K2r = sp.eye(N) #the second sample-sample covariance matrix (noise) K1c = None #the first phenotype-phenotype covariance matrix (non-noise) K2c = None #the second phenotype-phenotype covariance matrix (noise) covar_type = 'freeform' #the type of the trait/trait covariance to be estimated searchDelta = False #specify if delta should be optimized for each SNP test="lrt" #lmm, pv = qtl.test_lmm_kronecker(snps,phenotypes_vals_ranks,covs=covs,Acovs=Acovs, Asnps=Asnps,K1r=K1r,trait_covar_type=covar_type) pv = qtl.test_interaction_lmm_kronecker(snps=snps,phenos=phenotypes_vals_ranks, covs=covs,Acovs=Acovs,Asnps1=Asnps1,Asnps0=Asnps0,K1r=K1r,K2r=K2r,K1c=K1c,K2c=K2c,trait_covar_type=covar_type,searchDelta=searchDelta) #pvalues = pd.DataFrame(data=pv.T,index=data_subsample.geno_ID,columns=[gene]) pvalues = pd.DataFrame(data=sp.concatenate(pv).T,index=data_subsample.geno_ID,columns=["specific","null_common","alternative_any"]) flag = 0 qvalues1 = fdr.qvalues(pv[0][0]) qvalues2 = fdr.qvalues(pv[1][0]) qvalues3 = fdr.qvalues(pv[2][0]) for n in range(pvalues.shape[0]): k = position.ix[n]['chrom']+':'+str(position.ix[n]['pos']) ###ALL.append([M[k],position.ix[n]['chrom'],str(position.ix[n]['pos']),str(pvalues.ix[n][0]),gene]) if qvalues1[n] < FDR: flag = 1 #print(pvalues) #ouFile.write('\t'.join([M[k],position.ix[n]['chrom'],str(position.ix[n]['pos']),str(pvalues.ix[n][0]),gene]) + '\n') S.append([M[k],position.ix[n]['chrom'],str(position.ix[n]['pos']),str(pvalues.ix[n][0]), str(pvalues.ix[n][1]), str(pvalues.ix[n][2]), str(qvalues1[n]), str(qvalues2[n]), str(qvalues3[n]),gene]) if flag: manhattonPlotSpecific(gene, pvalues,ouF) S.sort(cmp = lambda x,y:cmp(float(x[6]),float(y[6]))) for item in S: ouFile.write('\t'.join(item) + '\n') ###ALL.sort(cmp = lambda x,y:cmp(float(x[3]),float(y[3]))) ###for item in ALL: ### ouFile2.write('\t'.join(item) + '\n') ###ouFile2.close() ouFile.close()
#open out tsv file out = open(outfile, 'w') if file.shape[0] == 1: sys.stdout.write('WARNING: file {0} is empty\n'.format(i)) else: pval = file['pv'].astype(float) # l_adj_pval = file['qv'].astype(float) # beta = file['beta'].astype(float) # lambda_pval= file['lambda'].astype(float) # lambda_perm= file['lambda_perm'].astype(float) # l_emp_pval = file['pv_perm'].astype(float) #store the number of pvalues tested shape_pv = pval.shape[0] #calculate qv_all,pv_perm_all if window != 0 and n_perm <= 1: #if cis and no empirical pvalues g_adj_pval = FDR.qvalues(file['qv'].astype(float), m=n_genes) g_emp_adj_pval = (sp.empty((shape_pv, ))).astype(str) g_emp_adj_pval[:] = 'NA' #fill an empty array with NA values elif window != 0 and n_perm > 1: # if cis and empirical pvalues g_adj_pval = FDR.qvalues(file['qv'].astype(float), m=n_genes) g_emp_adj_pval = FDR.qvalues(file['pv_perm'].astype(float), m=n_genes) elif window == 0 and n_perm <= 1: #if trans and no empirical pvalues pval = file['pv'].astype(float) g_adj_pval = sp.array( stats.p_adjust(FloatVector(pval.tolist()), method='bonferroni', n=float(n_tests)) ) #compute bonferroni adjusted across nominal pvalues g_emp_adj_pval = (sp.empty( (shape_pv, ))).astype(str) #fill an empty array with NA values
# one line per donor Xu = np.array(X, dtype='float') Yu = np.array(Y, dtype='float') #Yu -= Yu.mean(0); Yu /= Yu.std(0) if center: Xu -= Xu.mean(0); Xu /= Xu.std(0) uKcis = SP.dot(Xu,Xu.T) uKtrans = uKpop-uKcis uKcis /= uKcis.diagonal().mean() uKtrans /= uKtrans.diagonal().mean() #4.3 perform experiment and store results in out_gene out_gene = {} #print "cis scan" lm=QTL.test_lmm(snps=Xu,pheno=Yu,K=uKtrans,covs=uCov,verbose=True) pv=lm.getPv() RV = {} RV['pv'] = pv RV['qv'] = FDR.qvalues(pv)[0] RV['lambd'] = getLambda(pv) RV['beta'] = lm.getBetaSNP() RV['posLead'] = SP.array([getRealPos(info['pos'][pv[0,:].argmin()],start,end,strand)]) RV['aDirPosLead'] = abs(SP.array([info['pos'][pv[0,:].argmin()]-0.5*(start+end)])) out_group = probe_group.create_group('lmm') dumpDictHdf5(RV,out_group) #print 'ok' except: continue f.close()
def forward_lmm_kronecker(snps,phenos,Asnps=None,Acond=None,K1r=None,K1c=None,K2r=None,K2c=None,covs=None,Acovs=None,threshold=5e-8,maxiter=2,qvalues=False, update_covariances = False,verbose=None,**kw_args): """ Kronecker fixed effects test with forward selection Args: snps: [N x S] np.array of S SNPs for N individuals (test SNPs) pheno: [N x P] np.array of 1 phenotype for N individuals K: [N x N] np.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed covs: [N x D] np.array of D covariates for N individuals threshold: (float) P-value thrashold for inclusion in forward selection (default 5e-8) maxiter: (int) maximum number of interaction scans. First scan is without inclusion, so maxiter-1 inclusions can be performed. (default 2) qvalues: Use q-value threshold and return q-values in addition (default False) update_covar: Boolean indicator if covariances should be re-estimated after each forward step (default False) Returns: lm: lmix LMMi object resultStruct with elements: iadded: array of indices of SNPs included in order of inclusion pvadded: array of Pvalues obtained by the included SNPs in iteration before inclusion pvall: [Nadded x S] np.array of Pvalues for all iterations. Optional: corresponding q-values qvadded qvall """ verbose = limix.getVerbose(verbose) #0. checks N = phenos.shape[0] P = phenos.shape[1] if K1r==None: K1r = np.dot(snps,snps.T) else: assert K1r.shape[0]==N, 'K1r: dimensions dismatch' assert K1r.shape[1]==N, 'K1r: dimensions dismatch' if K2r==None: K2r = np.eye(N) else: assert K2r.shape[0]==N, 'K2r: dimensions dismatch' assert K2r.shape[1]==N, 'K2r: dimensions dismatch' covs,Acovs = _updateKronCovs(covs,Acovs,N,P) if Asnps is None: Asnps = [np.ones([1,P])] if (type(Asnps)!=list): Asnps = [Asnps] assert len(Asnps)>0, "need at least one Snp design matrix" if Acond is None: Acond = Asnps if (type(Acond)!=list): Acond = [Acond] assert len(Acond)>0, "need at least one Snp design matrix" #1. run GP model to infer suitable covariance structure if K1c==None or K2c==None: vc = _estimateKronCovariances(phenos=phenos, K1r=K1r, K2r=K2r, K1c=K1c, K2c=K2c, covs=covs, Acovs=Acovs, **kw_args) K1c = vc.getTraitCovar(0) K2c = vc.getTraitCovar(1) else: vc = None assert K1c.shape[0]==P, 'K1c: dimensions dismatch' assert K1c.shape[1]==P, 'K1c: dimensions dismatch' assert K2c.shape[0]==P, 'K2c: dimensions dismatch' assert K2c.shape[1]==P, 'K2c: dimensions dismatch' t0 = time.time() lm,pv = test_lmm_kronecker(snps=snps,phenos=phenos,Asnps=Asnps,K1r=K1r,K2r=K2r,K1c=K1c,K2c=K2c,covs=covs,Acovs=Acovs) #get pv #start stuff iadded = [] pvadded = [] qvadded = [] time_el = [] pvall = [] qvall = None t1=time.time() if verbose: print ("finished GWAS testing in %.2f seconds" %(t1-t0)) time_el.append(t1-t0) pvall.append(pv) imin= np.unravel_index(pv.argmin(),pv.shape) score=pv[imin].min() niter = 1 if qvalues: assert pv.shape[0]==1, "This is untested with the fdr package. pv.shape[0]==1 failed" qvall = [] qv = FDR.qvalues(pv) qvall.append(qv) score=qv[imin] #loop: while (score<threshold) and niter<maxiter: t0=time.time() pvadded.append(pv[imin]) iadded.append(imin) if qvalues: qvadded.append(qv[imin]) if update_covariances and vc is not None: vc.addFixedTerm(snps[:,imin[1]:(imin[1]+1)],Acond[imin[0]]) vc.setScales()#CL: don't know what this does, but findLocalOptima crashes becahuse vc.noisPos=None vc.findLocalOptima(fast=True) K1c = vc.getTraitCovar(0) K2c = vc.getTraitCovar(1) lm.setK1c(K1c) lm.setK2c(K2c) lm.addCovariates(snps[:,imin[1]:(imin[1]+1)],Acond[imin[0]]) for i in xrange(len(Asnps)): #add SNP design lm.setSNPcoldesign(Asnps[i]) lm.process() pv[i,:] = lm.getPv()[0] pvall.append(pv.ravel()) imin= np.unravel_index(pv.argmin(),pv.shape) if qvalues: qv = FDR.qvalues(pv) qvall[niter:niter+1,:] = qv score = qv[imin].min() else: score = pv[imin].min() t1=time.time() if verbose: print ("finished GWAS testing in %.2f seconds" %(t1-t0)) time_el.append(t1-t0) niter=niter+1 RV = {} RV['iadded'] = iadded RV['pvadded'] = pvadded RV['pvall'] = np.array(pvall) RV['time_el'] = time_el if qvalues: RV['qvall'] = qvall RV['qvadded'] = qvadded return lm,RV
def forward_lmm(snps,pheno,K=None,covs=None,qvalues=False,threshold=5e-8,maxiter=2,test='lrt',verbose=None,**kw_args): """ univariate fixed effects test with forward selection Args: snps: [N x S] np.array of S SNPs for N individuals (test SNPs) pheno: [N x 1] np.array of 1 phenotype for N individuals K: [N x N] np.array of LMM-covariance/kinship koefficients (optional) If not provided, then linear regression analysis is performed covs: [N x D] np.array of D covariates for N individuals threshold: (float) P-value thrashold for inclusion in forward selection (default 5e-8) maxiter: (int) maximum number of interaction scans. First scan is without inclusion, so maxiter-1 inclusions can be performed. (default 2) test: 'lrt' for likelihood ratio test (default) or 'f' for F-test verbose: print verbose output? (False) Returns: lm: limix LMM object RV: dictionary RV['iadded']: array of indices of SNPs included in order of inclusion RV['pvadded']: array of Pvalues obtained by the included SNPs in iteration before inclusion RV['pvall']: [Nadded x S] np.array of Pvalues for all iterations """ verbose = limix.getVerbose(verbose) if K is None: K=np.eye(snps.shape[0]) if covs is None: covs = np.ones((snps.shape[0],1)) #assert single trait assert pheno.shape[1]==1, 'forward_lmm only supports single phenotypes' lm = test_lmm(snps,pheno,K=K,covs=covs,test=test,**kw_args) pvall = [] pv = lm.getPv().ravel() pvall.append(pv) imin= pv.argmin() niter = 1 #start stuff iadded = [] pvadded = [] qvadded = [] if qvalues: assert pv.shape[0]==1, "This is untested with the fdr package. pv.shape[0]==1 failed" qvall = [] qv = FDR.qvalues(pv) qvall.append(qv) score=qv.min() else: score=pv.min() while (score<threshold) and niter<maxiter: t0=time.time() iadded.append(imin) pvadded.append(pv[imin]) if qvalues: qvadded.append(qv[0,imin]) covs=np.concatenate((covs,snps[:,imin:(imin+1)]),1) lm.setCovs(covs) lm.process() pv = lm.getPv().ravel() pvall.append(pv) imin= pv.argmin() if qvalues: qv = FDR.qvalues(pv) qvall[niter:niter+1,:] = qv score = qv.min() else: score = pv.min() t1=time.time() if verbose: print ("finished GWAS testing in %.2f seconds" %(t1-t0)) niter=niter+1 RV = {} RV['iadded'] = iadded RV['pvadded'] = pvadded RV['pvall'] = np.array(pvall) if qvalues: RV['qvall'] = np.array(qvall) RV['qvadded'] = qvadded return lm,RV
#store results RV['pv'] = pv #record nominal pvalues if n_perm > 100: #compute how many MINIMUM permuted pvalues for each permutation are less than the minimum nominal pvalue and store the value RV['pv_perm'] = SP.array([((r + 1) / (float(n_perm) + 1))], dtype=float) #one value per gene RV['pv_perm'] = RV['pv_perm'].reshape( (1, len(RV['pv_perm']))) #reshape an array of (1,1) else: #if n_perm <= 100 #perm_pv= lmm_perm.getPv() #record the minimum permuted_pvalues after 1 permutation #RV['pv_perm'] = SP.array([perm_pv[:].min()]) RV['pv_perm'] = SP.array([min_perm_pvalue]) if multiple_test_correction == 'fdr': #default RV['qv'] = FDR.qvalues( pv) #multiple test correction for nominal pvalues with B-H else: RV['qv'] = SP.empty(pv.shape, dtype=float) tmp_qv = SP.array( stats.p_adjust(FloatVector(pv[0].tolist()), method='bonferroni', n=float(pv.shape[1])) ) #multiple test correction for nominal pvalues with Bonferroni RV['qv'][0, :] = tmp_qv RV['lambda'] = getLambda(pv) #get lambda for nominal pvalues if n_perm <= 100: #RV['lambda_perm'] = getLambda(perm_pv[:]) #calculate lambda on the the permutation RV['lambda_perm'] = SP.array([mean_perm_lambda, std_perm_lambda]) if change_beta_sign == 'y': RV['beta'] = -lmm.getBetaSNP(