def _robustness_test_(all_snps, phenVals, outputFile, filter=0.1): import analyzePhenotype import analyzeHaplotype import random new_all_snps = [] for snp in all_snps: if snp.count(0) > 1 and snp.count(1) > 1: new_all_snps.append(snp) print "Filtered", len(all_snps) - len( new_all_snps), " with minor allele count <2." all_snps = new_all_snps def getLeaveOneOutK(K, i): l = range(0, len(K)) l.pop(i) new_k = numpy.core.take(K, l, 0) new_k = numpy.core.take(new_k, l, 1) return new_k print "Calculating kinship" t1 = time.time() K = calcKinship(all_snps) t2 = time.time() print "Took", t2 - t1, "seconds." """ Leave one out test.. """ if filter < 1.0: snps = random.sample(all_snps, int(len(all_snps) * filter)) print "Number of SNPs:", len(snps) else: snps = all_snps # K = calcKinship(snps) print "running EMMA" t1 = time.time() true_pvals = _runEmma_(snps, phenVals, K)["ps"] true_pvals = map(float, true_pvals) t2 = time.time() print "Took", t2 - t1, "seconds." log_true_pvals = [] for pval in true_pvals: log_true_pvals.append(-math.log(pval, 10)) perm_pvalues_list = [] for i in range(0, len(phenVals)): newPhenvals = phenVals[:] newPhenvals.pop(i) newSNPs = [] for snp in snps: newSNP = snp[:] newSNP.pop(i) newSNPs.append(newSNP) new_k = getLeaveOneOutK(K, i) print "running EMMA" t1 = time.time() pvals = _runEmma_(newSNPs, newPhenvals, new_k)["ps"] pvals = map(float, pvals) t2 = time.time() print "Took", t2 - t1, "seconds." perm_pvalues_list.append(pvals) delta_pvals_list = [] delta_log_pvals_list = [] for perm_pvals in perm_pvalues_list: log_pvals = [] delta_pvals = [] delta_log_pvals = [] for i in range(0, len(true_pvals)): pval = perm_pvals[i] true_pval = true_pvals[i] delta_pvals.append(true_pval - pval) log_true_pval = log_true_pvals[i] log_pval = -math.log(pval, 10) log_pvals.append(log_pval) delta_log_pvals.append(log_true_pval - log_pval) delta_pvals_list.append(delta_pvals) delta_log_pvals_list.append(delta_log_pvals) sd_log_pvals = [] sd_pvals = [] t_delta_log_pvals_list = map(list, zip(*delta_log_pvals_list)) t_delta_pvals_list = map(list, zip(*delta_pvals_list)) for i in range(0, len(true_pvals)): sd_log_pvals.append(util.calcSD(t_delta_log_pvals_list[i])) sd_pvals.append(util.calcSD(t_delta_pvals_list[i])) #Write SDs out to file, to be able to replot, or plot together with other methods... etc import csv sd_log_pval_file = outputFile + ".rob.log_pvals_sd" f = open(sd_log_pval_file, "w") w = csv.writer(f) w.writerow(["log_true_pval", "sd_log_pvals"]) l = zip(log_true_pvals, sd_log_pvals) w.writerows(l) f.close() #Plot things.... pngFile_log_pvals = outputFile + ".rob.log_pval.png" pngFile_pval = outputFile + ".rob.pval.png" pngFile_sd_log_pval = outputFile + ".rob.sd_log_pval.png" pngFile_sd_pval = outputFile + ".rob.sd_pval.png" min_val = min(true_pvals) max_val = max(true_pvals) val_range = max_val - min_val min_log_val = min(log_true_pvals) max_log_val = max(log_true_pvals) log_val_range = max_val - min_val print "Plotting graphs" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.figure(figsize=(10, 7)) max_perm_val = 0 min_perm_val = 0 for i in range(0, len(perm_pvalues_list)): delta_log_pvals = delta_log_pvals_list[i] plt.plot(log_true_pvals, delta_log_pvals, "b.") max_perm_val = max(max_perm_val, max(delta_log_pvals)) min_perm_val = min(min_perm_val, min(delta_log_pvals)) perm_val_range = max_perm_val - min_perm_val v = [ min_log_val - 0.02 * log_val_range, max_log_val + 0.02 * log_val_range, min_perm_val - 0.02 * perm_val_range, max_perm_val + 0.02 * perm_val_range ] plt.axis(v) plt.savefig(pngFile_log_pvals, format="png") plt.figure(figsize=(10, 7)) max_perm_val = 0 min_perm_val = 0 for i in range(0, len(perm_pvalues_list)): delta_pvals = delta_pvals_list[i] plt.plot(true_pvals, delta_pvals, "b.") max_perm_val = max(max_perm_val, max(delta_pvals)) min_perm_val = min(min_perm_val, min(delta_pvals)) perm_val_range = max_perm_val - min_perm_val plt.axis([ min_val - 0.02 * val_range, max_val + 0.02 * val_range, min_perm_val - 0.02 * perm_val_range, max_perm_val + 0.02 * perm_val_range ]) plt.savefig(pngFile_pval, format="png") plt.figure(figsize=(10, 7)) max_sd_log_pval = max(sd_log_pvals) min_sd_log_pval = min(sd_log_pvals) sd_val_range = max_sd_log_pval - min_sd_log_pval plt.plot(log_true_pvals, sd_log_pvals, "b.") plt.axis([ min_log_val - 0.02 * log_val_range, max_log_val + 0.02 * log_val_range, min_sd_log_pval - 0.02 * sd_val_range, max_sd_log_pval + 0.02 * sd_val_range ]) plt.savefig(pngFile_sd_log_pval, format="png") plt.figure(figsize=(10, 7)) max_sd_pval = max(sd_pvals) min_sd_pval = min(sd_pvals) sd_val_range = max_sd_pval - min_sd_pval plt.plot(true_pvals, sd_pvals, "b.") plt.axis([ min_val - 0.02 * val_range, max_val + 0.02 * val_range, min_sd_pval - 0.02 * sd_val_range, max_sd_pval + 0.02 * sd_val_range ]) plt.savefig(pngFile_sd_pval, format="png") print "Done testing robustness"
def _robustness_test_(all_snps,phenVals,outputFile,filter=0.1): import analyzePhenotype import analyzeHaplotype import random new_all_snps = [] for snp in all_snps: if snp.count(0)>1 and snp.count(1)>1: new_all_snps.append(snp) print "Filtered",len(all_snps)-len(new_all_snps)," with minor allele count <2." all_snps = new_all_snps def getLeaveOneOutK(K,i): l = range(0,len(K)) l.pop(i) new_k = numpy.core.take(K,l,0) new_k = numpy.core.take(new_k,l,1) return new_k print "Calculating kinship" t1 = time.time() K = calcKinship(all_snps) t2 = time.time() print "Took",t2-t1,"seconds." """ Leave one out test.. """ if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) else: snps = all_snps # K = calcKinship(snps) print "running EMMA" t1 = time.time() true_pvals = _runEmma_(snps,phenVals,K)["ps"] true_pvals = map(float,true_pvals) t2 = time.time() print "Took",t2-t1,"seconds." log_true_pvals = [] for pval in true_pvals: log_true_pvals.append(-math.log(pval,10)) perm_pvalues_list = [] for i in range(0,len(phenVals)): newPhenvals = phenVals[:] newPhenvals.pop(i) newSNPs = [] for snp in snps: newSNP = snp[:] newSNP.pop(i) newSNPs.append(newSNP) new_k = getLeaveOneOutK(K,i) print "running EMMA" t1 = time.time() pvals = _runEmma_(newSNPs,newPhenvals,new_k)["ps"] pvals = map(float,pvals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list.append(pvals) delta_pvals_list = [] delta_log_pvals_list = [] for perm_pvals in perm_pvalues_list: log_pvals = [] delta_pvals = [] delta_log_pvals = [] for i in range(0,len(true_pvals)): pval = perm_pvals[i] true_pval = true_pvals[i] delta_pvals.append(true_pval-pval) log_true_pval = log_true_pvals[i] log_pval = -math.log(pval,10) log_pvals.append(log_pval) delta_log_pvals.append(log_true_pval-log_pval) delta_pvals_list.append(delta_pvals) delta_log_pvals_list.append(delta_log_pvals) sd_log_pvals = [] sd_pvals = [] t_delta_log_pvals_list = map(list,zip(*delta_log_pvals_list)) t_delta_pvals_list = map(list,zip(*delta_pvals_list)) for i in range(0,len(true_pvals)): sd_log_pvals.append(util.calcSD(t_delta_log_pvals_list[i])) sd_pvals.append(util.calcSD(t_delta_pvals_list[i])) #Write SDs out to file, to be able to replot, or plot together with other methods... etc import csv sd_log_pval_file = outputFile+".rob.log_pvals_sd" f = open(sd_log_pval_file,"w") w = csv.writer(f) w.writerow(["log_true_pval","sd_log_pvals"]) l = zip(log_true_pvals,sd_log_pvals) w.writerows(l) f.close() #Plot things.... pngFile_log_pvals = outputFile+".rob.log_pval.png" pngFile_pval = outputFile+".rob.pval.png" pngFile_sd_log_pval = outputFile+".rob.sd_log_pval.png" pngFile_sd_pval = outputFile+".rob.sd_pval.png" min_val = min(true_pvals) max_val = max(true_pvals) val_range = max_val-min_val min_log_val = min(log_true_pvals) max_log_val = max(log_true_pvals) log_val_range = max_val-min_val print "Plotting graphs" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.figure(figsize=(10,7)) max_perm_val = 0 min_perm_val = 0 for i in range(0,len(perm_pvalues_list)): delta_log_pvals = delta_log_pvals_list[i] plt.plot(log_true_pvals,delta_log_pvals,"b.") max_perm_val = max(max_perm_val,max(delta_log_pvals)) min_perm_val = min(min_perm_val,min(delta_log_pvals)) perm_val_range = max_perm_val - min_perm_val v = [min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range] plt.axis(v) plt.savefig(pngFile_log_pvals, format = "png") plt.figure(figsize=(10,7)) max_perm_val = 0 min_perm_val = 0 for i in range(0,len(perm_pvalues_list)): delta_pvals = delta_pvals_list[i] plt.plot(true_pvals,delta_pvals,"b.") max_perm_val = max(max_perm_val,max(delta_pvals)) min_perm_val = min(min_perm_val,min(delta_pvals)) perm_val_range = max_perm_val - min_perm_val plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range]) plt.savefig(pngFile_pval, format = "png") plt.figure(figsize=(10,7)) max_sd_log_pval = max(sd_log_pvals) min_sd_log_pval = min(sd_log_pvals) sd_val_range = max_sd_log_pval-min_sd_log_pval plt.plot(log_true_pvals,sd_log_pvals,"b.") plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_sd_log_pval-0.02*sd_val_range, max_sd_log_pval+0.02*sd_val_range]) plt.savefig(pngFile_sd_log_pval, format = "png") plt.figure(figsize=(10,7)) max_sd_pval = max(sd_pvals) min_sd_pval = min(sd_pvals) sd_val_range = max_sd_pval-min_sd_pval plt.plot(true_pvals,sd_pvals,"b.") plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_sd_pval-0.02*sd_val_range, max_sd_pval+0.02*sd_val_range]) plt.savefig(pngFile_sd_pval, format = "png") print "Done testing robustness"
def _robustness_test_(all_snps,phenVals,outputFile,filter=0.1,test_type = "KW",): """ Leave one out test.. """ new_all_snps = [] for snp in all_snps: if snp.count(0)>1 and snp.count(1)>1: new_all_snps.append(snp) print "Filtered",len(all_snps)-len(new_all_snps)," with minor allele count <2." all_snps = new_all_snps if filter <1.0: snps = random.sample(all_snps,int(len(all_snps)*filter)) print "Number of SNPs:",len(snps) else: snps = all_snps if test_type=="KW": print "running KW" t1 = time.time() true_pvals = util.kruskal_wallis(snps,phenVals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() true_pvals = run_fet(snps,phenVals) t2 = time.time() print "Took",t2-t1,"seconds." log_true_pvals = [] for pval in true_pvals: log_true_pvals.append(-math.log(pval,10)) perm_pvalues_list = [] for i in range(0,len(phenVals)): newPhenvals = phenVals[:] newPhenvals.pop(i) newSNPs = [] for snp in snps: newSNP = snp[:] newSNP.pop(i) newSNPs.append(newSNP) print i if test_type=="KW": print "running KW" t1 = time.time() pvals = util.kruskal_wallis(newSNPs,newPhenvals)["ps"] t2 = time.time() print "Took",t2-t1,"seconds." elif test_type=="Fisher": print "running Fisher's exact test" t1 = time.time() pvals = run_fet(newSNPs,newPhenvals) t2 = time.time() print "Took",t2-t1,"seconds." perm_pvalues_list.append(pvals) delta_pvals_list = [] delta_log_pvals_list = [] for perm_pvals in perm_pvalues_list: log_pvals = [] delta_pvals = [] delta_log_pvals = [] for i in range(0,len(true_pvals)): pval = perm_pvals[i] true_pval = true_pvals[i] delta_pvals.append(true_pval-pval) log_true_pval = log_true_pvals[i] if pval > 0.0: log_pval = -math.log(pval,10) else: print "Damn those random 0 prob. events: event #", i log_pval = -math.log(true_pval,10) log_pvals.append(log_pval) delta_log_pvals.append(log_true_pval-log_pval) delta_pvals_list.append(delta_pvals) delta_log_pvals_list.append(delta_log_pvals) sd_log_pvals = [] sd_pvals = [] t_delta_log_pvals_list = map(list,zip(*delta_log_pvals_list)) t_delta_pvals_list = map(list,zip(*delta_pvals_list)) for i in range(0,len(true_pvals)): sd_log_pvals.append(util.calcSD(t_delta_log_pvals_list[i])) sd_pvals.append(util.calcSD(t_delta_pvals_list[i])) #Write SDs out to file, to be able to replot, or plot together with other methods... etc import csv sd_log_pval_file = outputFile+".rob.log_pvals_sd" f = open(sd_log_pval_file,"w") w = csv.writer(f) w.writerow(["log_true_pval","sd_log_pvals"]) l = zip(log_true_pvals,sd_log_pvals) w.writerows(l) f.close() #Plot things.... pngFile_log_pvals = outputFile+".rob.log_pval.png" pngFile_pval = outputFile+".rob.pval.png" pngFile_sd_log_pval = outputFile+".rob.sd_log_pval.png" pngFile_sd_pval = outputFile+".rob.sd_pval.png" min_val = min(true_pvals) max_val = max(true_pvals) val_range = max_val-min_val min_log_val = min(log_true_pvals) max_log_val = max(log_true_pvals) log_val_range = max_val-min_val import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.figure(figsize=(10,7)) max_perm_val = 0 min_perm_val = 0 for i in range(0,len(perm_pvalues_list)): delta_log_pvals = delta_log_pvals_list[i] plt.plot(log_true_pvals,delta_log_pvals,"b.") max_perm_val = max(max_perm_val,max(delta_log_pvals)) min_perm_val = min(min_perm_val,min(delta_log_pvals)) perm_val_range = max_perm_val - min_perm_val plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range]) plt.savefig(pngFile_log_pvals, format = "png") plt.figure(figsize=(10,7)) max_perm_val = 0 min_perm_val = 0 for i in range(0,len(perm_pvalues_list)): delta_pvals = delta_pvals_list[i] plt.plot(true_pvals,delta_pvals,"b.") max_perm_val = max(max_perm_val,max(delta_pvals)) min_perm_val = min(min_perm_val,min(delta_pvals)) perm_val_range = max_perm_val - min_perm_val plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_perm_val-0.02*perm_val_range, max_perm_val+0.02*perm_val_range]) plt.savefig(pngFile_pval, format = "png") plt.figure(figsize=(10,7)) max_sd_log_pval = max(sd_log_pvals) min_sd_log_pval = min(sd_log_pvals) sd_val_range = max_sd_log_pval-min_sd_log_pval plt.plot(log_true_pvals,sd_log_pvals,"b.") plt.axis([min_log_val-0.02*log_val_range, max_log_val+0.02*log_val_range, min_sd_log_pval-0.02*sd_val_range, max_sd_log_pval+0.02*sd_val_range]) plt.savefig(pngFile_sd_log_pval, format = "png") plt.figure(figsize=(10,7)) max_sd_pval = max(sd_pvals) min_sd_pval = min(sd_pvals) sd_val_range = max_sd_pval-min_sd_pval plt.plot(true_pvals,sd_pvals,"b.") plt.axis([min_val-0.02*val_range, max_val+0.02*val_range, min_sd_pval-0.02*sd_val_range, max_sd_pval+0.02*sd_val_range]) plt.savefig(pngFile_sd_pval, format = "png")