def _run_(): p_dict, args = parse_parameters() print "GWA runs are being set up with the following parameters:" for k, v in p_dict.iteritems(): print k + ': ' + str(v) print '' #Load phenotype file if p_dict['phen_file']: print 'Loading phenotypes from file.' phed = phenotypeData.readPhenotypeFile(p_dict['phen_file'], with_db_ids=(not p_dict['no_phenotype_ids'])) #load phenotype file else: print 'Retrieving the phenotypes from the DB.' phed = phenotypeData.getPhenotypes() #If on the cluster, then set up runs.. if p_dict['parallel']: if len(p_dict['pids']) == 0: #phenotype index arguement is missing, hence all phenotypes are run/analyzed. if not p_dict['phen_file']: raise Exception('Phenotype file or phenotype ID is missing.') p_dict['pids'] = phed.phenIds else: raise Exception('Too many arguments..') if analysis_plots: #Running on the cluster.. for p_i in p_dict['pids']: run_parallel(p_i, phed, p_dict) else: for mapping_method in p_dict['specific_methods']: for trans_method in p_dict['specific_transformations']: for p_i in pids: run_parallel(p_i, phed, p_dict, mapping_method, trans_method) return #Exiting the program... #SNPs data file name if not p_dict['data_file']: snps_data_file = '%s250K_t%d.csv' % (env['data_dir'], p_dict['call_method_id']) else: snps_data_file = p_dict['data_file'] #Plot analysis plots... if p_dict['analysis_plots']: analysis_plots(snps_data_file, phed, p_dict) else: #If not analysis plots... then GWAS for p_i in p_dict['pids']: print '-' * 120, '\n' phenotype_name = phed.getPhenotypeName(p_i) print "Performing GWAS for phenotype: %s, phenotype_id: %s" % (phenotype_name, p_i) for trans_method in p_dict['specific_transformations']: print 'Phenotype transformation:', trans_method for mapping_method in p_dict['specific_methods']: #DO ANALYSIS print 'Mapping method:', mapping_method map_phenotype(p_i, phed, snps_data_file, mapping_method, trans_method, p_dict)
def _runTest_(): import dataParsers import phenotypeData #Get phenotype data phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_transformed_publishable_v2.tsv" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') #Get Phenotype data #Get SNPs data snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile) #Get SNPs data psFile = env.homedir + "tmp/tree.ps" marg_file = env.homedir + "tmp/test" out_file = env.homedir + "tmp/test_out" rFile = env.homedir + "tmp/tree_test.r" #Run Margarita marg = Margarita(marg_file, out_file) chr = 4 snpsd = snpsds[chr - 1].getSnpsData() marg.gwaWithTrees(marg_file, snpsd, phed, phenotype = 1, numMarkers = 200, chromosome = chr, boundaries = [200000, 350000], numPerm = 1, cutoff = 16, numArg = 100) #(self, id, snpsd, phed, phenotype=0, boundaries = None, numMarkers = 100, numPerm = 500000, cutoff = 16, numArg = 50) #which marginal tree runNum = 1 argNum = 1 markerNum = 1 marg.parseTreeFile(marg_file + ".marg.trees", rFile, psFile, runNum, argNum, markerNum)
def _fakeTransformPhenotypes_(): import os res_dir = "/Network/Data/250k/tmp-bvilhjal/emma_results/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') for p_i in transformationMap: (trans_type,data_type) = transformationMap[p_i] print trans_type if trans_type==1 or trans_type==6: t_type = "new_logTransform" #m" elif trans_type==9 : t_type = "new_logTransform_f192" #m" elif trans_type==5: t_type = "newDataset_logTransform_noOutliers" elif trans_type==7: t_type = "new_ranks" elif trans_type==8: t_type = "neg_const_logTransform" else: #if trans_type==0: t_type = "new_raw" phenName = phed.getPhenotypeName(p_i) # oldName = res_dir+"Emma_"+t_type+"_"+phenName+".pvals" # newName = res_dir+"Emma_new_trans_"+phenName+".pvals" # cp_command = "sudo cp "+oldName+" "+newName # print cp_command # os.system(cp_command) oldName = res_dir+"Emma_"+t_type+"_"+phenName+".sr.pvals" newName = res_dir+"Emma_new_trans_"+phenName+".sr.pvals" cp_command = "sudo cp "+oldName+" "+newName print cp_command os.system(cp_command)
def _plotKinshipDiffs_(): filterProb = 0.2 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "full_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") # ,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) for snpsd in snpsds: snpsd.filterMinMAF(0.1) snpsd.filterMonoMorphicSnps() totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps # For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps print "Calculating the global kinship..." globalKinship = calcKinship(totalSNPs) print "done." normalizedGlobalKinship = globalKinship / mean(globalKinship) gc.collect() # Calling garbage collector, in an attempt to clean up memory.. for i in range(4, 5): # len(snpsds)): chr = i + 1 snpsd = snpsds[i] # pylab.subplot(5,1,chr) # pylab.figure(figsize=(18,4)) # (kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000) # pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$') # (kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000) # pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$') # pylab.legend(numpoints=2,handlelen=0.005) # pylab.title("Kinship diff. chr. "+str(chr)) # pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf") # pylab.clf() pylab.figure(figsize=(18, 4)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000) pylab.plot(binPos, emmaDiffs, "r", label="ws$=300000$") pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000) pylab.plot(binPos, emmaDiffs, "b", label="ws$=500000$") pylab.title("Emma avg. p-value diff. on chr. " + str(chr)) pylab.legend(numpoints=2, handlelen=0.005) pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf") pylab.clf() gc.collect() # Calling garbage collector, in an attempt to clean up memory..
def runParallel(phenotypeIndex): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/","_div_") phenName = phenName.replace("*","_star_") impFileName = resultDir+"RF_"+parallel+"_"+phenName outFileName = impFileName shstr = """#!/bin/csh #PBS -l walltime=120:00:00 """ shstr += "#PBS -l mem="+mem+"\n" shstr +=""" #PBS -q cmb """ shstr += "#PBS -N RF"+phenName+"_"+parallel+"\n" shstr += "(python "+programDir+"RandomForest.py -o "+impFileName+" --chunkSize "+str(chunkSize)+" --nTrees "+str(nTrees)+" --mem "+str(mem)+" --round2Size "+str(round2Size)+"" if nodeSize: shstr += " --nodeSize "+str(nodeSize)+" " if logTransform: shstr += " --logTransform " if not skipSecondRound: shstr += " --secondRound " shstr += " -a "+str(withArrayIds)+" " shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n" f = open(parallel+".sh",'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ")
def runParallel(phenotypeIndex, id=""): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id shstr = """#!/bin/csh #PBS -l walltime=24:00:00 #PBS -l mem=6g #PBS -q cmb """ shstr += "#PBS -N C" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " " if sampleNum: shstr += " -n " + str(sampleNum) + " " if useFloats: shstr += " --useFloats " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ")
def _fakeTransformPhenotypes_(): import os res_dir = "/Network/Data/250k/tmp-bvilhjal/emma_results/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_042109.tsv" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') for p_i in transformationMap: (trans_type, data_type) = transformationMap[p_i] print trans_type if trans_type == 1 or trans_type == 6: t_type = "logTransform" #m" elif trans_type == 9: t_type = "logTransform_f192" #m" elif trans_type == 5: t_type = "logTransform_noOutliers" elif trans_type == 7: t_type = "ranks" elif trans_type == 8: t_type = "neg_const_logTransform" else: #if trans_type==0: t_type = "raw" phenName = phed.getPhenotypeName(p_i) # oldName = res_dir+"Emma_"+t_type+"_"+phenName+".pvals" # newName = res_dir+"Emma_trans_"+phenName+".pvals" # cp_command = "sudo cp "+oldName+" "+newName # print cp_command # os.system(cp_command) oldName = res_dir + "Emma_" + t_type + "_" + phenName + ".pvals" newName = res_dir + "Emma_trans_" + phenName + ".pvals" cp_command = "sudo cp " + oldName + " " + newName print cp_command os.system(cp_command)
def runParallel(phenotypeIndex, id=""): # Cluster specific parameters phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id shstr = """#!/bin/csh #PBS -l walltime=24:00:00 #PBS -l mem=6g #PBS -q cmb """ shstr += "#PBS -N C" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " " if sampleNum: shstr += " -n " + str(sampleNum) + " " if useFloats: shstr += " --useFloats " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " " shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n" f = open(parallel + ".sh", "w") f.write(shstr) f.close() # Execute qsub script os.system("qsub " + parallel + ".sh ")
def runParallel(phenotypeIndex): #Cluster specific parameters #margdir = '/home/cmb-01/bvilhjal/Projects/Python-snps/' resultDir = env.results_dir #'/home/cmb-01/bvilhjal/results/' import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outFileName = resultDir + "Marg_" + parallel + "_" + phenName scoreFile = outFileName + ".score" shstr = """#!/bin/csh #PBS -l walltime=120:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr += "#PBS -N M" + phenName + "_" + parallel + "\n" #shstr += "(python " + margdir + "margarita.py " shstr += "(python " + env.script_dir + "margarita.py " if phed.isBinary(phenotypeIndex): shstr += " --binary " shstr += " -s " + scoreFile shstr += " -a " + str(withArrayId) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " " shstr += "> " + outFileName + ".out) >& " + outFileName + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ")
def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) print phenName outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime="+walltimeReq+"\n" shstr += "#PBS -l mem="+memReq+"\n" shstr +="#PBS -q cmb\n" shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ")
def _test_(): phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_102208.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') for p_i in range(215, 216): phenName = phed.getPhenotypeName(p_i) drawHistogram(phed, p_i, title = phenName) phed.logTransform(p_i) drawHistogram(phed, p_i, title = phenName)
def _test1_(): phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_042109.tsv" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') print "phenName, data_type, transformation_type" for ctg in [1, 2, 3, 4]: phenIds = phenotypeData.categories_2_phenotypes[ctg] for pi in phenIds: phenName = phed.getPhenotypeName(pi) (trans_type, data_type) = transformationMap[pi] print str(phenName) + ", " + str( datatypeDict[data_type]) + ", " + str( transformationTypes[trans_type])
def _drawPowerQQPlots_(phenotypeIndices=None,res_path="/Network/Data/250k/tmp-bvilhjal/power_analysis/results/",runId="gwPlot"): """ Draws all the GWA plots for 6 methods. """ import plotResults phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_120308.tsv" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') if not phenotypeIndices: phenotypeIndices = phed.phenIds #mainRTs = ["","_original192", "_original192_inverse","_original96","_original96_inverse"] #FIXME: add full result #mainLabels = ["Full data", "192 acc. overlap", "192 acc. complement", "96 acc. overlap", "96 acc. complement"] mainRTs = ["","_original192","_original96", "_latitude60","_latitude55", "_original192_latitude60", "_original192_latitude55"] #FIXME: add full result mainLabels = ["Full data", "192 acc. overlap", "96 acc. overlap", "latitude < 60", "Latitude < 55", "192 acc. overl. and lat. < 60", "192 acc. overl. and lat. < 55"] permRTs = []#["permTest","permTest"] colors = []#[[0.6,0.8,0.6],[0.6,0.6,0.8]] perm_counts = []#[10,10] perm_sample_sizes = []#[65 ,112] #[170,96] # permLabels = []#["random 65","random 112"] for p_i in phenotypeIndices: mainResults = [] phenName = phed.getPhenotypeName(p_i) pdfFile = res_path+phenName+"_log_QQplot.pdf" pngFile = res_path+phenName+"_log_QQplot.png" for i in range(0,len(mainRTs)): mainRT = mainRTs[i] name = mainLabels[i] filename = res_path+"KW_raw"+mainRT+"_"+phenName+".pvals" rt = gwaResults.ResultType(resultType="KW",name=name) print "Loading",filename result = gwaResults.Result(filename, name=name, resultType=rt) mainResults.append(result) permResultsList = [] for i in range(0,len(permRTs)): permResults = [] permRT = permRTs[i] for j in range(0,perm_counts[i]): filename = res_path+"KW_raw_"+permRT+"_"+phenName+"_r"+str(perm_sample_sizes[i])+"_"+str(j)+".pvals" rt = gwaResults.ResultType(resultType="KW",name=permRT) print "Loading",filename result = gwaResults.Result(filename, name=permRT, resultType=rt) permResults.append(result) permResultsList.append((permResults,permLabels[i],colors[i])) drawPermLogQQPlot(mainResults, permResultsList, phenName = phenName,pdfFile=pdfFile,pngFile=pngFile) gc.collect() #Calling garbage collector, in an attempt to clean up memory..
def _impute_FLC_192_(): phed = pd.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv" d250k_sd = dataParsers.parse_snp_data(d250k_file) d250k_sd.filter_accessions(phed.accessions) d250k_sd.filter_maf_snps(0.05) seq_snpsd = dataParsers.parseCSVData( data_dir + "/flc_seqs_aln_imputed_snps_012710.csv") seq_snpsd.onlyBinarySnps() d250k_sd.snpsDataList[4].compareWith(seq_snpsd) d250k_sd.snpsDataList[4].merge_data(seq_snpsd)
def _countVals_(): resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4] print "total # of phenotypes:", phed.countPhenotypes() print "# of phenotypes analyzed:", len(phenotypeIndices) totalCounts = [] for p_i in phenotypeIndices: valCount = phed.countValues(p_i) totalCounts.append(valCount) snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv" import dataParsers,snpsdata snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")#,debug=True) snpsd = snpsdata.SNPsDataSet(snpsds,[1,2,3,4,5]) phed.removeAccessionsNotInSNPsData(snpsd) overlappingCounts = [] for p_i in phenotypeIndices: valCount = phed.countValues(p_i) overlappingCounts.append(valCount) #ecotypes_192 = phenotypeData._getFirst192Ecotypes_() ecotypes_192 = _get192Ecotypes_() ecotypes_192 = [str(e) for e in ecotypes_192] print "len(ecotypes_192):",len(ecotypes_192) print ecotypes_192 phed.filterAccessions(ecotypes_192) filename = resdir+"phen_value_count_new_data_012509_v2.txt" f = open(filename,"w") f.write("Phenotype, total_count, overlapping_count, 192_overlap_count\n") for i in range(0,len(phenotypeIndices)): p_i = phenotypeIndices[i] try: phenName = phed.getPhenotypeName(p_i) valCount = phed.countValues(p_i) f.write(str(phenName)+", "+str(totalCounts[i])+", "+str(overlappingCounts[i])+", "+str(valCount)+"\n") except Exception: print "\nPhenotype index", p_i, "failed." f.close()
def _plot_local_FLC_haplotype_(): data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/" ref_seq_name = "raw_ref_col-0" ref_start = 3170501 ref_chr = 5 ad = sequences.readFastaAlignment(data_dir + "flc_seqs_aln_merged_011810.fasta", ref_seq_name=ref_seq_name, ref_start=ref_start, ref_chr=ref_chr, alignment_type="muscle", ref_direction=1) r = ad.get_snps(type=1) seq_snpsd = r['snpsd'] seq_snpsd.filter_na_snps(max_na_rate=0.05) # seq_snpsd = dataParsers.parseCSVData(data_dir+"flc_seqs_aln_imputed_snps_012710.csv")[0] # seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA') # seq_snpsd.remove_accessions(map(str,ad.ecotypes)) import phenotypeData as pd phend = pd.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv") for i in phend.phenIds: phen_name = phend.getPhenotypeName(i) import analyzeHaplotype as ah ah.plot_haplotypes( seq_snpsd.snps, seq_snpsd.accessions, haplotypeFile= "/Users/bjarnivilhjalmsson/tmp/flc_seq_haplotypes_old.pdf") for start, stop in [(3175500, 3176000), (3176000, 3176500), (3176500, 3177000), (3177000, 3177500), (3177500, 3178000), (3178000, 3178500), (3178500, 3179000)]: ah.plot_local_haplotypes( "/Users/bjarnivilhjalmsson/tmp/flc_seq_haplotypes_" + phen_name + "_" + str(start) + "_" + str(stop) + ".pdf", seq_snpsd, start, stop, phenotypeData=phend, phen_id=i)
def __init__(self,phenotypeIndices=None,snpsds=None,results=None,results_map=None): phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" self.phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') #if snpsds: self.snpsds = snpsds #else: # snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv" # self.snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") self.results_map = {} if results_map: self.results_map=results_map elif results: for result in results: if self.results_map.has_key(result.phenotypeID): self.results_map[result.phenotypeID].append(result) else: self.results_map[result.phenotypeID] = [result] elif phenotypeIndices: self.loadData(phenotypeIndices)
def _get192Ecotypes_(): resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4] total_accessions = set() for p_i in phenotypeIndices: if not p_i in [5,6,7]: accessions = phed.getAccessionsWithValues(p_i) total_accessions = total_accessions.union(accessions) ecotypes_192 = phenotypeData._getFirst192Ecotypes_() ecotypes_192 = [str(e) for e in ecotypes_192] print "len(ecotypes_192):",len(ecotypes_192) #print ecotypes_192 phed.filterAccessions(ecotypes_192) for p_i in [5,6,7]: accessions = phed.getAccessionsWithValues(p_i) total_accessions = total_accessions.union(accessions) total_accessions = list(total_accessions) print len(total_accessions) total_accessions.sort() print total_accessions ecotype_info_dict = phenotypeData._getEcotypeIdInfoDict_() ets = [] i = 0 for et in total_accessions: et = int(et) if ecotype_info_dict.has_key(et): print str(et)+", "+str(ecotype_info_dict[et][0])+", "+str(ecotype_info_dict[et][1]) i += 1 ets.append(et) else: print et,"is missing in genotype data." print i return ets
def runParallel(phenotypeIndex): # Cluster specific parameters scriptDir = env.scriptDir resultDir = env.resultDir phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd) phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outFile = resultDir + "CS_" + parallel + "_" + phenName shstr = """#!/bin/csh #PBS -l walltime=72:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr += "#PBS -N CS" + phenName + "_" + parallel + "\n" shstr += "(python " + scriptDir + "compositeScore.py -o" + outFile + " " shstr += ( "--candGeneListID=" + str(candGeneListID) + " --testDataFraction=" + str(testDataFraction) + " --gridSize=" + str(gridSize) + " --windowSize=" + str(windowSize) + " --phenotypeCategory=" + str(phenotypeCategory) + " " + str(phenotypeIndex) + " " ) shstr += "> " + outFile + "_job" + ".out) >& " + outFile + "_job" + ".err\n" f = open(parallel + ".sh", "w") f.write(shstr) f.close() # Execute qsub script os.system("qsub " + parallel + ".sh ")
def runParallel(phenotypeIndex): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") impFileName = resultDir + "RF_" + parallel + "_" + phenName outFileName = impFileName shstr = """#!/bin/csh #PBS -l walltime=50:00:00 """ shstr += "#PBS -l mem=" + mem + "\n" shstr += """ #PBS -q cmb """ shstr += "#PBS -N RF" + phenName + "_" + parallel + "\n" shstr += "(python " + programDir + "RandomForest.py -o " + impFileName + " --chunkSize " + str( chunkSize) + " --nTrees " + str(nTrees) + " --mem " + str( mem) + " --round2Size " + str(round2Size) + "" if nodeSize: shstr += " --nodeSize " + str(nodeSize) + " " if logTransform: shstr += " --logTransform " if not skipSecondRound: shstr += " --secondRound " shstr += " -a " + str(withArrayIds) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ")
def runParallel(phenotypeIndex): #Cluster specific parameters scriptDir = env.scriptDir resultDir = env.resultDir phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd) phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outFile = resultDir + "CS_" + parallel + "_" + phenName shstr = """#!/bin/csh #PBS -l walltime=72:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr += "#PBS -N CS" + phenName + "_" + parallel + "\n" shstr += "(python " + scriptDir + "compositeScore.py -o" + outFile + " " shstr += "--candGeneListID=" + str( candGeneListID) + " --testDataFraction=" + str( testDataFraction ) + " --gridSize=" + str(gridSize) + " --windowSize=" + str( windowSize) + " --phenotypeCategory=" + str( phenotypeCategory) + " " + str(phenotypeIndex) + " " shstr += "> " + outFile + "_job" + ".out) >& " + outFile + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ")
def _run_(): if len(sys.argv)==1: print __doc__ sys.exit(2) long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "addToDB", "callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , "subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", "onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "permTest=", "savePermutations", "permutationFilter=", "testRobustness", "memReq=","walltimeReq=",] try: opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType=1 outputFile=None delim="," missingVal="NA" help=0 parallel=None parallelAll=False addToDB=False callMethodID=None comment="" subSample=None onlyOriginal96=False onlyOriginal192 = False subSampleLikePhenotype = None subsampleTest = False numSubSamples = None complement = False onlyBelowLatidue = None onlyAboveLatidue = None sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 permTest = None savePermutations = False permutationFilter = 1.0 testRobustness = False memReq = "5g" walltimeReq = "100:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help=1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile=arg elif opt in ("--phenotypeFileType"): phenotypeFileType=int(arg) elif opt in ("--parallel"): parallel=arg elif opt in ("--parallelAll"): parallelAll=True elif opt in ("--addToDB"): addToDB=True elif opt in ("--onlyOriginal96"): onlyOriginal96=True elif opt in ("--onlyOriginal192"): onlyOriginal192=True elif opt in ("--complement"): complement=True elif opt in ("--subSample"): subSample=int(arg) elif opt in ("--subsampleTest"): subsampleTest = True l = arg.split(",") subSample=int(l[0]) numSubSamples=int(l[1]) elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue=float(arg) elif opt in ("--onlyAboveLatidue"): onlyAboveLatidue=float(arg) elif opt in ("--subSampleLikePhenotype"): subSampleLikePhenotype=int(arg) elif opt in ("--callMethodID"): callMethodID=int(arg) elif opt in ("--comment"): comment=arg elif opt in ("-d", "--delim"): delim=arg elif opt in ("-m", "--missingval"): missingVal=arg elif opt in ("--sr"): sr = True elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permTest"): permTest = int(arg) elif opt in ("--savePermutations"): savePermutations = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--memReq"): memReq=arg elif opt in ("--walltimeReq"): walltimeReq=arg else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile=args[0] phenotypeDataFile=args[1] print "Kruskal-Wallis is being set up with the following parameters:" print "phenotypeDataFile:",phenotypeDataFile print "snpsDataFile:",snpsDataFile print "parallel:",parallel print "parallelAll:",parallelAll print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "onlyAboveLatidue:",onlyAboveLatidue print "complement:",complement print "subSampleLikePhenotype:",subSampleLikePhenotype print "subsampleTest:",subsampleTest print "numSubSamples:",numSubSamples print "subSample:",subSample print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "permTest:",permTest print "savePermutations:",savePermutations print "permutationFilter:",permutationFilter print "testRobustness:",testRobustness print "walltimeReq:",walltimeReq print "memReq:",memReq def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) print phenName outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime="+walltimeReq+"\n" shstr += "#PBS -l mem="+memReq+"\n" shstr +="#PBS -q cmb\n" shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") if parallel: #Running on the cluster.. if parallelAll: phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) elif subsampleTest: phenotypeIndex=int(args[2]) for i in range(0,numSubSamples): runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i)) else: phenotypeIndex=int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex=int(args[2]) print "phenotypeIndex:",phenotypeIndex print "output:",outputFile print "\nStarting program now!\n" #Load phenotype file phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) elif onlyAboveLatidue: print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if subSampleLikePhenotype: p_name = phed.getPhenotypeName(subSampleLikePhenotype) print "Picking sample as in",p_name ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype) print ecotypes phed.filterAccessions(ecotypes) print "len(phed.accessions)", len(phed.accessions) if subSample: sample_ecotypes = [] ecotypes = phed.getNonNAEcotypes(phenotypeIndex) sample_ecotypes = random.sample(ecotypes,subSample) phed.filterAccessions(sample_ecotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() #Load genotype file snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal) #Checking overlap between phenotype and genotype accessions. phenotype=phed.getPhenIndex(phenotypeIndex) accIndicesToKeep=[] phenAccIndicesToKeep=[] numAcc=len(snpsds[0].accessions) sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1=snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2=phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping=[] i=0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i+=1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds=[] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Double check genotype file: problems = 0 for i in range(0,len(newSnpsds)): snpsd = newSnpsds[i] for j in range(0,len(snpsd.snps)): snp = snpsd.snps[j] sc = snp.count(0) if sc==0 or sc==len(snp): print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i] problems += 1 if problems >0: print "Genotype file appears to have potential problems" else: print "Genotype file appears to be good" if permTest: print "Starting a permutation test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" permTest = 100 _perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter) sys.exit(0) if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" _robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter) sys.exit(0) sys.stdout.flush() print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun if (not sr) or (sr and not srSkipFirstRun): #Writing files #phed and phenotype sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) phenotypeName=phed.getPhenotypeName(phenotypeIndex) if phed.isBinary(phenotypeIndex): pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex)) else: snps = sd.getSnps() phen_vals = phed.getPhenVals(phenotypeIndex) try: kw_res = util.kruskal_wallis(snps,phen_vals) pvals = kw_res['ps'] except: print snps print phen_vals print len(snps),len(snps[0]),len(phen_vals) raise Exception res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False) pvalFile=outputFile+".pvals" res.writeToFile(pvalFile) print "Generating a GW plot." res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile else: print "Skipping first stage analysis." sys.stdout.flush() if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def _plotKW_(): """ Analyze how population structure affects KW. """ filterProb = 0.1 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "_full_quick_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps #For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps #globalKinship = calcKinship(totalSNPs) gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. #chr = 1 #for snpsd in snpsds: snpsd = snpsds[3] k = calcKinship(snpsd.snps[200:1400]) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) #runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)") k = calcKinship(totalSNPs) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) #runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)") phenVals = phed.getPhenVals(p_i) pvals = _run_kw_(snpsd.snps[200:1400], phenVals) log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)") (pvals, new_positions, acc_groups) = get_KW_pvals(snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW") ecot_map = phenotypeData._getEcotypeIdToStockParentDict_() for i in range(0, len(acc_groups)): acc_list = [] for a_i in acc_groups[i]: e_i = snpsd.accessions[a_i] #print e_i acc_list.append(ecot_map[int(e_i)][0]) print "group", i, ":", acc_list log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)") pylab.legend(numpoints=2, handlelen=0.005) pylab.show()
def _plotRobustnessTests_(): import csv resdir = "/Network/Data/250k/tmp-bvilhjal/robustness_test/" fig_dir = "/Users/bjarni/tmp/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4] #First Emma emma_sd_dict = {} emma_sd_list = [] emma_log_pvalues = [] found_phenotypes = [] for p_i in phenotypeIndices: try: phenName = phed.getPhenotypeName(p_i) filename = resdir+"KW_rob_f1_"+phenName+".rob.log_pvals_sd" print "Loading", filename, "..." reader = csv.reader(open(filename, "rb")) reader.next() for row in reader: emma_log_pvalues.append(float(row[0])) emma_sd_list.append(float(row[1])) found_phenotypes.append(p_i) except Exception: print p_i,"failed." import numpy as np import matplotlib.cm as cm import matplotlib.pyplot as plt xs = np.array(emma_log_pvalues) ys = np.array(emma_sd_list) print len(emma_sd_list),len(emma_log_pvalues) xmin = xs.min() xmax = xs.max() ymin = ys.min() ymax = ys.max() #plt.subplots_adjust(hspace=0.5) #plt.subplot(121) plt.hexbin(xs,ys,bins='log',cmap=cm.jet) plt.axis([xmin, xmax, ymin, ymax]) cb = plt.colorbar() cb.set_label('$log_{10}(N)$') plt.ylabel("SD$(\Delta log(p)))$") plt.xlabel("$log(p)$") plt.savefig(fig_dir+"KW_overall_robustness.png", format = "png") plt.clf() emma_sd_dict = {} emma_sd_list = [] emma_log_pvalues = [] found_phenotypes = [] for p_i in phenotypeIndices: try: phenName = phed.getPhenotypeName(p_i) filename = resdir+"Emma_rob_f1_"+phenName+".rob.log_pvals_sd" print "Loading", filename, "..." reader = csv.reader(open(filename, "rb")) reader.next() for row in reader: emma_log_pvalues.append(float(row[0])) emma_sd_list.append(float(row[1])) found_phenotypes.append(p_i) except Exception: print p_i,"failed." import numpy as np import matplotlib.cm as cm import matplotlib.pyplot as plt xs = np.array(emma_log_pvalues) ys = np.array(emma_sd_list) print len(emma_sd_list),len(emma_log_pvalues) xmin = xs.min() xmax = xs.max() ymin = ys.min() ymax = ys.max() #plt.subplots_adjust(hspace=0.5) #plt.subplot(121) plt.hexbin(xs,ys,bins='log',cmap=cm.jet) plt.axis([xmin, xmax, ymin, ymax]) cb = plt.colorbar() cb.set_label('$log_{10}(N)$') plt.ylabel("SD$(\Delta log(p)))$") plt.xlabel("$log(p)$") plt.savefig(fig_dir+"Emma_overall_robustness.png", format = "png") plt.clf()
def _run_(): import os if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["id=", "chr=", "numARG=", "numMarkers=", "numPerm=", "smartCutoff=", "BoundaryStart=", "BoundaryEnd=", "binary", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", "debug", "parallel=", "parallelAll", "help", "scoreFile="] try: opts, args = getopt.getopt(sys.argv[1:], "i:s:c:d:m:a:bh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) import tempfile tempfile.tempdir = '/tmp' (fId, id) = tempfile.mkstemp() os.close(fId) scoreFile = None chr = None numARG = 30 numMarkers = 100 numPerm = 0 smartCutoff = 10 binary = False delim = "," missingVal = "NA" debug = None report = None help = 0 withArrayId = 0 boundaries = [ - 1, - 1] phenotypeFileType = 1 parallel = None parallelAll = False snpsDataFile = None for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-i", "--id"): id = '/tmp/' + arg elif opt in ("-s", "--scoreFile"): scoreFile = arg elif opt in ("-c", "--chr"): chr = int(arg) elif opt in ("--numARG"): numARG = int(arg) elif opt in ("--numMarkers"): numMarkers = int(arg) elif opt in ("--numPerm"): numPerm = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--smartCutoff"): smartCutoff = int(arg) elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--binary"): binary = True elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-a", "--withArrayId"): withArrayId = int(arg) elif opt in ("-b", "--debug"): debug = 1 if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) if boundaries[0] == boundaries[1] and boundaries[0] == - 1: boundaries = None margFile = id + ".marg" outFile = margFile + ".out" def runParallel(phenotypeIndex): #Cluster specific parameters #margdir = '/home/cmb-01/bvilhjal/Projects/Python-snps/' resultDir = env.results_dir #'/home/cmb-01/bvilhjal/results/' import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outFileName = resultDir + "Marg_" + parallel + "_" + phenName scoreFile = outFileName + ".score" shstr = """#!/bin/csh #PBS -l walltime=120:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr += "#PBS -N M" + phenName + "_" + parallel + "\n" #shstr += "(python " + margdir + "margarita.py " shstr += "(python " + env.script_dir + "margarita.py " if phed.isBinary(phenotypeIndex): shstr += " --binary " shstr += " -s " + scoreFile shstr += " -a " + str(withArrayId) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " " shstr += "> " + outFileName + ".out) >& " + outFileName + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") #Nested function ends snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. if len(args) > 2: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: snpsDataFile = args[0] if not parallelAll: phenotypeIndex = int(args[1]) runParallel(phenotypeIndex) return import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) return phenotypeIndex = int(args[2]) #Print out information about this run... print "Preparing a blended margarita...." print "Num ARG:", numARG print "Num Markers:", numMarkers print "Num Permutations:", numPerm print "Smart cutoff:", smartCutoff print "Binary:", binary print "ScoreFile:", scoreFile import dataParsers, snpsdata, phenotypeData #phenotypeFile = "/Users/bjarni/Projects/Python-snps/tinaPhenos_041808.csv" if phenotypeFileType == 1: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data elif phenotypeFileType == 2: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, accessionDecoder = dataParsers.accessionName2EcotypeId, type = 2) snpsds = dataParsers.parseCSVData(snpsDataFile, deliminator = delim, missingVal = missingVal, withArrayIds = bool(withArrayId)) #Get SNPs data marg = Margarita(margFile, outFile, numARG, numMarkers, numPerm, smartCutoff) if chr: snpsd = snpsds[chr - 1].getSnpsData() marg.gwa(snpsd, phed, phenotype = phenotypeIndex, boundaries = boundaries, chromosome = chr, binary = binary) else: scoreStr = "" for chr in [0, 1, 2, 3, 4]: snpsd = snpsds[chr].getSnpsData() (newRStr, newScoreStr, permPvals) = marg.gwa(snpsd, phed, phenotype = phenotypeIndex, boundaries = boundaries, chromosome = chr + 1, binary = binary) scoreStr += newScoreStr f = open(scoreFile, 'w') f.write(scoreStr) f.close()
def _generate_250K_2010_FLC_data_(impute=True): """ Create a combined version of 250K, overlapping with the FLC phenotypes. Then merge with 2010 data (including indels). Then merge with FLC sequences. Impute missing SNPs. write to file. """ import phenotypeData as pd import env phed = pd.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) d2010_file = env.home_dir + "Projects/Data/2010/2010_imputed_012610.csv" d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data") d2010_sd.filter_accessions(phed.accessions) d2010_sd.filter_na_snps() d2010_sd.filter_maf_snps(0.05) #d250k_file = env.home_dir+"Projects/Data/250k/250K_t54.csv" d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv" d250k_sd = dataParsers.parse_snp_data(d250k_file) d250k_sd.filter_accessions(phed.accessions) d250k_sd.filter_maf_snps(0.05) d250k_sd.merge_snps_data(d2010_sd) d250k_sd.filter_na_accessions() d250k_sd.filter_na_snps(0.7) d250k_sd.filter_monomorphic_snps() ref_seq_name = "raw_ref_col-0" ref_start = 3170501 ref_chr = 5 seq_file = env.home_dir + "Projects/FLC_analysis/flc_seqs_aln_merged_050410.fasta" ad = sequences.readFastaAlignment(seq_file, ref_seq_name=ref_seq_name, ref_start=ref_start, ref_chr=ref_chr, alignment_type="muscle", ref_direction=1) # ref_start = 3170500 # ad2 = sequences.readFastaAlignment(seq_file,ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) # ref_start = 3170502 # ad3 = sequences.readFastaAlignment(seq_file,ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) pdb.set_trace() r = ad.get_snps(type=0) seq_snpsd1 = r['snpsd'] seq_snpsd1.merge_data(r['indels'], error_threshold=0.0) # r2 = ad2.get_snps(type=0) # seq_snpsd2 = r2['snpsd'] # seq_snpsd2.merge_data(r2['indels'],error_threshold=0.0) # # r3 = ad3.get_snps(type=0) # seq_snpsd3 = r3['snpsd'] # seq_snpsd3.merge_data(r3['indels'],error_threshold=0.0) print "Now merging data.." d250k_sd.snpsDataList[4].compareWith(seq_snpsd1) # d250k_sd.snpsDataList[4].compareWith(seq_snpsd2) # d250k_sd.snpsDataList[4].compareWith(seq_snpsd3) d250k_sd.snpsDataList[4].merge_data(seq_snpsd1, union_accessions=False) d250k_sd.filter_na_accessions() d250k_sd.filter_na_snps(0.7) d250k_sd.filter_monomorphic_snps() d250k_sd.snpsDataList[4].impute_data() d250k_sd.writeToFile("/tmp/test.csv") print "YEAH!"
def _run_(): if len(sys.argv)==1: print __doc__ sys.exit(2) long_options_list=["outputFile=", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "addToDB", "callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , "subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", "onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "permTest=", "savePermutations", "permutationFilter=", "testRobustness"] try: opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType=1 outputFile=None delim="," missingVal="NA" help=0 withArrayIds=1 parallel=None parallelAll=False addToDB=False callMethodID=None comment="" subSample=None onlyOriginal96=False onlyOriginal192 = False subSampleLikePhenotype = None subsampleTest = False numSubSamples = None complement = False onlyBelowLatidue = None onlyAboveLatidue = None sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 permTest = None savePermutations = False permutationFilter = 1.0 testRobustness = False for opt, arg in opts: if opt in ("-h", "--help"): help=1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds=int(arg) elif opt in ("-o", "--outputFile"): outputFile=arg elif opt in ("--phenotypeFileType"): phenotypeFileType=int(arg) elif opt in ("--parallel"): parallel=arg elif opt in ("--parallelAll"): parallelAll=True elif opt in ("--addToDB"): addToDB=True elif opt in ("--onlyOriginal96"): onlyOriginal96=True elif opt in ("--onlyOriginal192"): onlyOriginal192=True elif opt in ("--complement"): complement=True elif opt in ("--subSample"): subSample=int(arg) elif opt in ("--subsampleTest"): subsampleTest = True l = arg.split(",") subSample=int(l[0]) numSubSamples=int(l[1]) elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue=float(arg) elif opt in ("--onlyAboveLatidue"): onlyAboveLatidue=float(arg) elif opt in ("--subSampleLikePhenotype"): subSampleLikePhenotype=int(arg) elif opt in ("--callMethodID"): callMethodID=int(arg) elif opt in ("--comment"): comment=arg elif opt in ("-d", "--delim"): delim=arg elif opt in ("-m", "--missingval"): missingVal=arg elif opt in ("--sr"): sr = True elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permTest"): permTest = int(arg) elif opt in ("--savePermutations"): savePermutations = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile=args[0] phenotypeDataFile=args[1] print "Kruskal-Wallis is being set up with the following parameters:" print "phenotypeDataFile:",phenotypeDataFile print "snpsDataFile:",snpsDataFile print "parallel:",parallel print "parallelAll:",parallelAll print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "onlyAboveLatidue:",onlyAboveLatidue print "subSampleLikePhenotype:",subSampleLikePhenotype print "subsampleTest:",subsampleTest print "numSubSamples:",numSubSamples print "subSample:",subSample print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "permTest:",permTest print "savePermutations:",savePermutations print "permutationFilter:",permutationFilter print "testRobustness:",testRobustness def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) phenName=phenName.replace("/", "_div_") phenName=phenName.replace("*", "_star_") outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr="""#!/bin/csh #PBS -l walltime=100:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " shstr+=" -a "+str(withArrayIds)+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") if parallel: #Running on the cluster.. if parallelAll: phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) elif subsampleTest: phenotypeIndex=int(args[2]) for i in range(0,numSubSamples): runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i)) else: phenotypeIndex=int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex=int(args[2]) print "phenotypeIndex:",phenotypeIndex print "output:",outputFile print "\nStarting program now!\n" #Load phenotype file phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) elif onlyAboveLatidue: print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if subSampleLikePhenotype: p_name = phed.getPhenotypeName(subSampleLikePhenotype) print "Picking sample as in",p_name ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype) print ecotypes phed.filterAccessions(ecotypes) print "len(phed.accessions)", len(phed.accessions) if subSample: sample_ecotypes = [] ecotypes = phed.getNonNAEcotypes(phenotypeIndex) sample_ecotypes = random.sample(ecotypes,subSample) phed.filterAccessions(sample_ecotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() #Load genotype file snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal, withArrayIds = withArrayIds) #Checking overlap between phenotype and genotype accessions. phenotype=phed.getPhenIndex(phenotypeIndex) accIndicesToKeep=[] phenAccIndicesToKeep=[] numAcc=len(snpsds[0].accessions) sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1=snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2=phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping=[] i=0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i+=1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds=[] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Double check genotype file: problems = 0 for i in range(0,len(newSnpsds)): snpsd = newSnpsds[i] for j in range(0,len(snpsd.snps)): snp = snpsd.snps[j] sc = snp.count(0) if sc==0 or sc==len(snp): print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i] problems += 1 if problems >0: print "Genotype file appears to have potential problems" else: print "Genotype file appears to be good" if permTest: print "Starting a permutation test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" permTest = 100 _perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter) sys.exit(0) if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" _robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter) sys.exit(0) sys.stdout.flush() print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun if (not sr) or (sr and not srSkipFirstRun): #Writing files if env.user=="bjarni": tempfile.tempdir='/tmp' (fId, phenotypeTempFile)=tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile)=tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write("Phenotype file written\n") sys.stdout.flush() snpsDataset=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) decoder={1:1, 0:0,-1:'NA'} snpsDataset.writeToFile(genotypeTempFile, deliminator = delim, missingVal = missingVal, withArrayIds = 0, decoder = decoder) sys.stdout.write("Genotype file written\n") sys.stdout.flush() phenotypeName=phed.getPhenotypeName(phenotypeIndex) rDataFile=outputFile+".rData" pvalFile=outputFile+".pvals" #Is the phenotype binary? binary=phed.isBinary(phenotypeIndex) rstr=_generateRScript_(genotypeTempFile, phenotypeTempFile, rDataFile, pvalFile, name = phenotypeName, binary = binary) rFileName=outputFile+".r" f=open(rFileName, 'w') f.write(rstr) f.close() outRfile=rFileName+".out" errRfile=rFileName+".err" print "Running R file:" cmdStr="(R --vanilla < "+rFileName+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() gc.collect() os.system(cmdStr) #print "Emma output saved in R format in", rDataFile print "Generating a GW plot." res = gwaResults.Result(pvalFile,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile else: print "Skipping first stage analysis." sys.stdout.flush() if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF="] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 impFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None logTransform = False parallelAll = False chunkSize = 250000 round2Size = 5000 nTrees = 15000 nodeSize = None mem = "8g" skipSecondRound = True minMAF = 0.0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-o","--rFile"): impFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--secondRound"): skipSecondRound = False elif opt in ("-d","--delim"): delim = arg elif opt in ("--chunkSize"): chunkSize = int(arg) elif opt in ("--round2Size"): round2Size = int(arg) elif opt in ("--nTrees"): nTrees = int(arg) elif opt in ("--nodeSize"): nodeSize = int(arg) elif opt in ("--mem"): mem = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-m","--minMAF"): minMAF = float(arg) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) def runParallel(phenotypeIndex): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/","_div_") phenName = phenName.replace("*","_star_") impFileName = resultDir+"RF_"+parallel+"_"+phenName outFileName = impFileName shstr = """#!/bin/csh #PBS -l walltime=120:00:00 """ shstr += "#PBS -l mem="+mem+"\n" shstr +=""" #PBS -q cmb """ shstr += "#PBS -N RF"+phenName+"_"+parallel+"\n" shstr += "(python "+programDir+"RandomForest.py -o "+impFileName+" --chunkSize "+str(chunkSize)+" --nTrees "+str(nTrees)+" --mem "+str(mem)+" --round2Size "+str(round2Size)+"" if nodeSize: shstr += " --nodeSize "+str(nodeSize)+" " if logTransform: shstr += " --logTransform " if not skipSecondRound: shstr += " --secondRound " shstr += " -a "+str(withArrayIds)+" " shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n" f = open(parallel+".sh",'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") #Nested function ends snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) print "chunkSize:",chunkSize print "nTrees:",nTrees print "nodeSize:",nodeSize print "mem:",mem print "logTransform:",logTransform print "round2Size:",round2Size print "skipSecondRound:",skipSecondRound #Loading genotype data import dataParsers snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) #Load phenotype file sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc),i)) i += 1 phed.orderAccessions(accessionMapping) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotype) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" #Remove minor allele frequencies if minMAF!=0: sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Converting format to 01 import snpsdata newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" snpsds = newSnpsds #Writing files import tempfile if env.user=="bjarni": tempfile.tempdir='/tmp' (fId, phenotypeTempFile) = tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile) = tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write( "Phenotype file written\n") sys.stdout.flush() #Retain only the correct runchunk of data. chromasomes = [] positions = [] snps = [] for i in range(0,len(snpsds)): snpsd = snpsds[i] positions += snpsd.positions snps += snpsd.snps chrList = [i+1]*len(snpsd.positions) chromasomes += chrList #Is the phenotype binary? binary = phed.isBinary(phenotypeIndex) import util impFile = impFile+".imp" rDataFile = impFile+".rData" rFile = impFile+".r" outRfile = rFile+".out" errRfile = rFile+".err" topImpFile = impFile+"_top"+str(chunkSize)+".imp" topRDataFile = impFile+"_top.rData" try: os.remove(impFile) #Removing file if it already exits. except Exception: print "Couldn't remove",impFile try: os.remove(topImpFile) #Removing file if it already exits. except Exception: print "Couldn't remove",topImpFile for startIndex in range(0,len(positions),chunkSize): if startIndex+chunkSize>=len(positions): endIndex = len(positions) else: endIndex = startIndex+chunkSize #Writing genotype data to file. tmpFile = open(genotypeTempFile,"w") for i in range(startIndex,endIndex): outStr ="" snp = util.valListToStrList(snps[i]) outStr += str(chromasomes[i])+","+str(positions[i])+"," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile,'w') f.write(rstr) f.close() #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out" #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err" print "Running model nr",startIndex/chunkSize,":" cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() os.system(cmdStr) print "Random forest output saved in", impFile if not skipSecondRound: #Run on the top 'chunkSize' number of hits. #loading the R output file. impF = open(impFile,"r") lines=impF.readlines() impF.close() impList = list() for i in range(1,len(lines)): line = lines[i] line.strip() l = line.split(",") impList.append( (float(l[2]),l[0],l[1],snps[i]) ) impList.sort() impList.reverse() #Writing genotype data to file. tmpFile = open(genotypeTempFile,"w") for i in range(0,round2Size): outStr = "" snp = util.valListToStrList(impList[i][3]) outStr += str(impList[i][1])+","+str(impList[i][2])+"," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile,'w') f.write(rstr) f.close() print "Running randomForest on the top importance scores:" cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() os.system(cmdStr)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "outputSNPsFile=", "outputPhenotFile=", "filterMonomorphic", "rawDataFormat", "delim=", "missingval=", "withArrayId=", "phenotype=", "phenotypeFile=", "phenotypeName=", "calcKinshipMatrix=", "orderAccessions", "help", ] try: opts, args = getopt.getopt(sys.argv[1:], "o:u:d:m:a:f:p:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None outputPhenotFile = None delim = "," missingVal = "NA" phenotypeFile = None kinshipMatrixFile = None phenotype = None phenotypeName = None rawDataFormat = False monomorphic = False help = 0 withArrayIds = 1 orderAccessions = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("-f", "--phenotypeFile"): phenotypeFile = arg elif opt in ("calcKinshipMatrix"): kinshipMatrixFile = arg elif opt in ("--filterMonomorphic"): monomorphic = True elif opt in ("--rawDataFormat"): rawDataFormat = True elif opt in ("--minCallProb"): minCallProb = float(arg) elif opt in ("-p", "--phenotype"): phenotype = int(arg) elif opt in ("-o", "--outputSNPsFile"): output_fname = arg elif opt in ("--orderAccessions"): orderAccessions = True elif opt in ("-u", "--phenotypeFile"): outputPhenotFile = arg elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: print output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds == 1 or withArrayIds == 2 waid2 = withArrayIds == 2 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) if phenotypeFile: import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") # Get Phenotype data accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) if phenotype >= 0: # Load phenotype file sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + "." ) sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != "NA": accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break elif phenotype == None: sys.stdout.write("Removing accessions which do not have any phenotype values.") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2: accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break # Filter Accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." if outputPhenotFile: print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) if orderAccessions: accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) if phenotype >= 0: phed.writeToFile(outputPhenotFile, [phenotype]) else: phed.writeToFile(outputPhenotFile) # Filtering monomorphic if monomorphic: print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" import snpsdata newSnpsds = [] if not rawDataFormat: sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" waid1 = 0 snpsDataset = snpsdata.SnpsDataSet(newSnpsds, [1, 2, 3, 4, 5]) decoder = {1: 1, 0: 0, -1: "NA"} else: snpsDataset = snpsdata.SnpsDataSet(snpsds, [1, 2, 3, 4, 5]) decoder = None snpsDataset.writeToFile(output_fname, deliminator=delim, missingVal=missingVal, withArrayIds=waid1, decoder=decoder)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["outputFile=", "delim=", "missingval=", "sampleNum=", "parallel=", "parallelAll", "useFloats"] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:n:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 outputFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None parallelAll = False sampleNum = None chromosomes = [1, 2, 3, 4, 5] useFloats = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile = arg elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("n", "--sampleNum"): sampleNum = int(arg) elif opt in ("--useFloats"): useFloats = True else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile = args[0] phenotypeDataFile = args[1] print "CAMP is being set up with the following parameters:" print "phenotypeDataFile:", phenotypeDataFile if len(args) > 2: print "Phenotype_id:", args[2] print "snpsDataFile:", snpsDataFile print "parallel:", parallel print "parallelAll:", parallelAll print "sampleNum:", sampleNum def runParallel(phenotypeIndex, id=""): # Cluster specific parameters phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id shstr = """#!/bin/csh #PBS -l walltime=24:00:00 #PBS -l mem=6g #PBS -q cmb """ shstr += "#PBS -N C" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " " if sampleNum: shstr += " -n " + str(sampleNum) + " " if useFloats: shstr += " --useFloats " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " " shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n" f = open(parallel + ".sh", "w") f.write(shstr) f.close() # Execute qsub script os.system("qsub " + parallel + ".sh ") if parallel: # Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) # Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data # Load genotype file snpsds = dataParsers.parseCSVData( snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds ) # Checking overlap between phenotype and genotype accessions. phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + "." ) sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != "NA": accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break # Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) # Removing accessions that don't have genotypes or phenotype values # Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) # Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" # Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" # Writing phenotype data to CAMP format. (fId, phenotypeFile) = tempfile.mkstemp() os.close(fId) phenVals = phed.getPhenVals(phenotypeIndex, asString=False) if not useFloats: phenVals = map(int, phenVals) phenFile = open(phenotypeFile, "w") for value in phenVals: phenFile.write(str(value) + "\n") phenFile.close() chromosome_list = [] positions_list = [] scores_list = [] interaction_positions_list = [] mafs = [] marfs = [] # Writing SNP data to CAMP format. for chromosome in chromosomes: (fId, snpsFile) = tempfile.mkstemp() os.close(fId) (fId, posFile) = tempfile.mkstemp() os.close(fId) sf = open(snpsFile, "w") pf = open(posFile, "w") snpsd = newSnpsds[chromosome - 1] for i in range(0, len(snpsd.snps)): snp = snpsd.snps[i] (marf, maf) = snpsdata.getMAF(snp) marfs.append(marf) mafs.append(maf) str_snp = map(str, snp) double_snp = [] for nt in str_snp: double_snp.append(nt) double_snp.append(nt) sf.write("".join(double_snp) + "\n") pf.write(str(snpsd.positions[i]) + "\n") sf.close() pf.close() outFile = outputFile + "_job_" + str(chromosome) + ".out" errFile = outputFile + "_job_" + str(chromosome) + ".err" resFile = outputFile + "_" + str(chromosome) + ".out" print "resFile,outFile,errFile,snpsFile,posFile,phenotypeFile:", resFile, outFile, errFile, snpsFile, posFile, phenotypeFile results = _runCAMP_(resFile, outFile, errFile, snpsFile, posFile, phenotypeFile, sampleNum) positions_list += results["positions"] scores_list += results["scores"] for (i, j) in results["snpIndices"]: if not (j < 0 or i < 0): marfs.append(0.5) # An ugly hack!!! mafs.append(0.5) chromosome_list.append(chromosome) scoreFile = outputFile + ".scores" f = open(scoreFile, "w") f.write("Chromosome,Position,Score,MARF,MAF,Second_Position\n") for i in range(0, len(positions_list)): chromosome = chromosome_list[i] (pos1, pos2) = positions_list[i] score = scores_list[i] marf = marfs[i] maf = mafs[i] l = map(str, [chromosome, pos1, score, marf, maf, pos2]) f.write(",".join(l) + "\n") f.close()
def _plotKW_(): """ Analyze how population structure affects KW. """ filterProb = 0.1 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "_full_quick_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") # ,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps # For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps # globalKinship = calcKinship(totalSNPs) gc.collect() # Calling garbage collector, in an attempt to clean up memory.. # chr = 1 # for snpsd in snpsds: snpsd = snpsds[3] k = calcKinship(snpsd.snps[200:1400]) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) # runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)") k = calcKinship(totalSNPs) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) # runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)") phenVals = phed.getPhenVals(p_i) pvals = _run_kw_(snpsd.snps[200:1400], phenVals) log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)") (pvals, new_positions, acc_groups) = get_KW_pvals( snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW" ) ecot_map = phenotypeData._getEcotypeIdToStockParentDict_() for i in range(0, len(acc_groups)): acc_list = [] for a_i in acc_groups[i]: e_i = snpsd.accessions[a_i] # print e_i acc_list.append(ecot_map[int(e_i)][0]) print "group", i, ":", acc_list log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)") pylab.legend(numpoints=2, handlelen=0.005) pylab.show()
def _run_(): import sys if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "delim=", "missingval=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "candGeneListID=", "windowSize=", "testDataFraction=", "gridSize=", "phenotypeCategory=", ] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeCategory = 1 phenotypeFileType = 1 testDataFraction = 1.0 / 3.0 gridSize = 6 outFile = None delim = "," missingVal = "NA" help = 0 parallel = None parallelAll = False candGeneListID = 129 windowSize = 10000 host = "papaya.usc.edu" user = "******" passwd = "bamboo123" db = "T8_annotation_TH" for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o"): outFile = arg elif opt in ("--gridSize"): gridSize = int(arg) elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--phenotypeCategory"): phenotypeCategory = int(arg) elif opt in ("--testDataFraction"): testDataFraction = float(arg) elif opt in ("--candGeneListID"): candGeneListID = int(arg) elif opt in ("--windowSize"): windowSize = int(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 1 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) def runParallel(phenotypeIndex): # Cluster specific parameters scriptDir = env.scriptDir resultDir = env.resultDir phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd) phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outFile = resultDir + "CS_" + parallel + "_" + phenName shstr = """#!/bin/csh #PBS -l walltime=72:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr += "#PBS -N CS" + phenName + "_" + parallel + "\n" shstr += "(python " + scriptDir + "compositeScore.py -o" + outFile + " " shstr += ( "--candGeneListID=" + str(candGeneListID) + " --testDataFraction=" + str(testDataFraction) + " --gridSize=" + str(gridSize) + " --windowSize=" + str(windowSize) + " --phenotypeCategory=" + str(phenotypeCategory) + " " + str(phenotypeIndex) + " " ) shstr += "> " + outFile + "_job" + ".out) >& " + outFile + "_job" + ".err\n" f = open(parallel + ".sh", "w") f.write(shstr) f.close() # Execute qsub script os.system("qsub " + parallel + ".sh ") if parallel: # Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd) for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: for arg in args: runParallel(int(arg)) return else: phenotype = int(args[0]) if len(args) > 1: print "Warning multiple phenotype_id arguments were ignored (use --parallel)." print "compositeScore is being set up with the following parameters:" print "candGeneListID:", candGeneListID print "phenotypeCategory:", phenotypeCategory print "phenotype:", phenotype print "gridSize:", gridSize print "testDataFraction:", testDataFraction print "phenotypeFileType:", phenotypeFileType print "parallel:", parallel print "parallelAll:", parallelAll print "delim:", delim print "missingval:", missingVal print "" # Now the algorithm!!! # Load phenotype file categoricalNames = [ "158_Sil_length_16", "159_Sil_length_22", "161_Germ_10", "163_Germ_22", "173_Leaf_serr_10", "174_Leaf_serr_16", "175_Leaf_serr_22", "179_Roset_erect_22", "180_Chlor_16", "181_Chlor_22", ] phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd) # Check whether phenotype is quantitative.. isQuantitative = not (phed.isBinary(phenotype) or phed.getPhenotypeName(phenotype) in categoricalNames) if isQuantitative: print "Phenotype", phed.getPhenotypeName(phenotype), "is quantitaive." phenName = phed.getPhenotypeName(phenotype) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") # Load the result files: results = [] resultFiles = [] mafCutoff = max(mafCutoffs) for j in range(0, len(methods)): if isQuantitative or not onlyQuantitative[j]: resultFile = resultsDirs[j] + methods[j] + "_" + datasetNames[j] + "_" + phenName + fileTypes[j] resultFiles.append(resultFile) print "Loading result file", resultFile result = gwaResults.Result(resultFile) if logTransform[j]: print "Log transformed the p-values" result.negLogTransform() result.filterMAF(minMaf=mafCutoff) results.append(result) # Write the results to a file. import tempfile # if os.getenv("USER")=="bjarni" # tempfile.tempdir='/tmp' # tempfile.tempdir='/home/cmb-01/bvilhjal/tmp' (fId, resultsTempFile) = tempfile.mkstemp() os.close(fId) f = open(resultsTempFile, "w") for i in range(0, len(results[0].scores)): out_str = str(results[0].chromosomes[i]) + "_" + str(results[0].positions[i]) for result in results: out_str += "," + str(result.scores[i]) out_str += "\n" f.write(out_str) f.close() # Load cand. gene list. print "Connecting to db, host=" + host if not user: import sys sys.stdout.write("Username: "******"Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1)
def _run_(): import sys if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "delim=", "missingval=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "candGeneListID=", "windowSize=", "testDataFraction=", "gridSize=", "phenotypeCategory=" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeCategory = 1 phenotypeFileType = 1 testDataFraction = 1.0 / 3.0 gridSize = 6 outFile = None delim = "," missingVal = "NA" help = 0 parallel = None parallelAll = False candGeneListID = 129 windowSize = 10000 host = "papaya.usc.edu" user = "******" passwd = "bamboo123" db = "T8_annotation_TH" for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o"): outFile = arg elif opt in ("--gridSize"): gridSize = int(arg) elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--phenotypeCategory"): phenotypeCategory = int(arg) elif opt in ("--testDataFraction"): testDataFraction = float(arg) elif opt in ("--candGeneListID"): candGeneListID = int(arg) elif opt in ("--windowSize"): windowSize = int(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 1 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) def runParallel(phenotypeIndex): #Cluster specific parameters scriptDir = env.scriptDir resultDir = env.resultDir phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd) phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outFile = resultDir + "CS_" + parallel + "_" + phenName shstr = """#!/bin/csh #PBS -l walltime=72:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr += "#PBS -N CS" + phenName + "_" + parallel + "\n" shstr += "(python " + scriptDir + "compositeScore.py -o" + outFile + " " shstr += "--candGeneListID=" + str( candGeneListID) + " --testDataFraction=" + str( testDataFraction ) + " --gridSize=" + str(gridSize) + " --windowSize=" + str( windowSize) + " --phenotypeCategory=" + str( phenotypeCategory) + " " + str(phenotypeIndex) + " " shstr += "> " + outFile + "_job" + ".out) >& " + outFile + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd) for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: for arg in args: runParallel(int(arg)) return else: phenotype = int(args[0]) if len(args) > 1: print "Warning multiple phenotype_id arguments were ignored (use --parallel)." print "compositeScore is being set up with the following parameters:" print "candGeneListID:", candGeneListID print "phenotypeCategory:", phenotypeCategory print "phenotype:", phenotype print "gridSize:", gridSize print "testDataFraction:", testDataFraction print "phenotypeFileType:", phenotypeFileType print "parallel:", parallel print "parallelAll:", parallelAll print "delim:", delim print "missingval:", missingVal print "" #Now the algorithm!!! #Load phenotype file categoricalNames = [ "158_Sil_length_16", "159_Sil_length_22", "161_Germ_10", "163_Germ_22", "173_Leaf_serr_10", "174_Leaf_serr_16", "175_Leaf_serr_22", "179_Roset_erect_22", "180_Chlor_16", "181_Chlor_22" ] phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd) #Check whether phenotype is quantitative.. isQuantitative = not (phed.isBinary(phenotype) or phed.getPhenotypeName(phenotype) in categoricalNames) if isQuantitative: print "Phenotype", phed.getPhenotypeName(phenotype), "is quantitaive." phenName = phed.getPhenotypeName(phenotype) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") #Load the result files: results = [] resultFiles = [] mafCutoff = max(mafCutoffs) for j in range(0, len(methods)): if isQuantitative or not onlyQuantitative[j]: resultFile = resultsDirs[j] + methods[j] + "_" + datasetNames[ j] + "_" + phenName + fileTypes[j] resultFiles.append(resultFile) print "Loading result file", resultFile result = gwaResults.Result(resultFile, ) if logTransform[j]: print "Log transformed the p-values" result.negLogTransform() result.filterMAF(minMaf=mafCutoff) results.append(result) #Write the results to a file. import tempfile #if os.getenv("USER")=="bjarni" # tempfile.tempdir='/tmp' #tempfile.tempdir='/home/cmb-01/bvilhjal/tmp' (fId, resultsTempFile) = tempfile.mkstemp() os.close(fId) f = open(resultsTempFile, 'w') for i in range(0, len(results[0].scores)): out_str = str(results[0].chromosomes[i]) + "_" + str( results[0].positions[i]) for result in results: out_str += "," + str(result.scores[i]) out_str += "\n" f.write(out_str) f.close() #Load cand. gene list. print "Connecting to db, host=" + host if not user: import sys sys.stdout.write("Username: "******"Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF=" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 impFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None logTransform = False parallelAll = False chunkSize = 250000 round2Size = 5000 nTrees = 15000 nodeSize = None mem = "8g" skipSecondRound = True minMAF = 0.0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("-o", "--rFile"): impFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--secondRound"): skipSecondRound = False elif opt in ("-d", "--delim"): delim = arg elif opt in ("--chunkSize"): chunkSize = int(arg) elif opt in ("--round2Size"): round2Size = int(arg) elif opt in ("--nTrees"): nTrees = int(arg) elif opt in ("--nodeSize"): nodeSize = int(arg) elif opt in ("--mem"): mem = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-m", "--minMAF"): minMAF = float(arg) else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) def runParallel(phenotypeIndex): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") impFileName = resultDir + "RF_" + parallel + "_" + phenName outFileName = impFileName shstr = """#!/bin/csh #PBS -l walltime=50:00:00 """ shstr += "#PBS -l mem=" + mem + "\n" shstr += """ #PBS -q cmb """ shstr += "#PBS -N RF" + phenName + "_" + parallel + "\n" shstr += "(python " + programDir + "RandomForest.py -o " + impFileName + " --chunkSize " + str( chunkSize) + " --nTrees " + str(nTrees) + " --mem " + str( mem) + " --round2Size " + str(round2Size) + "" if nodeSize: shstr += " --nodeSize " + str(nodeSize) + " " if logTransform: shstr += " --logTransform " if not skipSecondRound: shstr += " --secondRound " shstr += " -a " + str(withArrayIds) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") #Nested function ends snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) print "chunkSize:", chunkSize print "nTrees:", nTrees print "nodeSize:", nodeSize print "mem:", mem print "logTransform:", logTransform print "round2Size:", round2Size print "skipSecondRound:", skipSecondRound #Loading genotype data import dataParsers snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) #Load phenotype file sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len( accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotype) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Remove minor allele frequencies if minMAF != 0: sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Converting format to 01 import snpsdata newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" snpsds = newSnpsds #Writing files import tempfile if env.user == "bjarni": tempfile.tempdir = '/tmp' (fId, phenotypeTempFile) = tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile) = tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write("Phenotype file written\n") sys.stdout.flush() #Retain only the correct runchunk of data. chromasomes = [] positions = [] snps = [] for i in range(0, len(snpsds)): snpsd = snpsds[i] positions += snpsd.positions snps += snpsd.snps chrList = [i + 1] * len(snpsd.positions) chromasomes += chrList #Is the phenotype binary? binary = phed.isBinary(phenotypeIndex) import util impFile = impFile + ".imp" rDataFile = impFile + ".rData" rFile = impFile + ".r" outRfile = rFile + ".out" errRfile = rFile + ".err" topImpFile = impFile + "_top" + str(chunkSize) + ".imp" topRDataFile = impFile + "_top.rData" try: os.remove(impFile) #Removing file if it already exits. except Exception: print "Couldn't remove", impFile try: os.remove(topImpFile) #Removing file if it already exits. except Exception: print "Couldn't remove", topImpFile for startIndex in range(0, len(positions), chunkSize): if startIndex + chunkSize >= len(positions): endIndex = len(positions) else: endIndex = startIndex + chunkSize #Writing genotype data to file. tmpFile = open(genotypeTempFile, "w") for i in range(startIndex, endIndex): outStr = "" snp = util.valListToStrList(snps[i]) outStr += str(chromasomes[i]) + "," + str(positions[i]) + "," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile, 'w') f.write(rstr) f.close() #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out" #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err" print "Running model nr", startIndex / chunkSize, ":" cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile sys.stdout.write(cmdStr + "\n") sys.stdout.flush() os.system(cmdStr) print "Random forest output saved in", impFile if not skipSecondRound: #Run on the top 'chunkSize' number of hits. #loading the R output file. impF = open(impFile, "r") lines = impF.readlines() impF.close() impList = list() for i in range(1, len(lines)): line = lines[i] line.strip() l = line.split(",") impList.append((float(l[2]), l[0], l[1], snps[i])) impList.sort() impList.reverse() #Writing genotype data to file. tmpFile = open(genotypeTempFile, "w") for i in range(0, round2Size): outStr = "" snp = util.valListToStrList(impList[i][3]) outStr += str(impList[i][1]) + "," + str(impList[i][2]) + "," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile, 'w') f.write(rstr) f.close() print "Running randomForest on the top importance scores:" cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile sys.stdout.write(cmdStr + "\n") sys.stdout.flush() os.system(cmdStr)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "rFile=", "chr=", "delim=", "missingval=", "BoundaryStart=", "removeOutliers=", "addConstant=", "logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", "kinshipDatafile=", "phenotypeRanks", "onlyMissing", "onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", "complement", "negate", "srInput=", "sr", "srOutput=", "srPar=", "srSkipFirstRun", "testRobustness", "permutationFilter=", "useLinearRegress", "regressionCofactors=", "FriLerAsCofactor", "FriColAsCofactor", "memReq=", "walltimeReq=", ] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeRanks = False removeOutliers = None addConstant = -1 phenotypeFileType = 1 rFile = None delim = "," missingVal = "NA" help = 0 minMAF = 0.0 boundaries = [-1, -1] chr = None parallel = None logTransform = False negate = False parallelAll = False lrt = False kinshipDatafile = None onlyMissing = False onlyOriginal96 = False onlyOriginal192 = False onlyBelowLatidue = None complement = False sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 testRobustness = False permutationFilter = 0.002 useLinearRegress = False regressionCofactors = None FriLerAsCofactor = False FriColAsCofactor = False memReq = "5g" walltimeReq = "150:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--rFile"): rFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--addConstant"): addConstant = float(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--onlyMissing"): onlyMissing = True elif opt in ("--onlyOriginal96"): onlyOriginal96 = True elif opt in ("--onlyOriginal192"): onlyOriginal192 = True elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue = float(arg) elif opt in ("--complement"): complement = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--negate"): negate = True elif opt in ("--removeOutliers"): removeOutliers = float(arg) elif opt in ("--LRT"): lrt = True elif opt in ("-c", "--chr"): chr = int(arg) elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("--kinshipDatafile"): kinshipDatafile = arg elif opt in ("--phenotypeRanks"): phenotypeRanks = True elif opt in ("--sr"): sr = True elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--FriLerAsCofactor"): FriLerAsCofactor = True elif opt in ("--FriColAsCofactor"): FriColAsCofactor = True elif opt in ("--useLinearRegress"): useLinearRegress = True elif opt in ("--regressionCofactors"): regressionCofactors = arg elif opt in ("--memReq"): memReq = arg elif opt in ("--walltimeReq"): walltimeReq = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) print "Emma is being set up with the following parameters:" print "output:", rFile print "phenotypeRanks:", phenotypeRanks print "phenotypeFileType:", phenotypeFileType print "parallel:", parallel print "parallelAll:", parallelAll print "minMAF:", minMAF print "LRT:", lrt print "delim:", delim print "missingval:", missingVal print "kinshipDatafile:", kinshipDatafile print "chr:", chr print "boundaries:", boundaries print "onlyMissing:", onlyMissing print "onlyOriginal96:", onlyOriginal96 print "onlyOriginal192:", onlyOriginal192 print "onlyBelowLatidue:", onlyBelowLatidue print "complement:", complement print "negate:", negate print "logTransform:", logTransform print "addConstant:", addConstant print "removeOutliers:", removeOutliers print "sr:", sr print "srSkipFirstRun:", srSkipFirstRun print "srInput:", srInput print "srOutput:", srOutput print "srTopQuantile:", srTopQuantile print "srWindowSize:", srWindowSize print "testRobustness:", testRobustness print "permutationFilter:", permutationFilter print "useLinearRegress:", useLinearRegress print "regressionCofactors:", regressionCofactors print "FriLerAsCofactor:", FriLerAsCofactor print "FriColAsCofactor:", FriColAsCofactor print "walltimeReq:", walltimeReq print "memReq:", memReq def runParallel(phenotypeIndex, phed): #Cluster specific parameters print phenotypeIndex phenName = phed.getPhenotypeName(phenotypeIndex) outFileName = resultDir + "Emma_" + parallel + "_" + phenName shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime=" + walltimeReq + "\n" shstr += "#PBS -l mem=" + memReq + "\n" shstr += "#PBS -q cmb\n" shstr += "#PBS -N E" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" if useLinearRegress: outFileName = resultDir + "LR_" + parallel + "_" + phenName shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " " if useLinearRegress: shstr += " --useLinearRegress " if regressionCofactors: shstr += " --regressionCofactors=" + str(regressionCofactors) + " " if FriLerAsCofactor: shstr += " --FriLerAsCofactor " if FriColAsCofactor: shstr += " --FriColAsCofactor " if onlyOriginal96: shstr += " --onlyOriginal96 " elif onlyOriginal192: shstr += " --onlyOriginal192 " if onlyBelowLatidue: shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " " if logTransform: shstr += " --logTransform " if negate: shstr += " --negate " if removeOutliers: shstr += " --removeOutliers=" + str(removeOutliers) + " " if phenotypeRanks: shstr += " --phenotypeRanks " if testRobustness: shstr += " --testRobustness " shstr += " --permutationFilter=" + str(permutationFilter) + " " if sr: shstr += " --sr " if not srOutput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" shstr += " --srOutput=" + str(output) + " " if srSkipFirstRun: if not srInput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" shstr += " --srInput=" + str(output) + " " shstr += " --srSkipFirstRun " shstr += " --srPar=" + str(srTopQuantile) + "," + str( srWindowSize) + " " if kinshipDatafile: shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " " shstr += " --addConstant=" + str(addConstant) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data if parallelAll: for phenotypeIndex in phed.phenIds: if onlyMissing: phenName = phed.getPhenotypeName(phenotypeIndex) pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" res = None try: res = os.stat(pvalFile) except Exception: print "File", pvalFile, "does not exist." if res and res.st_size > 0: print "File", pvalFile, "already exists, and is non-empty." if sr: srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" srRes = None try: srRes = os.stat(srInput) except Exception: print "File", srInput, "does not exist." if srRes and srRes.st_size > 0: print "File", srInput, "already exists, and is non-empty." else: runParallel(phenotypeIndex, phed) else: print "Setting up the run." runParallel(phenotypeIndex, phed) else: runParallel(phenotypeIndex, phed) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex, phed) return else: phenotypeIndex = int(args[2]) print "phenotypeIndex:", phenotypeIndex print "\nStarting program now!\n" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data numAcc = len(snpsds[0].accessions) #Removing outliers if removeOutliers: print "Remoing outliers" phed.naOutliers(phenotypeIndex, removeOutliers) #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str, original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str, original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][ 2] and eiDict[acc][2] < onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2] == None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] #Checking which accessions to keep and which to remove . for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break print "\nFiltering accessions in genotype data:" #Filter accessions which do not have the phenotype value (from the genotype data). for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "\nNow filtering accessions in phenotype data:" phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len( phed.accessions) == len(snpsds[0].accessions) if len(phed.accessions) != len(snpsds[0].accessions): raise Exception #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Remove minor allele frequencies if minMAF != 0: sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Removing SNPs which are outside of boundaries. if chr: print "\nRemoving SNPs which are outside of boundaries." snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1]) snpsds = [snpsds[chr - 1]] #Ordering accessions in genotype data to fit phenotype data. print "Ordering genotype data accessions." accessionMapping = [] i = 0 for acc in phed.accessions: if acc in snpsds[0].accessions: accessionMapping.append((snpsds[0].accessions.index(acc), i)) i += 1 #print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "\nGenotype data has been ordered." #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) print "" print "Checking kinshipfile:", kinshipDatafile if kinshipDatafile: #Is there a special kinship file? kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal) accIndicesToKeep = [] #Checking which accessions to keep and which to remove (genotype data). sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(kinshipSnpsds[0].accessions)): acc1 = kinshipSnpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) break print accIndicesToKeep for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from kinship genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "Ordering kinship data accessions." accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in kinshipSnpsds[0].accessions: accessionMapping.append( (kinshipSnpsds[0].accessions.index(acc), i)) i += 1 print zip(accessionMapping, snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "Kinship genotype data has been ordered." newKinshipSnpsds = [] sys.stdout.write("Converting data format") for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() newKinshipSnpsds.append(snpsd.getSnpsData( missingVal=missingVal)) #This data might have NAs print "" kinshipSnpsds = newKinshipSnpsds else: kinshipSnpsds = newSnpsds print "Found kinship data." #Ordering accessions according to the order of accessions in the genotype file # accessionMapping = [] # i = 0 # for acc in snpsds[0].accessions: # if acc in phed.accessions: # accessionMapping.append((phed.accessions.index(acc),i)) # i += 1 # phed.orderAccessions(accessionMapping) #Negating phenotypic values if negate: phed.negateValues(phenotypeIndex) if logTransform and not phed.isBinary( phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0: addConstant = 0 #Adding a constant. if addConstant != -1: if addConstant == 0: addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10 addConstant = addConstant - phed.getMinValue(phenotypeIndex) print "Adding a constant to phenotype:", addConstant phed.addConstant(phenotypeIndex, addConstant) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotypeIndex) #Converting phenotypes to Ranks elif phenotypeRanks: phed.transformToRanks(phenotypeIndex) if not chr: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [1, 2, 3, 4, 5]) else: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr]) phenotypeName = phed.getPhenotypeName(phenotypeIndex) sys.stdout.flush() if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in snpsDataset.snpsDataList: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter) sys.exit(0) if useLinearRegress: phenVals = phed.getPhenVals(phenotypeIndex) d0 = {} d0["phen"] = phenVals dh = {} dh["phen"] = phenVals import rpy, gc if regressionCofactors: #Adds ler and col as cofactors import pickle f = open(regressionCofactors, "r") co_factors = pickle.load(f) f.close() #inserting co factors into model for factor in co_factors: d[factor] = co_factors[factor] import analyzeHaplotype as ah (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True) if FriColAsCofactor: d0["col"] = col_factor dh["col"] = col_factor if FriLerAsCofactor: d0["ler"] = ler_factor dh["ler"] = ler_factor chr_pos_pvals = [] stats = [] sys.stdout.write("Applying the linear model") sys.stdout.flush() for i in range(0, len(newSnpsds)): #[3]:# snpsd = newSnpsds[i] sys.stdout.write("|") sys.stdout.flush() gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. for j in range(0, len(snpsd.snps)): if j % 5000 == 0: sys.stdout.write(".") sys.stdout.flush() #if snpsd.positions[j]>1700000: # break snp = snpsd.snps[j] d0["snp"] = snp try: rpy.set_default_mode(rpy.NO_CONVERSION) aov0 = rpy.r.aov(r("phen ~ ."), data=d0) aovh = rpy.r.aov(r("phen ~ ."), data=dh) rpy.set_default_mode(rpy.BASIC_CONVERSION) s0 = rpy.r.summary(aov0) sh = rpy.r.summary(aovh) #print s0,sh rss_0 = s0['Sum Sq'][-1] if type(sh['Sum Sq']) != float: rss_h = sh['Sum Sq'][-1] else: rss_h = sh['Sum Sq'] f = (rss_h - rss_0) / (rss_0 / (len(phenVals) - len(d0) + 1)) pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False) except Exception, err_str: print "Calculating p-value failed" #,err_str pval = 1.0 #print "dh:",dh #print "d0:",d0 #print "rss_h,rss_0:",rss_h,rss_0 #print "f,p:",f,pval chr_pos_pvals.append([i + 1, snpsd.positions[j], pval]) mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0])) maf = mafc / float(len(snp)) stats.append([maf, mafc]) sys.stdout.write("\n") #Write out to a result file sys.stdout.write("Writing results to file\n") sys.stdout.flush() pvalFile = rFile + ".pvals" f = open(pvalFile, "w") f.write("Chromosome,position,p-value,marf,maf\n") for i in range(0, len(chr_pos_pvals)): chr_pos_pval = chr_pos_pvals[i] stat = stats[i] f.write( str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," + str(chr_pos_pval[2]) + "," + str(stat[0]) + "," + str(stat[1]) + "\n") f.close() #Plot results print "Generating a GW plot." phenotypeName = phed.getPhenotypeName(phenotypeIndex) res = gwaResults.Result(pvalFile, name="LM_" + phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() pngFile = pvalFile + ".png" plotResults.plotResult(res, pngFile=pngFile, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plotBonferroni=True, usePylab=False)
def _testQQplot_(includeEmmaInBinary=False,usePvalueFiles=True): resdir = "/Users/bjarni/tmp/" #resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/" #resdir = "/Network/Data/250k/tmp-bvilhjal/qq_plots/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') phed2 = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') phenotypeIndices = phenotypeData.categories_2_phenotypes[4]#+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4] #(results_map, resultTypes_map) = _loadData_(phed, phenotypeIndices) q_pvalues = None stat_dict = {} for p_i in phenotypeIndices: (results_map, resultTypes_map) = _loadData_(phed, [p_i]) #try: phenName = phed.getPhenotypeName(p_i) phenNamePrint = " ".join(phenName.split("_")[1:]) print "\nWorking on phenotype",phenName if usePvalueFiles: q_pvalues = _getPermPvalues_(phenName) print len(q_pvalues),"permuted pvalues found" valCount = phed.countValues(p_i) print valCount,"values found." if (not phed.isBinary(p_i)) or includeEmmaInBinary: histogramFile = resdir + phenName +"_hist.pdf" histogramFile_png = resdir + phenName +"_hist.png" drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png) if phed.logTransform(p_i): histogramFile = resdir + phenName + "_hist_logTransformed.pdf" histogramFile_png = resdir + phenName + "_hist_logTransformed.png" drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png) elif not phed.isBinary(p_i): print "adding scaled const." phed.addSDscaledConstant(p_i) if phed.logTransform(p_i): histogramFile = resdir + phenName + "_hist_logTransformed_const.pdf" histogramFile_png = resdir + phenName + "_hist_logTransformed_const.png" drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png) # phed2.naOutliers(p_i,10) # histogramFile = resdir + phenName + "_hist_noOutliers.pdf" # histogramFile_png = resdir + phenName + "_hist_noOutliers.png" # drawHistogram(phed2, p_i, title = phenName, pdfFile = histogramFile, pngFile = histogramFile_png) # if phed2.logTransform(p_i): # histogramFile = resdir + phenName + "_hist_logTransformed_noOutliers.pdf" # histogramFile_png = resdir + phenName + "_hist_logTransformed_noOutliers.png" # drawHistogram(phed2, p_i, title = phenName, pdfFile = histogramFile, pngFile = histogramFile_png) results = results_map[p_i] resultTypes = resultTypes_map[p_i] qqplotFile = resdir + phenName + "_qqplot.pdf" qqplotFile_png = resdir + phenName + "_qqplot.png" s_dict={} (As,Ms)=drawQQPlot(results, 1000, phenName = phenNamePrint, resultTypes = resultTypes, pngFile=qqplotFile_png, perm_pvalues = q_pvalues) s_dict["A"]=As s_dict["M"]=Ms qqplotFile = resdir + phenName + "_qqplot_log.pdf" qqplotFile_png = resdir + phenName + "_qqplot_log.png" (ds,areas,slopes) = drawLogQQPlot(results, 1000,5, phenName = phenNamePrint, resultTypes = resultTypes, pngFile=qqplotFile_png, perm_pvalues = q_pvalues) s_dict["A2"]=areas s_dict["D"]=ds s_dict["S"]=slopes stat_dict[p_i] = s_dict for i in range(0,len(results)): result = results[i] result.negLogTransform() pngFile = resdir + phenName + "_gwplot_" +resultTypes[i]+".png" plotResults.plotResult(result,pngFile=pngFile,percentile=90,type="pvals", plotBonferroni=True) #except Exception: # print "\nPhenotype index", p_i, "failed." del results_map gc.collect() #Calling garbage collector, in an attempt to clean up memory.. print stat_dict stat_file_name = resdir + "confounding_stat_4.txt" f = open(stat_file_name,"w") methods = ["KW","Emma"] f.write("phenotype_name, method_name, is_binary, D, A, B, M, S\n") for p_i in phenotypeIndices: if stat_dict.has_key(p_i): s_dict = stat_dict[p_i] phenName = phed.getPhenotypeName(p_i) phenName = " ".join(phenName.split("_")[1:]) for i in range(0,len(methods)): st = phenName+", "+methods[i]+", "+str(phed.isBinary(p_i))+", "+str(s_dict["D"][i])+", "+str(s_dict["A"][i])+", "+str(s_dict["A2"][i])+", "+str(s_dict["M"][i])+", "+str(s_dict["S"][i])+"\n" f.write(st) f.close()
def _plotKinshipDiffs_(): filterProb = 0.2 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "full_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) for snpsd in snpsds: snpsd.filterMinMAF(0.1) snpsd.filterMonoMorphicSnps() totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps #For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps print "Calculating the global kinship..." globalKinship = calcKinship(totalSNPs) print "done." normalizedGlobalKinship = globalKinship / mean(globalKinship) gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. for i in range(4, 5): #len(snpsds)): chr = i + 1 snpsd = snpsds[i] #pylab.subplot(5,1,chr) # pylab.figure(figsize=(18,4)) # (kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000) # pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$') # (kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000) # pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$') # pylab.legend(numpoints=2,handlelen=0.005) # pylab.title("Kinship diff. chr. "+str(chr)) # pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf") # pylab.clf() pylab.figure(figsize=(18, 4)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000) pylab.plot(binPos, emmaDiffs, "r", label='ws$=300000$') pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000) pylab.plot(binPos, emmaDiffs, "b", label='ws$=500000$') pylab.title("Emma avg. p-value diff. on chr. " + str(chr)) pylab.legend(numpoints=2, handlelen=0.005) pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf") pylab.clf() gc.collect( ) #Calling garbage collector, in an attempt to clean up memory..
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["outputSNPsFile=","outputPhenotFile=", "filterMonomorphic", "rawDataFormat", "delim=", "missingval=", "withArrayId=", "phenotype=", "phenotypeFile=", "phenotypeName=", "calcKinshipMatrix=", "orderAccessions", "help"] try: opts, args = getopt.getopt(sys.argv[1:], "o:u:d:m:a:f:p:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None outputPhenotFile = None delim = "," missingVal = "NA" phenotypeFile = None kinshipMatrixFile = None phenotype = None phenotypeName = None rawDataFormat = False monomorphic = False help = 0 withArrayIds = 1 orderAccessions = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-f","--phenotypeFile"): phenotypeFile = arg elif opt in ("calcKinshipMatrix"): kinshipMatrixFile = arg elif opt in ("--filterMonomorphic"): monomorphic = True elif opt in ("--rawDataFormat"): rawDataFormat = True elif opt in ("--minCallProb"): minCallProb = float(arg) elif opt in ("-p","--phenotype"): phenotype = int(arg) elif opt in ("-o","--outputSNPsFile"): output_fname = arg elif opt in ("--orderAccessions"): orderAccessions = True elif opt in ("-u","--phenotypeFile"): outputPhenotFile = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: print output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) if phenotypeFile: import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') #Get Phenotype data accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) if phenotype>=0: #Load phenotype file sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break elif phenotype==None: sys.stdout.write("Removing accessions which do not have any phenotype values.") sys.stdout.flush() for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2: accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter Accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all." if outputPhenotFile: print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) if orderAccessions: accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc),i)) i += 1 phed.orderAccessions(accessionMapping) if phenotype>=0: phed.writeToFile(outputPhenotFile, [phenotype]) else: phed.writeToFile(outputPhenotFile) #Filtering monomorphic if monomorphic: print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" import snpsdata newSnpsds = [] if not rawDataFormat: sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" waid1 = 0 snpsDataset = snpsdata.SnpsDataSet(newSnpsds,[1,2,3,4,5]) decoder = {1:1, 0:0, -1:'NA'} else: snpsDataset = snpsdata.SnpsDataSet(snpsds,[1,2,3,4,5]) decoder=None snpsDataset.writeToFile(output_fname, deliminator=delim, missingVal = missingVal, withArrayIds = waid1, decoder=decoder)
def _test_(): #Load phenotype data.. import phenotypeData as pd import gwaResults as gr phed = pd.readPhenotypeFile('/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/phen_raw_092910.tsv') pid1 = 1 phed.filter_accessions_w_missing_data(pid1) phen_name = phed.getPhenotypeName(pid1) phen_vals = phed.getPhenVals(pid1) ecotypes = phed.accessions is_binary = phed.isBinary(pid1) #Creating the first hdf5 file hdf5_file_name_1 = '/Users/bjarni.vilhjalmsson/tmp/test1.hdf5' gwa_record = GWASRecord(hdf5_file_name_1) gwa_record.init_file() gwa_record.add_new_phenotype(phen_name, phen_vals, ecotypes, is_binary=is_binary) print "First file is constructed" print "Now testing it" r = gwa_record.get_phenotype_values(phen_name, 'raw') #print r phed = pd.readPhenotypeFile('/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/phen_raw_092910.tsv') pid2 = 5 phed.filter_accessions_w_missing_data(pid2) phen_name = phed.getPhenotypeName(pid2) phen_vals = phed.getPhenVals(pid2) ecotypes = phed.accessions is_binary = phed.isBinary(pid2) gwa_record.add_new_phenotype(phen_name, phen_vals, ecotypes, is_binary=is_binary) print "Now testing it" r = gwa_record.get_phenotype_values(phen_name, 'raw') #print r r = gwa_record.get_phenotype_info(phen_name) print r gwa_record.transform_phenotype('FT10', transformation='sqrt') print "Now testing it" r = gwa_record.get_phenotype_values(phen_name, 'raw') #print r r = gwa_record.get_phenotype_info(phen_name) print r result_file = '/Users/bjarnivilhjalmsson/tmp/pi1_pid5_FT10_emmax_none.pvals' res = gr.Result(result_file=result_file, name='FT10') res.neg_log_trans() # for c in ['chromosomes', 'positions', 'scores', 'marfs', 'mafs', 'genotype_var_perc', 'beta0', \ # 'beta1', 'correlations']: # print c, res.snp_results[c][:10] gwa_record.add_results(phen_name, 'emmax', res.snp_results['chromosomes'], res.snp_results['positions'], res.scores, res.snp_results['marfs'], res.snp_results['mafs'], transformation='raw', genotype_var_perc=res.snp_results['genotype_var_perc'], beta0=res.snp_results['beta0'], beta1=res.snp_results['beta1'], correlation=res.snp_results['correlations']) print "Result added." print "Now fetching a result." res = gwa_record.get_results(phen_name, 'emmax')#, min_mac=15, max_pval=0.01) print "Result loaded" # for c in ['chromosome', 'position', 'score', 'maf', 'mac', 'genotype_var_perc', 'beta0', \ # 'beta1', 'correlation']: # print c, res[c][:10] r = gwa_record.get_phenotype_info() print r s1 = time.time() res = gwa_record.get_results_by_chromosome(phen_name, 'emmax') print "Result re-loaded" secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) for chromosome in [1, 2, 3, 4, 5]: for c in ['position', 'score', 'maf', 'mac', 'genotype_var_perc', 'beta0', \ 'beta1', 'correlation']: print c, res[chromosome][c][:10] print res['chromosome_ends'] print res['max_score'] print gwa_record.get_phenotype_bins(phen_name) s1 = time.time() gwa_record.perform_gwas('LD', analysis_method='kw') secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) gwa_record.transform_phenotype('LD', transformation='log') gwa_record.perform_gwas('LD', analysis_method='emmax', transformation='log')
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "outputFile=", "delim=", "missingval=", "sampleNum=", "parallel=", "parallelAll", "useFloats" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:n:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 outputFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None parallelAll = False sampleNum = None chromosomes = [1, 2, 3, 4, 5] useFloats = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile = arg elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("n", "--sampleNum"): sampleNum = int(arg) elif opt in ("--useFloats"): useFloats = True else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile = args[0] phenotypeDataFile = args[1] print "CAMP is being set up with the following parameters:" print "phenotypeDataFile:", phenotypeDataFile if len(args) > 2: print "Phenotype_id:", args[2] print "snpsDataFile:", snpsDataFile print "parallel:", parallel print "parallelAll:", parallelAll print "sampleNum:", sampleNum def runParallel(phenotypeIndex, id=""): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id shstr = """#!/bin/csh #PBS -l walltime=24:00:00 #PBS -l mem=6g #PBS -q cmb """ shstr += "#PBS -N C" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " " if sampleNum: shstr += " -n " + str(sampleNum) + " " if useFloats: shstr += " --useFloats " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data #Load genotype file snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) #Checking overlap between phenotype and genotype accessions. phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len( accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Writing phenotype data to CAMP format. (fId, phenotypeFile) = tempfile.mkstemp() os.close(fId) phenVals = phed.getPhenVals(phenotypeIndex, asString=False) if not useFloats: phenVals = map(int, phenVals) phenFile = open(phenotypeFile, "w") for value in phenVals: phenFile.write(str(value) + "\n") phenFile.close() chromosome_list = [] positions_list = [] scores_list = [] interaction_positions_list = [] mafs = [] marfs = [] #Writing SNP data to CAMP format. for chromosome in chromosomes: (fId, snpsFile) = tempfile.mkstemp() os.close(fId) (fId, posFile) = tempfile.mkstemp() os.close(fId) sf = open(snpsFile, "w") pf = open(posFile, "w") snpsd = newSnpsds[chromosome - 1] for i in range(0, len(snpsd.snps)): snp = snpsd.snps[i] (marf, maf) = snpsdata.getMAF(snp) marfs.append(marf) mafs.append(maf) str_snp = map(str, snp) double_snp = [] for nt in str_snp: double_snp.append(nt) double_snp.append(nt) sf.write("".join(double_snp) + "\n") pf.write(str(snpsd.positions[i]) + "\n") sf.close() pf.close() outFile = outputFile + "_job_" + str(chromosome) + ".out" errFile = outputFile + "_job_" + str(chromosome) + ".err" resFile = outputFile + "_" + str(chromosome) + ".out" print "resFile,outFile,errFile,snpsFile,posFile,phenotypeFile:", resFile, outFile, errFile, snpsFile, posFile, phenotypeFile results = _runCAMP_(resFile, outFile, errFile, snpsFile, posFile, phenotypeFile, sampleNum) positions_list += results["positions"] scores_list += results["scores"] for (i, j) in results["snpIndices"]: if not (j < 0 or i < 0): marfs.append(0.5) #An ugly hack!!! mafs.append(0.5) chromosome_list.append(chromosome) scoreFile = outputFile + ".scores" f = open(scoreFile, "w") f.write("Chromosome,Position,Score,MARF,MAF,Second_Position\n") for i in range(0, len(positions_list)): chromosome = chromosome_list[i] (pos1, pos2) = positions_list[i] score = scores_list[i] marf = marfs[i] maf = mafs[i] l = map(str, [chromosome, pos1, score, marf, maf, pos2]) f.write(",".join(l) + "\n") f.close()
def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) phenName=phenName.replace("/", "_div_") phenName=phenName.replace("*", "_star_") outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr="""#!/bin/csh #PBS -l walltime=100:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " shstr+=" -a "+str(withArrayIds)+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ")
def _plotConfoundingStats_(): #import pylab as plt resdir = "/Network/Data/250k/tmp-bvilhjal/perm_tests/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" stat_file_dir = "/Users/bjarni/tmp/" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4] m_pvals = {} a_pvals = {} ks_pvals = {} for p_i in phenotypeIndices: #if not phed.isBinary(p_i): phenName = phed.getPhenotypeName(p_i) print "Loading permutation stats data for",phenName filename = resdir+"KW_perm_f1_n1000_"+phenName+".perm.stat.txt" f = open(filename,"r") lines = f.readlines() pvals = (lines[-1].strip()).split(',') m_pvals[p_i] = float(pvals[0].split(" ")[-1]) a_pvals[p_i] = float(pvals[1]) ks_pvals[p_i] = float(pvals[2]) x_ticks = [] s_ticks = [] x_pos = 1 for cat in [1,2,3,4]: for p_i in phenotypeData.categories_2_phenotypes[cat]: s_ticks.append(phed.getPhenotypeName(p_i)) #plt.text(x_pos+shift,min_stat-0.1*stat_range,p_i,rotation="vertical",size="xx-small") x_ticks.append(x_pos-0.5) x_pos += 1 x_pos = x_pos+1 figure = plt.figure(figsize=(14,8)) axes = plt.Axes(figure, [.06,.16,.91,.81]) figure.add_axes(axes) x_pos = 0 colors = {1:"b",2:"r",3:"g",4:"c"} for i in [1,2,3,4]: phenotypeIndices = phenotypeData.categories_2_phenotypes[i] newPhenotypeIndices = [] for p_i in phenotypeIndices: #if not phed.isBinary(p_i): newPhenotypeIndices.append(p_i) phenotypeIndices = newPhenotypeIndices m_list = [] for p_i in phenotypeIndices: m_list.append(m_pvals[p_i]) plt.bar(range(x_pos,len(m_list)+x_pos),m_list,color = colors[i]) x_pos = x_pos+len(m_list)+1 plt.axis([0-0.02*(x_pos-1),1.02*(x_pos-1),-0.02,1.02]) plt.xticks(x_ticks,s_ticks,size="x-small",rotation="vertical") plt.ylabel("M stat. p-value") plt.savefig(stat_file_dir+"confounding_M_pvalues.png", format = "png") plt.clf() figure = plt.figure(figsize=(14,8)) axes = plt.Axes(figure, [.06,.16,.91,.81]) figure.add_axes(axes) x_pos = 0 for i in [1,2,3,4]: phenotypeIndices = phenotypeData.categories_2_phenotypes[i] newPhenotypeIndices = [] for p_i in phenotypeIndices: #if not phed.isBinary(p_i): newPhenotypeIndices.append(p_i) phenotypeIndices = newPhenotypeIndices a_list = [] for p_i in phenotypeIndices: a_list.append(a_pvals[p_i]) plt.bar(range(x_pos,len(a_list)+x_pos),a_list,color = colors[i]) x_pos = x_pos+len(a_list)+1 plt.axis([0-0.02*(x_pos-1),1.02*(x_pos-1),-0.02,1.02]) plt.xticks(x_ticks,s_ticks,size="x-small",rotation="vertical") plt.ylabel("A stat. p-value") plt.savefig(stat_file_dir+"confounding_A_pvalues.png", format = "png") plt.clf() figure = plt.figure(figsize=(14,8)) axes = plt.Axes(figure, [.06,.16,.91,.81]) figure.add_axes(axes) x_pos = 0 for i in [1,2,3,4]: phenotypeIndices = phenotypeData.categories_2_phenotypes[i] newPhenotypeIndices = [] for p_i in phenotypeIndices: #if not phed.isBinary(p_i): newPhenotypeIndices.append(p_i) phenotypeIndices = newPhenotypeIndices a_list = [] for p_i in phenotypeIndices: a_list.append(ks_pvals[p_i]) plt.bar(range(x_pos,len(a_list)+x_pos),a_list,color = colors[i]) x_pos = x_pos+len(a_list)+1 plt.axis([0-0.02*(x_pos-1),1.02*(x_pos-1),-0.02,1.02]) plt.xticks(x_ticks,s_ticks,size="x-small",rotation="vertical") plt.ylabel("KS stat. p-value") plt.savefig(stat_file_dir+"confounding_KS_pvalues.png", format = "png") plt.clf() print m_pvals, a_pvals, ks_pvals
def analyzeSNPs(): import KW, phenotype_parsers, phenotypeData import Emma result_id = "filtered_imputed" data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/" #ref_seq_name = "2010_Col-0" ref_seq_name = "raw_ref_col-0" ref_start = 3170501 ref_chr = 5 #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_edited_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #ad = sequences.readFastaAlignment(data_dir+"flc_seqs_aln_merged_011810.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #r = ad.get_snps(type=1) #seq_snpsd = r['snpsd'] #seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA') #seq_snpsd.onlyBinarySnps() #i_snpsd = r['indels'] #print indels #i_snpsd = i_snpsd.getSnpsData(missingVal='NA') #print zip(i_snpsd.positions, i_snpsd.snps) #print i_snpsd.accessionsl seq_snpsd = dataParsers.parseCSVData( data_dir + "/flc_seqs_aln_imputed_snps_012510.csv")[0] seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA') # d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_073009.csv" d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_imputed_012610.csv" d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data") # d2010_sd.filter_na_accessions() d2010_sd.filter_na_snps() d2010_sd.convert_2_binary() d2010_sd.filter_maf_snps(0.05) #kinship_2010 = Emma.calcKinship(d2010_sd.getSnps(0.05)) d2010_sd = d2010_sd.get_region_snpsd(5, 3140000, 3220000) d2010_sd.remove_redundant_snps(w_missing=True) d250k_file = "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_data_t43_081009.csv" snpsd = dataParsers.parse_snp_data(d250k_file) snpsd.filter_accessions(seq_snpsd.accessions) snpsd.convert_2_binary() snpsd.filter_maf_snps(0.05) #kinship_250k = Emma.calcKinship(snpsd.getSnps(0.02)) snpsd = snpsd.get_region_snpsd(5, 3140000, 3220000) snpsd.remove_redundant_snps() seq_snpsd.remove_accessions(snpsd.accessions) seq_snpsd.snpsFilterRare(0.05) seq_snpsd.onlyBinarySnps() acc_map = [] for i, acc in enumerate(seq_snpsd.accessions): acc_map.append((i, snpsd.accessions.index(acc))) seq_snpsd.orderAccessions(acc_map) seq_snpsd.remove_redundant_snps(w_missing=True) #snpsd.mergeDataUnion(d2010_sd,priority=2,unionType=3) #ad.compare_with_snps_data(snpsd) #Something missing here snpsd...? #i_snpsd = #snpsd.mergeDataUnion(d250k_sd,unionType=3,verbose=True) #NOW PERFORM GWAS AND PLOT RESULT!!!! phend = phenotypeData.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv") results_colors = ['blue', 'green', 'red'] #kinship_matrices = [kinship_250k,kinship_250k,kinship_2010] snpsds = [snpsd, seq_snpsd, d2010_sd] phenotypeIndices = phend.phenIds log_transforms = [1, 2] import analyzePhenotype as ap import analyzeSNPResult as asr import copy # for i in phenotypeIndices: # #ap.drawHistogram(phend,i,pdfFile="/Users/bjarnivilhjalmsson/tmp/hist_"+str(phend.getPhenotypeName(i))+".pdf") # #if i in log_transforms: # phend.logTransform(i) # #print "log transforming" # results = [] # filtered_sds=[] # for sd,k in zip(snpsds,kinship_matrices): # new_sd = copy.deepcopy(sd) # res = Emma.run_emma_w_missing_data(new_sd,phend,i,5,k) # res.negLogTransform() # snps_indices_to_keep = res.filterMARF(minMaf=0.1) # print "Got",len(res.scores),len(res.positions),"p-values from Emma." # results.append(res) # #pvals = res.scores # #positions = res.positions # #pp = zip(pvals,positions) # #pp.sort() # #print pp # #import plotResults as pr # #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf") # new_sd.filter_snp_indices(snps_indices_to_keep) # filtered_sds.append(new_sd) # import regionPlotter as rp # reg_plotter = rp.RegionPlotter() # reg_plotter.plot_small_result(results,results_colors=results_colors, # pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_"+result_id+"_emma_gwas_"+str(phend.getPhenotypeName(i))+".pdf") # for j,(r,sd) in enumerate(zip(results,filtered_sds)): # r_i = r.scores.index(max(r.scores)) # phend.plot_marker_box_plot(i,sd,r_i,pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_emma_"+str(phend.getPhenotypeName(i))+"_"+results_colors[j]+".pdf",marker_score=r.scores[r_i]) # phend = phenotypeData.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv") for i in phenotypeIndices: results = [] filtered_sds = [] for sd in snpsds: new_sd = copy.deepcopy(sd) res, f_sd = KW.run_kw(new_sd, phend, i, 5) filtered_sds.append(f_sd) res.negLogTransform() print "Got", len(res.scores), len( res.positions), "p-values from KW." results.append(res) #pvals = res.scores #positions = res.positions #pp = zip(pvals,positions) #pp.sort() #print pp #import plotResults as pr #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf") import regionPlotter as rp reg_plotter = rp.RegionPlotter() reg_plotter.plot_small_result( results, results_colors=results_colors, pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_" + result_id + "_gwas_" + str(phend.getPhenotypeName(i)) + ".pdf") for j, (r, sd) in enumerate(zip(results, filtered_sds)): if len(r.scores) != len(sd.snps): print "Lengths not equal? %d, %d", (len(r.scores), len(sd.snps)) r_i = r.scores.index(max(r.scores)) phend.plot_marker_box_plot( i, sd, r_i, pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_kw_" + str(phend.getPhenotypeName(i)) + "_" + results_colors[j] + ".pdf", marker_score=r.scores[r_i])
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["rFile=","chr=", "delim=", "missingval=", "withArrayId=", "BoundaryStart=", "removeOutliers=", "addConstant=", "logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", "kinshipDatafile=", "phenotypeRanks", "onlyMissing","onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", "complement", "negate", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "testRobustness", "permutationFilter="] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeRanks = False removeOutliers = None addConstant = -1 phenotypeFileType = 1 rFile = None delim = "," missingVal = "NA" help = 0 minMAF=0.0 withArrayIds = 1 boundaries = [-1,-1] chr=None parallel = None logTransform = False negate = False parallelAll = False lrt = False kinshipDatafile = None onlyMissing = False onlyOriginal96 = False onlyOriginal192 = False onlyBelowLatidue = None complement = False sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 testRobustness = False permutationFilter = 0.002 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-o","--rFile"): rFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--addConstant"): addConstant = float(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--onlyMissing"): onlyMissing = True elif opt in ("--onlyOriginal96"): onlyOriginal96 = True elif opt in ("--onlyOriginal192"): onlyOriginal192 = True elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue = float(arg) elif opt in ("--complement"): complement = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--negate"): negate = True elif opt in ("--removeOutliers"): removeOutliers = float(arg) elif opt in ("--LRT"): lrt = True elif opt in ("-c","--chr"): chr = int(arg) elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("--kinshipDatafile"): kinshipDatafile = arg elif opt in ("--phenotypeRanks"): phenotypeRanks = True elif opt in ("--sr"): sr = True elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) print "Emma is being set up with the following parameters:" print "output:",rFile print "phenotypeRanks:",phenotypeRanks print "withArrayId:",withArrayIds print "phenotypeFileType:",phenotypeFileType print "parallel:",parallel print "parallelAll:",parallelAll print "minMAF:",minMAF print "LRT:",lrt print "delim:",delim print "missingval:",missingVal print "kinshipDatafile:",kinshipDatafile print "chr:",chr print "boundaries:",boundaries print "onlyMissing:",onlyMissing print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "complement:",complement print "negate:",negate print "logTransform:",logTransform print "addConstant:",addConstant print "removeOutliers:",removeOutliers print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "testRobustness:",testRobustness print "permutationFilter:",permutationFilter def runParallel(phenotypeIndex,phed): #Cluster specific parameters print phenotypeIndex phenName = phed.getPhenotypeName(phenotypeIndex) outFileName = resultDir+"Emma_"+parallel+"_"+phenName shstr = """#!/bin/csh #PBS -l walltime=100:00:00 #PBS -l mem=8g #PBS -q cmb """ shstr += "#PBS -N E"+phenName+"_"+parallel+"\n" shstr += "set phenotypeName="+parallel+"\n" shstr += "set phenotype="+str(phenotypeIndex)+"\n" shstr += "(python "+emmadir+"Emma.py -o "+outFileName+" " if onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " if logTransform: shstr += " --logTransform " if negate: shstr += " --negate " if removeOutliers: shstr += " --removeOutliers="+str(removeOutliers)+" " if phenotypeRanks: shstr += " --phenotypeRanks " if testRobustness: shstr+=" --testRobustness " shstr+=" --permutationFilter="+str(permutationFilter)+" " if sr: shstr += " --sr " if not srOutput: output = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"Emma_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr += " -a "+str(withArrayIds)+" " if kinshipDatafile: shstr += " --kinshipDatafile="+str(kinshipDatafile)+" " shstr += " --addConstant="+str(addConstant)+" " shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n" f = open(parallel+".sh",'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data if parallelAll: for phenotypeIndex in phed.phenIds: if onlyMissing: phenName = phed.getPhenotypeName(phenotypeIndex) pvalFile = resultDir+"Emma_"+parallel+"_"+phenName+".pvals" res = None try: res = os.stat(pvalFile) except Exception: print "File",pvalFile,"does not exist." if res and res.st_size>0: print "File",pvalFile,"already exists, and is non-empty." if sr: srInput = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals" srRes = None try: srRes = os.stat(srInput) except Exception: print "File",srInput,"does not exist." if srRes and srRes.st_size>0: print "File",srInput,"already exists, and is non-empty." else: runParallel(phenotypeIndex,phed) else: print "Setting up the run." runParallel(phenotypeIndex,phed) else: runParallel(phenotypeIndex,phed) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex,phed) return else: phenotypeIndex = int(args[2]) print "phenotypeIndex:",phenotypeIndex print "\nStarting program now!\n" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data numAcc = len(snpsds[0].accessions) #Removing outliers if removeOutliers: print "Remoing outliers" phed.naOutliers(phenotypeIndex,removeOutliers) #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] #Checking which accessions to keep and which to remove . for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break print "\nFiltering accessions in genotype data:" #Filter accessions which do not have the phenotype value (from the genotype data). for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed from genotype data, leaving",len(accIndicesToKeep),"accessions in all." print "\nNow filtering accessions in phenotype data:" phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is",len(phed.accessions)==len(snpsds[0].accessions) if len(phed.accessions)!=len(snpsds[0].accessions): raise Exception #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" #Remove minor allele frequencies if minMAF!=0: sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Removing SNPs which are outside of boundaries. if chr: print "\nRemoving SNPs which are outside of boundaries." snpsds[chr-1].filterRegion(boundaries[0],boundaries[1]) snpsds = [snpsds[chr-1]] #Ordering accessions in genotype data to fit phenotype data. print "Ordering genotype data accessions." accessionMapping = [] i = 0 for acc in phed.accessions: if acc in snpsds[0].accessions: accessionMapping.append((snpsds[0].accessions.index(acc),i)) i += 1 #print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)",len(snpsds[0].snps) for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "\nGenotype data has been ordered." #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) print "" print "Checking kinshipfile:",kinshipDatafile if kinshipDatafile: #Is there a special kinship file? kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) accIndicesToKeep = [] #Checking which accessions to keep and which to remove (genotype data). sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0,len(kinshipSnpsds[0].accessions)): acc1 = kinshipSnpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) break print accIndicesToKeep for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed from kinship genotype data, leaving",len(accIndicesToKeep),"accessions in all." print "Ordering kinship data accessions." accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in kinshipSnpsds[0].accessions: accessionMapping.append((kinshipSnpsds[0].accessions.index(acc),i)) i += 1 print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)",len(snpsds[0].snps) for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "Kinship genotype data has been ordered." newKinshipSnpsds = [] sys.stdout.write("Converting data format") for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() newKinshipSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) #This data might have NAs print "" kinshipSnpsds = newKinshipSnpsds else: kinshipSnpsds = newSnpsds print "Found kinship data." #Ordering accessions according to the order of accessions in the genotype file # accessionMapping = [] # i = 0 # for acc in snpsds[0].accessions: # if acc in phed.accessions: # accessionMapping.append((phed.accessions.index(acc),i)) # i += 1 # phed.orderAccessions(accessionMapping) #Negating phenotypic values if negate: phed.negateValues(phenotypeIndex) #Adding a constant. if addConstant!=-1: if addConstant==0: addConstant = math.sqrt(phed.getVariance(phenotypeIndex))/10 addConstant = addConstant - phed.getMinValue(phenotypeIndex) print "Adding a constant to phenotype:",addConstant phed.addConstant(phenotypeIndex,addConstant) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotypeIndex) #Converting phenotypes to Ranks elif phenotypeRanks: phed.transformToRanks(phenotypeIndex) if not chr: snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[1,2,3,4,5]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[1,2,3,4,5]) else: snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[chr]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[chr]) phenotypeName = phed.getPhenotypeName(phenotypeIndex) sys.stdout.flush() if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in snpsDataset.snpsDataList: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) _robustness_test_(allSNPs,phenVals,rFile,filter=permutationFilter) sys.exit(0) if (not sr) or (sr and not srSkipFirstRun): sys.stdout.write("Running Primary Emma.\n") sys.stdout.flush() pvalFile = _runEmmaScript_(snpsDataset, kinshipSnpsDataset, phed, phenotypeIndex, rFile, chr=chr, delim=delim, missingVal=missingVal, boundaries=boundaries, lrt=lrt) res = gwaResults.Result(pvalFile,name="EMMA_"+phenotypeName, phenotypeID=phenotypeIndex) res.filterMAF() res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,kinshipSnpsDataset) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.filterMAF() res.negLogTransform() srRes = gwaResults.Result(srOutput,name="EMMA_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.filterMAF() srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)