def _run_():

	p_dict, args = parse_parameters()
	print "GWA runs are being set up with the following parameters:"
	for k, v in p_dict.iteritems(): print k + ': ' + str(v)
	print ''

	#Load phenotype file
	if p_dict['phen_file']:
		print 'Loading phenotypes from file.'
		phed = phenotypeData.readPhenotypeFile(p_dict['phen_file'],
						with_db_ids=(not p_dict['no_phenotype_ids']))  #load phenotype file
	else:
		print 'Retrieving the phenotypes from the DB.'
		phed = phenotypeData.getPhenotypes()

	#If on the cluster, then set up runs..
	if p_dict['parallel']:
		if len(p_dict['pids']) == 0:  #phenotype index arguement is missing, hence all phenotypes are run/analyzed.
			if not p_dict['phen_file']:
				raise Exception('Phenotype file or phenotype ID is missing.')
			p_dict['pids'] = phed.phenIds
		else:
			raise Exception('Too many arguments..')

		if analysis_plots:  #Running on the cluster..
			for p_i in p_dict['pids']:
				run_parallel(p_i, phed, p_dict)
		else:
			for mapping_method in p_dict['specific_methods']:
				for trans_method in p_dict['specific_transformations']:
					for p_i in pids:
						run_parallel(p_i, phed, p_dict, mapping_method, trans_method)
		return #Exiting the program...


	#SNPs data file name
	if not p_dict['data_file']:
		snps_data_file = '%s250K_t%d.csv' % (env['data_dir'], p_dict['call_method_id'])
	else:
		snps_data_file = p_dict['data_file']



	#Plot analysis plots...
	if p_dict['analysis_plots']:
		analysis_plots(snps_data_file, phed, p_dict)
	else:
		#If not analysis plots... then GWAS
		for p_i in p_dict['pids']:
			print '-' * 120, '\n'
			phenotype_name = phed.getPhenotypeName(p_i)
			print "Performing GWAS for phenotype: %s, phenotype_id: %s" % (phenotype_name, p_i)
			for trans_method in p_dict['specific_transformations']:
				print 'Phenotype transformation:', trans_method

				for mapping_method in p_dict['specific_methods']:
					#DO ANALYSIS
					print 'Mapping method:', mapping_method
					map_phenotype(p_i, phed, snps_data_file, mapping_method, trans_method, p_dict)
Example #2
0
def _runTest_():
	import dataParsers
	import phenotypeData
	
	#Get phenotype data
	phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_transformed_publishable_v2.tsv"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')  #Get Phenotype data 

	#Get SNPs data 
	snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
	snpsds = dataParsers.parseCSVData(snpsDataFile) #Get SNPs data 

	psFile = env.homedir + "tmp/tree.ps"
	marg_file = env.homedir + "tmp/test"
	out_file = env.homedir + "tmp/test_out"
	rFile = env.homedir + "tmp/tree_test.r"

	#Run Margarita
	marg = Margarita(marg_file, out_file)
	chr = 4
	snpsd = snpsds[chr - 1].getSnpsData()
	marg.gwaWithTrees(marg_file, snpsd, phed, phenotype = 1, numMarkers = 200, chromosome = chr, boundaries = [200000, 350000], numPerm = 1, cutoff = 16, numArg = 100)
	#(self, id, snpsd, phed, phenotype=0, boundaries = None, numMarkers = 100, numPerm = 500000, cutoff = 16, numArg = 50)

	#which marginal tree
	runNum = 1
	argNum = 1
	markerNum = 1
	
	marg.parseTreeFile(marg_file + ".marg.trees", rFile, psFile, runNum, argNum, markerNum)
Example #3
0
def _fakeTransformPhenotypes_():
	import os
	res_dir = "/Network/Data/250k/tmp-bvilhjal/emma_results/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	for p_i in transformationMap: 
		(trans_type,data_type) = transformationMap[p_i]
		print trans_type
		if trans_type==1 or trans_type==6:
			t_type = "new_logTransform" #m"
		elif trans_type==9 :
			t_type = "new_logTransform_f192" #m"
		elif trans_type==5:
			t_type = "newDataset_logTransform_noOutliers"
		elif trans_type==7:
			t_type = "new_ranks"
		elif trans_type==8:
			t_type = "neg_const_logTransform"
		else: #if trans_type==0:
			t_type = "new_raw"
		phenName = phed.getPhenotypeName(p_i)
#		oldName = res_dir+"Emma_"+t_type+"_"+phenName+".pvals"
#		newName = res_dir+"Emma_new_trans_"+phenName+".pvals"
#		cp_command = "sudo cp "+oldName+" "+newName
#		print cp_command
#		os.system(cp_command)
		oldName = res_dir+"Emma_"+t_type+"_"+phenName+".sr.pvals"
		newName = res_dir+"Emma_new_trans_"+phenName+".sr.pvals"
		cp_command = "sudo cp "+oldName+" "+newName
		print cp_command
		os.system(cp_command)
Example #4
0
def _plotKinshipDiffs_():

    filterProb = 0.2
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "full_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")  # ,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    for snpsd in snpsds:
        snpsd.filterMinMAF(0.1)
        snpsd.filterMonoMorphicSnps()

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

        # For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    print "Calculating the global kinship..."
    globalKinship = calcKinship(totalSNPs)
    print "done."
    normalizedGlobalKinship = globalKinship / mean(globalKinship)
    gc.collect()  # Calling garbage collector, in an attempt to clean up memory..

    for i in range(4, 5):  # len(snpsds)):
        chr = i + 1
        snpsd = snpsds[i]
        # pylab.subplot(5,1,chr)
        # 		pylab.figure(figsize=(18,4))
        # 		(kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000)
        # 		pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$')
        # 		(kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000)
        # 		pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$')
        # 		pylab.legend(numpoints=2,handlelen=0.005)
        # 		pylab.title("Kinship diff. chr. "+str(chr))
        # 		pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf")
        # 		pylab.clf()
        pylab.figure(figsize=(18, 4))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000)
        pylab.plot(binPos, emmaDiffs, "r", label="ws$=300000$")
        pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000)
        pylab.plot(binPos, emmaDiffs, "b", label="ws$=500000$")
        pylab.title("Emma avg. p-value diff. on chr. " + str(chr))
        pylab.legend(numpoints=2, handlelen=0.005)
        pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf")
        pylab.clf()
        gc.collect()  # Calling garbage collector, in an attempt to clean up memory..
Example #5
0
	def runParallel(phenotypeIndex):
		#Cluster specific parameters
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		phenName = phed.getPhenotypeName(phenotypeIndex)
		phenName = phenName.replace("/","_div_")
		phenName = phenName.replace("*","_star_")
		impFileName = resultDir+"RF_"+parallel+"_"+phenName
		outFileName = impFileName
		shstr = """#!/bin/csh
#PBS -l walltime=120:00:00
"""
		shstr += "#PBS -l mem="+mem+"\n"
		shstr +="""
#PBS -q cmb
"""
		
		shstr += "#PBS -N RF"+phenName+"_"+parallel+"\n"
		shstr += "(python "+programDir+"RandomForest.py -o "+impFileName+" --chunkSize "+str(chunkSize)+" --nTrees "+str(nTrees)+" --mem "+str(mem)+" --round2Size "+str(round2Size)+""
		if nodeSize:
			shstr += " --nodeSize "+str(nodeSize)+" "
		if logTransform:
			shstr += " --logTransform "
		if not skipSecondRound:
			shstr += " --secondRound "
		shstr += " -a "+str(withArrayIds)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")
Example #6
0
    def runParallel(phenotypeIndex, id=""):
        #Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id

        shstr = """#!/bin/csh
#PBS -l walltime=24:00:00
#PBS -l mem=6g 
#PBS -q cmb
"""

        shstr += "#PBS -N C" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " "
        if sampleNum:
            shstr += " -n " + str(sampleNum) + " "
        if useFloats:
            shstr += " --useFloats "

        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")
Example #7
0
def _fakeTransformPhenotypes_():
    import os
    res_dir = "/Network/Data/250k/tmp-bvilhjal/emma_results/"
    phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_042109.tsv"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    for p_i in transformationMap:
        (trans_type, data_type) = transformationMap[p_i]
        print trans_type
        if trans_type == 1 or trans_type == 6:
            t_type = "logTransform"  #m"
        elif trans_type == 9:
            t_type = "logTransform_f192"  #m"
        elif trans_type == 5:
            t_type = "logTransform_noOutliers"
        elif trans_type == 7:
            t_type = "ranks"
        elif trans_type == 8:
            t_type = "neg_const_logTransform"
        else:  #if trans_type==0:
            t_type = "raw"
        phenName = phed.getPhenotypeName(p_i)
        #		oldName = res_dir+"Emma_"+t_type+"_"+phenName+".pvals"
        #		newName = res_dir+"Emma_trans_"+phenName+".pvals"
        #		cp_command = "sudo cp "+oldName+" "+newName
        #		print cp_command
        #		os.system(cp_command)
        oldName = res_dir + "Emma_" + t_type + "_" + phenName + ".pvals"
        newName = res_dir + "Emma_trans_" + phenName + ".pvals"
        cp_command = "sudo cp " + oldName + " " + newName
        print cp_command
        os.system(cp_command)
Example #8
0
    def runParallel(phenotypeIndex, id=""):
        # Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id

        shstr = """#!/bin/csh
#PBS -l walltime=24:00:00
#PBS -l mem=6g 
#PBS -q cmb
"""

        shstr += "#PBS -N C" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " "
        if sampleNum:
            shstr += " -n " + str(sampleNum) + " "
        if useFloats:
            shstr += " --useFloats "

        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " "
        shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n"

        f = open(parallel + ".sh", "w")
        f.write(shstr)
        f.close()

        # Execute qsub script
        os.system("qsub " + parallel + ".sh ")
Example #9
0
	def runParallel(phenotypeIndex):
		#Cluster specific parameters
		#margdir = '/home/cmb-01/bvilhjal/Projects/Python-snps/'
		resultDir = env.results_dir #'/home/cmb-01/bvilhjal/results/'
		import phenotypeData
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName = phed.getPhenotypeName(phenotypeIndex)
		phenName = phenName.replace("/", "_div_")
		phenName = phenName.replace("*", "_star_")
 
		outFileName = resultDir + "Marg_" + parallel + "_" + phenName
		scoreFile = outFileName + ".score" 

		shstr = """#!/bin/csh
#PBS -l walltime=120:00:00
#PBS -l mem=4g
#PBS -q cmb
"""

		shstr += "#PBS -N M" + phenName + "_" + parallel + "\n"
		#shstr += "(python " + margdir + "margarita.py "
		shstr += "(python " + env.script_dir + "margarita.py "
		if phed.isBinary(phenotypeIndex):
			shstr += " --binary "
		shstr += " -s " + scoreFile
		shstr += " -a " + str(withArrayId) + " "
		shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " "
		shstr += "> " + outFileName + ".out) >& " + outFileName + ".err\n"
		
		f = open(parallel + ".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub " + parallel + ".sh ")
Example #10
0
	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		print phenName
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr = "#!/bin/csh\n"
		shstr += "#PBS -l walltime="+walltimeReq+"\n"
		shstr += "#PBS -l mem="+memReq+"\n"
		shstr +="#PBS -q cmb\n"
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")
Example #11
0
def _test_():
	phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_102208.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	for p_i in range(215, 216):
		phenName = phed.getPhenotypeName(p_i)
		drawHistogram(phed, p_i, title = phenName)
		phed.logTransform(p_i)
		drawHistogram(phed, p_i, title = phenName)
Example #12
0
def _test1_():
    phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_042109.tsv"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    print "phenName, data_type, transformation_type"
    for ctg in [1, 2, 3, 4]:
        phenIds = phenotypeData.categories_2_phenotypes[ctg]
        for pi in phenIds:
            phenName = phed.getPhenotypeName(pi)
            (trans_type, data_type) = transformationMap[pi]
            print str(phenName) + ", " + str(
                datatypeDict[data_type]) + ", " + str(
                    transformationTypes[trans_type])
Example #13
0
def _drawPowerQQPlots_(phenotypeIndices=None,res_path="/Network/Data/250k/tmp-bvilhjal/power_analysis/results/",runId="gwPlot"):
	"""
	Draws all the GWA plots for 6 methods.
	"""
	import plotResults

	
	phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_120308.tsv"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')

	if not phenotypeIndices:
		phenotypeIndices = phed.phenIds

	#mainRTs = ["","_original192", "_original192_inverse","_original96","_original96_inverse"]  #FIXME: add full result 
	#mainLabels = ["Full data", "192 acc. overlap", "192 acc. complement", "96 acc. overlap", "96 acc. complement"]
	mainRTs = ["","_original192","_original96", "_latitude60","_latitude55", "_original192_latitude60", "_original192_latitude55"]  #FIXME: add full result 
	mainLabels = ["Full data", "192 acc. overlap", "96 acc. overlap", "latitude < 60", "Latitude < 55", "192 acc. overl. and lat. < 60", "192 acc. overl. and lat. < 55"]
	permRTs = []#["permTest","permTest"]
	colors = []#[[0.6,0.8,0.6],[0.6,0.6,0.8]]
	perm_counts = []#[10,10]
	perm_sample_sizes = []#[65 ,112] #[170,96] #
	permLabels = []#["random 65","random 112"]
	for p_i	in phenotypeIndices:
		mainResults = []
		phenName = phed.getPhenotypeName(p_i)
		pdfFile = res_path+phenName+"_log_QQplot.pdf"
		pngFile = res_path+phenName+"_log_QQplot.png"
		for i in range(0,len(mainRTs)):
			mainRT = mainRTs[i]
			name = mainLabels[i]
			filename = res_path+"KW_raw"+mainRT+"_"+phenName+".pvals"
			rt = gwaResults.ResultType(resultType="KW",name=name)
			print "Loading",filename
			result = gwaResults.Result(filename, name=name, resultType=rt)
			mainResults.append(result)
	
		permResultsList = []
		for i in range(0,len(permRTs)):
			permResults = []
			permRT = permRTs[i]
			for j in range(0,perm_counts[i]):
				filename = res_path+"KW_raw_"+permRT+"_"+phenName+"_r"+str(perm_sample_sizes[i])+"_"+str(j)+".pvals"
				rt = gwaResults.ResultType(resultType="KW",name=permRT)
				print "Loading",filename
				result = gwaResults.Result(filename, name=permRT, resultType=rt)
				permResults.append(result)
			permResultsList.append((permResults,permLabels[i],colors[i]))

		drawPermLogQQPlot(mainResults, permResultsList, phenName = phenName,pdfFile=pdfFile,pngFile=pngFile)
		gc.collect()  #Calling garbage collector, in an attempt to clean up memory..
Example #14
0
def _impute_FLC_192_():
    phed = pd.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )

    d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv"
    d250k_sd = dataParsers.parse_snp_data(d250k_file)
    d250k_sd.filter_accessions(phed.accessions)
    d250k_sd.filter_maf_snps(0.05)

    seq_snpsd = dataParsers.parseCSVData(
        data_dir + "/flc_seqs_aln_imputed_snps_012710.csv")
    seq_snpsd.onlyBinarySnps()

    d250k_sd.snpsDataList[4].compareWith(seq_snpsd)
    d250k_sd.snpsDataList[4].merge_data(seq_snpsd)
Example #15
0
def _countVals_():
	resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	print "total # of phenotypes:", phed.countPhenotypes()
	print "# of phenotypes analyzed:", len(phenotypeIndices)
	
	totalCounts = []
	for p_i in phenotypeIndices:
		valCount = phed.countValues(p_i)
		totalCounts.append(valCount)

	snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv"
	import dataParsers,snpsdata
	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")#,debug=True)
	snpsd = snpsdata.SNPsDataSet(snpsds,[1,2,3,4,5])
	phed.removeAccessionsNotInSNPsData(snpsd)
	
	overlappingCounts = []
	for p_i in phenotypeIndices:
		valCount = phed.countValues(p_i)
		overlappingCounts.append(valCount)


	#ecotypes_192 = phenotypeData._getFirst192Ecotypes_()
	ecotypes_192 = _get192Ecotypes_()
	ecotypes_192 = [str(e) for e in ecotypes_192]
	print "len(ecotypes_192):",len(ecotypes_192)
	print ecotypes_192
	phed.filterAccessions(ecotypes_192)

	filename = resdir+"phen_value_count_new_data_012509_v2.txt"
	f = open(filename,"w")
	f.write("Phenotype,  total_count, overlapping_count, 192_overlap_count\n")
	
	for i in range(0,len(phenotypeIndices)):
		p_i = phenotypeIndices[i]
		try:
			phenName = phed.getPhenotypeName(p_i)
			valCount = phed.countValues(p_i)
			f.write(str(phenName)+", "+str(totalCounts[i])+", "+str(overlappingCounts[i])+", "+str(valCount)+"\n")
		except Exception:
			print "\nPhenotype index", p_i, "failed."

	f.close()
Example #16
0
def _plot_local_FLC_haplotype_():
    data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/"
    ref_seq_name = "raw_ref_col-0"
    ref_start = 3170501
    ref_chr = 5
    ad = sequences.readFastaAlignment(data_dir +
                                      "flc_seqs_aln_merged_011810.fasta",
                                      ref_seq_name=ref_seq_name,
                                      ref_start=ref_start,
                                      ref_chr=ref_chr,
                                      alignment_type="muscle",
                                      ref_direction=1)

    r = ad.get_snps(type=1)
    seq_snpsd = r['snpsd']
    seq_snpsd.filter_na_snps(max_na_rate=0.05)
    #	seq_snpsd = dataParsers.parseCSVData(data_dir+"flc_seqs_aln_imputed_snps_012710.csv")[0]
    #	seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA')
    #	seq_snpsd.remove_accessions(map(str,ad.ecotypes))
    import phenotypeData as pd
    phend = pd.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )  #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv")

    for i in phend.phenIds:
        phen_name = phend.getPhenotypeName(i)
        import analyzeHaplotype as ah
        ah.plot_haplotypes(
            seq_snpsd.snps,
            seq_snpsd.accessions,
            haplotypeFile=
            "/Users/bjarnivilhjalmsson/tmp/flc_seq_haplotypes_old.pdf")
        for start, stop in [(3175500, 3176000), (3176000, 3176500),
                            (3176500, 3177000), (3177000, 3177500),
                            (3177500, 3178000), (3178000, 3178500),
                            (3178500, 3179000)]:
            ah.plot_local_haplotypes(
                "/Users/bjarnivilhjalmsson/tmp/flc_seq_haplotypes_" +
                phen_name + "_" + str(start) + "_" + str(stop) + ".pdf",
                seq_snpsd,
                start,
                stop,
                phenotypeData=phend,
                phen_id=i)
Example #17
0
	def __init__(self,phenotypeIndices=None,snpsds=None,results=None,results_map=None):
		phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
		self.phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
		#if snpsds:
		self.snpsds = snpsds
		#else:
		#	snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv"
		#	self.snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")
		self.results_map = {}	
		if results_map:
			self.results_map=results_map
		elif results:
			for result in results:
				if self.results_map.has_key(result.phenotypeID):
					self.results_map[result.phenotypeID].append(result)
				else:
					self.results_map[result.phenotypeID] = [result]					
		elif phenotypeIndices:
			self.loadData(phenotypeIndices)
Example #18
0
def _get192Ecotypes_():
	resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	
	
	total_accessions = set()
	for p_i in phenotypeIndices:
		if not p_i in [5,6,7]:
			accessions = phed.getAccessionsWithValues(p_i)
			total_accessions = total_accessions.union(accessions)

	ecotypes_192 = phenotypeData._getFirst192Ecotypes_()
	ecotypes_192 = [str(e) for e in ecotypes_192]
	print "len(ecotypes_192):",len(ecotypes_192)
	#print ecotypes_192
	phed.filterAccessions(ecotypes_192)

        for p_i in [5,6,7]:
		accessions = phed.getAccessionsWithValues(p_i)
		total_accessions = total_accessions.union(accessions)
		
	total_accessions = list(total_accessions)
	print len(total_accessions)
	total_accessions.sort()
	print total_accessions
	
	ecotype_info_dict = phenotypeData._getEcotypeIdInfoDict_()
	ets = []
	
	i = 0
	for et in total_accessions:
		et = int(et)
		if ecotype_info_dict.has_key(et):
			print str(et)+", "+str(ecotype_info_dict[et][0])+", "+str(ecotype_info_dict[et][1])
			i += 1
			ets.append(et)
		else:
			print et,"is missing in genotype data."
	print i
	return ets
Example #19
0
    def runParallel(phenotypeIndex):
        # Cluster specific parameters
        scriptDir = env.scriptDir
        resultDir = env.resultDir
        phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
        phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd)
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outFile = resultDir + "CS_" + parallel + "_" + phenName
        shstr = """#!/bin/csh
#PBS -l walltime=72:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
        shstr += "#PBS -N CS" + phenName + "_" + parallel + "\n"
        shstr += "(python " + scriptDir + "compositeScore.py -o" + outFile + " "
        shstr += (
            "--candGeneListID="
            + str(candGeneListID)
            + " --testDataFraction="
            + str(testDataFraction)
            + " --gridSize="
            + str(gridSize)
            + " --windowSize="
            + str(windowSize)
            + " --phenotypeCategory="
            + str(phenotypeCategory)
            + " "
            + str(phenotypeIndex)
            + " "
        )

        shstr += "> " + outFile + "_job" + ".out) >& " + outFile + "_job" + ".err\n"

        f = open(parallel + ".sh", "w")
        f.write(shstr)
        f.close()

        # Execute qsub script
        os.system("qsub " + parallel + ".sh ")
Example #20
0
    def runParallel(phenotypeIndex):
        #Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        impFileName = resultDir + "RF_" + parallel + "_" + phenName
        outFileName = impFileName
        shstr = """#!/bin/csh
#PBS -l walltime=50:00:00
"""
        shstr += "#PBS -l mem=" + mem + "\n"
        shstr += """
#PBS -q cmb
"""

        shstr += "#PBS -N RF" + phenName + "_" + parallel + "\n"
        shstr += "(python " + programDir + "RandomForest.py -o " + impFileName + " --chunkSize " + str(
            chunkSize) + " --nTrees " + str(nTrees) + " --mem " + str(
                mem) + " --round2Size " + str(round2Size) + ""
        if nodeSize:
            shstr += " --nodeSize " + str(nodeSize) + " "
        if logTransform:
            shstr += " --logTransform "
        if not skipSecondRound:
            shstr += " --secondRound "
        shstr += " -a " + str(withArrayIds) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")
Example #21
0
    def runParallel(phenotypeIndex):
        #Cluster specific parameters
        scriptDir = env.scriptDir
        resultDir = env.resultDir
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phed.onlyBiologyCategory(phenotypeCategory,
                                 host=host,
                                 user=user,
                                 passwd=passwd)
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outFile = resultDir + "CS_" + parallel + "_" + phenName
        shstr = """#!/bin/csh
#PBS -l walltime=72:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
        shstr += "#PBS -N CS" + phenName + "_" + parallel + "\n"
        shstr += "(python " + scriptDir + "compositeScore.py -o" + outFile + " "
        shstr += "--candGeneListID=" + str(
            candGeneListID) + " --testDataFraction=" + str(
                testDataFraction
            ) + " --gridSize=" + str(gridSize) + " --windowSize=" + str(
                windowSize) + " --phenotypeCategory=" + str(
                    phenotypeCategory) + " " + str(phenotypeIndex) + " "

        shstr += "> " + outFile + "_job" + ".out) >& " + outFile + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")
Example #22
0
def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness",
					"memReq=","walltimeReq=",]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	phenotypeFileType=1
	outputFile=None
	delim=","
	missingVal="NA"
	help=0
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	memReq = "5g"
	walltimeReq = "100:00:00"

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--memReq"):
			memReq=arg
		elif opt in ("--walltimeReq"):
			walltimeReq=arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "complement:",complement
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	print "walltimeReq:",walltimeReq
	print "memReq:",memReq

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		print phenName
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr = "#!/bin/csh\n"
		shstr += "#PBS -l walltime="+walltimeReq+"\n"
		shstr += "#PBS -l mem="+memReq+"\n"
		shstr +="#PBS -q cmb\n"
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		#phed and phenotype
		sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
		
		if phed.isBinary(phenotypeIndex):
			pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex))	
		else:
			snps = sd.getSnps()
			phen_vals = phed.getPhenVals(phenotypeIndex)
			try:
				kw_res = util.kruskal_wallis(snps,phen_vals)
				pvals = kw_res['ps']
			except:
				print snps
				print phen_vals
				print len(snps),len(snps[0]),len(phen_vals)
				raise Exception
							
		res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False)
		pvalFile=outputFile+".pvals"
		res.writeToFile(pvalFile)

		print "Generating a GW plot."
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
def _plotKW_():
    """
	Analyze how population structure affects KW.
	"""
    filterProb = 0.1
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "_full_quick_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

    #For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    #globalKinship = calcKinship(totalSNPs)
    gc.collect(
    )  #Calling garbage collector, in an attempt to clean up memory..

    #chr = 1
    #for snpsd in snpsds:

    snpsd = snpsds[3]

    k = calcKinship(snpsd.snps[200:1400])
    res = runEmma(phed, p_i, k,
                  snpsd.snps[200:1400])  #runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "c.",
               label="Emma (local)")

    k = calcKinship(totalSNPs)
    res = runEmma(phed, p_i, k,
                  snpsd.snps[200:1400])  #runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "g.",
               label="Emma (global)")

    phenVals = phed.getPhenVals(p_i)
    pvals = _run_kw_(snpsd.snps[200:1400], phenVals)
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "r.",
               label="KW (full data)")

    (pvals, new_positions,
     acc_groups) = get_KW_pvals(snpsd.snps[200:1400],
                                snpsd.positions[200:1400],
                                phed,
                                p_i,
                                kinshipThreshold=0.95,
                                method="KW")
    ecot_map = phenotypeData._getEcotypeIdToStockParentDict_()

    for i in range(0, len(acc_groups)):
        acc_list = []
        for a_i in acc_groups[i]:
            e_i = snpsd.accessions[a_i]
            #print e_i
            acc_list.append(ecot_map[int(e_i)][0])
        print "group", i, ":", acc_list

    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)")

    pylab.legend(numpoints=2, handlelen=0.005)

    pylab.show()
Example #24
0
def _plotRobustnessTests_():
	import csv
	resdir = "/Network/Data/250k/tmp-bvilhjal/robustness_test/"
	fig_dir = "/Users/bjarni/tmp/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	
	#First Emma
	emma_sd_dict = {}
	emma_sd_list = []
	emma_log_pvalues = []
	found_phenotypes = []
	for p_i in phenotypeIndices:
		try:
			phenName = phed.getPhenotypeName(p_i)
			filename = resdir+"KW_rob_f1_"+phenName+".rob.log_pvals_sd"
			print "Loading", filename, "..."
			reader = csv.reader(open(filename, "rb"))
			reader.next()
			for row in reader:
				emma_log_pvalues.append(float(row[0]))
				emma_sd_list.append(float(row[1]))
			found_phenotypes.append(p_i)
		except Exception:
			print p_i,"failed."
	
	import numpy as np
	import matplotlib.cm as cm
	import  matplotlib.pyplot as plt

	xs = np.array(emma_log_pvalues)
	ys = np.array(emma_sd_list)
	print len(emma_sd_list),len(emma_log_pvalues)
	xmin = xs.min()
	xmax = xs.max()
	ymin = ys.min()
	ymax = ys.max()
	
	#plt.subplots_adjust(hspace=0.5)
	#plt.subplot(121)
	plt.hexbin(xs,ys,bins='log',cmap=cm.jet)
	plt.axis([xmin, xmax, ymin, ymax])
	cb = plt.colorbar()
	cb.set_label('$log_{10}(N)$')
	plt.ylabel("SD$(\Delta log(p)))$")
	plt.xlabel("$log(p)$")
	plt.savefig(fig_dir+"KW_overall_robustness.png", format = "png")
	plt.clf()

	emma_sd_dict = {}
	emma_sd_list = []
	emma_log_pvalues = []
	found_phenotypes = []
	for p_i in phenotypeIndices:
		try:
			phenName = phed.getPhenotypeName(p_i)
			filename = resdir+"Emma_rob_f1_"+phenName+".rob.log_pvals_sd"
			print "Loading", filename, "..."
			reader = csv.reader(open(filename, "rb"))
			reader.next()
			for row in reader:
				emma_log_pvalues.append(float(row[0]))
				emma_sd_list.append(float(row[1]))
			found_phenotypes.append(p_i)
		except Exception:
			print p_i,"failed."
	
	import numpy as np
	import matplotlib.cm as cm
	import  matplotlib.pyplot as plt

	xs = np.array(emma_log_pvalues)
	ys = np.array(emma_sd_list)
	print len(emma_sd_list),len(emma_log_pvalues)
	xmin = xs.min()
	xmax = xs.max()
	ymin = ys.min()
	ymax = ys.max()
	
	#plt.subplots_adjust(hspace=0.5)
	#plt.subplot(121)
	plt.hexbin(xs,ys,bins='log',cmap=cm.jet)
	plt.axis([xmin, xmax, ymin, ymax])
	cb = plt.colorbar()
	cb.set_label('$log_{10}(N)$')
	plt.ylabel("SD$(\Delta log(p)))$")
	plt.xlabel("$log(p)$")
	plt.savefig(fig_dir+"Emma_overall_robustness.png", format = "png")
	plt.clf()
Example #25
0
def _run_():
	import os
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)	   

	long_options_list = ["id=", "chr=", "numARG=", "numMarkers=", "numPerm=", "smartCutoff=", "BoundaryStart=", "BoundaryEnd=", "binary", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", "debug", "parallel=", "parallelAll", "help", "scoreFile="]
	
	try:
		opts, args = getopt.getopt(sys.argv[1:], "i:s:c:d:m:a:bh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	import tempfile
	tempfile.tempdir = '/tmp'
	(fId, id) = tempfile.mkstemp()
	os.close(fId)		
	scoreFile = None
	chr = None
	numARG = 30
	numMarkers = 100
	numPerm = 0
	smartCutoff = 10
	binary = False
	delim = ","
	missingVal = "NA"
	debug = None
	report = None
	help = 0
	withArrayId = 0
	boundaries = [ - 1, - 1]
	phenotypeFileType = 1
	parallel = None
	parallelAll = False
	snpsDataFile = None

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-i", "--id"):
			id = '/tmp/' + arg
		elif opt in ("-s", "--scoreFile"):
			scoreFile = arg
		elif opt in ("-c", "--chr"):
			chr = int(arg)
		elif opt in ("--numARG"):
			numARG = int(arg)
		elif opt in ("--numMarkers"):
			numMarkers = int(arg)
		elif opt in ("--numPerm"):
			numPerm = int(arg)
		elif opt in ("--BoundaryStart"):
			boundaries[0] = int(arg)
		elif opt in ("--BoundaryEnd"):
			boundaries[1] = int(arg)
		elif opt in ("--smartCutoff"):
			smartCutoff = int(arg)
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType = int(arg)
		elif opt in ("--binary"):
			binary = True
		elif opt in ("--parallel"):
			parallel = arg
		elif opt in ("--parallelAll"):
			parallelAll = True
		elif opt in ("-d", "--delim"):
			delim = arg
		elif opt in ("-m", "--missingval"):
			missingVal = arg	
		elif opt in ("-a", "--withArrayId"):
			withArrayId = int(arg)
		elif opt in ("-b", "--debug"):
			debug = 1

	if len(args) < 3 and not parallel:
		if help == 0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	if boundaries[0] == boundaries[1] and boundaries[0] == - 1:
		boundaries = None

	margFile = id + ".marg"
	outFile = margFile + ".out"		


	def runParallel(phenotypeIndex):
		#Cluster specific parameters
		#margdir = '/home/cmb-01/bvilhjal/Projects/Python-snps/'
		resultDir = env.results_dir #'/home/cmb-01/bvilhjal/results/'
		import phenotypeData
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName = phed.getPhenotypeName(phenotypeIndex)
		phenName = phenName.replace("/", "_div_")
		phenName = phenName.replace("*", "_star_")
 
		outFileName = resultDir + "Marg_" + parallel + "_" + phenName
		scoreFile = outFileName + ".score" 

		shstr = """#!/bin/csh
#PBS -l walltime=120:00:00
#PBS -l mem=4g
#PBS -q cmb
"""

		shstr += "#PBS -N M" + phenName + "_" + parallel + "\n"
		#shstr += "(python " + margdir + "margarita.py "
		shstr += "(python " + env.script_dir + "margarita.py "
		if phed.isBinary(phenotypeIndex):
			shstr += " --binary "
		shstr += " -s " + scoreFile
		shstr += " -a " + str(withArrayId) + " "
		shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " "
		shstr += "> " + outFileName + ".out) >& " + outFileName + ".err\n"
		
		f = open(parallel + ".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub " + parallel + ".sh ")
		
	#Nested function ends

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		if len(args) > 2:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex)
			return
		
		else:
			snpsDataFile = args[0]
			if not parallelAll:
				phenotypeIndex = int(args[1])
				runParallel(phenotypeIndex)
				return

		import phenotypeData
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		for phenotypeIndex in phed.phenIds:
			runParallel(phenotypeIndex)
		return

	phenotypeIndex = int(args[2])


	#Print out information about this run...
	print "Preparing a blended margarita...."
	print "Num ARG:", numARG
	print "Num Markers:", numMarkers
	print "Num Permutations:", numPerm
	print "Smart cutoff:", smartCutoff
	print "Binary:", binary
	print "ScoreFile:", scoreFile


	
	import dataParsers, snpsdata, phenotypeData
	#phenotypeFile = "/Users/bjarni/Projects/Python-snps/tinaPhenos_041808.csv"
	if phenotypeFileType == 1:
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	elif phenotypeFileType == 2:
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, accessionDecoder = dataParsers.accessionName2EcotypeId, type = 2)	

	snpsds = dataParsers.parseCSVData(snpsDataFile, deliminator = delim, missingVal = missingVal, withArrayIds = bool(withArrayId)) #Get SNPs data 



	marg = Margarita(margFile, outFile, numARG, numMarkers, numPerm, smartCutoff)

	if chr:
		snpsd = snpsds[chr - 1].getSnpsData()
		marg.gwa(snpsd, phed, phenotype = phenotypeIndex, boundaries = boundaries, chromosome = chr, binary = binary)
	else:
		scoreStr = ""
		for chr in [0, 1, 2, 3, 4]:
			snpsd = snpsds[chr].getSnpsData()
			(newRStr, newScoreStr, permPvals) = marg.gwa(snpsd, phed, phenotype = phenotypeIndex, boundaries = boundaries, chromosome = chr + 1, binary = binary)
			scoreStr += newScoreStr

		f = open(scoreFile, 'w')
		f.write(scoreStr)
		f.close()
Example #26
0
def _generate_250K_2010_FLC_data_(impute=True):
    """
	Create a combined version of 
	250K, overlapping with the FLC phenotypes.
	Then merge with 2010 data (including indels).
	Then merge with FLC sequences.
	Impute missing SNPs.
	write to file.
	"""
    import phenotypeData as pd
    import env

    phed = pd.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )

    d2010_file = env.home_dir + "Projects/Data/2010/2010_imputed_012610.csv"
    d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data")
    d2010_sd.filter_accessions(phed.accessions)
    d2010_sd.filter_na_snps()
    d2010_sd.filter_maf_snps(0.05)

    #d250k_file = env.home_dir+"Projects/Data/250k/250K_t54.csv"
    d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv"
    d250k_sd = dataParsers.parse_snp_data(d250k_file)
    d250k_sd.filter_accessions(phed.accessions)
    d250k_sd.filter_maf_snps(0.05)

    d250k_sd.merge_snps_data(d2010_sd)
    d250k_sd.filter_na_accessions()
    d250k_sd.filter_na_snps(0.7)
    d250k_sd.filter_monomorphic_snps()

    ref_seq_name = "raw_ref_col-0"
    ref_start = 3170501
    ref_chr = 5
    seq_file = env.home_dir + "Projects/FLC_analysis/flc_seqs_aln_merged_050410.fasta"
    ad = sequences.readFastaAlignment(seq_file,
                                      ref_seq_name=ref_seq_name,
                                      ref_start=ref_start,
                                      ref_chr=ref_chr,
                                      alignment_type="muscle",
                                      ref_direction=1)
    #	ref_start = 3170500
    #	ad2 = sequences.readFastaAlignment(seq_file,ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #	ref_start = 3170502
    #	ad3 = sequences.readFastaAlignment(seq_file,ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    pdb.set_trace()
    r = ad.get_snps(type=0)
    seq_snpsd1 = r['snpsd']
    seq_snpsd1.merge_data(r['indels'], error_threshold=0.0)

    #	r2 = ad2.get_snps(type=0)
    #	seq_snpsd2 = r2['snpsd']
    #	seq_snpsd2.merge_data(r2['indels'],error_threshold=0.0)
    #
    #	r3 = ad3.get_snps(type=0)
    #	seq_snpsd3 = r3['snpsd']
    #	seq_snpsd3.merge_data(r3['indels'],error_threshold=0.0)

    print "Now merging data.."

    d250k_sd.snpsDataList[4].compareWith(seq_snpsd1)
    #	d250k_sd.snpsDataList[4].compareWith(seq_snpsd2)
    #	d250k_sd.snpsDataList[4].compareWith(seq_snpsd3)
    d250k_sd.snpsDataList[4].merge_data(seq_snpsd1, union_accessions=False)
    d250k_sd.filter_na_accessions()
    d250k_sd.filter_na_snps(0.7)
    d250k_sd.filter_monomorphic_snps()
    d250k_sd.snpsDataList[4].impute_data()
    d250k_sd.writeToFile("/tmp/test.csv")
    print "YEAH!"
Example #27
0
def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness"]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
		phenotypeFileType=1
		outputFile=None
	delim=","
	missingVal="NA"
	help=0
	withArrayIds=1
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-a", "--withArrayId"):
			withArrayIds=int(arg)
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		phenName=phenName.replace("/", "_div_")
		phenName=phenName.replace("*", "_star_")
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr="""#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		shstr+=" -a "+str(withArrayIds)+" "			
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal, withArrayIds = withArrayIds)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		if env.user=="bjarni":
			tempfile.tempdir='/tmp'
		(fId, phenotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		(fId, genotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		
		phed.writeToFile(phenotypeTempFile, [phenotype])	
		sys.stdout.write("Phenotype file written\n")
		sys.stdout.flush()
		snpsDataset=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		decoder={1:1, 0:0,-1:'NA'}	
		snpsDataset.writeToFile(genotypeTempFile, deliminator = delim, missingVal = missingVal, withArrayIds = 0, decoder = decoder)
		sys.stdout.write("Genotype file written\n")
		sys.stdout.flush()
	
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
	
		rDataFile=outputFile+".rData"
		pvalFile=outputFile+".pvals"
		#Is the phenotype binary?
		binary=phed.isBinary(phenotypeIndex)
		rstr=_generateRScript_(genotypeTempFile, phenotypeTempFile, rDataFile, pvalFile, name = phenotypeName, binary = binary)
		rFileName=outputFile+".r"
		f=open(rFileName, 'w')
		f.write(rstr)
		f.close()
		outRfile=rFileName+".out"
		errRfile=rFileName+".err"
		print "Running R file:"
		cmdStr="(R --vanilla < "+rFileName+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()	
		gc.collect() 
		os.system(cmdStr)
		#print "Emma output saved in R format in", rDataFile
		print "Generating a GW plot."
		res = gwaResults.Result(pvalFile,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
Example #28
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
        phenotypeFileType = 1
        impFile = None
	delim = ","
	missingVal = "NA"
	help = 0
	withArrayIds = 1
	parallel = None
	logTransform = False
	parallelAll = False
	chunkSize = 250000
	round2Size = 5000
	nTrees = 15000
	nodeSize = None
	mem = "8g"
	skipSecondRound = True
	minMAF = 0.0

	for opt, arg in opts:
            if opt in ("-h", "--help"):
                help = 1
                print __doc__
            elif opt in ("-a","--withArrayId"):
                withArrayIds = int(arg)
            elif opt in ("-o","--rFile"):
                impFile = arg
            elif opt in ("--phenotypeFileType"):
                phenotypeFileType = int(arg)
            elif opt in ("--parallel"):
                parallel = arg
            elif opt in ("--parallelAll"):
                parallelAll = True
            elif opt in ("--logTransform"):
                logTransform = True
            elif opt in ("--secondRound"):
                skipSecondRound = False
            elif opt in ("-d","--delim"):
                delim = arg
            elif opt in ("--chunkSize"):
                chunkSize = int(arg)
            elif opt in ("--round2Size"):
                round2Size = int(arg)		
            elif opt in ("--nTrees"):
                nTrees = int(arg)
            elif opt in ("--nodeSize"):
                nodeSize = int(arg)
            elif opt in ("--mem"):
                mem = arg
            elif opt in ("-m","--missingval"):
                missingVal = arg
            elif opt in ("-m","--minMAF"):
                minMAF = float(arg)
            else:
                if help==0:
                    print "Unkown option!!\n"
                    print __doc__
                sys.exit(2)

        if len(args)<3 and not parallel:
            if help==0:
                print "Arguments are missing!!\n"
                print __doc__
            sys.exit(2)

	
	def runParallel(phenotypeIndex):
		#Cluster specific parameters
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		phenName = phed.getPhenotypeName(phenotypeIndex)
		phenName = phenName.replace("/","_div_")
		phenName = phenName.replace("*","_star_")
		impFileName = resultDir+"RF_"+parallel+"_"+phenName
		outFileName = impFileName
		shstr = """#!/bin/csh
#PBS -l walltime=120:00:00
"""
		shstr += "#PBS -l mem="+mem+"\n"
		shstr +="""
#PBS -q cmb
"""
		
		shstr += "#PBS -N RF"+phenName+"_"+parallel+"\n"
		shstr += "(python "+programDir+"RandomForest.py -o "+impFileName+" --chunkSize "+str(chunkSize)+" --nTrees "+str(nTrees)+" --mem "+str(mem)+" --round2Size "+str(round2Size)+""
		if nodeSize:
			shstr += " --nodeSize "+str(nodeSize)+" "
		if logTransform:
			shstr += " --logTransform "
		if not skipSecondRound:
			shstr += " --secondRound "
		shstr += " -a "+str(withArrayIds)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	#Nested function ends

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		if parallelAll:
			phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		else:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex = int(args[2])

	print "chunkSize:",chunkSize
	print "nTrees:",nTrees
	print "nodeSize:",nodeSize
	print "mem:",mem
	print "logTransform:",logTransform
	print "round2Size:",round2Size
	print "skipSecondRound:",skipSecondRound

	#Loading genotype data
	import dataParsers
	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)
	
	phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
	phenotype = phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep = []			
	phenAccIndicesToKeep = []
	numAcc = len(snpsds[0].accessions)

	#Load phenotype file
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0,len(snpsds[0].accessions)):
		acc1 = snpsds[0].accessions[i]
		for j in range(0,len(phed.accessions)):
			acc2 = phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping = []
	i = 0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc),i))
			i += 1
	phed.orderAccessions(accessionMapping)

	#Log-transforming
	if logTransform:
		print "Log transforming phenotype"
		phed.logTransform(phenotype)

        #Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	#Remove minor allele frequencies
	if minMAF!=0:
		sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.filterMinMAF(minMAF)
		
	
      	#Converting format to 01
	import snpsdata
	newSnpsds = []
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	snpsds = newSnpsds
	
	#Writing files
	import tempfile
	if env.user=="bjarni":
		tempfile.tempdir='/tmp'
	(fId, phenotypeTempFile) = tempfile.mkstemp()
	os.close(fId)
	(fId, genotypeTempFile) = tempfile.mkstemp()
	os.close(fId)
	
	phed.writeToFile(phenotypeTempFile, [phenotype])	
	sys.stdout.write( "Phenotype file written\n")
	sys.stdout.flush()
	
	#Retain only the correct runchunk of data.
	chromasomes = []
	positions = []
	snps = []
	for i in range(0,len(snpsds)):
		snpsd = snpsds[i]
		positions += snpsd.positions
		snps += snpsd.snps
		chrList = [i+1]*len(snpsd.positions)
		chromasomes += chrList

	#Is the phenotype binary?
	binary = phed.isBinary(phenotypeIndex)
	import util
	impFile = impFile+".imp"
	rDataFile = impFile+".rData"
	rFile = impFile+".r"
	outRfile = rFile+".out"
	errRfile = rFile+".err"
	topImpFile = impFile+"_top"+str(chunkSize)+".imp"
	topRDataFile = impFile+"_top.rData"
	try:
		os.remove(impFile)    #Removing file if it already exits.
	except Exception:
		print "Couldn't remove",impFile
	try:
		os.remove(topImpFile) #Removing file if it already exits.
	except Exception:
		print "Couldn't remove",topImpFile
	for startIndex in range(0,len(positions),chunkSize):
		if startIndex+chunkSize>=len(positions):
			endIndex = len(positions)
		else:
			endIndex = startIndex+chunkSize

	        #Writing genotype data to file.
		tmpFile = open(genotypeTempFile,"w")
		for i in range(startIndex,endIndex):
			outStr =""
			snp = util.valListToStrList(snps[i])
			outStr += str(chromasomes[i])+","+str(positions[i])+","
			outStr += ",".join(snp)
			outStr += "\n"
			tmpFile.write(outStr)
		tmpFile.close()
			
		rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize)
		f = open(rFile,'w')
		f.write(rstr)
		f.close()
		#outRfile = rFile+"_"+str(startIndex/chunkSize)+".out"
		#errRfile = rFile+"_"+str(startIndex/chunkSize)+".err"
		print "Running model nr",startIndex/chunkSize,":"
		cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()
		os.system(cmdStr)
	print "Random forest output saved in", impFile
	
	if not skipSecondRound:
		#Run on the top 'chunkSize' number of hits.
		#loading the R output file.
		impF = open(impFile,"r")
		lines=impF.readlines()
		impF.close()
		impList = list()
		for i in range(1,len(lines)):
			line = lines[i]
			line.strip()
			l = line.split(",")
			impList.append( (float(l[2]),l[0],l[1],snps[i]) )
		impList.sort()
		impList.reverse()

		#Writing genotype data to file.
		tmpFile = open(genotypeTempFile,"w")
		for i in range(0,round2Size):
			outStr = ""
			snp = util.valListToStrList(impList[i][3])
			outStr += str(impList[i][1])+","+str(impList[i][2])+","
			outStr += ",".join(snp)
			outStr += "\n"
			tmpFile.write(outStr)
		tmpFile.close()
		rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize)
		f = open(rFile,'w')
		f.write(rstr)
		f.close()
		print "Running randomForest on the top importance scores:"
		cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()
		os.system(cmdStr)
Example #29
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "outputSNPsFile=",
        "outputPhenotFile=",
        "filterMonomorphic",
        "rawDataFormat",
        "delim=",
        "missingval=",
        "withArrayId=",
        "phenotype=",
        "phenotypeFile=",
        "phenotypeName=",
        "calcKinshipMatrix=",
        "orderAccessions",
        "help",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:u:d:m:a:f:p:h", long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    inputFile = args[0]
    output_fname = None
    outputPhenotFile = None
    delim = ","
    missingVal = "NA"
    phenotypeFile = None
    kinshipMatrixFile = None
    phenotype = None
    phenotypeName = None
    rawDataFormat = False
    monomorphic = False
    help = 0
    withArrayIds = 1
    orderAccessions = False

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("-f", "--phenotypeFile"):
            phenotypeFile = arg
        elif opt in ("calcKinshipMatrix"):
            kinshipMatrixFile = arg
        elif opt in ("--filterMonomorphic"):
            monomorphic = True
        elif opt in ("--rawDataFormat"):
            rawDataFormat = True
        elif opt in ("--minCallProb"):
            minCallProb = float(arg)
        elif opt in ("-p", "--phenotype"):
            phenotype = int(arg)
        elif opt in ("-o", "--outputSNPsFile"):
            output_fname = arg
        elif opt in ("--orderAccessions"):
            orderAccessions = True
        elif opt in ("-u", "--phenotypeFile"):
            outputPhenotFile = arg
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if not output_fname:
        print output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    waid1 = withArrayIds == 1 or withArrayIds == 2
    waid2 = withArrayIds == 2

    import dataParsers

    snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)

    if phenotypeFile:
        import phenotypeData

        phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")  # Get Phenotype data
        accIndicesToKeep = []
        phenAccIndicesToKeep = []
        numAcc = len(snpsds[0].accessions)
        if phenotype >= 0:
            # Load phenotype file
            sys.stdout.write(
                "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + "."
            )
            sys.stdout.flush()
            for i in range(0, len(snpsds[0].accessions)):
                acc1 = snpsds[0].accessions[i]
                for j in range(0, len(phed.accessions)):
                    acc2 = phed.accessions[j]
                    if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != "NA":
                        accIndicesToKeep.append(i)
                        phenAccIndicesToKeep.append(j)
                        break

        elif phenotype == None:
            sys.stdout.write("Removing accessions which do not have any phenotype values.")
            sys.stdout.flush()
            for i in range(0, len(snpsds[0].accessions)):
                acc1 = snpsds[0].accessions[i]
                for j in range(0, len(phed.accessions)):
                    acc2 = phed.accessions[j]
                    if acc1 == acc2:
                        accIndicesToKeep.append(i)
                        phenAccIndicesToKeep.append(j)
                        break

                        # Filter Accessions which do not have the phenotype value.
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.removeAccessionIndices(accIndicesToKeep)
        print ""
        print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."

        if outputPhenotFile:
            print "Filtering phenotype data."
            phed.removeAccessions(phenAccIndicesToKeep)
            if orderAccessions:
                accessionMapping = []
                i = 0
                for acc in snpsds[0].accessions:
                    if acc in phed.accessions:
                        accessionMapping.append((phed.accessions.index(acc), i))
                        i += 1
                phed.orderAccessions(accessionMapping)
            if phenotype >= 0:
                phed.writeToFile(outputPhenotFile, [phenotype])
            else:
                phed.writeToFile(outputPhenotFile)

                # Filtering monomorphic
    if monomorphic:
        print "Filtering monomorphic SNPs"
        for snpsd in snpsds:
            print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    import snpsdata

    newSnpsds = []
    if not rawDataFormat:
        sys.stdout.write("Converting data format")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            newSnpsds.append(snpsd.getSnpsData())
        print ""
        waid1 = 0
        snpsDataset = snpsdata.SnpsDataSet(newSnpsds, [1, 2, 3, 4, 5])
        decoder = {1: 1, 0: 0, -1: "NA"}
    else:
        snpsDataset = snpsdata.SnpsDataSet(snpsds, [1, 2, 3, 4, 5])
        decoder = None

    snpsDataset.writeToFile(output_fname, deliminator=delim, missingVal=missingVal, withArrayIds=waid1, decoder=decoder)
Example #30
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = ["outputFile=", "delim=", "missingval=", "sampleNum=", "parallel=", "parallelAll", "useFloats"]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:n:h", long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeFileType = 1
    outputFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    withArrayIds = 1
    parallel = None
    parallelAll = False
    sampleNum = None
    chromosomes = [1, 2, 3, 4, 5]
    useFloats = False

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--outputFile"):
            outputFile = arg
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("n", "--sampleNum"):
            sampleNum = int(arg)
        elif opt in ("--useFloats"):
            useFloats = True
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]

    print "CAMP is being set up with the following parameters:"
    print "phenotypeDataFile:", phenotypeDataFile
    if len(args) > 2:
        print "Phenotype_id:", args[2]
    print "snpsDataFile:", snpsDataFile
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "sampleNum:", sampleNum

    def runParallel(phenotypeIndex, id=""):
        # Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id

        shstr = """#!/bin/csh
#PBS -l walltime=24:00:00
#PBS -l mem=6g 
#PBS -q cmb
"""

        shstr += "#PBS -N C" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " "
        if sampleNum:
            shstr += " -n " + str(sampleNum) + " "
        if useFloats:
            shstr += " --useFloats "

        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " "
        shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n"

        f = open(parallel + ".sh", "w")
        f.write(shstr)
        f.close()

        # Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    if parallel:  # Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex)
        return
    else:
        phenotypeIndex = int(args[2])

        # Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data

    # Load genotype file
    snpsds = dataParsers.parseCSVData(
        snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds
    )

    # Checking overlap between phenotype and genotype accessions.
    phenotype = phed.getPhenIndex(phenotypeIndex)
    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    numAcc = len(snpsds[0].accessions)
    sys.stdout.write(
        "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + "."
    )
    sys.stdout.flush()
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != "NA":
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

                # Filter accessions which do not have the phenotype value.
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."

    print "Filtering phenotype data."
    phed.removeAccessions(phenAccIndicesToKeep)  # Removing accessions that don't have genotypes or phenotype values

    # Ordering accessions according to the order of accessions in the genotype file
    accessionMapping = []
    i = 0
    for acc in snpsds[0].accessions:
        if acc in phed.accessions:
            accessionMapping.append((phed.accessions.index(acc), i))
            i += 1
    phed.orderAccessions(accessionMapping)

    # Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

        # Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData())
    print ""

    # Writing phenotype data to CAMP format.
    (fId, phenotypeFile) = tempfile.mkstemp()
    os.close(fId)
    phenVals = phed.getPhenVals(phenotypeIndex, asString=False)
    if not useFloats:
        phenVals = map(int, phenVals)
    phenFile = open(phenotypeFile, "w")
    for value in phenVals:
        phenFile.write(str(value) + "\n")
    phenFile.close()

    chromosome_list = []
    positions_list = []
    scores_list = []
    interaction_positions_list = []
    mafs = []
    marfs = []
    # Writing SNP data to CAMP format.
    for chromosome in chromosomes:
        (fId, snpsFile) = tempfile.mkstemp()
        os.close(fId)
        (fId, posFile) = tempfile.mkstemp()
        os.close(fId)
        sf = open(snpsFile, "w")
        pf = open(posFile, "w")
        snpsd = newSnpsds[chromosome - 1]
        for i in range(0, len(snpsd.snps)):
            snp = snpsd.snps[i]
            (marf, maf) = snpsdata.getMAF(snp)
            marfs.append(marf)
            mafs.append(maf)
            str_snp = map(str, snp)
            double_snp = []
            for nt in str_snp:
                double_snp.append(nt)
                double_snp.append(nt)
            sf.write("".join(double_snp) + "\n")
            pf.write(str(snpsd.positions[i]) + "\n")
        sf.close()
        pf.close()

        outFile = outputFile + "_job_" + str(chromosome) + ".out"
        errFile = outputFile + "_job_" + str(chromosome) + ".err"
        resFile = outputFile + "_" + str(chromosome) + ".out"
        print "resFile,outFile,errFile,snpsFile,posFile,phenotypeFile:", resFile, outFile, errFile, snpsFile, posFile, phenotypeFile
        results = _runCAMP_(resFile, outFile, errFile, snpsFile, posFile, phenotypeFile, sampleNum)

        positions_list += results["positions"]
        scores_list += results["scores"]
        for (i, j) in results["snpIndices"]:
            if not (j < 0 or i < 0):
                marfs.append(0.5)  # An ugly hack!!!
                mafs.append(0.5)
            chromosome_list.append(chromosome)

    scoreFile = outputFile + ".scores"
    f = open(scoreFile, "w")
    f.write("Chromosome,Position,Score,MARF,MAF,Second_Position\n")
    for i in range(0, len(positions_list)):
        chromosome = chromosome_list[i]
        (pos1, pos2) = positions_list[i]
        score = scores_list[i]
        marf = marfs[i]
        maf = mafs[i]
        l = map(str, [chromosome, pos1, score, marf, maf, pos2])
        f.write(",".join(l) + "\n")
    f.close()
Example #31
0
def _plotKW_():
    """
	Analyze how population structure affects KW.
	"""
    filterProb = 0.1
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "_full_quick_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")  # ,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

        # For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    # globalKinship = calcKinship(totalSNPs)
    gc.collect()  # Calling garbage collector, in an attempt to clean up memory..

    # chr = 1
    # for snpsd in snpsds:

    snpsd = snpsds[3]

    k = calcKinship(snpsd.snps[200:1400])
    res = runEmma(phed, p_i, k, snpsd.snps[200:1400])  # runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)")

    k = calcKinship(totalSNPs)
    res = runEmma(phed, p_i, k, snpsd.snps[200:1400])  # runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)")

    phenVals = phed.getPhenVals(p_i)
    pvals = _run_kw_(snpsd.snps[200:1400], phenVals)
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)")

    (pvals, new_positions, acc_groups) = get_KW_pvals(
        snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW"
    )
    ecot_map = phenotypeData._getEcotypeIdToStockParentDict_()

    for i in range(0, len(acc_groups)):
        acc_list = []
        for a_i in acc_groups[i]:
            e_i = snpsd.accessions[a_i]
            # print e_i
            acc_list.append(ecot_map[int(e_i)][0])
        print "group", i, ":", acc_list

    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)")

    pylab.legend(numpoints=2, handlelen=0.005)

    pylab.show()
Example #32
0
def _run_():
    import sys

    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "delim=",
        "missingval=",
        "phenotypeFileType=",
        "help",
        "parallel=",
        "parallelAll",
        "candGeneListID=",
        "windowSize=",
        "testDataFraction=",
        "gridSize=",
        "phenotypeCategory=",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeCategory = 1
    phenotypeFileType = 1
    testDataFraction = 1.0 / 3.0
    gridSize = 6
    outFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    parallel = None
    parallelAll = False
    candGeneListID = 129
    windowSize = 10000

    host = "papaya.usc.edu"
    user = "******"
    passwd = "bamboo123"
    db = "T8_annotation_TH"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o"):
            outFile = arg
        elif opt in ("--gridSize"):
            gridSize = int(arg)
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--phenotypeCategory"):
            phenotypeCategory = int(arg)
        elif opt in ("--testDataFraction"):
            testDataFraction = float(arg)
        elif opt in ("--candGeneListID"):
            candGeneListID = int(arg)
        elif opt in ("--windowSize"):
            windowSize = int(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 1 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    def runParallel(phenotypeIndex):
        # Cluster specific parameters
        scriptDir = env.scriptDir
        resultDir = env.resultDir
        phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
        phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd)
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outFile = resultDir + "CS_" + parallel + "_" + phenName
        shstr = """#!/bin/csh
#PBS -l walltime=72:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
        shstr += "#PBS -N CS" + phenName + "_" + parallel + "\n"
        shstr += "(python " + scriptDir + "compositeScore.py -o" + outFile + " "
        shstr += (
            "--candGeneListID="
            + str(candGeneListID)
            + " --testDataFraction="
            + str(testDataFraction)
            + " --gridSize="
            + str(gridSize)
            + " --windowSize="
            + str(windowSize)
            + " --phenotypeCategory="
            + str(phenotypeCategory)
            + " "
            + str(phenotypeIndex)
            + " "
        )

        shstr += "> " + outFile + "_job" + ".out) >& " + outFile + "_job" + ".err\n"

        f = open(parallel + ".sh", "w")
        f.write(shstr)
        f.close()

        # Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    if parallel:  # Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
            phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd)
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            for arg in args:
                runParallel(int(arg))
        return
    else:

        phenotype = int(args[0])
        if len(args) > 1:
            print "Warning multiple phenotype_id arguments were ignored (use --parallel)."

    print "compositeScore is being set up with the following parameters:"
    print "candGeneListID:", candGeneListID
    print "phenotypeCategory:", phenotypeCategory
    print "phenotype:", phenotype
    print "gridSize:", gridSize
    print "testDataFraction:", testDataFraction
    print "phenotypeFileType:", phenotypeFileType
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "delim:", delim
    print "missingval:", missingVal
    print ""

    # Now the algorithm!!!

    # Load phenotype file
    categoricalNames = [
        "158_Sil_length_16",
        "159_Sil_length_22",
        "161_Germ_10",
        "163_Germ_22",
        "173_Leaf_serr_10",
        "174_Leaf_serr_16",
        "175_Leaf_serr_22",
        "179_Roset_erect_22",
        "180_Chlor_16",
        "181_Chlor_22",
    ]
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
    phed.onlyBiologyCategory(phenotypeCategory, host=host, user=user, passwd=passwd)
    # Check whether phenotype is quantitative..
    isQuantitative = not (phed.isBinary(phenotype) or phed.getPhenotypeName(phenotype) in categoricalNames)
    if isQuantitative:
        print "Phenotype", phed.getPhenotypeName(phenotype), "is quantitaive."

    phenName = phed.getPhenotypeName(phenotype)
    phenName = phenName.replace("/", "_div_")
    phenName = phenName.replace("*", "_star_")

    # Load the result files:
    results = []
    resultFiles = []
    mafCutoff = max(mafCutoffs)
    for j in range(0, len(methods)):
        if isQuantitative or not onlyQuantitative[j]:
            resultFile = resultsDirs[j] + methods[j] + "_" + datasetNames[j] + "_" + phenName + fileTypes[j]
            resultFiles.append(resultFile)
            print "Loading result file", resultFile
            result = gwaResults.Result(resultFile)
            if logTransform[j]:
                print "Log transformed the p-values"
                result.negLogTransform()

            result.filterMAF(minMaf=mafCutoff)
            results.append(result)

            # Write the results to a file.
    import tempfile

    # if os.getenv("USER")=="bjarni"
    # 	tempfile.tempdir='/tmp'
    # tempfile.tempdir='/home/cmb-01/bvilhjal/tmp'
    (fId, resultsTempFile) = tempfile.mkstemp()
    os.close(fId)

    f = open(resultsTempFile, "w")
    for i in range(0, len(results[0].scores)):
        out_str = str(results[0].chromosomes[i]) + "_" + str(results[0].positions[i])
        for result in results:
            out_str += "," + str(result.scores[i])
        out_str += "\n"
        f.write(out_str)
    f.close()

    # Load cand. gene list.

    print "Connecting to db, host=" + host
    if not user:
        import sys

        sys.stdout.write("Username: "******"Error %d: %s" % (e.args[0], e.args[1])
        sys.exit(1)
Example #33
0
def _run_():
    import sys
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "delim=", "missingval=", "phenotypeFileType=", "help", "parallel=",
        "parallelAll", "candGeneListID=", "windowSize=", "testDataFraction=",
        "gridSize=", "phenotypeCategory="
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeCategory = 1
    phenotypeFileType = 1
    testDataFraction = 1.0 / 3.0
    gridSize = 6
    outFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    parallel = None
    parallelAll = False
    candGeneListID = 129
    windowSize = 10000

    host = "papaya.usc.edu"
    user = "******"
    passwd = "bamboo123"
    db = "T8_annotation_TH"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o"):
            outFile = arg
        elif opt in ("--gridSize"):
            gridSize = int(arg)
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--phenotypeCategory"):
            phenotypeCategory = int(arg)
        elif opt in ("--testDataFraction"):
            testDataFraction = float(arg)
        elif opt in ("--candGeneListID"):
            candGeneListID = int(arg)
        elif opt in ("--windowSize"):
            windowSize = int(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 1 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    def runParallel(phenotypeIndex):
        #Cluster specific parameters
        scriptDir = env.scriptDir
        resultDir = env.resultDir
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phed.onlyBiologyCategory(phenotypeCategory,
                                 host=host,
                                 user=user,
                                 passwd=passwd)
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outFile = resultDir + "CS_" + parallel + "_" + phenName
        shstr = """#!/bin/csh
#PBS -l walltime=72:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
        shstr += "#PBS -N CS" + phenName + "_" + parallel + "\n"
        shstr += "(python " + scriptDir + "compositeScore.py -o" + outFile + " "
        shstr += "--candGeneListID=" + str(
            candGeneListID) + " --testDataFraction=" + str(
                testDataFraction
            ) + " --gridSize=" + str(gridSize) + " --windowSize=" + str(
                windowSize) + " --phenotypeCategory=" + str(
                    phenotypeCategory) + " " + str(phenotypeIndex) + " "

        shstr += "> " + outFile + "_job" + ".out) >& " + outFile + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    if parallel:  #Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(
                phenotypeDataFile, delimiter='\t')  #Get Phenotype data
            phed.onlyBiologyCategory(phenotypeCategory,
                                     host=host,
                                     user=user,
                                     passwd=passwd)
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            for arg in args:
                runParallel(int(arg))
        return
    else:

        phenotype = int(args[0])
        if len(args) > 1:
            print "Warning multiple phenotype_id arguments were ignored (use --parallel)."

    print "compositeScore is being set up with the following parameters:"
    print "candGeneListID:", candGeneListID
    print "phenotypeCategory:", phenotypeCategory
    print "phenotype:", phenotype
    print "gridSize:", gridSize
    print "testDataFraction:", testDataFraction
    print "phenotypeFileType:", phenotypeFileType
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "delim:", delim
    print "missingval:", missingVal
    print ""

    #Now the algorithm!!!

    #Load phenotype file
    categoricalNames = [
        "158_Sil_length_16", "159_Sil_length_22", "161_Germ_10", "163_Germ_22",
        "173_Leaf_serr_10", "174_Leaf_serr_16", "175_Leaf_serr_22",
        "179_Roset_erect_22", "180_Chlor_16", "181_Chlor_22"
    ]
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    phed.onlyBiologyCategory(phenotypeCategory,
                             host=host,
                             user=user,
                             passwd=passwd)
    #Check whether phenotype is quantitative..
    isQuantitative = not (phed.isBinary(phenotype) or
                          phed.getPhenotypeName(phenotype) in categoricalNames)
    if isQuantitative:
        print "Phenotype", phed.getPhenotypeName(phenotype), "is quantitaive."

    phenName = phed.getPhenotypeName(phenotype)
    phenName = phenName.replace("/", "_div_")
    phenName = phenName.replace("*", "_star_")

    #Load the result files:
    results = []
    resultFiles = []
    mafCutoff = max(mafCutoffs)
    for j in range(0, len(methods)):
        if isQuantitative or not onlyQuantitative[j]:
            resultFile = resultsDirs[j] + methods[j] + "_" + datasetNames[
                j] + "_" + phenName + fileTypes[j]
            resultFiles.append(resultFile)
            print "Loading result file", resultFile
            result = gwaResults.Result(resultFile, )
            if logTransform[j]:
                print "Log transformed the p-values"
                result.negLogTransform()

            result.filterMAF(minMaf=mafCutoff)
            results.append(result)

    #Write the results to a file.
    import tempfile
    #if os.getenv("USER")=="bjarni"
    #	tempfile.tempdir='/tmp'
    #tempfile.tempdir='/home/cmb-01/bvilhjal/tmp'
    (fId, resultsTempFile) = tempfile.mkstemp()
    os.close(fId)

    f = open(resultsTempFile, 'w')
    for i in range(0, len(results[0].scores)):
        out_str = str(results[0].chromosomes[i]) + "_" + str(
            results[0].positions[i])
        for result in results:
            out_str += "," + str(result.scores[i])
        out_str += "\n"
        f.write(out_str)
    f.close()

    #Load cand. gene list.

    print "Connecting to db, host=" + host
    if not user:
        import sys
        sys.stdout.write("Username: "******"Error %d: %s" % (e.args[0], e.args[1])
        sys.exit(1)
Example #34
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=",
        "withArrayId=", "logTransform", "phenotypeFileType=", "help",
        "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=",
        "secondRound", "minMAF="
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeFileType = 1
    impFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    withArrayIds = 1
    parallel = None
    logTransform = False
    parallelAll = False
    chunkSize = 250000
    round2Size = 5000
    nTrees = 15000
    nodeSize = None
    mem = "8g"
    skipSecondRound = True
    minMAF = 0.0

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("-o", "--rFile"):
            impFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--secondRound"):
            skipSecondRound = False
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("--chunkSize"):
            chunkSize = int(arg)
        elif opt in ("--round2Size"):
            round2Size = int(arg)
        elif opt in ("--nTrees"):
            nTrees = int(arg)
        elif opt in ("--nodeSize"):
            nodeSize = int(arg)
        elif opt in ("--mem"):
            mem = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-m", "--minMAF"):
            minMAF = float(arg)
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    def runParallel(phenotypeIndex):
        #Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        impFileName = resultDir + "RF_" + parallel + "_" + phenName
        outFileName = impFileName
        shstr = """#!/bin/csh
#PBS -l walltime=50:00:00
"""
        shstr += "#PBS -l mem=" + mem + "\n"
        shstr += """
#PBS -q cmb
"""

        shstr += "#PBS -N RF" + phenName + "_" + parallel + "\n"
        shstr += "(python " + programDir + "RandomForest.py -o " + impFileName + " --chunkSize " + str(
            chunkSize) + " --nTrees " + str(nTrees) + " --mem " + str(
                mem) + " --round2Size " + str(round2Size) + ""
        if nodeSize:
            shstr += " --nodeSize " + str(nodeSize) + " "
        if logTransform:
            shstr += " --logTransform "
        if not skipSecondRound:
            shstr += " --secondRound "
        shstr += " -a " + str(withArrayIds) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    #Nested function ends

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(
                phenotypeDataFile, delimiter='\t')  #Get Phenotype data
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex)
        return
    else:
        phenotypeIndex = int(args[2])

    print "chunkSize:", chunkSize
    print "nTrees:", nTrees
    print "nodeSize:", nodeSize
    print "mem:", mem
    print "logTransform:", logTransform
    print "round2Size:", round2Size
    print "skipSecondRound:", skipSecondRound

    #Loading genotype data
    import dataParsers
    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=withArrayIds)

    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    phenotype = phed.getPhenIndex(phenotypeIndex)
    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    numAcc = len(snpsds[0].accessions)

    #Load phenotype file
    sys.stdout.write(
        "Removing accessions which do not have a phenotype value for " +
        phed.phenotypeNames[phenotype] + ".")
    sys.stdout.flush()
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    #Filter accessions which do not have the phenotype value.
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "Filtering phenotype data."
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    #Ordering accessions according to the order of accessions in the genotype file
    accessionMapping = []
    i = 0
    for acc in snpsds[0].accessions:
        if acc in phed.accessions:
            accessionMapping.append((phed.accessions.index(acc), i))
            i += 1
    phed.orderAccessions(accessionMapping)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotype)

#Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

#Converting format to 01
    import snpsdata
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData())
    print ""
    snpsds = newSnpsds

    #Writing files
    import tempfile
    if env.user == "bjarni":
        tempfile.tempdir = '/tmp'
    (fId, phenotypeTempFile) = tempfile.mkstemp()
    os.close(fId)
    (fId, genotypeTempFile) = tempfile.mkstemp()
    os.close(fId)

    phed.writeToFile(phenotypeTempFile, [phenotype])
    sys.stdout.write("Phenotype file written\n")
    sys.stdout.flush()

    #Retain only the correct runchunk of data.
    chromasomes = []
    positions = []
    snps = []
    for i in range(0, len(snpsds)):
        snpsd = snpsds[i]
        positions += snpsd.positions
        snps += snpsd.snps
        chrList = [i + 1] * len(snpsd.positions)
        chromasomes += chrList

    #Is the phenotype binary?
    binary = phed.isBinary(phenotypeIndex)
    import util
    impFile = impFile + ".imp"
    rDataFile = impFile + ".rData"
    rFile = impFile + ".r"
    outRfile = rFile + ".out"
    errRfile = rFile + ".err"
    topImpFile = impFile + "_top" + str(chunkSize) + ".imp"
    topRDataFile = impFile + "_top.rData"
    try:
        os.remove(impFile)  #Removing file if it already exits.
    except Exception:
        print "Couldn't remove", impFile
    try:
        os.remove(topImpFile)  #Removing file if it already exits.
    except Exception:
        print "Couldn't remove", topImpFile
    for startIndex in range(0, len(positions), chunkSize):
        if startIndex + chunkSize >= len(positions):
            endIndex = len(positions)
        else:
            endIndex = startIndex + chunkSize

    #Writing genotype data to file.
        tmpFile = open(genotypeTempFile, "w")
        for i in range(startIndex, endIndex):
            outStr = ""
            snp = util.valListToStrList(snps[i])
            outStr += str(chromasomes[i]) + "," + str(positions[i]) + ","
            outStr += ",".join(snp)
            outStr += "\n"
            tmpFile.write(outStr)
        tmpFile.close()

        rstr = _generateRScript_(genotypeTempFile,
                                 phenotypeTempFile,
                                 impFile,
                                 rDataFile,
                                 binary=binary,
                                 nTrees=nTrees,
                                 nodeSize=nodeSize)
        f = open(rFile, 'w')
        f.write(rstr)
        f.close()
        #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out"
        #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err"
        print "Running model nr", startIndex / chunkSize, ":"
        cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile
        sys.stdout.write(cmdStr + "\n")
        sys.stdout.flush()
        os.system(cmdStr)
    print "Random forest output saved in", impFile

    if not skipSecondRound:
        #Run on the top 'chunkSize' number of hits.
        #loading the R output file.
        impF = open(impFile, "r")
        lines = impF.readlines()
        impF.close()
        impList = list()
        for i in range(1, len(lines)):
            line = lines[i]
            line.strip()
            l = line.split(",")
            impList.append((float(l[2]), l[0], l[1], snps[i]))
        impList.sort()
        impList.reverse()

        #Writing genotype data to file.
        tmpFile = open(genotypeTempFile, "w")
        for i in range(0, round2Size):
            outStr = ""
            snp = util.valListToStrList(impList[i][3])
            outStr += str(impList[i][1]) + "," + str(impList[i][2]) + ","
            outStr += ",".join(snp)
            outStr += "\n"
            tmpFile.write(outStr)
        tmpFile.close()
        rstr = _generateRScript_(genotypeTempFile,
                                 phenotypeTempFile,
                                 topImpFile,
                                 topRDataFile,
                                 binary=binary,
                                 nTrees=nTrees,
                                 nodeSize=nodeSize)
        f = open(rFile, 'w')
        f.write(rstr)
        f.close()
        print "Running randomForest on the top importance scores:"
        cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile
        sys.stdout.write(cmdStr + "\n")
        sys.stdout.flush()
        os.system(cmdStr)
Example #35
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "rFile=",
        "chr=",
        "delim=",
        "missingval=",
        "BoundaryStart=",
        "removeOutliers=",
        "addConstant=",
        "logTransform",
        "BoundaryEnd=",
        "phenotypeFileType=",
        "help",
        "parallel=",
        "parallelAll",
        "LRT",
        "minMAF=",
        "kinshipDatafile=",
        "phenotypeRanks",
        "onlyMissing",
        "onlyOriginal96",
        "onlyOriginal192",
        "onlyBelowLatidue=",
        "complement",
        "negate",
        "srInput=",
        "sr",
        "srOutput=",
        "srPar=",
        "srSkipFirstRun",
        "testRobustness",
        "permutationFilter=",
        "useLinearRegress",
        "regressionCofactors=",
        "FriLerAsCofactor",
        "FriColAsCofactor",
        "memReq=",
        "walltimeReq=",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeRanks = False
    removeOutliers = None
    addConstant = -1
    phenotypeFileType = 1
    rFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    minMAF = 0.0
    boundaries = [-1, -1]
    chr = None
    parallel = None
    logTransform = False
    negate = False
    parallelAll = False
    lrt = False
    kinshipDatafile = None
    onlyMissing = False
    onlyOriginal96 = False
    onlyOriginal192 = False
    onlyBelowLatidue = None
    complement = False

    sr = False
    srOutput = False
    srInput = False
    srSkipFirstRun = False
    srTopQuantile = 0.95
    srWindowSize = 30000

    testRobustness = False
    permutationFilter = 0.002

    useLinearRegress = False
    regressionCofactors = None
    FriLerAsCofactor = False
    FriColAsCofactor = False

    memReq = "5g"
    walltimeReq = "150:00:00"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--rFile"):
            rFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--BoundaryStart"):
            boundaries[0] = int(arg)
        elif opt in ("--BoundaryEnd"):
            boundaries[1] = int(arg)
        elif opt in ("--addConstant"):
            addConstant = float(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--minMAF"):
            minMAF = float(arg)
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--onlyMissing"):
            onlyMissing = True
        elif opt in ("--onlyOriginal96"):
            onlyOriginal96 = True
        elif opt in ("--onlyOriginal192"):
            onlyOriginal192 = True
        elif opt in ("--onlyBelowLatidue"):
            onlyBelowLatidue = float(arg)
        elif opt in ("--complement"):
            complement = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--negate"):
            negate = True
        elif opt in ("--removeOutliers"):
            removeOutliers = float(arg)
        elif opt in ("--LRT"):
            lrt = True
        elif opt in ("-c", "--chr"):
            chr = int(arg)
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("--kinshipDatafile"):
            kinshipDatafile = arg
        elif opt in ("--phenotypeRanks"):
            phenotypeRanks = True
        elif opt in ("--sr"):
            sr = True
        elif opt in ("--srSkipFirstRun"):
            srSkipFirstRun = True
        elif opt in ("--srInput"):
            srInput = arg
        elif opt in ("--srOutput"):
            srOutput = arg
        elif opt in ("--srPar"):
            vals = arg.split(",")
            srTopQuantile = float(vals[0])
            srWindowSize = int(vals[1])
        elif opt in ("--testRobustness"):
            testRobustness = True
        elif opt in ("--permutationFilter"):
            permutationFilter = float(arg)
        elif opt in ("--FriLerAsCofactor"):
            FriLerAsCofactor = True
        elif opt in ("--FriColAsCofactor"):
            FriColAsCofactor = True
        elif opt in ("--useLinearRegress"):
            useLinearRegress = True
        elif opt in ("--regressionCofactors"):
            regressionCofactors = arg
        elif opt in ("--memReq"):
            memReq = arg
        elif opt in ("--walltimeReq"):
            walltimeReq = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    print "Emma is being set up with the following parameters:"
    print "output:", rFile
    print "phenotypeRanks:", phenotypeRanks
    print "phenotypeFileType:", phenotypeFileType
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "minMAF:", minMAF
    print "LRT:", lrt
    print "delim:", delim
    print "missingval:", missingVal
    print "kinshipDatafile:", kinshipDatafile
    print "chr:", chr
    print "boundaries:", boundaries
    print "onlyMissing:", onlyMissing
    print "onlyOriginal96:", onlyOriginal96
    print "onlyOriginal192:", onlyOriginal192
    print "onlyBelowLatidue:", onlyBelowLatidue
    print "complement:", complement
    print "negate:", negate
    print "logTransform:", logTransform
    print "addConstant:", addConstant
    print "removeOutliers:", removeOutliers
    print "sr:", sr
    print "srSkipFirstRun:", srSkipFirstRun
    print "srInput:", srInput
    print "srOutput:", srOutput
    print "srTopQuantile:", srTopQuantile
    print "srWindowSize:", srWindowSize
    print "testRobustness:", testRobustness
    print "permutationFilter:", permutationFilter
    print "useLinearRegress:", useLinearRegress
    print "regressionCofactors:", regressionCofactors
    print "FriLerAsCofactor:", FriLerAsCofactor
    print "FriColAsCofactor:", FriColAsCofactor
    print "walltimeReq:", walltimeReq
    print "memReq:", memReq

    def runParallel(phenotypeIndex, phed):
        #Cluster specific parameters
        print phenotypeIndex
        phenName = phed.getPhenotypeName(phenotypeIndex)
        outFileName = resultDir + "Emma_" + parallel + "_" + phenName

        shstr = "#!/bin/csh\n"
        shstr += "#PBS -l walltime=" + walltimeReq + "\n"
        shstr += "#PBS -l mem=" + memReq + "\n"
        shstr += "#PBS -q cmb\n"

        shstr += "#PBS -N E" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        if useLinearRegress:
            outFileName = resultDir + "LR_" + parallel + "_" + phenName
        shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " "
        if useLinearRegress:
            shstr += " --useLinearRegress "

        if regressionCofactors:
            shstr += " --regressionCofactors=" + str(regressionCofactors) + " "
        if FriLerAsCofactor:
            shstr += " --FriLerAsCofactor "
        if FriColAsCofactor:
            shstr += " --FriColAsCofactor "
        if onlyOriginal96:
            shstr += " --onlyOriginal96 "
        elif onlyOriginal192:
            shstr += " --onlyOriginal192 "
        if onlyBelowLatidue:
            shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " "
        if logTransform:
            shstr += " --logTransform "
        if negate:
            shstr += " --negate "
        if removeOutliers:
            shstr += " --removeOutliers=" + str(removeOutliers) + " "
        if phenotypeRanks:
            shstr += " --phenotypeRanks "
        if testRobustness:
            shstr += " --testRobustness "

        shstr += " --permutationFilter=" + str(permutationFilter) + " "

        if sr:
            shstr += " --sr "
            if not srOutput:
                output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
            shstr += " --srOutput=" + str(output) + " "
            if srSkipFirstRun:
                if not srInput:
                    output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                shstr += " --srInput=" + str(output) + " "
                shstr += " --srSkipFirstRun "
            shstr += " --srPar=" + str(srTopQuantile) + "," + str(
                srWindowSize) + " "

        if kinshipDatafile:
            shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " "
        shstr += " --addConstant=" + str(addConstant) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        if parallelAll:
            for phenotypeIndex in phed.phenIds:
                if onlyMissing:
                    phenName = phed.getPhenotypeName(phenotypeIndex)
                    pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                    res = None
                    try:
                        res = os.stat(pvalFile)

                    except Exception:
                        print "File", pvalFile, "does not exist."
                    if res and res.st_size > 0:
                        print "File", pvalFile, "already exists, and is non-empty."
                        if sr:
                            srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
                            srRes = None
                            try:
                                srRes = os.stat(srInput)
                            except Exception:
                                print "File", srInput, "does not exist."
                            if srRes and srRes.st_size > 0:
                                print "File", srInput, "already exists, and is non-empty."
                            else:
                                runParallel(phenotypeIndex, phed)

                    else:
                        print "Setting up the run."
                        runParallel(phenotypeIndex, phed)

                else:
                    runParallel(phenotypeIndex, phed)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex, phed)
        return
    else:
        phenotypeIndex = int(args[2])

    print "phenotypeIndex:", phenotypeIndex
    print "\nStarting program now!\n"

    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal)

    #Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    numAcc = len(snpsds[0].accessions)

    #Removing outliers
    if removeOutliers:
        print "Remoing outliers"
        phed.naOutliers(phenotypeIndex, removeOutliers)

    #If onlyOriginal96, then remove all other phenotypes..
    if onlyOriginal96:
        print "Filtering for the first 96 accessions"
        original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
        original_96_ecotypes = map(str, original_96_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_96_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_96_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyOriginal192:
        print "Filtering for the first 192 accessions"
        original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
        original_192_ecotypes = map(str, original_192_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_192_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_192_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyBelowLatidue:
        print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue
        eiDict = phenotypeData._getEcotypeIdInfoDict_()
        print eiDict
        keepEcotypes = []
        for acc in phed.accessions:
            acc = int(acc)
            if eiDict.has_key(acc) and eiDict[acc][
                    2] and eiDict[acc][2] < onlyBelowLatidue:
                keepEcotypes.append(str(acc))
            elif eiDict.has_key(acc) and eiDict[acc][2] == None:
                keepEcotypes.append(str(acc))

        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)
    sys.stdout.write("Finished prefiltering phenotype accessions.\n")
    sys.stdout.flush()

    phenotype = phed.getPhenIndex(phenotypeIndex)

    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    #Checking which accessions to keep and which to remove .
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    print "\nFiltering accessions in genotype data:"
    #Filter accessions which do not have the phenotype value (from the genotype data).
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(
        accIndicesToKeep
    ), "accessions removed from genotype data, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "\nNow filtering accessions in phenotype data:"
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len(
        phed.accessions) == len(snpsds[0].accessions)
    if len(phed.accessions) != len(snpsds[0].accessions):
        raise Exception

    #Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

    #Removing SNPs which are outside of boundaries.
    if chr:
        print "\nRemoving SNPs which are outside of boundaries."
        snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1])
        snpsds = [snpsds[chr - 1]]

    #Ordering accessions in genotype data to fit phenotype data.
    print "Ordering genotype data accessions."
    accessionMapping = []
    i = 0
    for acc in phed.accessions:
        if acc in snpsds[0].accessions:
            accessionMapping.append((snpsds[0].accessions.index(acc), i))
            i += 1

    #print zip(accessionMapping,snpsds[0].accessions)
    print "len(snpsds[0].snps)", len(snpsds[0].snps)

    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.orderAccessions(accessionMapping)
    print "\nGenotype data has been ordered."

    #Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
    print ""

    print "Checking kinshipfile:", kinshipDatafile

    if kinshipDatafile:  #Is there a special kinship file?
        kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile,
                                                 format=1,
                                                 deliminator=delim,
                                                 missingVal=missingVal)

        accIndicesToKeep = []
        #Checking which accessions to keep and which to remove (genotype data).
        sys.stdout.write(
            "Removing accessions which do not have a phenotype value for " +
            phed.phenotypeNames[phenotype] + ".")
        sys.stdout.flush()
        for i in range(0, len(kinshipSnpsds[0].accessions)):
            acc1 = kinshipSnpsds[0].accessions[i]
            for j in range(0, len(phed.accessions)):
                acc2 = phed.accessions[j]
                if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                    accIndicesToKeep.append(i)
                    break
        print accIndicesToKeep

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.removeAccessionIndices(accIndicesToKeep)
        print ""
        print numAcc - len(
            accIndicesToKeep
        ), "accessions removed from kinship genotype data, leaving", len(
            accIndicesToKeep), "accessions in all."

        print "Ordering kinship data accessions."
        accessionMapping = []
        i = 0
        for acc in snpsds[0].accessions:
            if acc in kinshipSnpsds[0].accessions:
                accessionMapping.append(
                    (kinshipSnpsds[0].accessions.index(acc), i))
                i += 1

        print zip(accessionMapping, snpsds[0].accessions)
        print "len(snpsds[0].snps)", len(snpsds[0].snps)

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.orderAccessions(accessionMapping)
        print "Kinship genotype data has been ordered."

        newKinshipSnpsds = []
        sys.stdout.write("Converting data format")
        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            newKinshipSnpsds.append(snpsd.getSnpsData(
                missingVal=missingVal))  #This data might have NAs
        print ""
        kinshipSnpsds = newKinshipSnpsds

    else:
        kinshipSnpsds = newSnpsds

    print "Found kinship data."

    #Ordering accessions according to the order of accessions in the genotype file
    #	accessionMapping = []
    #	i = 0
    #	for acc in snpsds[0].accessions:
    #		if acc in phed.accessions:
    #			accessionMapping.append((phed.accessions.index(acc),i))
    #			i += 1
    #	phed.orderAccessions(accessionMapping)

    #Negating phenotypic values
    if negate:
        phed.negateValues(phenotypeIndex)

    if logTransform and not phed.isBinary(
            phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0:
        addConstant = 0

    #Adding a constant.
    if addConstant != -1:
        if addConstant == 0:
            addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10
            addConstant = addConstant - phed.getMinValue(phenotypeIndex)

        print "Adding a constant to phenotype:", addConstant
        phed.addConstant(phenotypeIndex, addConstant)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotypeIndex)
    #Converting phenotypes to Ranks
    elif phenotypeRanks:
        phed.transformToRanks(phenotypeIndex)

    if not chr:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,
                                                  [1, 2, 3, 4, 5])
    else:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr])

    phenotypeName = phed.getPhenotypeName(phenotypeIndex)

    sys.stdout.flush()

    if testRobustness:
        print "Starting a robustness test"
        allSNPs = []
        for snpsd in snpsDataset.snpsDataList:
            allSNPs += snpsd.snps
        phenVals = phed.getPhenVals(phenotypeIndex)
        _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter)
        sys.exit(0)

    if useLinearRegress:
        phenVals = phed.getPhenVals(phenotypeIndex)
        d0 = {}
        d0["phen"] = phenVals
        dh = {}
        dh["phen"] = phenVals
        import rpy, gc
        if regressionCofactors:  #Adds ler and col as cofactors
            import pickle
            f = open(regressionCofactors, "r")
            co_factors = pickle.load(f)
            f.close()
            #inserting co factors into model
            for factor in co_factors:
                d[factor] = co_factors[factor]
        import analyzeHaplotype as ah
        (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True)
        if FriColAsCofactor:
            d0["col"] = col_factor
            dh["col"] = col_factor
        if FriLerAsCofactor:
            d0["ler"] = ler_factor
            dh["ler"] = ler_factor
        chr_pos_pvals = []
        stats = []
        sys.stdout.write("Applying the linear model")
        sys.stdout.flush()
        for i in range(0, len(newSnpsds)):  #[3]:#
            snpsd = newSnpsds[i]
            sys.stdout.write("|")
            sys.stdout.flush()
            gc.collect(
            )  #Calling garbage collector, in an attempt to clean up memory..
            for j in range(0, len(snpsd.snps)):
                if j % 5000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()
                #if snpsd.positions[j]>1700000:
                #	break
                snp = snpsd.snps[j]
                d0["snp"] = snp
                try:
                    rpy.set_default_mode(rpy.NO_CONVERSION)
                    aov0 = rpy.r.aov(r("phen ~ ."), data=d0)
                    aovh = rpy.r.aov(r("phen ~ ."), data=dh)
                    rpy.set_default_mode(rpy.BASIC_CONVERSION)
                    s0 = rpy.r.summary(aov0)
                    sh = rpy.r.summary(aovh)
                    #print s0,sh
                    rss_0 = s0['Sum Sq'][-1]
                    if type(sh['Sum Sq']) != float:
                        rss_h = sh['Sum Sq'][-1]

                    else:
                        rss_h = sh['Sum Sq']
                    f = (rss_h - rss_0) / (rss_0 /
                                           (len(phenVals) - len(d0) + 1))
                    pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False)
                except Exception, err_str:
                    print "Calculating p-value failed"  #,err_str
                    pval = 1.0
                #print "dh:",dh
                #print "d0:",d0
                #print "rss_h,rss_0:",rss_h,rss_0
                #print "f,p:",f,pval
                chr_pos_pvals.append([i + 1, snpsd.positions[j], pval])
                mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0]))
                maf = mafc / float(len(snp))
                stats.append([maf, mafc])
        sys.stdout.write("\n")
        #Write out to a result file
        sys.stdout.write("Writing results to file\n")
        sys.stdout.flush()
        pvalFile = rFile + ".pvals"
        f = open(pvalFile, "w")
        f.write("Chromosome,position,p-value,marf,maf\n")
        for i in range(0, len(chr_pos_pvals)):
            chr_pos_pval = chr_pos_pvals[i]
            stat = stats[i]
            f.write(
                str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," +
                str(chr_pos_pval[2]) + "," + str(stat[0]) + "," +
                str(stat[1]) + "\n")
        f.close()

        #Plot results
        print "Generating a GW plot."
        phenotypeName = phed.getPhenotypeName(phenotypeIndex)
        res = gwaResults.Result(pvalFile,
                                name="LM_" + phenotypeName,
                                phenotypeID=phenotypeIndex)
        res.negLogTransform()
        pngFile = pvalFile + ".png"
        plotResults.plotResult(res,
                               pngFile=pngFile,
                               percentile=90,
                               type="pvals",
                               ylab="$-$log$_{10}(p)$",
                               plotBonferroni=True,
                               usePylab=False)
Example #36
0
def _testQQplot_(includeEmmaInBinary=False,usePvalueFiles=True):
	resdir = "/Users/bjarni/tmp/"
	#resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/"
	#resdir = "/Network/Data/250k/tmp-bvilhjal/qq_plots/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phed2 = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[4]#+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	#(results_map, resultTypes_map) = _loadData_(phed, phenotypeIndices)
	q_pvalues = None
	stat_dict = {}
	for p_i in phenotypeIndices:
		(results_map, resultTypes_map) = _loadData_(phed, [p_i])
		#try:
		phenName = phed.getPhenotypeName(p_i)
		phenNamePrint = " ".join(phenName.split("_")[1:])
		print "\nWorking on phenotype",phenName
		if usePvalueFiles:
			q_pvalues = _getPermPvalues_(phenName)
			print len(q_pvalues),"permuted pvalues found"

		valCount = phed.countValues(p_i)
		print valCount,"values found."
		if (not phed.isBinary(p_i)) or includeEmmaInBinary:
			histogramFile = resdir + phenName +"_hist.pdf"
			histogramFile_png = resdir + phenName +"_hist.png"
			drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png)
			if phed.logTransform(p_i):
				histogramFile = resdir + phenName + "_hist_logTransformed.pdf"
				histogramFile_png = resdir + phenName + "_hist_logTransformed.png"
				drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png)
			elif not phed.isBinary(p_i):
				print "adding scaled const."
				phed.addSDscaledConstant(p_i)
				if phed.logTransform(p_i):
					histogramFile = resdir + phenName + "_hist_logTransformed_const.pdf"
					histogramFile_png = resdir + phenName + "_hist_logTransformed_const.png"
					drawHistogram(phed, p_i, title = phenNamePrint, pngFile = histogramFile_png)

#				phed2.naOutliers(p_i,10)
#				histogramFile = resdir + phenName + "_hist_noOutliers.pdf"
#				histogramFile_png = resdir + phenName + "_hist_noOutliers.png"
#				drawHistogram(phed2, p_i, title = phenName, pdfFile = histogramFile, pngFile = histogramFile_png)
#				if phed2.logTransform(p_i):
#					histogramFile = resdir + phenName + "_hist_logTransformed_noOutliers.pdf"
#					histogramFile_png = resdir + phenName + "_hist_logTransformed_noOutliers.png"
#					drawHistogram(phed2, p_i, title = phenName, pdfFile = histogramFile, pngFile = histogramFile_png)
		results = results_map[p_i]
		resultTypes = resultTypes_map[p_i]
		qqplotFile = resdir + phenName + "_qqplot.pdf"
		qqplotFile_png = resdir + phenName + "_qqplot.png"
		s_dict={}
		(As,Ms)=drawQQPlot(results, 1000, phenName = phenNamePrint, resultTypes = resultTypes, pngFile=qqplotFile_png, perm_pvalues = q_pvalues)
		s_dict["A"]=As
		s_dict["M"]=Ms
		
		qqplotFile = resdir + phenName + "_qqplot_log.pdf"
		qqplotFile_png = resdir + phenName + "_qqplot_log.png"
		(ds,areas,slopes) = drawLogQQPlot(results, 1000,5, phenName = phenNamePrint, resultTypes = resultTypes, pngFile=qqplotFile_png, perm_pvalues = q_pvalues)
		s_dict["A2"]=areas
		s_dict["D"]=ds
		s_dict["S"]=slopes
		stat_dict[p_i] = s_dict
		for i in range(0,len(results)):
			result = results[i]
			result.negLogTransform()
			pngFile = resdir + phenName + "_gwplot_" +resultTypes[i]+".png"
			plotResults.plotResult(result,pngFile=pngFile,percentile=90,type="pvals", plotBonferroni=True)	
		#except Exception:
		#	print "\nPhenotype index", p_i, "failed."
		del results_map
	       	gc.collect()  #Calling garbage collector, in an attempt to clean up memory..
		
	print stat_dict
	stat_file_name = resdir + "confounding_stat_4.txt"
	f = open(stat_file_name,"w")
	methods = ["KW","Emma"]
	f.write("phenotype_name, method_name, is_binary, D, A, B, M, S\n")
	for p_i in phenotypeIndices:
		if stat_dict.has_key(p_i):
			s_dict = stat_dict[p_i]
			phenName = phed.getPhenotypeName(p_i)
			phenName = " ".join(phenName.split("_")[1:])
			for i in range(0,len(methods)):
				st = phenName+", "+methods[i]+", "+str(phed.isBinary(p_i))+", "+str(s_dict["D"][i])+", "+str(s_dict["A"][i])+", "+str(s_dict["A2"][i])+", "+str(s_dict["M"][i])+", "+str(s_dict["S"][i])+"\n"
				f.write(st)
	f.close()
def _plotKinshipDiffs_():

    filterProb = 0.2
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "full_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    for snpsd in snpsds:
        snpsd.filterMinMAF(0.1)
        snpsd.filterMonoMorphicSnps()

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

    #For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    print "Calculating the global kinship..."
    globalKinship = calcKinship(totalSNPs)
    print "done."
    normalizedGlobalKinship = globalKinship / mean(globalKinship)
    gc.collect(
    )  #Calling garbage collector, in an attempt to clean up memory..

    for i in range(4, 5):  #len(snpsds)):
        chr = i + 1
        snpsd = snpsds[i]
        #pylab.subplot(5,1,chr)
        #		pylab.figure(figsize=(18,4))
        #		(kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000)
        #		pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$')
        #		(kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000)
        #		pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$')
        #		pylab.legend(numpoints=2,handlelen=0.005)
        #		pylab.title("Kinship diff. chr. "+str(chr))
        #		pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf")
        #		pylab.clf()
        pylab.figure(figsize=(18, 4))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd,
                                           phed,
                                           p_i,
                                           globalKinship,
                                           windowSize=300000)
        pylab.plot(binPos, emmaDiffs, "r", label='ws$=300000$')
        pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd,
                                           phed,
                                           p_i,
                                           globalKinship,
                                           windowSize=500000)
        pylab.plot(binPos, emmaDiffs, "b", label='ws$=500000$')
        pylab.title("Emma avg. p-value diff. on chr. " + str(chr))
        pylab.legend(numpoints=2, handlelen=0.005)
        pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" +
                      str(chr) + ".pdf",
                      format="pdf")
        pylab.clf()
        gc.collect(
        )  #Calling garbage collector, in an attempt to clean up memory..
Example #38
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["outputSNPsFile=","outputPhenotFile=", "filterMonomorphic", "rawDataFormat", "delim=", "missingval=", "withArrayId=", "phenotype=", "phenotypeFile=", "phenotypeName=", "calcKinshipMatrix=", "orderAccessions", "help"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:u:d:m:a:f:p:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	inputFile = args[0]
        output_fname = None
	outputPhenotFile = None
	delim = ","
	missingVal = "NA"
	phenotypeFile = None
        kinshipMatrixFile = None
	phenotype = None
	phenotypeName = None
	rawDataFormat = False
	monomorphic = False
	help = 0
	withArrayIds = 1
	orderAccessions = False
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-f","--phenotypeFile"):
			phenotypeFile = arg
                elif opt in ("calcKinshipMatrix"):
                        kinshipMatrixFile = arg
		elif opt in ("--filterMonomorphic"):
			monomorphic = True
		elif opt in ("--rawDataFormat"):
			rawDataFormat = True
		elif opt in ("--minCallProb"):
			minCallProb = float(arg)
		elif opt in ("-p","--phenotype"):
			phenotype = int(arg)
		elif opt in ("-o","--outputSNPsFile"):
			output_fname = arg
		elif opt in ("--orderAccessions"):
			orderAccessions = True
		elif opt in ("-u","--phenotypeFile"):
			outputPhenotFile = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if not output_fname:
		print output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2

	import dataParsers
	snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	
	if phenotypeFile:
		import phenotypeData
		phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')  #Get Phenotype data 
		accIndicesToKeep = []			
		phenAccIndicesToKeep = []
		numAcc = len(snpsds[0].accessions)
		if phenotype>=0:
		        #Load phenotype file
			sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
			sys.stdout.flush()
			for i in range(0,len(snpsds[0].accessions)):
				acc1 = snpsds[0].accessions[i]
				for j in range(0,len(phed.accessions)):
					acc2 = phed.accessions[j]
					if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
						accIndicesToKeep.append(i)
						phenAccIndicesToKeep.append(j)
						break					

		elif phenotype==None:
			sys.stdout.write("Removing accessions which do not have any phenotype values.")
			sys.stdout.flush()
			for i in range(0,len(snpsds[0].accessions)):
				acc1 = snpsds[0].accessions[i]
				for j in range(0,len(phed.accessions)):
					acc2 = phed.accessions[j]
					if acc1==acc2:
						accIndicesToKeep.append(i)
						phenAccIndicesToKeep.append(j)
						break
			
					
		#Filter Accessions which do not have the phenotype value.
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.removeAccessionIndices(accIndicesToKeep)
		print ""
		print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all."
		
		if outputPhenotFile:
			print "Filtering phenotype data."
			phed.removeAccessions(phenAccIndicesToKeep)
			if orderAccessions:
				accessionMapping = []
				i = 0
				for acc in snpsds[0].accessions:
					if acc in phed.accessions:
						accessionMapping.append((phed.accessions.index(acc),i))
						i += 1
				phed.orderAccessions(accessionMapping)
			if phenotype>=0:
				phed.writeToFile(outputPhenotFile, [phenotype])
			else:
				phed.writeToFile(outputPhenotFile)

		
        #Filtering monomorphic
	if monomorphic:
		print "Filtering monomorphic SNPs"
		for snpsd in snpsds:
			print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	import snpsdata
	
	newSnpsds = []
	if not rawDataFormat:
		sys.stdout.write("Converting data format")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			newSnpsds.append(snpsd.getSnpsData())
		print ""
		waid1 = 0
		snpsDataset = snpsdata.SnpsDataSet(newSnpsds,[1,2,3,4,5])
		decoder = {1:1, 0:0, -1:'NA'}
	else:
		snpsDataset = snpsdata.SnpsDataSet(snpsds,[1,2,3,4,5])
		decoder=None
	
	snpsDataset.writeToFile(output_fname, deliminator=delim, missingVal = missingVal, withArrayIds = waid1, decoder=decoder)
Example #39
0
def _test_():
    #Load phenotype data..
    import phenotypeData as pd
    import gwaResults as gr
    phed = pd.readPhenotypeFile('/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/phen_raw_092910.tsv')
    pid1 = 1
    phed.filter_accessions_w_missing_data(pid1)
    phen_name = phed.getPhenotypeName(pid1)
    phen_vals = phed.getPhenVals(pid1)
    ecotypes = phed.accessions
    is_binary = phed.isBinary(pid1)

    #Creating the first hdf5 file
    hdf5_file_name_1 = '/Users/bjarni.vilhjalmsson/tmp/test1.hdf5'
    gwa_record = GWASRecord(hdf5_file_name_1)
    gwa_record.init_file()
    gwa_record.add_new_phenotype(phen_name, phen_vals, ecotypes, is_binary=is_binary)
    print "First file is constructed"

    print "Now testing it"
    r = gwa_record.get_phenotype_values(phen_name, 'raw')
    #print r

    phed = pd.readPhenotypeFile('/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/phen_raw_092910.tsv')
    pid2 = 5
    phed.filter_accessions_w_missing_data(pid2)
    phen_name = phed.getPhenotypeName(pid2)
    phen_vals = phed.getPhenVals(pid2)
    ecotypes = phed.accessions
    is_binary = phed.isBinary(pid2)
    gwa_record.add_new_phenotype(phen_name, phen_vals, ecotypes, is_binary=is_binary)

    print "Now testing it"
    r = gwa_record.get_phenotype_values(phen_name, 'raw')
    #print r
    r = gwa_record.get_phenotype_info(phen_name)
    print r

    gwa_record.transform_phenotype('FT10', transformation='sqrt')
    print "Now testing it"
    r = gwa_record.get_phenotype_values(phen_name, 'raw')
    #print r
    r = gwa_record.get_phenotype_info(phen_name)
    print r

    result_file = '/Users/bjarnivilhjalmsson/tmp/pi1_pid5_FT10_emmax_none.pvals'
    res = gr.Result(result_file=result_file, name='FT10')
    res.neg_log_trans()

#    for c in ['chromosomes', 'positions', 'scores', 'marfs', 'mafs', 'genotype_var_perc', 'beta0', \
#        'beta1', 'correlations']:
#        print c, res.snp_results[c][:10]


    gwa_record.add_results(phen_name, 'emmax', res.snp_results['chromosomes'], res.snp_results['positions'],
            res.scores, res.snp_results['marfs'], res.snp_results['mafs'],
            transformation='raw', genotype_var_perc=res.snp_results['genotype_var_perc'],
            beta0=res.snp_results['beta0'], beta1=res.snp_results['beta1'],
            correlation=res.snp_results['correlations'])


    print "Result added."

    print "Now fetching a result."
    res = gwa_record.get_results(phen_name, 'emmax')#, min_mac=15, max_pval=0.01)
    print "Result loaded"
#    for c in ['chromosome', 'position', 'score', 'maf', 'mac', 'genotype_var_perc', 'beta0', \
#        'beta1', 'correlation']:
#        print c, res[c][:10]
    r = gwa_record.get_phenotype_info()
    print r
    s1 = time.time()
    res = gwa_record.get_results_by_chromosome(phen_name, 'emmax')
    print "Result re-loaded"
    secs = time.time() - s1
    if secs > 60:
        mins = int(secs) / 60
        secs = secs - mins * 60
        print 'Took %d mins and %f seconds.' % (mins, secs)
    else:
        print 'Took %f seconds.' % (secs)
    for chromosome in [1, 2, 3, 4, 5]:
        for c in ['position', 'score', 'maf', 'mac', 'genotype_var_perc', 'beta0', \
            'beta1', 'correlation']:
            print c, res[chromosome][c][:10]
    print res['chromosome_ends']
    print res['max_score']
    print gwa_record.get_phenotype_bins(phen_name)
    s1 = time.time()
    gwa_record.perform_gwas('LD', analysis_method='kw')
    secs = time.time() - s1
    if secs > 60:
        mins = int(secs) / 60
        secs = secs - mins * 60
        print 'Took %d mins and %f seconds.' % (mins, secs)
    else:
        print 'Took %f seconds.' % (secs)

    gwa_record.transform_phenotype('LD', transformation='log')
    gwa_record.perform_gwas('LD', analysis_method='emmax', transformation='log')
Example #40
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "outputFile=", "delim=", "missingval=", "sampleNum=", "parallel=",
        "parallelAll", "useFloats"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:n:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeFileType = 1
    outputFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    withArrayIds = 1
    parallel = None
    parallelAll = False
    sampleNum = None
    chromosomes = [1, 2, 3, 4, 5]
    useFloats = False

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--outputFile"):
            outputFile = arg
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("n", "--sampleNum"):
            sampleNum = int(arg)
        elif opt in ("--useFloats"):
            useFloats = True
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]

    print "CAMP is being set up with the following parameters:"
    print "phenotypeDataFile:", phenotypeDataFile
    if len(args) > 2:
        print "Phenotype_id:", args[2]
    print "snpsDataFile:", snpsDataFile
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "sampleNum:", sampleNum

    def runParallel(phenotypeIndex, id=""):
        #Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id

        shstr = """#!/bin/csh
#PBS -l walltime=24:00:00
#PBS -l mem=6g 
#PBS -q cmb
"""

        shstr += "#PBS -N C" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " "
        if sampleNum:
            shstr += " -n " + str(sampleNum) + " "
        if useFloats:
            shstr += " --useFloats "

        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    if parallel:  #Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(
                phenotypeDataFile, delimiter='\t')  #Get Phenotype data
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex)
        return
    else:
        phenotypeIndex = int(args[2])

    #Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data

    #Load genotype file
    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=withArrayIds)

    #Checking overlap between phenotype and genotype accessions.
    phenotype = phed.getPhenIndex(phenotypeIndex)
    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    numAcc = len(snpsds[0].accessions)
    sys.stdout.write(
        "Removing accessions which do not have a phenotype value for " +
        phed.phenotypeNames[phenotype] + ".")
    sys.stdout.flush()
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    #Filter accessions which do not have the phenotype value.
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "Filtering phenotype data."
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    #Ordering accessions according to the order of accessions in the genotype file
    accessionMapping = []
    i = 0
    for acc in snpsds[0].accessions:
        if acc in phed.accessions:
            accessionMapping.append((phed.accessions.index(acc), i))
            i += 1
    phed.orderAccessions(accessionMapping)

    #Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData())
    print ""

    #Writing phenotype data to CAMP format.
    (fId, phenotypeFile) = tempfile.mkstemp()
    os.close(fId)
    phenVals = phed.getPhenVals(phenotypeIndex, asString=False)
    if not useFloats:
        phenVals = map(int, phenVals)
    phenFile = open(phenotypeFile, "w")
    for value in phenVals:
        phenFile.write(str(value) + "\n")
    phenFile.close()

    chromosome_list = []
    positions_list = []
    scores_list = []
    interaction_positions_list = []
    mafs = []
    marfs = []
    #Writing SNP data to CAMP format.
    for chromosome in chromosomes:
        (fId, snpsFile) = tempfile.mkstemp()
        os.close(fId)
        (fId, posFile) = tempfile.mkstemp()
        os.close(fId)
        sf = open(snpsFile, "w")
        pf = open(posFile, "w")
        snpsd = newSnpsds[chromosome - 1]
        for i in range(0, len(snpsd.snps)):
            snp = snpsd.snps[i]
            (marf, maf) = snpsdata.getMAF(snp)
            marfs.append(marf)
            mafs.append(maf)
            str_snp = map(str, snp)
            double_snp = []
            for nt in str_snp:
                double_snp.append(nt)
                double_snp.append(nt)
            sf.write("".join(double_snp) + "\n")
            pf.write(str(snpsd.positions[i]) + "\n")
        sf.close()
        pf.close()

        outFile = outputFile + "_job_" + str(chromosome) + ".out"
        errFile = outputFile + "_job_" + str(chromosome) + ".err"
        resFile = outputFile + "_" + str(chromosome) + ".out"
        print "resFile,outFile,errFile,snpsFile,posFile,phenotypeFile:", resFile, outFile, errFile, snpsFile, posFile, phenotypeFile
        results = _runCAMP_(resFile, outFile, errFile, snpsFile, posFile,
                            phenotypeFile, sampleNum)

        positions_list += results["positions"]
        scores_list += results["scores"]
        for (i, j) in results["snpIndices"]:
            if not (j < 0 or i < 0):
                marfs.append(0.5)  #An ugly hack!!!
                mafs.append(0.5)
            chromosome_list.append(chromosome)

    scoreFile = outputFile + ".scores"
    f = open(scoreFile, "w")
    f.write("Chromosome,Position,Score,MARF,MAF,Second_Position\n")
    for i in range(0, len(positions_list)):
        chromosome = chromosome_list[i]
        (pos1, pos2) = positions_list[i]
        score = scores_list[i]
        marf = marfs[i]
        maf = mafs[i]
        l = map(str, [chromosome, pos1, score, marf, maf, pos2])
        f.write(",".join(l) + "\n")
    f.close()
Example #41
0
	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		phenName=phenName.replace("/", "_div_")
		phenName=phenName.replace("*", "_star_")
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr="""#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		shstr+=" -a "+str(withArrayIds)+" "			
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")
Example #42
0
def _plotConfoundingStats_():
	#import pylab as plt

	resdir = "/Network/Data/250k/tmp-bvilhjal/perm_tests/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	stat_file_dir = "/Users/bjarni/tmp/"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	
	m_pvals = {}
	a_pvals = {}
	ks_pvals = {}
	for p_i in phenotypeIndices:
		#if not phed.isBinary(p_i):
		phenName = phed.getPhenotypeName(p_i)
		print "Loading permutation stats data for",phenName
		filename = resdir+"KW_perm_f1_n1000_"+phenName+".perm.stat.txt"
		f = open(filename,"r")
		lines = f.readlines()
		pvals = (lines[-1].strip()).split(',')
		m_pvals[p_i] = float(pvals[0].split(" ")[-1])
		a_pvals[p_i] = float(pvals[1])
		ks_pvals[p_i] = float(pvals[2])
	

	x_ticks = []
	s_ticks = []
	x_pos = 1
	for cat in [1,2,3,4]:
		for p_i in phenotypeData.categories_2_phenotypes[cat]:
			s_ticks.append(phed.getPhenotypeName(p_i))
			#plt.text(x_pos+shift,min_stat-0.1*stat_range,p_i,rotation="vertical",size="xx-small")				
			x_ticks.append(x_pos-0.5)
			x_pos += 1
		x_pos = x_pos+1

	
	figure = plt.figure(figsize=(14,8))
	axes = plt.Axes(figure, [.06,.16,.91,.81])
	figure.add_axes(axes) 
	x_pos = 0
	colors = {1:"b",2:"r",3:"g",4:"c"}
	for i in [1,2,3,4]:
		
		phenotypeIndices = phenotypeData.categories_2_phenotypes[i]
		newPhenotypeIndices = []
		for p_i in phenotypeIndices:
			#if not phed.isBinary(p_i):
				newPhenotypeIndices.append(p_i)
		phenotypeIndices = newPhenotypeIndices
			
		m_list = []
		for p_i in phenotypeIndices:
			m_list.append(m_pvals[p_i])
		plt.bar(range(x_pos,len(m_list)+x_pos),m_list,color = colors[i])
		x_pos = x_pos+len(m_list)+1
	plt.axis([0-0.02*(x_pos-1),1.02*(x_pos-1),-0.02,1.02])
	plt.xticks(x_ticks,s_ticks,size="x-small",rotation="vertical")
	plt.ylabel("M stat. p-value")
	plt.savefig(stat_file_dir+"confounding_M_pvalues.png", format = "png")
	plt.clf()

	figure = plt.figure(figsize=(14,8))
	axes = plt.Axes(figure, [.06,.16,.91,.81])
	figure.add_axes(axes) 
	x_pos = 0
	for i in [1,2,3,4]:
		
		phenotypeIndices = phenotypeData.categories_2_phenotypes[i]
		newPhenotypeIndices = []
		for p_i in phenotypeIndices:
			#if not phed.isBinary(p_i):
				newPhenotypeIndices.append(p_i)
		phenotypeIndices = newPhenotypeIndices
		a_list = []
		for p_i in phenotypeIndices:
			a_list.append(a_pvals[p_i])
		plt.bar(range(x_pos,len(a_list)+x_pos),a_list,color = colors[i])
		x_pos = x_pos+len(a_list)+1
	plt.axis([0-0.02*(x_pos-1),1.02*(x_pos-1),-0.02,1.02])
	plt.xticks(x_ticks,s_ticks,size="x-small",rotation="vertical")
	plt.ylabel("A stat. p-value")
	plt.savefig(stat_file_dir+"confounding_A_pvalues.png", format = "png")
	plt.clf()

	figure = plt.figure(figsize=(14,8))
	axes = plt.Axes(figure, [.06,.16,.91,.81])
	figure.add_axes(axes) 
	x_pos = 0
	for i in [1,2,3,4]:
		
		phenotypeIndices = phenotypeData.categories_2_phenotypes[i]
		newPhenotypeIndices = []
		for p_i in phenotypeIndices:
			#if not phed.isBinary(p_i):
				newPhenotypeIndices.append(p_i)
		phenotypeIndices = newPhenotypeIndices
		a_list = []
		for p_i in phenotypeIndices:
			a_list.append(ks_pvals[p_i])
		plt.bar(range(x_pos,len(a_list)+x_pos),a_list,color = colors[i])
		x_pos = x_pos+len(a_list)+1
	plt.axis([0-0.02*(x_pos-1),1.02*(x_pos-1),-0.02,1.02])
	plt.xticks(x_ticks,s_ticks,size="x-small",rotation="vertical")
	plt.ylabel("KS stat. p-value")
	plt.savefig(stat_file_dir+"confounding_KS_pvalues.png", format = "png")
	plt.clf()


	print m_pvals, a_pvals, ks_pvals
Example #43
0
def analyzeSNPs():
    import KW, phenotype_parsers, phenotypeData
    import Emma
    result_id = "filtered_imputed"
    data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/"
    #ref_seq_name = "2010_Col-0"
    ref_seq_name = "raw_ref_col-0"
    ref_start = 3170501
    ref_chr = 5
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_edited_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #ad = sequences.readFastaAlignment(data_dir+"flc_seqs_aln_merged_011810.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #		ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)

    #r = ad.get_snps(type=1)
    #seq_snpsd = r['snpsd']
    #seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA')
    #seq_snpsd.onlyBinarySnps()
    #i_snpsd = r['indels']
    #print indels
    #i_snpsd = i_snpsd.getSnpsData(missingVal='NA')
    #print zip(i_snpsd.positions, i_snpsd.snps)
    #print i_snpsd.accessionsl
    seq_snpsd = dataParsers.parseCSVData(
        data_dir + "/flc_seqs_aln_imputed_snps_012510.csv")[0]
    seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA')

    #	d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_073009.csv"
    d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_imputed_012610.csv"
    d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data")
    #	d2010_sd.filter_na_accessions()
    d2010_sd.filter_na_snps()
    d2010_sd.convert_2_binary()
    d2010_sd.filter_maf_snps(0.05)
    #kinship_2010 = Emma.calcKinship(d2010_sd.getSnps(0.05))
    d2010_sd = d2010_sd.get_region_snpsd(5, 3140000, 3220000)
    d2010_sd.remove_redundant_snps(w_missing=True)

    d250k_file = "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_data_t43_081009.csv"
    snpsd = dataParsers.parse_snp_data(d250k_file)
    snpsd.filter_accessions(seq_snpsd.accessions)
    snpsd.convert_2_binary()
    snpsd.filter_maf_snps(0.05)
    #kinship_250k = Emma.calcKinship(snpsd.getSnps(0.02))

    snpsd = snpsd.get_region_snpsd(5, 3140000, 3220000)
    snpsd.remove_redundant_snps()

    seq_snpsd.remove_accessions(snpsd.accessions)
    seq_snpsd.snpsFilterRare(0.05)
    seq_snpsd.onlyBinarySnps()
    acc_map = []
    for i, acc in enumerate(seq_snpsd.accessions):
        acc_map.append((i, snpsd.accessions.index(acc)))

    seq_snpsd.orderAccessions(acc_map)
    seq_snpsd.remove_redundant_snps(w_missing=True)

    #snpsd.mergeDataUnion(d2010_sd,priority=2,unionType=3)
    #ad.compare_with_snps_data(snpsd) #Something missing here snpsd...?
    #i_snpsd =
    #snpsd.mergeDataUnion(d250k_sd,unionType=3,verbose=True)

    #NOW PERFORM GWAS AND PLOT RESULT!!!!

    phend = phenotypeData.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )
    #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv")
    results_colors = ['blue', 'green', 'red']
    #kinship_matrices = [kinship_250k,kinship_250k,kinship_2010]
    snpsds = [snpsd, seq_snpsd, d2010_sd]
    phenotypeIndices = phend.phenIds
    log_transforms = [1, 2]
    import analyzePhenotype as ap
    import analyzeSNPResult as asr
    import copy

    #	for i in phenotypeIndices:
    #		#ap.drawHistogram(phend,i,pdfFile="/Users/bjarnivilhjalmsson/tmp/hist_"+str(phend.getPhenotypeName(i))+".pdf")
    #		#if i in log_transforms:
    #		phend.logTransform(i)
    #		#print "log transforming"
    #		results = []
    #		filtered_sds=[]
    #		for sd,k in zip(snpsds,kinship_matrices):
    #			new_sd = copy.deepcopy(sd)
    #			res = Emma.run_emma_w_missing_data(new_sd,phend,i,5,k)
    #			res.negLogTransform()
    #			snps_indices_to_keep = res.filterMARF(minMaf=0.1)
    #			print "Got",len(res.scores),len(res.positions),"p-values from Emma."
    #			results.append(res)
    #			#pvals = res.scores
    #			#positions = res.positions
    #			#pp = zip(pvals,positions)
    #			#pp.sort()
    #			#print pp
    #			#import plotResults as pr
    #			#pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf")
    #			new_sd.filter_snp_indices(snps_indices_to_keep)
    #			filtered_sds.append(new_sd)
    #		import regionPlotter as rp
    #		reg_plotter = rp.RegionPlotter()
    #		reg_plotter.plot_small_result(results,results_colors=results_colors,
    #					pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_"+result_id+"_emma_gwas_"+str(phend.getPhenotypeName(i))+".pdf")
    #		for j,(r,sd) in enumerate(zip(results,filtered_sds)):
    #			r_i = r.scores.index(max(r.scores))
    #			phend.plot_marker_box_plot(i,sd,r_i,pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_emma_"+str(phend.getPhenotypeName(i))+"_"+results_colors[j]+".pdf",marker_score=r.scores[r_i])
    #
    phend = phenotypeData.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )  #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv")

    for i in phenotypeIndices:
        results = []
        filtered_sds = []
        for sd in snpsds:
            new_sd = copy.deepcopy(sd)
            res, f_sd = KW.run_kw(new_sd, phend, i, 5)
            filtered_sds.append(f_sd)
            res.negLogTransform()
            print "Got", len(res.scores), len(
                res.positions), "p-values from KW."
            results.append(res)
            #pvals = res.scores
            #positions = res.positions
            #pp = zip(pvals,positions)
            #pp.sort()
            #print pp
            #import plotResults as pr
            #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf")
        import regionPlotter as rp
        reg_plotter = rp.RegionPlotter()
        reg_plotter.plot_small_result(
            results,
            results_colors=results_colors,
            pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_" +
            result_id + "_gwas_" + str(phend.getPhenotypeName(i)) + ".pdf")
        for j, (r, sd) in enumerate(zip(results, filtered_sds)):
            if len(r.scores) != len(sd.snps):
                print "Lengths not equal? %d, %d", (len(r.scores),
                                                    len(sd.snps))
            r_i = r.scores.index(max(r.scores))
            phend.plot_marker_box_plot(
                i,
                sd,
                r_i,
                pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_kw_" +
                str(phend.getPhenotypeName(i)) + "_" + results_colors[j] +
                ".pdf",
                marker_score=r.scores[r_i])
Example #44
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["rFile=","chr=", "delim=", "missingval=", "withArrayId=", "BoundaryStart=", "removeOutliers=", "addConstant=",
						"logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", 
						"kinshipDatafile=", "phenotypeRanks", "onlyMissing","onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", 
						"complement", "negate", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "testRobustness",
						"permutationFilter="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	phenotypeRanks = False
	removeOutliers = None
	addConstant = -1
	phenotypeFileType = 1
	rFile = None
	delim = ","
	missingVal = "NA"
	help = 0
	minMAF=0.0
	withArrayIds = 1
	boundaries = [-1,-1]
	chr=None
	parallel = None
	logTransform = False
	negate = False
	parallelAll = False
	lrt = False
	kinshipDatafile = None 
	onlyMissing = False
	onlyOriginal96 = False
	onlyOriginal192 = False
	onlyBelowLatidue = None
	complement = False

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	testRobustness = False
	permutationFilter = 0.002

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-o","--rFile"):
			rFile = arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType = int(arg)
		elif opt in ("--BoundaryStart"):
			boundaries[0] = int(arg)
		elif opt in ("--BoundaryEnd"):
			boundaries[1] = int(arg)
		elif opt in ("--addConstant"):
			addConstant = float(arg)
		elif opt in ("--parallel"):
			parallel = arg
		elif opt in ("--minMAF"):
			minMAF = float(arg)
		elif opt in ("--parallelAll"):
			parallelAll = True
		elif opt in ("--onlyMissing"):
			onlyMissing = True
		elif opt in ("--onlyOriginal96"):
			onlyOriginal96 = True
		elif opt in ("--onlyOriginal192"):
			onlyOriginal192 = True
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue = float(arg)
		elif opt in ("--complement"):
			complement = True
		elif opt in ("--logTransform"):
			logTransform = True
		elif opt in ("--negate"):
			negate = True
		elif opt in ("--removeOutliers"):
			removeOutliers = float(arg)
		elif opt in ("--LRT"):
			lrt = True
		elif opt in ("-c","--chr"):
			chr = int(arg)
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("--kinshipDatafile"):
			kinshipDatafile = arg
		elif opt in ("--phenotypeRanks"):
			phenotypeRanks = True
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	print "Emma is being set up with the following parameters:"
	print "output:",rFile
	print "phenotypeRanks:",phenotypeRanks
	print "withArrayId:",withArrayIds
	print "phenotypeFileType:",phenotypeFileType
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "minMAF:",minMAF
	print "LRT:",lrt
	print "delim:",delim
	print "missingval:",missingVal
	print "kinshipDatafile:",kinshipDatafile
	print "chr:",chr
	print "boundaries:",boundaries
	print "onlyMissing:",onlyMissing
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "complement:",complement
	print "negate:",negate
	print "logTransform:",logTransform
	print "addConstant:",addConstant
	print "removeOutliers:",removeOutliers
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "testRobustness:",testRobustness
	print "permutationFilter:",permutationFilter


	def runParallel(phenotypeIndex,phed):
		#Cluster specific parameters
		print phenotypeIndex
		phenName = phed.getPhenotypeName(phenotypeIndex)
		outFileName = resultDir+"Emma_"+parallel+"_"+phenName

		shstr = """#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=8g 
#PBS -q cmb
"""

		shstr += "#PBS -N E"+phenName+"_"+parallel+"\n"
		shstr += "set phenotypeName="+parallel+"\n"
		shstr += "set phenotype="+str(phenotypeIndex)+"\n"
		shstr += "(python "+emmadir+"Emma.py -o "+outFileName+" "
		if onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		if logTransform:
			shstr += " --logTransform "
		if negate:
			shstr += " --negate "
		if removeOutliers:
			shstr += " --removeOutliers="+str(removeOutliers)+" "
		if phenotypeRanks:
			shstr += " --phenotypeRanks "
		if testRobustness:
			shstr+=" --testRobustness "

		shstr+=" --permutationFilter="+str(permutationFilter)+" "

		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "
			
		shstr += " -a "+str(withArrayIds)+" "			
		if kinshipDatafile:
			shstr += " --kinshipDatafile="+str(kinshipDatafile)+" "			
		shstr += " --addConstant="+str(addConstant)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		if parallelAll:
			for phenotypeIndex in phed.phenIds:
				if onlyMissing:
					phenName = phed.getPhenotypeName(phenotypeIndex)
					pvalFile = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
					res = None
					try:
						res = os.stat(pvalFile)

					except Exception:
						print "File",pvalFile,"does not exist."
					if res and res.st_size>0:
						print "File",pvalFile,"already exists, and is non-empty."
						if sr:
							srInput = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"
							srRes = None
							try:
								srRes = os.stat(srInput)
							except Exception:
								print "File",srInput,"does not exist."
							if srRes and srRes.st_size>0:
								print "File",srInput,"already exists, and is non-empty."
							else:
								runParallel(phenotypeIndex,phed)
							
					else:
						print "Setting up the run."
						runParallel(phenotypeIndex,phed)
											
				else:
					runParallel(phenotypeIndex,phed)
		else:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex,phed)
		return
	else:
		phenotypeIndex = int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "\nStarting program now!\n"



	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

	#Load phenotype file
	phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
	numAcc = len(snpsds[0].accessions)

	#Removing outliers
	if removeOutliers:
		print "Remoing outliers"
		phed.naOutliers(phenotypeIndex,removeOutliers)
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()

	phenotype = phed.getPhenIndex(phenotypeIndex)

	accIndicesToKeep = []			
	phenAccIndicesToKeep = []
	#Checking which accessions to keep and which to remove .
	for i in range(0,len(snpsds[0].accessions)):
		acc1 = snpsds[0].accessions[i]
		for j in range(0,len(phed.accessions)):
			acc2 = phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	

	print "\nFiltering accessions in genotype data:"
	#Filter accessions which do not have the phenotype value (from the genotype data).
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep),"accessions removed from genotype data, leaving",len(accIndicesToKeep),"accessions in all."
		

	print "\nNow filtering accessions in phenotype data:"
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values

	print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is",len(phed.accessions)==len(snpsds[0].accessions)
	if len(phed.accessions)!=len(snpsds[0].accessions):
		raise Exception

	#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	#Remove minor allele frequencies
	if minMAF!=0:
		sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.filterMinMAF(minMAF)

	#Removing SNPs which are outside of boundaries.
	if chr:
		print "\nRemoving SNPs which are outside of boundaries."
		snpsds[chr-1].filterRegion(boundaries[0],boundaries[1])
		snpsds = [snpsds[chr-1]]
	
	#Ordering accessions in genotype data to fit phenotype data.
	print "Ordering genotype data accessions."
	accessionMapping = []
	i = 0
	for acc in phed.accessions:
		if acc in snpsds[0].accessions:
			accessionMapping.append((snpsds[0].accessions.index(acc),i))
			i += 1

	#print zip(accessionMapping,snpsds[0].accessions)
	print "len(snpsds[0].snps)",len(snpsds[0].snps)

	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.orderAccessions(accessionMapping)
	print "\nGenotype data has been ordered."
		
	#Converting format to 01
	newSnpsds = []
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
	print ""


	
	print "Checking kinshipfile:",kinshipDatafile
	
	if kinshipDatafile:  #Is there a special kinship file?
		kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

		accIndicesToKeep = []			
		#Checking which accessions to keep and which to remove (genotype data).
		sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
		sys.stdout.flush()
		for i in range(0,len(kinshipSnpsds[0].accessions)):
			acc1 = kinshipSnpsds[0].accessions[i]
			for j in range(0,len(phed.accessions)):
				acc2 = phed.accessions[j]
				if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
					accIndicesToKeep.append(i)
					break	
		print accIndicesToKeep
	
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.removeAccessionIndices(accIndicesToKeep)
		print ""
		print numAcc-len(accIndicesToKeep),"accessions removed from kinship genotype data, leaving",len(accIndicesToKeep),"accessions in all."
	
		print "Ordering kinship data accessions."
		accessionMapping = []
		i = 0
		for acc in snpsds[0].accessions:
			if acc in kinshipSnpsds[0].accessions:
				accessionMapping.append((kinshipSnpsds[0].accessions.index(acc),i))
				i += 1

		print zip(accessionMapping,snpsds[0].accessions)
		print "len(snpsds[0].snps)",len(snpsds[0].snps)
		
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.orderAccessions(accessionMapping)
		print "Kinship genotype data has been ordered."

		newKinshipSnpsds = []
		sys.stdout.write("Converting data format")
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			newKinshipSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))  #This data might have NAs
		print ""
		kinshipSnpsds = newKinshipSnpsds

	else:
		kinshipSnpsds = newSnpsds
		

	print "Found kinship data."

	#Ordering accessions according to the order of accessions in the genotype file
#	accessionMapping = []
#	i = 0
#	for acc in snpsds[0].accessions:
#		if acc in phed.accessions:
#			accessionMapping.append((phed.accessions.index(acc),i))
#			i += 1
#	phed.orderAccessions(accessionMapping)

	
	#Negating phenotypic values
	if negate: 
		phed.negateValues(phenotypeIndex)

	#Adding a constant.
	if addConstant!=-1:
		if addConstant==0:
			addConstant = math.sqrt(phed.getVariance(phenotypeIndex))/10
			addConstant = addConstant - phed.getMinValue(phenotypeIndex)
			
		print "Adding a constant to phenotype:",addConstant
		phed.addConstant(phenotypeIndex,addConstant)
	
		
	
	#Log-transforming
	if logTransform:
		print "Log transforming phenotype"
		phed.logTransform(phenotypeIndex)
	#Converting phenotypes to Ranks
	elif phenotypeRanks:
		phed.transformToRanks(phenotypeIndex)
	
	if not chr:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[1,2,3,4,5])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[1,2,3,4,5])
	else:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[chr])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[chr])
		
	
	phenotypeName = phed.getPhenotypeName(phenotypeIndex)

	sys.stdout.flush()
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in snpsDataset.snpsDataList:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		_robustness_test_(allSNPs,phenVals,rFile,filter=permutationFilter)
		sys.exit(0)

	if (not sr) or (sr and not srSkipFirstRun):
		sys.stdout.write("Running Primary Emma.\n")
		sys.stdout.flush()
		pvalFile = _runEmmaScript_(snpsDataset, kinshipSnpsDataset, phed, phenotypeIndex, rFile, chr=chr, delim=delim, missingVal=missingVal, boundaries=boundaries, lrt=lrt)
		res = gwaResults.Result(pvalFile,name="EMMA_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,kinshipSnpsDataset)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="EMMA_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.filterMAF()
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)