Ejemplo n.º 1
0
def getKinshipMatrix():
	#snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv"
	snpsDataFile="/home/cmb-01/bvilhjal/Projects/data/250K_f13_012609.csv"
	import dataParsers,snpsdata
	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")#,debug=True)
	snps = []
	sys.stdout.write("Converting format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snps += snpsd.getSnpsData(missingVal="NA").snps
	print ""
	#snps = _sampleSNPs_(snps,100)
	print "Calculating kinship"
	K = calcKinship(snps)
	eDict = phenotypeData._getEcotypeIdToStockParentDict_()
	accessions = map(int,snpsd.accessions)
	#for et in accessions:
	#print eDict[et]
	for i in range(0,len(accessions)):
		et = accessions[i]
		info = eDict[et]
		st = str(et)+", "+str(info[0])+", "+str(info[1])+":"
		st += str(K[i][0])
		for j in range(1,i+1):
			st += ", "+str(K[i][j])
		print st
Ejemplo n.º 2
0
def getKinshipMatrix():
    #snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv"
    snpsDataFile = "/home/cmb-01/bvilhjal/Projects/data/250K_f13_012609.csv"
    import dataParsers, snpsdata
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    snps = []
    sys.stdout.write("Converting format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snps += snpsd.getSnpsData(missingVal="NA").snps
    print ""
    #snps = _sampleSNPs_(snps,100)
    print "Calculating kinship"
    K = calcKinship(snps)
    eDict = phenotypeData._getEcotypeIdToStockParentDict_()
    accessions = map(int, snpsd.accessions)
    #for et in accessions:
    #print eDict[et]
    for i in range(0, len(accessions)):
        et = accessions[i]
        info = eDict[et]
        st = str(et) + ", " + str(info[0]) + ", " + str(info[1]) + ":"
        st += str(K[i][0])
        for j in range(1, i + 1):
            st += ", " + str(K[i][j])
        print st
Ejemplo n.º 3
0
def _runTest_():
	import dataParsers
	import phenotypeData
	
	#Get phenotype data
	phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_transformed_publishable_v2.tsv"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')  #Get Phenotype data 

	#Get SNPs data 
	snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
	snpsds = dataParsers.parseCSVData(snpsDataFile) #Get SNPs data 

	psFile = env.homedir + "tmp/tree.ps"
	marg_file = env.homedir + "tmp/test"
	out_file = env.homedir + "tmp/test_out"
	rFile = env.homedir + "tmp/tree_test.r"

	#Run Margarita
	marg = Margarita(marg_file, out_file)
	chr = 4
	snpsd = snpsds[chr - 1].getSnpsData()
	marg.gwaWithTrees(marg_file, snpsd, phed, phenotype = 1, numMarkers = 200, chromosome = chr, boundaries = [200000, 350000], numPerm = 1, cutoff = 16, numArg = 100)
	#(self, id, snpsd, phed, phenotype=0, boundaries = None, numMarkers = 100, numPerm = 500000, cutoff = 16, numArg = 50)

	#which marginal tree
	runNum = 1
	argNum = 1
	markerNum = 1
	
	marg.parseTreeFile(marg_file + ".marg.trees", rFile, psFile, runNum, argNum, markerNum)
Ejemplo n.º 4
0
def _plotKinshipDiffs_():

    filterProb = 0.2
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "full_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")  # ,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    for snpsd in snpsds:
        snpsd.filterMinMAF(0.1)
        snpsd.filterMonoMorphicSnps()

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

        # For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    print "Calculating the global kinship..."
    globalKinship = calcKinship(totalSNPs)
    print "done."
    normalizedGlobalKinship = globalKinship / mean(globalKinship)
    gc.collect()  # Calling garbage collector, in an attempt to clean up memory..

    for i in range(4, 5):  # len(snpsds)):
        chr = i + 1
        snpsd = snpsds[i]
        # pylab.subplot(5,1,chr)
        # 		pylab.figure(figsize=(18,4))
        # 		(kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000)
        # 		pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$')
        # 		(kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000)
        # 		pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$')
        # 		pylab.legend(numpoints=2,handlelen=0.005)
        # 		pylab.title("Kinship diff. chr. "+str(chr))
        # 		pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf")
        # 		pylab.clf()
        pylab.figure(figsize=(18, 4))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000)
        pylab.plot(binPos, emmaDiffs, "r", label="ws$=300000$")
        pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000)
        pylab.plot(binPos, emmaDiffs, "b", label="ws$=500000$")
        pylab.title("Emma avg. p-value diff. on chr. " + str(chr))
        pylab.legend(numpoints=2, handlelen=0.005)
        pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf")
        pylab.clf()
        gc.collect()  # Calling garbage collector, in an attempt to clean up memory..
Ejemplo n.º 5
0
def _runTest_():
	filename = "/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv"
	import dataParsers,snpsdata
	snpsds = dataParsers.parseCSVData(filename, format=1, deliminator=",")#,debug=True)
	snpsd = snpsdata.SNPsDataSet(snpsds,[1,2,3,4,5])
	eDict = _getEcotypeIdToStockParentDict_()
	accessions = map(int,snpsd.accessions)
	accessions.sort()
	print "ecotype_id, native_name, stock_parent"
	i = 0
	for et in accessions:
		et = int(et)
		print str(et)+", "+str(eDict[et][0])+", "+str(eDict[et][1])
Ejemplo n.º 6
0
	def run(self):
		"""
		2008-5-18
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		snpsd_ls = dataParsers.parseCSVData(self.input_fname, withArrayIds=self.withArrayIds)
		snpData = RawSnpsData_ls2SNPData(snpsd_ls, use_nt2number=1)
		del snpsd_ls
		newSnpData = transposeSNPData(snpData)
		del snpData
		newSnpData.tofile(self.output_fname, transform_to_numpy=0)
Ejemplo n.º 7
0
def _impute_FLC_192_():
    phed = pd.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )

    d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv"
    d250k_sd = dataParsers.parse_snp_data(d250k_file)
    d250k_sd.filter_accessions(phed.accessions)
    d250k_sd.filter_maf_snps(0.05)

    seq_snpsd = dataParsers.parseCSVData(
        data_dir + "/flc_seqs_aln_imputed_snps_012710.csv")
    seq_snpsd.onlyBinarySnps()

    d250k_sd.snpsDataList[4].compareWith(seq_snpsd)
    d250k_sd.snpsDataList[4].merge_data(seq_snpsd)
Ejemplo n.º 8
0
def _countVals_():
	resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/"
	phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv"
	print "Loading phenotype data"
	phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t')
	phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4]
	print "total # of phenotypes:", phed.countPhenotypes()
	print "# of phenotypes analyzed:", len(phenotypeIndices)
	
	totalCounts = []
	for p_i in phenotypeIndices:
		valCount = phed.countValues(p_i)
		totalCounts.append(valCount)

	snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv"
	import dataParsers,snpsdata
	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")#,debug=True)
	snpsd = snpsdata.SNPsDataSet(snpsds,[1,2,3,4,5])
	phed.removeAccessionsNotInSNPsData(snpsd)
	
	overlappingCounts = []
	for p_i in phenotypeIndices:
		valCount = phed.countValues(p_i)
		overlappingCounts.append(valCount)


	#ecotypes_192 = phenotypeData._getFirst192Ecotypes_()
	ecotypes_192 = _get192Ecotypes_()
	ecotypes_192 = [str(e) for e in ecotypes_192]
	print "len(ecotypes_192):",len(ecotypes_192)
	print ecotypes_192
	phed.filterAccessions(ecotypes_192)

	filename = resdir+"phen_value_count_new_data_012509_v2.txt"
	f = open(filename,"w")
	f.write("Phenotype,  total_count, overlapping_count, 192_overlap_count\n")
	
	for i in range(0,len(phenotypeIndices)):
		p_i = phenotypeIndices[i]
		try:
			phenName = phed.getPhenotypeName(p_i)
			valCount = phed.countValues(p_i)
			f.write(str(phenName)+", "+str(totalCounts[i])+", "+str(overlappingCounts[i])+", "+str(valCount)+"\n")
		except Exception:
			print "\nPhenotype index", p_i, "failed."

	f.close()
def plotHaplotypes(chr, startPos, endPos):
    snpsd = dataParsers.parseCSVData(
        "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv")[chr - 1]
    import scipy as sp
    import scipy.cluster.hierarchy as hc
    import Emma
    snpsd = snpsd.getSnpsData()
    newSnps = []
    positions = []
    for i in range(0, len(snpsd.positions)):
        pos = snpsd.positions[i]
        if pos > endPos:
            break
        elif pos >= startPos:
            newSnps.append(snpsd.snps[i])
            positions.append(snpsd.positions[i])

    print "calculating the kinship"
    K = Emma.calcKinship(newSnps)
    #print "K:",K
    Z = hc.average(K)
    #print "Z:",Z
    import pylab
    #hc.leaders(Z)
    dend_dict = hc.dendrogram(Z, labels=snpsd.accessions)
    new_acc_order = dend_dict['ivl']
    print new_acc_order
    print snpsd.accessions
    pylab.savefig("/Users/bjarni/tmp/FRI_tree.pdf", format='pdf')
    #cluster to get ordering??

    acc_mapping = []
    for acc in snpsd.accessions:
        i = new_acc_order.index(acc)
        acc_mapping.append(i)

    snps = []
    for snp in newSnps:
        newSNP = [0] * len(snp)
        for (nt, i) in zip(snp, acc_mapping):
            newSNP[i] = nt
        snps.append(newSNP)

    snps = sp.array(snps)
    pylab.matshow(snps.transpose())
    pylab.savefig("/Users/bjarni/tmp/FRI_haplotype.pdf", format='pdf')
def plot_250k_Tree(chr=None, startPos=None, endPos=None):
    import scipy as sp
    import scipy.cluster.hierarchy as hc
    import Emma
    import pylab
    import phenotypeData
    e_dict = phenotypeData._getEcotypeIdToStockParentDict_()
    snpsds = dataParsers.parseCSVData(
        "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv")
    snps = []
    for snpsd in snpsds:
        snps += snpsd.getSnpsData().snps
    snps = sampleSNPs(snps, 100000, False)
    labels = []
    for acc in snpsds[0].accessions:
        try:
            s = unicode(e_dict[int(acc, )][0], 'iso-8859-1')
        except Exception, err_s:
            print err_s
            print e_dict[int(acc)][0]
            s = acc
        labels.append(s)
def getLerAndColAccessions(snpsds=None, asFactors=False):
    if not snpsds:
        snpsds = dataParsers.parseCSVData(
            "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv")
    snpsd = snpsds[3]  #.getSnpsData()
    ler_pos = 268809
    col_pos = 269962
    col_accessions = [[], []]
    ler_accessions = [[], []]
    col_factor = []
    ler_factor = []

    for i in range(0, len(snpsd.positions)):
        pos = snpsd.positions[i]
        if pos > col_pos:
            break
        elif pos == ler_pos:
            for j in range(0, len(snpsd.snps[i])):
                if snpsd.snps[i][j] == 0:
                    ler_accessions[0].append(snpsd.accessions[j])
                    ler_factor.append(0)
                else:
                    ler_accessions[1].append(snpsd.accessions[j])
                    ler_factor.append(1)

        elif pos == col_pos:
            for j in range(0, len(snpsd.snps[i])):
                if snpsd.snps[i][j] == 0:
                    col_accessions[0].append(snpsd.accessions[j])
                    col_factor.append(0)
                else:
                    col_accessions[1].append(snpsd.accessions[j])
                    col_factor.append(1)

    if asFactors:
        return (ler_factor, col_factor)
    else:
        return (ler_accessions[1], col_accessions[1])
Ejemplo n.º 12
0
def _test_():
	import dataParsers
	snpsds1 = dataParsers.parseCSVData("149_v1.csv", deliminator=",")
	snpsds2 = dataParsers.parseCSVData("384.csv", deliminator=",")	
	merge(snpsds1,snpsds2,unionType=1,priority=1)
	print snpsds1[0].positions
Ejemplo n.º 13
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["phenotypeIDs=","rawPhenotypes","onlyBinary", "onlyCategorical", "onlyQuantitative", "onlyReplicates", "delim=", "missingval=", "help","includeSD","orderByGenotypeFile=", "onlyPublishable"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "p:u:h:o:d:m:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)

	output_fname = None
	phenotypeIDs = None
	delim = ","
	missingVal = "NA"
	rawPhenotypes = False
	onlyBinary = False
	onlyCategorical = False
	onlyQuantitative = False
	onlyReplicates = False
	includeSD = False
	onlyPublishable = False
	genotypeFile = None
	help = 0
	passwd = None
	user=None
	host="papaya.usc.edu"
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-u",):
			user = arg
		elif opt in ("-p",):
			passwd = arg
		elif opt in ("-h",):
			host = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("--rawPhenotypes"):
			rawPhenotypes = True
		elif opt in ("--onlyBinary"):
			onlyBinary = True
		elif opt in ("--onlyCategorical"):
			onlyCategorical = True
		elif opt in ("--onlyQuantitative"):
			onlyQuantitative = True
		elif opt in ("--onlyReplicates"):
			onlyReplicates = True
		elif opt in ("--includeSD"):
			includeSD = True
		elif opt in ("--onlyPublishable"):
			onlyPublishable = True
		elif opt in ("--phenotypeIDs"):
			phenotypeIDs = []
			for num in arg.split(","):
				phenotypeIDs.append(int(num))
		elif opt in ("--orderByGenotypeFile"):
			genotypeFile = arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)


	if not user:
		sys.stdout.write("Username: "******"Remoing phenotypes."
		phenData.removePhenotypeIDs(phenotypeIDs)
	
	#Sort in correct order.
	if genotypeFile:
		snpsds = dataParsers.parseCSVData(genotypeFile, format=1, deliminator=delim, missingVal=missingVal)
		print "Removing accessions which are not in the genotype file."
		indicesToKeep = []
		for i in range(0,len(phenData.accessions)):
			if phenData.accessions[i] in snpsds[0].accessions:
				indicesToKeep.append(i)
		phenData.removeAccessions(indicesToKeep)

		print "Ordering accessions to match the genotype file order"
		associationMapping = []
		j = 0
		for acc in snpsds[0].accessions:
			if acc in phenData.accessions:
				associationMapping.append((phenData.accessions.index(acc),j))
				j += 1
	
		phenData.orderAccessions(associationMapping)
			
		#Output phenotypes to file.
	phenData.writeToFile(output_fname, delimiter='\t')
Ejemplo n.º 14
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["rFile=","chr=", "delim=", "missingval=", "withArrayId=", "BoundaryStart=", "removeOutliers=", "addConstant=",
						"logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", 
						"kinshipDatafile=", "phenotypeRanks", "onlyMissing","onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", 
						"complement", "negate", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "testRobustness",
						"permutationFilter="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	phenotypeRanks = False
	removeOutliers = None
	addConstant = -1
	phenotypeFileType = 1
	rFile = None
	delim = ","
	missingVal = "NA"
	help = 0
	minMAF=0.0
	withArrayIds = 1
	boundaries = [-1,-1]
	chr=None
	parallel = None
	logTransform = False
	negate = False
	parallelAll = False
	lrt = False
	kinshipDatafile = None 
	onlyMissing = False
	onlyOriginal96 = False
	onlyOriginal192 = False
	onlyBelowLatidue = None
	complement = False

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	testRobustness = False
	permutationFilter = 0.002

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-o","--rFile"):
			rFile = arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType = int(arg)
		elif opt in ("--BoundaryStart"):
			boundaries[0] = int(arg)
		elif opt in ("--BoundaryEnd"):
			boundaries[1] = int(arg)
		elif opt in ("--addConstant"):
			addConstant = float(arg)
		elif opt in ("--parallel"):
			parallel = arg
		elif opt in ("--minMAF"):
			minMAF = float(arg)
		elif opt in ("--parallelAll"):
			parallelAll = True
		elif opt in ("--onlyMissing"):
			onlyMissing = True
		elif opt in ("--onlyOriginal96"):
			onlyOriginal96 = True
		elif opt in ("--onlyOriginal192"):
			onlyOriginal192 = True
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue = float(arg)
		elif opt in ("--complement"):
			complement = True
		elif opt in ("--logTransform"):
			logTransform = True
		elif opt in ("--negate"):
			negate = True
		elif opt in ("--removeOutliers"):
			removeOutliers = float(arg)
		elif opt in ("--LRT"):
			lrt = True
		elif opt in ("-c","--chr"):
			chr = int(arg)
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("--kinshipDatafile"):
			kinshipDatafile = arg
		elif opt in ("--phenotypeRanks"):
			phenotypeRanks = True
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	print "Emma is being set up with the following parameters:"
	print "output:",rFile
	print "phenotypeRanks:",phenotypeRanks
	print "withArrayId:",withArrayIds
	print "phenotypeFileType:",phenotypeFileType
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "minMAF:",minMAF
	print "LRT:",lrt
	print "delim:",delim
	print "missingval:",missingVal
	print "kinshipDatafile:",kinshipDatafile
	print "chr:",chr
	print "boundaries:",boundaries
	print "onlyMissing:",onlyMissing
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "complement:",complement
	print "negate:",negate
	print "logTransform:",logTransform
	print "addConstant:",addConstant
	print "removeOutliers:",removeOutliers
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "testRobustness:",testRobustness
	print "permutationFilter:",permutationFilter


	def runParallel(phenotypeIndex,phed):
		#Cluster specific parameters
		print phenotypeIndex
		phenName = phed.getPhenotypeName(phenotypeIndex)
		outFileName = resultDir+"Emma_"+parallel+"_"+phenName

		shstr = """#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=8g 
#PBS -q cmb
"""

		shstr += "#PBS -N E"+phenName+"_"+parallel+"\n"
		shstr += "set phenotypeName="+parallel+"\n"
		shstr += "set phenotype="+str(phenotypeIndex)+"\n"
		shstr += "(python "+emmadir+"Emma.py -o "+outFileName+" "
		if onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		if logTransform:
			shstr += " --logTransform "
		if negate:
			shstr += " --negate "
		if removeOutliers:
			shstr += " --removeOutliers="+str(removeOutliers)+" "
		if phenotypeRanks:
			shstr += " --phenotypeRanks "
		if testRobustness:
			shstr+=" --testRobustness "

		shstr+=" --permutationFilter="+str(permutationFilter)+" "

		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "
			
		shstr += " -a "+str(withArrayIds)+" "			
		if kinshipDatafile:
			shstr += " --kinshipDatafile="+str(kinshipDatafile)+" "			
		shstr += " --addConstant="+str(addConstant)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		if parallelAll:
			for phenotypeIndex in phed.phenIds:
				if onlyMissing:
					phenName = phed.getPhenotypeName(phenotypeIndex)
					pvalFile = resultDir+"Emma_"+parallel+"_"+phenName+".pvals"
					res = None
					try:
						res = os.stat(pvalFile)

					except Exception:
						print "File",pvalFile,"does not exist."
					if res and res.st_size>0:
						print "File",pvalFile,"already exists, and is non-empty."
						if sr:
							srInput = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals"
							srRes = None
							try:
								srRes = os.stat(srInput)
							except Exception:
								print "File",srInput,"does not exist."
							if srRes and srRes.st_size>0:
								print "File",srInput,"already exists, and is non-empty."
							else:
								runParallel(phenotypeIndex,phed)
							
					else:
						print "Setting up the run."
						runParallel(phenotypeIndex,phed)
											
				else:
					runParallel(phenotypeIndex,phed)
		else:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex,phed)
		return
	else:
		phenotypeIndex = int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "\nStarting program now!\n"



	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

	#Load phenotype file
	phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
	numAcc = len(snpsds[0].accessions)

	#Removing outliers
	if removeOutliers:
		print "Remoing outliers"
		phed.naOutliers(phenotypeIndex,removeOutliers)
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()

	phenotype = phed.getPhenIndex(phenotypeIndex)

	accIndicesToKeep = []			
	phenAccIndicesToKeep = []
	#Checking which accessions to keep and which to remove .
	for i in range(0,len(snpsds[0].accessions)):
		acc1 = snpsds[0].accessions[i]
		for j in range(0,len(phed.accessions)):
			acc2 = phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	

	print "\nFiltering accessions in genotype data:"
	#Filter accessions which do not have the phenotype value (from the genotype data).
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep),"accessions removed from genotype data, leaving",len(accIndicesToKeep),"accessions in all."
		

	print "\nNow filtering accessions in phenotype data:"
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values

	print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is",len(phed.accessions)==len(snpsds[0].accessions)
	if len(phed.accessions)!=len(snpsds[0].accessions):
		raise Exception

	#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	#Remove minor allele frequencies
	if minMAF!=0:
		sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.filterMinMAF(minMAF)

	#Removing SNPs which are outside of boundaries.
	if chr:
		print "\nRemoving SNPs which are outside of boundaries."
		snpsds[chr-1].filterRegion(boundaries[0],boundaries[1])
		snpsds = [snpsds[chr-1]]
	
	#Ordering accessions in genotype data to fit phenotype data.
	print "Ordering genotype data accessions."
	accessionMapping = []
	i = 0
	for acc in phed.accessions:
		if acc in snpsds[0].accessions:
			accessionMapping.append((snpsds[0].accessions.index(acc),i))
			i += 1

	#print zip(accessionMapping,snpsds[0].accessions)
	print "len(snpsds[0].snps)",len(snpsds[0].snps)

	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.orderAccessions(accessionMapping)
	print "\nGenotype data has been ordered."
		
	#Converting format to 01
	newSnpsds = []
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
	print ""


	
	print "Checking kinshipfile:",kinshipDatafile
	
	if kinshipDatafile:  #Is there a special kinship file?
		kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)

		accIndicesToKeep = []			
		#Checking which accessions to keep and which to remove (genotype data).
		sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
		sys.stdout.flush()
		for i in range(0,len(kinshipSnpsds[0].accessions)):
			acc1 = kinshipSnpsds[0].accessions[i]
			for j in range(0,len(phed.accessions)):
				acc2 = phed.accessions[j]
				if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
					accIndicesToKeep.append(i)
					break	
		print accIndicesToKeep
	
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.removeAccessionIndices(accIndicesToKeep)
		print ""
		print numAcc-len(accIndicesToKeep),"accessions removed from kinship genotype data, leaving",len(accIndicesToKeep),"accessions in all."
	
		print "Ordering kinship data accessions."
		accessionMapping = []
		i = 0
		for acc in snpsds[0].accessions:
			if acc in kinshipSnpsds[0].accessions:
				accessionMapping.append((kinshipSnpsds[0].accessions.index(acc),i))
				i += 1

		print zip(accessionMapping,snpsds[0].accessions)
		print "len(snpsds[0].snps)",len(snpsds[0].snps)
		
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.orderAccessions(accessionMapping)
		print "Kinship genotype data has been ordered."

		newKinshipSnpsds = []
		sys.stdout.write("Converting data format")
		for snpsd in kinshipSnpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			newKinshipSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))  #This data might have NAs
		print ""
		kinshipSnpsds = newKinshipSnpsds

	else:
		kinshipSnpsds = newSnpsds
		

	print "Found kinship data."

	#Ordering accessions according to the order of accessions in the genotype file
#	accessionMapping = []
#	i = 0
#	for acc in snpsds[0].accessions:
#		if acc in phed.accessions:
#			accessionMapping.append((phed.accessions.index(acc),i))
#			i += 1
#	phed.orderAccessions(accessionMapping)

	
	#Negating phenotypic values
	if negate: 
		phed.negateValues(phenotypeIndex)

	#Adding a constant.
	if addConstant!=-1:
		if addConstant==0:
			addConstant = math.sqrt(phed.getVariance(phenotypeIndex))/10
			addConstant = addConstant - phed.getMinValue(phenotypeIndex)
			
		print "Adding a constant to phenotype:",addConstant
		phed.addConstant(phenotypeIndex,addConstant)
	
		
	
	#Log-transforming
	if logTransform:
		print "Log transforming phenotype"
		phed.logTransform(phenotypeIndex)
	#Converting phenotypes to Ranks
	elif phenotypeRanks:
		phed.transformToRanks(phenotypeIndex)
	
	if not chr:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[1,2,3,4,5])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[1,2,3,4,5])
	else:
		snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[chr])
		kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[chr])
		
	
	phenotypeName = phed.getPhenotypeName(phenotypeIndex)

	sys.stdout.flush()
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in snpsDataset.snpsDataList:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		_robustness_test_(allSNPs,phenVals,rFile,filter=permutationFilter)
		sys.exit(0)

	if (not sr) or (sr and not srSkipFirstRun):
		sys.stdout.write("Running Primary Emma.\n")
		sys.stdout.flush()
		pvalFile = _runEmmaScript_(snpsDataset, kinshipSnpsDataset, phed, phenotypeIndex, rFile, chr=chr, delim=delim, missingVal=missingVal, boundaries=boundaries, lrt=lrt)
		res = gwaResults.Result(pvalFile,name="EMMA_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,kinshipSnpsDataset)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.filterMAF()
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="EMMA_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.filterMAF()
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
Ejemplo n.º 15
0
def _plotKW_():
    """
	Analyze how population structure affects KW.
	"""
    filterProb = 0.1
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "_full_quick_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")  # ,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

        # For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    # globalKinship = calcKinship(totalSNPs)
    gc.collect()  # Calling garbage collector, in an attempt to clean up memory..

    # chr = 1
    # for snpsd in snpsds:

    snpsd = snpsds[3]

    k = calcKinship(snpsd.snps[200:1400])
    res = runEmma(phed, p_i, k, snpsd.snps[200:1400])  # runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)")

    k = calcKinship(totalSNPs)
    res = runEmma(phed, p_i, k, snpsd.snps[200:1400])  # runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)")

    phenVals = phed.getPhenVals(p_i)
    pvals = _run_kw_(snpsd.snps[200:1400], phenVals)
    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)")

    (pvals, new_positions, acc_groups) = get_KW_pvals(
        snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW"
    )
    ecot_map = phenotypeData._getEcotypeIdToStockParentDict_()

    for i in range(0, len(acc_groups)):
        acc_list = []
        for a_i in acc_groups[i]:
            e_i = snpsd.accessions[a_i]
            # print e_i
            acc_list.append(ecot_map[int(e_i)][0])
        print "group", i, ":", acc_list

    log_pvals = []
    for pval in pvals:
        # print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)")

    pylab.legend(numpoints=2, handlelen=0.005)

    pylab.show()
Ejemplo n.º 16
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=",
        "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=",
        "missingval=", "withArrayId=", "debug", "report", "help",
        "heterozygous2NA", "first192", "removeLer", "removeCol"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    inputFile = args[0]
    output_fname = None
    delim = ","
    missingVal = "NA"
    comparisonFile = None
    maxMissing = 1.0
    maxError = 1.0
    removeEcotypes = None
    removeArray = None
    removeIdentical = False
    onlyCommon = False
    debug = None
    report = None
    help = 0
    withArrayIds = 1
    first96 = False
    first192 = False
    heterozygous2NA = False
    removeLer = False
    removeCol = False

    for opt, arg in opts:
        if opt in ('-o'):
            output_fname = arg
        elif opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("--comparisonFile"):
            comparisonFile = arg
        elif opt in ("--maxError"):
            maxError = float(arg)
        elif opt in ("--maxMissing"):
            maxMissing = float(arg)
        elif opt in ("--heterozygous2NA"):
            heterozygous2NA = True
        elif opt in ("--removeEcotypeId"):
            removeEcotypes = arg.split(",")
            removeEcotypes = map(int, removeEcotypes)
        elif opt in ("--removeArrayId"):
            removeArray = int(arg)
        elif opt in ("--removeIdentical"):
            removeIdentical = True
        elif opt in ("--onlyCommon"):
            onlyCommon = True
        elif opt in ("--first96"):
            first96 = True
        elif opt in ("--first192"):
            first192 = True
        elif opt in ("--removeLer"):
            removeLer = True
        elif opt in ("--removeCol"):
            removeCol = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-b", "--debug"):
            debug = 1
        elif opt in ("-r", "--report"):
            report = 1
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if not output_fname:
        output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    waid1 = withArrayIds == 1 or withArrayIds == 2
    waid2 = withArrayIds == 2 or withArrayIds == 3

    import dataParsers
    snpsds = dataParsers.parseCSVData(inputFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=waid1)

    accessionsToRemove = []
    arraysToRemove = None

    if first96:
        import dataParsers
        d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',
                                                        user="******",
                                                        passwd="bamboo123")
        ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',
                                                       user="******",
                                                       passwd="bamboo123")
        print "Dictionaries loaded"
        names = []
        first96Names = []
        for i in range(0, len(snpsds[0].accessions)):
            ecotype = snpsds[0].accessions[i]
            arrayID = snpsds[0].arrayIds[i]
            names.append((arrayID, ecotd[ecotype], ecotype))
            if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0:
                accessionsToRemove.append(ecotype)
            else:
                first96Names.append(
                    (arrayID, d[ecotype][1], d[ecotype][0], ecotype))

        first96Names.sort()
        print "First 96 accessions, len:", len(first96Names), ":"
        for name in first96Names:
            print name
        names.sort()
        print "All accessions:"
        for name in names:
            print name
    elif first192:
        import phenotypeData
        ecotypes_192 = map(str, phenotypeData._getFirst192Ecotypes_())
        print ecotypes_192, snpsds[0].accessions
        for acc in snpsds[0].accessions:
            if acc not in ecotypes_192:
                accessionsToRemove.append(acc)
        print "found", len(ecotypes_192), '"192" ecotypes... removing', len(
            accessionsToRemove), "ecotypes."

    if removeLer:
        import analyzeHaplotype as ah
        accessionsToRemove += ah.getLerAndColAccessions(snpsds)[0]
    if removeCol:
        import analyzeHaplotype as ah
        accessionsToRemove += ah.getLerAndColAccessions(snpsds)[1]

    #Retrieve comparison list of accessions.  (Error rates for accessions)
    if (removeIdentical or maxError < 1.0) and comparisonFile:
        sys.stderr.write("Loading comparison file:")
        snpsds2 = dataParsers.parseCSVData(comparisonFile,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)
        res = []
        sys.stderr.write("Comparing accessions.")
        for i in range(0, len(snpsds)):
            res.append(snpsds[i].compareWith(snpsds2[i],
                                             withArrayIds=withArrayIds,
                                             verbose=False,
                                             heterozygous2NA=heterozygous2NA))
            sys.stderr.write(".")
        sys.stderr.write("\n")

        totalAccessionCounts = [0] * len(res[0][2])
        accErrorRate = [0] * len(res[0][2])
        for i in range(0, len(snpsds)):
            r = res[i]
            for j in range(0, len(r[2])):
                totalAccessionCounts[j] += r[6][j]
                accErrorRate[j] += r[3][j] * float(r[6][j])

        for i in range(0, len(accErrorRate)):
            accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i])

        accErrAndID = []
        if 0 < withArrayIds < 3:
            for i in range(0, len(r[2])):
                accErrAndID.append((accErrorRate[i], r[2][i], r[5][i]))
        else:
            for i in range(0, len(r[2])):
                accErrAndID.append((accErrorRate[i], r[2][i]))
        accErrAndID.sort()
        accErrAndID.reverse()

    #Figure out which accessions are too erroraneous
    if maxError < 1.0 and comparisonFile:
        if withArrayIds:
            arraysToRemove = []
            for (error, ecotype, array) in accErrAndID:
                if error > maxError:
                    accessionsToRemove.append(ecotype)
                    arraysToRemove.append(array)

        else:
            for (error, ecotype) in accErrAndID:
                if error > maxError:
                    accessionsToRemove.append(ecotype)

    if removeIdentical and comparisonFile and withArrayIds:
        print "Locating identical accessions"
        accErrAndID.sort()
        if not arraysToRemove:
            arraysToRemove = []
        for accession in set(snpsds[0].accessions):
            if snpsds[0].accessions.count(accession) > 1:
                found = 0
                for (error, ecotype, array) in accErrAndID:
                    if ecotype == accession:
                        if found > 0:
                            accessionsToRemove.append(ecotype)
                            arraysToRemove.append(array)
                        found += 1

    if onlyCommon and comparisonFile:
        print "Locating accessions which are not shared"
        snpsds2 = dataParsers.parseCSVData(comparisonFile,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)
        #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions)))
        if not arraysToRemove:
            arraysToRemove = []
        for i in range(0, len(snpsds[0].accessions)):
            acc = snpsds[0].accessions[i]
            if not acc in snpsds2[0].accessions:
                accessionsToRemove.append(acc)
                if 0 < withArrayIds < 3:
                    arraysToRemove.append(snpsds[0].arrayIds[i])

    if maxMissing < 1.0:
        missingCounts = [0] * len(snpsds[0].accessions)
        numSnps = 0
        for snpsd in snpsds:
            mc = snpsd.accessionsMissingCounts()
            numSnps += len(snpsd.positions)
            for i in range(0, len(snpsds[0].accessions)):
                missingCounts[i] += mc[i]

        missingRates = []
        if withArrayIds:
            arraysToRemove = []
            for i in range(0, len(snpsds[0].accessions)):
                missingRates.append(
                    (missingCounts[i] / float(numSnps),
                     snpsds[0].accessions[i], snpsds[0].arrayIds[i]))
            missingRates.sort()
            missingRates.reverse()
            for (mrate, ecotype, array) in missingRates:
                if mrate > maxMissing:
                    accessionsToRemove.append(ecotype)
                    arraysToRemove.append(array)
        else:
            for i in range(0, len(snpsds[0].accessions)):
                missingRates.append((missingCounts[i] / float(numSnps),
                                     snpsds[0].accessions[i]))
            missingRates.sort()
            missingRates.reverse()
            for (mrate, ecotype) in missingRates:
                if mrate > maxMissing:
                    accessionsToRemove.append(ecotype)

    if removeEcotypes:
        for removeEcotype in removeEcotypes:
            accessionsToRemove.append(str(int(removeEcotype)))
        print "Removing", len(accessionsToRemove), "accessions."
    if removeArray:
        if not arraysToRemove:
            arraysToRemove = []
        arraysToRemove.append(str(removeArray))
        print "Removing", len(arraysToRemove), " arrays."

    numAccessions = len(snpsds[0].accessions)
    sys.stderr.write("Removing accessions.")
    for snpsd in snpsds:
        snpsd.removeAccessions(accessionsToRemove, arrayIds=arraysToRemove)
        sys.stderr.write(".")
    print "\n", (
        numAccessions - len(snpsds[0].accessions)
    ), "accessions out of " + str(numAccessions) + " were removed."

    import snpsdata
    snpsdata.writeRawSnpsDatasToFile(output_fname,
                                     snpsds,
                                     chromosomes=[1, 2, 3, 4, 5],
                                     deliminator=delim,
                                     missingVal=missingVal,
                                     withArrayIds=waid1)
Ejemplo n.º 17
0
def _run_():
	import os
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)	   

	long_options_list = ["id=", "chr=", "numARG=", "numMarkers=", "numPerm=", "smartCutoff=", "BoundaryStart=", "BoundaryEnd=", "binary", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", "debug", "parallel=", "parallelAll", "help", "scoreFile="]
	
	try:
		opts, args = getopt.getopt(sys.argv[1:], "i:s:c:d:m:a:bh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	import tempfile
	tempfile.tempdir = '/tmp'
	(fId, id) = tempfile.mkstemp()
	os.close(fId)		
	scoreFile = None
	chr = None
	numARG = 30
	numMarkers = 100
	numPerm = 0
	smartCutoff = 10
	binary = False
	delim = ","
	missingVal = "NA"
	debug = None
	report = None
	help = 0
	withArrayId = 0
	boundaries = [ - 1, - 1]
	phenotypeFileType = 1
	parallel = None
	parallelAll = False
	snpsDataFile = None

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-i", "--id"):
			id = '/tmp/' + arg
		elif opt in ("-s", "--scoreFile"):
			scoreFile = arg
		elif opt in ("-c", "--chr"):
			chr = int(arg)
		elif opt in ("--numARG"):
			numARG = int(arg)
		elif opt in ("--numMarkers"):
			numMarkers = int(arg)
		elif opt in ("--numPerm"):
			numPerm = int(arg)
		elif opt in ("--BoundaryStart"):
			boundaries[0] = int(arg)
		elif opt in ("--BoundaryEnd"):
			boundaries[1] = int(arg)
		elif opt in ("--smartCutoff"):
			smartCutoff = int(arg)
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType = int(arg)
		elif opt in ("--binary"):
			binary = True
		elif opt in ("--parallel"):
			parallel = arg
		elif opt in ("--parallelAll"):
			parallelAll = True
		elif opt in ("-d", "--delim"):
			delim = arg
		elif opt in ("-m", "--missingval"):
			missingVal = arg	
		elif opt in ("-a", "--withArrayId"):
			withArrayId = int(arg)
		elif opt in ("-b", "--debug"):
			debug = 1

	if len(args) < 3 and not parallel:
		if help == 0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	if boundaries[0] == boundaries[1] and boundaries[0] == - 1:
		boundaries = None

	margFile = id + ".marg"
	outFile = margFile + ".out"		


	def runParallel(phenotypeIndex):
		#Cluster specific parameters
		#margdir = '/home/cmb-01/bvilhjal/Projects/Python-snps/'
		resultDir = env.results_dir #'/home/cmb-01/bvilhjal/results/'
		import phenotypeData
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName = phed.getPhenotypeName(phenotypeIndex)
		phenName = phenName.replace("/", "_div_")
		phenName = phenName.replace("*", "_star_")
 
		outFileName = resultDir + "Marg_" + parallel + "_" + phenName
		scoreFile = outFileName + ".score" 

		shstr = """#!/bin/csh
#PBS -l walltime=120:00:00
#PBS -l mem=4g
#PBS -q cmb
"""

		shstr += "#PBS -N M" + phenName + "_" + parallel + "\n"
		#shstr += "(python " + margdir + "margarita.py "
		shstr += "(python " + env.script_dir + "margarita.py "
		if phed.isBinary(phenotypeIndex):
			shstr += " --binary "
		shstr += " -s " + scoreFile
		shstr += " -a " + str(withArrayId) + " "
		shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " "
		shstr += "> " + outFileName + ".out) >& " + outFileName + ".err\n"
		
		f = open(parallel + ".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub " + parallel + ".sh ")
		
	#Nested function ends

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		if len(args) > 2:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex)
			return
		
		else:
			snpsDataFile = args[0]
			if not parallelAll:
				phenotypeIndex = int(args[1])
				runParallel(phenotypeIndex)
				return

		import phenotypeData
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		for phenotypeIndex in phed.phenIds:
			runParallel(phenotypeIndex)
		return

	phenotypeIndex = int(args[2])


	#Print out information about this run...
	print "Preparing a blended margarita...."
	print "Num ARG:", numARG
	print "Num Markers:", numMarkers
	print "Num Permutations:", numPerm
	print "Smart cutoff:", smartCutoff
	print "Binary:", binary
	print "ScoreFile:", scoreFile


	
	import dataParsers, snpsdata, phenotypeData
	#phenotypeFile = "/Users/bjarni/Projects/Python-snps/tinaPhenos_041808.csv"
	if phenotypeFileType == 1:
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	elif phenotypeFileType == 2:
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, accessionDecoder = dataParsers.accessionName2EcotypeId, type = 2)	

	snpsds = dataParsers.parseCSVData(snpsDataFile, deliminator = delim, missingVal = missingVal, withArrayIds = bool(withArrayId)) #Get SNPs data 



	marg = Margarita(margFile, outFile, numARG, numMarkers, numPerm, smartCutoff)

	if chr:
		snpsd = snpsds[chr - 1].getSnpsData()
		marg.gwa(snpsd, phed, phenotype = phenotypeIndex, boundaries = boundaries, chromosome = chr, binary = binary)
	else:
		scoreStr = ""
		for chr in [0, 1, 2, 3, 4]:
			snpsd = snpsds[chr].getSnpsData()
			(newRStr, newScoreStr, permPvals) = marg.gwa(snpsd, phed, phenotype = phenotypeIndex, boundaries = boundaries, chromosome = chr + 1, binary = binary)
			scoreStr += newScoreStr

		f = open(scoreFile, 'w')
		f.write(scoreStr)
		f.close()
Ejemplo n.º 18
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "phenotypeIDs=", "rawPhenotypes", "onlyBinary", "onlyCategorical",
        "onlyQuantitative", "onlyReplicates", "delim=", "missingval=", "help",
        "includeSD", "orderByGenotypeFile=", "onlyPublishable"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "p:u:h:o:d:m:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    output_fname = None
    phenotypeIDs = None
    delim = ","
    missingVal = "NA"
    rawPhenotypes = False
    onlyBinary = False
    onlyCategorical = False
    onlyQuantitative = False
    onlyReplicates = False
    includeSD = False
    onlyPublishable = False
    genotypeFile = None
    help = 0

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", ):
            output_fname = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("--rawPhenotypes"):
            rawPhenotypes = True
        elif opt in ("--onlyBinary"):
            onlyBinary = True
        elif opt in ("--onlyCategorical"):
            onlyCategorical = True
        elif opt in ("--onlyQuantitative"):
            onlyQuantitative = True
        elif opt in ("--onlyReplicates"):
            onlyReplicates = True
        elif opt in ("--includeSD"):
            includeSD = True
        elif opt in ("--onlyPublishable"):
            onlyPublishable = True
        elif opt in ("--phenotypeIDs"):
            phenotypeIDs = []
            for num in arg.split(","):
                phenotypeIDs.append(int(num))
        elif opt in ("--orderByGenotypeFile"):
            genotypeFile = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if not output_fname:
        output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

        #Retrieve phenotype data.
    phenData = getPhenotypes(onlyBinary=onlyBinary,
                             onlyQuantitative=onlyQuantitative,
                             onlyCategorical=onlyCategorical,
                             onlyReplicates=onlyReplicates,
                             includeSD=includeSD,
                             rawPhenotypes=rawPhenotypes,
                             onlyPublishable=onlyPublishable)

    if phenotypeIDs:
        print "Remoing phenotypes."
        phenData.removePhenotypeIDs(phenotypeIDs)

    #Sort in correct order.
    if genotypeFile:
        snpsds = dataParsers.parseCSVData(genotypeFile,
                                          format=1,
                                          deliminator=delim,
                                          missingVal=missingVal)
        print "Removing accessions which are not in the genotype file."
        indicesToKeep = []
        for i in range(0, len(phenData.accessions)):
            if phenData.accessions[i] in snpsds[0].accessions:
                indicesToKeep.append(i)
        phenData.removeAccessions(indicesToKeep)

        print "Ordering accessions to match the genotype file order"
        associationMapping = []
        j = 0
        for acc in snpsds[0].accessions:
            if acc in phenData.accessions:
                associationMapping.append((phenData.accessions.index(acc), j))
                j += 1

        phenData.orderAccessions(associationMapping)

        #Output phenotypes to file.
    phenData.writeToFile(output_fname, delimiter='\t', with_pid=True)
Ejemplo n.º 19
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "monomorphic", "onlyBinary", "delim=", 
						"missingval=", "withArrayId=", "callProbFile=", "minMAF=", "minCallProb=", "debug", 
						"report", "help", "output01Format", "filterRegion="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:brh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	inputFile = args[0]
	output_fname = None
	delim = ","
	missingVal = "NA"
	comparisonFile = None
	maxMissing = 1.0
	maxError = 1.0
	monomorphic = False
	debug = None
	report = None
	help = 0
	withArrayIds = 0
	minCallProb=None
	minMAF=None
	callProbFile = None
	onlyBinary = False
	output01Format = False
	filterRegion = False
	startPos = None
	endPos = None
	chromosome = None
	chromosomes=[1,2,3,4,5]
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("--comparisonFile"):
			comparisonFile = arg
		elif opt in ("--maxError"):
			maxError = float(arg)
		elif opt in ("--maxMissing"):
			maxMissing = float(arg)
		elif opt in ("--minCallProb"):
			minCallProb = float(arg)
		elif opt in ("--minMAF"):
			minMAF = float(arg)
		elif opt in ("--callProbFile"):
			callProbFile = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		elif opt in ("--monomorphic"):
			monomorphic = True
		elif opt in ("--onlyBinary"):
			onlyBinary = True
		elif opt in ("--output01Format"):
			output01Format = True
		elif opt in ("--filterRegion"):
			filterRegion = True
			region = arg.split(",")
			region = map(int,region)
			chromosome = region[0]
			startPos = region[1]
			endPos = region[2]
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2

	if callProbFile and minCallProb:
		#Read prob file into SNPsdatas.
		#snpsds = dataParsers.parseCSVDataWithCallProb(inputFile, callProbFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
		pass
	else:
		snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	
        #Filtering monomorphic
	if monomorphic:
		print "Filtering monomorphic SNPs"
		for snpsd in snpsds:
			print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	if onlyBinary or output01Format:
		print "Filtering non-binary SNPs"
		for snpsd in snpsds:
			print "Removed", str(snpsd.onlyBinarySnps()),"Snps"

	#Filtering missing values
	if maxMissing<1.0 and maxMissing>=0.0:
		print "Filtering SNPs with missing values"
		numAccessions = len(snpsds[0].accessions)
		for snpsd in snpsds:
			print "Removed", str(snpsd.filterMissingSnps(int(maxMissing*numAccessions))),"Snps"

	#Filtering bad SNPs
	if comparisonFile and maxError<1.0:
		print "Filtering erroneous SNPs, with maxError=",maxError
		snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
		for i in range(0,len(snpsds)):
			snpsds[i].filterBadSnps(snpsds2[i],maxError)
			

	if minMAF:
		print "Removing SNPs withe MAF <",minMAF
		for snpsd in snpsds:
			print "Removed", str(snpsd.filterMinMAF(minMAF)),"Snps"

	#Output specific region..
	if filterRegion:
		chromosomes = [chromosome]
		snpsd = snpsds[chromosome-1]
		snpsd.filterRegion(startPos,endPos)
		snpsds = [snpsd]
		
		
	#Converting lousy calls to NAs
	if callProbFile and minCallProb:
		print "Converting base calls with call prob. lower than",minCallProb,"to NAs"
		#To avoid memory problems, the file/data is processed one line at a time.
		gInFile = open(inputFile,"r")
		pInFile = open(callProbFile,"r")
		outFile = open(output_fname,"w")
		if withArrayIds==2:
			gline = gInFile.readline()
			outFile.write(gline)
			pInFile.readline()
		gline = gInFile.readline()
		outFile.write(gline)
		pInFile.readline()
		i = 0
		totalCount = 0.0
		convertedCount = 0.0 
		
		while(1):
			i += 1
			gline = gInFile.readline()
			pline = pInFile.readline()
			#print gline
			if gline and pline:
				snp = gline.strip().split(delim) 
				probs = pline.strip().split(delim)
				probs = map(float,probs)
				newSNP = []
				totalCount += len(snp)
				for (nt,prob) in zip(snp,probs):
					if prob>minCallProb:
						newSNP.append(nt)
						convertedCount += 1.0
					else:
						newSNP.append('NA')
				outFile.write(delim.join(newSNP)+"\n")
			else:
				print i,gline,pline		
				break
			
			if i%10000==0:
				print i
		print i
		gInFile.close()
		pInFile.close()
		outFile.close()		
		print "Fraction converted =",convertedCount/totalCount
		
	else:
		if output01Format:
			snpsds01format = []
			for snpsd in snpsds:
				snpsds01format.append(snpsd.getSnpsData(missingVal=missingVal))
			#FINISH
			snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds01format,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
		else:
			snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
Ejemplo n.º 20
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["priority=", "delim=", "missingval=", "union=", "intersection=", "debug", "report", "help", "withArrayId="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:p:d:m:u:i:a:brh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	if len(args)!=2:
		raise Exception("Number of arguments isn't correct.")
	inputFile1 = args[0]
	inputFile2 = args[1]
	priority = 1
	union = 0
	intersection = 0
	output_fname = None
	delim = ","
	missingVal = "NA"
	debug = None
	report = None
	withArrayIds = 0
	chromosomes = [1,2,3,4,5]
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-p", "--priority"):
			priority = int(arg)
		elif opt in ("-u", "--union"):
			union = int(arg)
		elif opt in ("-i", "--intersection"):
			intersection = int(arg)
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg	
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

		
	if not output_fname:
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)
		
	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2

	import dataParsers
	(snpsds1,chromosomes1) = dataParsers.parseCSVData(inputFile1, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1,returnChromosomes=True)
	(snpsds2,chromosomes2) = dataParsers.parseCSVData(inputFile2, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2,returnChromosomes=True)
	withArrayIds = waid1
	
	
	if len(snpsds1) != len(snpsds2):
		print("Warning: Unequal number of chromosomes.")
		#raise Exception("Unequal number of chromosomes.")
		
	import snpsdata
	if union==0 and intersection==0:
		for i in range(0,len(chromosomes1)):
			chr1 = chromosomes1[i]
			for j in range(0,len(chromosomes2)):
				chr2 = chromosomes2[j]
				if chr1==chr2:
					snpsds1[i].mergeData(snpsds2[j],priority=priority)
		chromosomes = chromosomes1
		snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
	elif 0<union<4 and intersection==0:
		for i in range(0,len(chromosomes1)):
			chr1 = chromosomes1[i]
			for j in range(0,len(chromosomes2)):
				chr2 = chromosomes2[j]
				if chr1==chr2:
					snpsds1[i].mergeDataUnion(snpsds2[j], priority=priority, unionType=union)
		if union==1 or union==3:			
			chromosomes = set(chromosomes1).union(set(chromosomes2))
			chromosomes = list(chromosomes)
			chromosomes.sort()
		elif union==2:
			chromosomes = chromosomes1
		snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal)
	elif 0<intersection<4 and union==0:
		for i in range(0,len(snpsds1)):
			snpsds1[i].mergeDataIntersection(snpsds2[i], priority=priority, intersectionType=intersection)
		if intersection==1 or intersection==3:
			chromosomes = set(chromosomes1).intersection(set(chromosomes2))
			chromosomes = list(chromosomes)
			chromosomes.sort()
		elif intersection==2:
			chromosomes = chromosomes1
			
		snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal)
	else:
		if help==0:
			print "The union or intersection options used are wrong!!\n"
			print __doc__
			sys.exit(2)
Ejemplo n.º 21
0
def _test1_():
	import dataParsers
	snpsds = dataParsers.parseCSVData("2010_v3.csv")
	#snpsds = dataParsers.parseCSVData("250K_m3.csv",withArrayIds=1)
	#comparisonSnpsds = dataParsers.parseCSVData("2010_v3.csv")
	filterMonomorphic(snpsds)
Ejemplo n.º 22
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["delim=", "missingval=", "withArrayId=", "comparisonFile=", "debug", "report", "help"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:brh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	inputFile = args[0]
	output_fname = None
	delim = ", "
	missingVal = "NA"
	comparisonFile = None
	debug = None
	report = None
	help = 0
	withArrayIds = 0

	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("--comparisonFile"):
			comparisonFile = arg
		elif opt in ("-o",):
			output_fname = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2

	import dataParsers
        import snpsdata
        snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	

	#Calculating Error rates
	#if comparisonFile:
	#	snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
	#	for i in range(0,len(snpsds)):
                        #Compare ... and record relevant information...
                        #snpsds[i].compare filterBadSnps(snpsds2[i],maxError)
        #            pass

	#Calculating NA rates..
	print "Calculating NA rates"
	snpsNARates = []
	for i in range(0,len(snpsds)):
		snpsNARates += snpsds[i].getSnpsNArates()
	import util
	rstr = ""
	rstr += "snpsNARates <- c("+",".join(util.valListToStrList(snpsNARates))+")\n"
	rstr += 'hist(snpsNARates, xlab="NA rates", ylab="SNP frequency", breaks=60)'
	
	f = open(output_fname,"w")
	f.write(rstr)
	f.close()
Ejemplo n.º 23
0
def _testRun1_():
	import dataParsers
	snpsds = dataParsers.parseCSVData("250K_m3.csv",withArrayIds=1)
	comparisonSnpsds = dataParsers.parseCSVData("2010_v3.csv")
	filterByError(snpsds,comparisonSnpsds,0.2,withArrayIds=1)
Ejemplo n.º 24
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", 
			     "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", 
			     "help", "heterozygous2NA"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	inputFile = args[0]
	output_fname = None
	delim = ","
	missingVal = "NA"
	comparisonFile = None
	maxMissing = 1.0
	maxError = 1.0
	removeEcotypes = None
	removeArray = None
	removeIdentical = False
	onlyCommon = False
	debug = None
	report = None
	help = 0
	withArrayIds = 1
	first96 = False
	heterozygous2NA = False
	
	for opt, arg in opts:
		if opt in ('-o'):
			output_fname = arg
		elif opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("--comparisonFile"):
			comparisonFile = arg
		elif opt in ("--maxError"):
			maxError = float(arg)
		elif opt in ("--maxMissing"):
			maxMissing = float(arg)
		elif opt in ("--heterozygous2NA"):
			heterozygous2NA = True
		elif opt in ("--removeEcotypeId"):
			removeEcotypes = arg.split(",")
			removeEcotypes = map(int,removeEcotypes)
		elif opt in ("--removeArrayId"):
			removeArray = int(arg)
		elif opt in ("--removeIdentical"):
			removeIdentical = True
		elif opt in ("--onlyCommon"):
			onlyCommon = True
		elif opt in ("--first96"):
			first96 = True
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		elif opt in ("-b", "--debug"):
			debug = 1
		elif opt in ("-r", "--report"):
			report = 1
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)


	if not output_fname:
		output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2 or withArrayIds==3

	import dataParsers
	snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	
	accessionsToRemove = []
	arraysToRemove = None

	if first96:
		import dataParsers
		d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',user="******",passwd="bamboo123")
		ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',user="******",passwd="bamboo123")
		print "Dictionaries loaded"
		names = []
		first96Names = []
		for i in range(0,len(snpsds[0].accessions)):
			ecotype = snpsds[0].accessions[i]
			arrayID = snpsds[0].arrayIds[i]
			names.append((arrayID,ecotd[ecotype],ecotype))
			if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0:
				accessionsToRemove.append(ecotype)
			else:
				first96Names.append((arrayID,d[ecotype][1],d[ecotype][0],ecotype))

		first96Names.sort()
		print "First 96 accessions, len:",len(first96Names),":"
		for name in first96Names:
			print name
		names.sort()
		print "All accessions:"
		for name in names:
			print name


	#Retrieve comparison list of accessions.  (Error rates for accessions)
	if (removeIdentical or maxError<1.0) and comparisonFile:
		sys.stderr.write("Loading comparison file:")
		snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
		res = []
		sys.stderr.write("Comparing accessions.")
		for i in range(0,len(snpsds)):
			res.append(snpsds[i].compareWith(snpsds2[i],withArrayIds=withArrayIds,verbose=False,heterozygous2NA=heterozygous2NA))
			sys.stderr.write(".")
		sys.stderr.write("\n")

		totalAccessionCounts = [0]*len(res[0][2])
		accErrorRate = [0]*len(res[0][2])
		for i in range(0,len(snpsds)):
			r = res[i]
			for j in range(0,len(r[2])):
				totalAccessionCounts[j] += r[6][j]
				accErrorRate[j]+=r[3][j]*float(r[6][j])
		
		for i in range(0,len(accErrorRate)):
			accErrorRate[i]=accErrorRate[i]/float(totalAccessionCounts[i])

		accErrAndID = []
		if 0<withArrayIds<3:
			for i in range(0,len(r[2])):
				accErrAndID.append((accErrorRate[i], r[2][i], r[5][i]))
		else:
			for i in range(0,len(r[2])):
				accErrAndID.append((accErrorRate[i], r[2][i]))
		accErrAndID.sort()
		accErrAndID.reverse()


   
	#Figure out which accessions are too erroraneous
	if maxError<1.0 and comparisonFile:
		if withArrayIds:
			arraysToRemove = []
			for (error,ecotype,array) in accErrAndID:
				if error> maxError:
					accessionsToRemove.append(ecotype)
					arraysToRemove.append(array)

		else:
			for (error,ecotype) in accErrAndID:
				if error> maxError:
					accessionsToRemove.append(ecotype)


	if removeIdentical and comparisonFile and withArrayIds:
		print "Locating identical accessions"
		accErrAndID.sort()
		if not arraysToRemove:
			arraysToRemove = []
		for accession in set(snpsds[0].accessions):
			if snpsds[0].accessions.count(accession)>1:
				found = 0
				for (error,ecotype,array) in accErrAndID:
					if ecotype==accession:
						if found>0:
							accessionsToRemove.append(ecotype)
							arraysToRemove.append(array)
						found += 1

	if onlyCommon and comparisonFile:
		print "Locating accessions which are not shared"
		snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)
		#print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions)))
		if not arraysToRemove:
			arraysToRemove = []
		for i in range(0,len(snpsds[0].accessions)):
			acc = snpsds[0].accessions[i]
			if not acc in snpsds2[0].accessions:
				accessionsToRemove.append(acc)
				if 0<withArrayIds<3:
					arraysToRemove.append(snpsds[0].arrayIds[i])


	if maxMissing<1.0:
		missingCounts = [0]*len(snpsds[0].accessions)
		numSnps = 0
		for snpsd in snpsds:
			mc = snpsd.accessionsMissingCounts()
			numSnps += len(snpsd.positions)
			for i in range(0,len(snpsds[0].accessions)):
				missingCounts[i] += mc[i]
		
		missingRates = []		
		if withArrayIds:
			arraysToRemove = []
			for i in range(0,len(snpsds[0].accessions)):
				missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i],snpsds[0].arrayIds[i]))
			missingRates.sort()
			missingRates.reverse()
			for (mrate,ecotype,array) in missingRates:
				if mrate>maxMissing:
					accessionsToRemove.append(ecotype)
					arraysToRemove.append(array)
		else:
			for i in range(0,len(snpsds[0].accessions)):
				missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i]))
			missingRates.sort()
			missingRates.reverse()
			for (mrate,ecotype) in missingRates:
				if mrate>maxMissing:
					accessionsToRemove.append(ecotype)


	if removeEcotypes:
		for removeEcotype in removeEcotypes:
			accessionsToRemove.append(str(int(removeEcotype)))
		print "Removing", len(accessionsToRemove), "accessions."
	if removeArray:
		if not arraysToRemove:
			arraysToRemove = []
		arraysToRemove.append(str(removeArray))
		print "Removing", len(arraysToRemove)," arrays."

	numAccessions = len(snpsds[0].accessions)
	sys.stderr.write("Removing accessions.")
	for snpsd in snpsds:
		snpsd.removeAccessions(accessionsToRemove,arrayIds=arraysToRemove)
		sys.stderr.write(".")
	print "\n", (numAccessions-len(snpsds[0].accessions)), "accessions out of "+str(numAccessions)+" were removed."
		
	import snpsdata
	snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
Ejemplo n.º 25
0
def _testRun1_():
    import dataParsers
    snpsds = dataParsers.parseCSVData("250K_m3.csv", withArrayIds=1)
    comparisonSnpsds = dataParsers.parseCSVData("2010_v3.csv")
    filterByError(snpsds, comparisonSnpsds, 0.2, withArrayIds=1)
def _plotKW_():
    """
	Analyze how population structure affects KW.
	"""
    filterProb = 0.1
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "_full_quick_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

    #For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    #globalKinship = calcKinship(totalSNPs)
    gc.collect(
    )  #Calling garbage collector, in an attempt to clean up memory..

    #chr = 1
    #for snpsd in snpsds:

    snpsd = snpsds[3]

    k = calcKinship(snpsd.snps[200:1400])
    res = runEmma(phed, p_i, k,
                  snpsd.snps[200:1400])  #runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "c.",
               label="Emma (local)")

    k = calcKinship(totalSNPs)
    res = runEmma(phed, p_i, k,
                  snpsd.snps[200:1400])  #runEmma(phed,p_i,k,snps):
    pvals = res["ps"]
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))
    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "g.",
               label="Emma (global)")

    phenVals = phed.getPhenVals(p_i)
    pvals = _run_kw_(snpsd.snps[200:1400], phenVals)
    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(snpsd.positions[200:1400],
               log_pvals,
               "r.",
               label="KW (full data)")

    (pvals, new_positions,
     acc_groups) = get_KW_pvals(snpsd.snps[200:1400],
                                snpsd.positions[200:1400],
                                phed,
                                p_i,
                                kinshipThreshold=0.95,
                                method="KW")
    ecot_map = phenotypeData._getEcotypeIdToStockParentDict_()

    for i in range(0, len(acc_groups)):
        acc_list = []
        for a_i in acc_groups[i]:
            e_i = snpsd.accessions[a_i]
            #print e_i
            acc_list.append(ecot_map[int(e_i)][0])
        print "group", i, ":", acc_list

    log_pvals = []
    for pval in pvals:
        #print pval
        log_pvals.append(-math.log10(pval))

    pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)")

    pylab.legend(numpoints=2, handlelen=0.005)

    pylab.show()
Ejemplo n.º 27
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF="]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
        phenotypeFileType = 1
        impFile = None
	delim = ","
	missingVal = "NA"
	help = 0
	withArrayIds = 1
	parallel = None
	logTransform = False
	parallelAll = False
	chunkSize = 250000
	round2Size = 5000
	nTrees = 15000
	nodeSize = None
	mem = "8g"
	skipSecondRound = True
	minMAF = 0.0

	for opt, arg in opts:
            if opt in ("-h", "--help"):
                help = 1
                print __doc__
            elif opt in ("-a","--withArrayId"):
                withArrayIds = int(arg)
            elif opt in ("-o","--rFile"):
                impFile = arg
            elif opt in ("--phenotypeFileType"):
                phenotypeFileType = int(arg)
            elif opt in ("--parallel"):
                parallel = arg
            elif opt in ("--parallelAll"):
                parallelAll = True
            elif opt in ("--logTransform"):
                logTransform = True
            elif opt in ("--secondRound"):
                skipSecondRound = False
            elif opt in ("-d","--delim"):
                delim = arg
            elif opt in ("--chunkSize"):
                chunkSize = int(arg)
            elif opt in ("--round2Size"):
                round2Size = int(arg)		
            elif opt in ("--nTrees"):
                nTrees = int(arg)
            elif opt in ("--nodeSize"):
                nodeSize = int(arg)
            elif opt in ("--mem"):
                mem = arg
            elif opt in ("-m","--missingval"):
                missingVal = arg
            elif opt in ("-m","--minMAF"):
                minMAF = float(arg)
            else:
                if help==0:
                    print "Unkown option!!\n"
                    print __doc__
                sys.exit(2)

        if len(args)<3 and not parallel:
            if help==0:
                print "Arguments are missing!!\n"
                print __doc__
            sys.exit(2)

	
	def runParallel(phenotypeIndex):
		#Cluster specific parameters
		phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
		phenName = phed.getPhenotypeName(phenotypeIndex)
		phenName = phenName.replace("/","_div_")
		phenName = phenName.replace("*","_star_")
		impFileName = resultDir+"RF_"+parallel+"_"+phenName
		outFileName = impFileName
		shstr = """#!/bin/csh
#PBS -l walltime=120:00:00
"""
		shstr += "#PBS -l mem="+mem+"\n"
		shstr +="""
#PBS -q cmb
"""
		
		shstr += "#PBS -N RF"+phenName+"_"+parallel+"\n"
		shstr += "(python "+programDir+"RandomForest.py -o "+impFileName+" --chunkSize "+str(chunkSize)+" --nTrees "+str(nTrees)+" --mem "+str(mem)+" --round2Size "+str(round2Size)+""
		if nodeSize:
			shstr += " --nodeSize "+str(nodeSize)+" "
		if logTransform:
			shstr += " --logTransform "
		if not skipSecondRound:
			shstr += " --secondRound "
		shstr += " -a "+str(withArrayIds)+" "			
		shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n"

		f = open(parallel+".sh",'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	#Nested function ends

	snpsDataFile = args[0]
	phenotypeDataFile = args[1]
	if parallel:  #Running on the cluster..
		if parallelAll:
			phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		else:
			phenotypeIndex = int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex = int(args[2])

	print "chunkSize:",chunkSize
	print "nTrees:",nTrees
	print "nodeSize:",nodeSize
	print "mem:",mem
	print "logTransform:",logTransform
	print "round2Size:",round2Size
	print "skipSecondRound:",skipSecondRound

	#Loading genotype data
	import dataParsers
	snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds)
	
	phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t')  #Get Phenotype data 
	phenotype = phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep = []			
	phenAccIndicesToKeep = []
	numAcc = len(snpsds[0].accessions)

	#Load phenotype file
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0,len(snpsds[0].accessions)):
		acc1 = snpsds[0].accessions[i]
		for j in range(0,len(phed.accessions)):
			acc2 = phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping = []
	i = 0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc),i))
			i += 1
	phed.orderAccessions(accessionMapping)

	#Log-transforming
	if logTransform:
		print "Log transforming phenotype"
		phed.logTransform(phenotype)

        #Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	#Remove minor allele frequencies
	if minMAF!=0:
		sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.filterMinMAF(minMAF)
		
	
      	#Converting format to 01
	import snpsdata
	newSnpsds = []
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	snpsds = newSnpsds
	
	#Writing files
	import tempfile
	if env.user=="bjarni":
		tempfile.tempdir='/tmp'
	(fId, phenotypeTempFile) = tempfile.mkstemp()
	os.close(fId)
	(fId, genotypeTempFile) = tempfile.mkstemp()
	os.close(fId)
	
	phed.writeToFile(phenotypeTempFile, [phenotype])	
	sys.stdout.write( "Phenotype file written\n")
	sys.stdout.flush()
	
	#Retain only the correct runchunk of data.
	chromasomes = []
	positions = []
	snps = []
	for i in range(0,len(snpsds)):
		snpsd = snpsds[i]
		positions += snpsd.positions
		snps += snpsd.snps
		chrList = [i+1]*len(snpsd.positions)
		chromasomes += chrList

	#Is the phenotype binary?
	binary = phed.isBinary(phenotypeIndex)
	import util
	impFile = impFile+".imp"
	rDataFile = impFile+".rData"
	rFile = impFile+".r"
	outRfile = rFile+".out"
	errRfile = rFile+".err"
	topImpFile = impFile+"_top"+str(chunkSize)+".imp"
	topRDataFile = impFile+"_top.rData"
	try:
		os.remove(impFile)    #Removing file if it already exits.
	except Exception:
		print "Couldn't remove",impFile
	try:
		os.remove(topImpFile) #Removing file if it already exits.
	except Exception:
		print "Couldn't remove",topImpFile
	for startIndex in range(0,len(positions),chunkSize):
		if startIndex+chunkSize>=len(positions):
			endIndex = len(positions)
		else:
			endIndex = startIndex+chunkSize

	        #Writing genotype data to file.
		tmpFile = open(genotypeTempFile,"w")
		for i in range(startIndex,endIndex):
			outStr =""
			snp = util.valListToStrList(snps[i])
			outStr += str(chromasomes[i])+","+str(positions[i])+","
			outStr += ",".join(snp)
			outStr += "\n"
			tmpFile.write(outStr)
		tmpFile.close()
			
		rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize)
		f = open(rFile,'w')
		f.write(rstr)
		f.close()
		#outRfile = rFile+"_"+str(startIndex/chunkSize)+".out"
		#errRfile = rFile+"_"+str(startIndex/chunkSize)+".err"
		print "Running model nr",startIndex/chunkSize,":"
		cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()
		os.system(cmdStr)
	print "Random forest output saved in", impFile
	
	if not skipSecondRound:
		#Run on the top 'chunkSize' number of hits.
		#loading the R output file.
		impF = open(impFile,"r")
		lines=impF.readlines()
		impF.close()
		impList = list()
		for i in range(1,len(lines)):
			line = lines[i]
			line.strip()
			l = line.split(",")
			impList.append( (float(l[2]),l[0],l[1],snps[i]) )
		impList.sort()
		impList.reverse()

		#Writing genotype data to file.
		tmpFile = open(genotypeTempFile,"w")
		for i in range(0,round2Size):
			outStr = ""
			snp = util.valListToStrList(impList[i][3])
			outStr += str(impList[i][1])+","+str(impList[i][2])+","
			outStr += ",".join(snp)
			outStr += "\n"
			tmpFile.write(outStr)
		tmpFile.close()
		rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize)
		f = open(rFile,'w')
		f.write(rstr)
		f.close()
		print "Running randomForest on the top importance scores:"
		cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()
		os.system(cmdStr)
Ejemplo n.º 28
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "monomorphic", "monomorphic", "delim=", "missingval=", "withArrayId=",
        "windowSize=", "debug", "report", "help"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:w:m:a:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    inputFile = args[0]
    output_fname = None
    delim = ","
    missingVal = "NA"
    monomorphic = False
    help = 0
    withArrayIds = 0
    windowSize = 30

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("--monomorphic"):
            monomorphic = True
        elif opt in ("--windowSize"):
            windowSize = arg
        elif opt in ("-o", ):
            output_fname = arg
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if not output_fname:
        output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    waid1 = withArrayIds == 1 or withArrayIds == 2
    waid2 = withArrayIds == 2

    (snpsds, chromosomes) = dataParsers.parseCSVData(inputFile,
                                                     format=1,
                                                     deliminator=delim,
                                                     missingVal=missingVal,
                                                     withArrayIds=waid1,
                                                     returnChromosomes=True)

    accessions = snpsds[0].accessions
    arrayIds = snpsds[0].arrayIds
    positionsList = []
    tmpFiles = []
    #tempfile.tempdir='/tmp'
    i = 1
    for snpsd in snpsds:
        tmpFile1 = tempfile.mkstemp()
        os.close(tmpFile1[0])
        tmpFile2 = tempfile.mkstemp()
        os.close(tmpFile2[0])
        tmpFiles.append((tmpFile1[1], tmpFile2[1]))
        positionsList.append(snpsd.positions)
        print "Preparing data in", tmpFile1[1]
        writeAsNputeFile(snpsd, tmpFile1[1])
        checkNputeFile(tmpFile1[1])
        del snpsd.snps
        nputeCmd = "python " + path_NPUTE + "NPUTE.py -m 0 -w " + str(
            windowSize) + " -i " + str(tmpFile1[1]) + " -o " + str(tmpFile2[1])
        print "Imputing chromosome", i
        i += 1
        print nputeCmd
        os.system(nputeCmd)

    for i in range(0, len(tmpFiles)):
        print "Reading chromosome", i + 1
        snpsds[i] = readNputeFile(tmpFiles[i][1],
                                  accessions,
                                  positionsList[i],
                                  arrayIds=arrayIds)
        os.remove(tmpFiles[i][0])
        os.remove(tmpFiles[i][1])

    snpsDataSet = snpsdata.SnpsDataSet(snpsds, [1, 2, 3, 4, 5])

    #Filtering monomorphic
    if monomorphic:
        print "Filtering monomorphic SNPs"
        for snpsd in snpsds:
            print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    snpsDataSet.writeToFile(output_fname,
                            deliminator=delim,
                            missingVal=missingVal,
                            withArrayIds=waid1)
def _plotKinshipDiffs_():

    filterProb = 0.2
    p_i = 1
    res_dir = "/Users/bjarni/tmp/"
    runId = "full_"

    snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv"
    snpsds = dataParsers.parseCSVData(snpsDataFile, format=1,
                                      deliminator=",")  #,debug=True)
    phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv"
    print "Loading phenotype data"
    phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')
    snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds)

    for snpsd in snpsds:
        snpsd.filterMinMAF(0.1)
        snpsd.filterMonoMorphicSnps()

    totalSNPs = []
    for i in range(len(snpsds)):
        snpsds[i] = snpsds[i].getSnpsData()
        totalSNPs += snpsds[i].snps

    #For memory, remove random SNPs
    snps = []
    for snp in totalSNPs:
        if random.random() < filterProb:
            snps.append(snp)
    totalSNPs = snps

    print "Calculating the global kinship..."
    globalKinship = calcKinship(totalSNPs)
    print "done."
    normalizedGlobalKinship = globalKinship / mean(globalKinship)
    gc.collect(
    )  #Calling garbage collector, in an attempt to clean up memory..

    for i in range(4, 5):  #len(snpsds)):
        chr = i + 1
        snpsd = snpsds[i]
        #pylab.subplot(5,1,chr)
        #		pylab.figure(figsize=(18,4))
        #		(kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000)
        #		pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$')
        #		(kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000)
        #		pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$')
        #		pylab.legend(numpoints=2,handlelen=0.005)
        #		pylab.title("Kinship diff. chr. "+str(chr))
        #		pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf")
        #		pylab.clf()
        pylab.figure(figsize=(18, 4))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd,
                                           phed,
                                           p_i,
                                           globalKinship,
                                           windowSize=300000)
        pylab.plot(binPos, emmaDiffs, "r", label='ws$=300000$')
        pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr))
        (emmaDiffs, binPos) = getEmmaDiffs(snpsd,
                                           phed,
                                           p_i,
                                           globalKinship,
                                           windowSize=500000)
        pylab.plot(binPos, emmaDiffs, "b", label='ws$=500000$')
        pylab.title("Emma avg. p-value diff. on chr. " + str(chr))
        pylab.legend(numpoints=2, handlelen=0.005)
        pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" +
                      str(chr) + ".pdf",
                      format="pdf")
        pylab.clf()
        gc.collect(
        )  #Calling garbage collector, in an attempt to clean up memory..
Ejemplo n.º 30
0
def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness"]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
		phenotypeFileType=1
		outputFile=None
	delim=","
	missingVal="NA"
	help=0
	withArrayIds=1
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-a", "--withArrayId"):
			withArrayIds=int(arg)
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		phenName=phenName.replace("/", "_div_")
		phenName=phenName.replace("*", "_star_")
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr="""#!/bin/csh
#PBS -l walltime=100:00:00
#PBS -l mem=4g 
#PBS -q cmb
"""
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		shstr+=" -a "+str(withArrayIds)+" "			
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal, withArrayIds = withArrayIds)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		if env.user=="bjarni":
			tempfile.tempdir='/tmp'
		(fId, phenotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		(fId, genotypeTempFile)=tempfile.mkstemp()
		os.close(fId)
		
		phed.writeToFile(phenotypeTempFile, [phenotype])	
		sys.stdout.write("Phenotype file written\n")
		sys.stdout.flush()
		snpsDataset=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		decoder={1:1, 0:0,-1:'NA'}	
		snpsDataset.writeToFile(genotypeTempFile, deliminator = delim, missingVal = missingVal, withArrayIds = 0, decoder = decoder)
		sys.stdout.write("Genotype file written\n")
		sys.stdout.flush()
	
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
	
		rDataFile=outputFile+".rData"
		pvalFile=outputFile+".pvals"
		#Is the phenotype binary?
		binary=phed.isBinary(phenotypeIndex)
		rstr=_generateRScript_(genotypeTempFile, phenotypeTempFile, rDataFile, pvalFile, name = phenotypeName, binary = binary)
		rFileName=outputFile+".r"
		f=open(rFileName, 'w')
		f.write(rstr)
		f.close()
		outRfile=rFileName+".out"
		errRfile=rFileName+".err"
		print "Running R file:"
		cmdStr="(R --vanilla < "+rFileName+" > "+outRfile+") >& "+errRfile
		sys.stdout.write(cmdStr+"\n")
		sys.stdout.flush()	
		gc.collect() 
		os.system(cmdStr)
		#print "Emma output saved in R format in", rDataFile
		print "Generating a GW plot."
		res = gwaResults.Result(pvalFile,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
Ejemplo n.º 31
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=",
        "withArrayId=", "logTransform", "phenotypeFileType=", "help",
        "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=",
        "secondRound", "minMAF="
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeFileType = 1
    impFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    withArrayIds = 1
    parallel = None
    logTransform = False
    parallelAll = False
    chunkSize = 250000
    round2Size = 5000
    nTrees = 15000
    nodeSize = None
    mem = "8g"
    skipSecondRound = True
    minMAF = 0.0

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("-o", "--rFile"):
            impFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--secondRound"):
            skipSecondRound = False
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("--chunkSize"):
            chunkSize = int(arg)
        elif opt in ("--round2Size"):
            round2Size = int(arg)
        elif opt in ("--nTrees"):
            nTrees = int(arg)
        elif opt in ("--nodeSize"):
            nodeSize = int(arg)
        elif opt in ("--mem"):
            mem = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-m", "--minMAF"):
            minMAF = float(arg)
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    def runParallel(phenotypeIndex):
        #Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        impFileName = resultDir + "RF_" + parallel + "_" + phenName
        outFileName = impFileName
        shstr = """#!/bin/csh
#PBS -l walltime=50:00:00
"""
        shstr += "#PBS -l mem=" + mem + "\n"
        shstr += """
#PBS -q cmb
"""

        shstr += "#PBS -N RF" + phenName + "_" + parallel + "\n"
        shstr += "(python " + programDir + "RandomForest.py -o " + impFileName + " --chunkSize " + str(
            chunkSize) + " --nTrees " + str(nTrees) + " --mem " + str(
                mem) + " --round2Size " + str(round2Size) + ""
        if nodeSize:
            shstr += " --nodeSize " + str(nodeSize) + " "
        if logTransform:
            shstr += " --logTransform "
        if not skipSecondRound:
            shstr += " --secondRound "
        shstr += " -a " + str(withArrayIds) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    #Nested function ends

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(
                phenotypeDataFile, delimiter='\t')  #Get Phenotype data
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex)
        return
    else:
        phenotypeIndex = int(args[2])

    print "chunkSize:", chunkSize
    print "nTrees:", nTrees
    print "nodeSize:", nodeSize
    print "mem:", mem
    print "logTransform:", logTransform
    print "round2Size:", round2Size
    print "skipSecondRound:", skipSecondRound

    #Loading genotype data
    import dataParsers
    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=withArrayIds)

    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    phenotype = phed.getPhenIndex(phenotypeIndex)
    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    numAcc = len(snpsds[0].accessions)

    #Load phenotype file
    sys.stdout.write(
        "Removing accessions which do not have a phenotype value for " +
        phed.phenotypeNames[phenotype] + ".")
    sys.stdout.flush()
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    #Filter accessions which do not have the phenotype value.
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "Filtering phenotype data."
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    #Ordering accessions according to the order of accessions in the genotype file
    accessionMapping = []
    i = 0
    for acc in snpsds[0].accessions:
        if acc in phed.accessions:
            accessionMapping.append((phed.accessions.index(acc), i))
            i += 1
    phed.orderAccessions(accessionMapping)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotype)

#Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

#Converting format to 01
    import snpsdata
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData())
    print ""
    snpsds = newSnpsds

    #Writing files
    import tempfile
    if env.user == "bjarni":
        tempfile.tempdir = '/tmp'
    (fId, phenotypeTempFile) = tempfile.mkstemp()
    os.close(fId)
    (fId, genotypeTempFile) = tempfile.mkstemp()
    os.close(fId)

    phed.writeToFile(phenotypeTempFile, [phenotype])
    sys.stdout.write("Phenotype file written\n")
    sys.stdout.flush()

    #Retain only the correct runchunk of data.
    chromasomes = []
    positions = []
    snps = []
    for i in range(0, len(snpsds)):
        snpsd = snpsds[i]
        positions += snpsd.positions
        snps += snpsd.snps
        chrList = [i + 1] * len(snpsd.positions)
        chromasomes += chrList

    #Is the phenotype binary?
    binary = phed.isBinary(phenotypeIndex)
    import util
    impFile = impFile + ".imp"
    rDataFile = impFile + ".rData"
    rFile = impFile + ".r"
    outRfile = rFile + ".out"
    errRfile = rFile + ".err"
    topImpFile = impFile + "_top" + str(chunkSize) + ".imp"
    topRDataFile = impFile + "_top.rData"
    try:
        os.remove(impFile)  #Removing file if it already exits.
    except Exception:
        print "Couldn't remove", impFile
    try:
        os.remove(topImpFile)  #Removing file if it already exits.
    except Exception:
        print "Couldn't remove", topImpFile
    for startIndex in range(0, len(positions), chunkSize):
        if startIndex + chunkSize >= len(positions):
            endIndex = len(positions)
        else:
            endIndex = startIndex + chunkSize

    #Writing genotype data to file.
        tmpFile = open(genotypeTempFile, "w")
        for i in range(startIndex, endIndex):
            outStr = ""
            snp = util.valListToStrList(snps[i])
            outStr += str(chromasomes[i]) + "," + str(positions[i]) + ","
            outStr += ",".join(snp)
            outStr += "\n"
            tmpFile.write(outStr)
        tmpFile.close()

        rstr = _generateRScript_(genotypeTempFile,
                                 phenotypeTempFile,
                                 impFile,
                                 rDataFile,
                                 binary=binary,
                                 nTrees=nTrees,
                                 nodeSize=nodeSize)
        f = open(rFile, 'w')
        f.write(rstr)
        f.close()
        #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out"
        #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err"
        print "Running model nr", startIndex / chunkSize, ":"
        cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile
        sys.stdout.write(cmdStr + "\n")
        sys.stdout.flush()
        os.system(cmdStr)
    print "Random forest output saved in", impFile

    if not skipSecondRound:
        #Run on the top 'chunkSize' number of hits.
        #loading the R output file.
        impF = open(impFile, "r")
        lines = impF.readlines()
        impF.close()
        impList = list()
        for i in range(1, len(lines)):
            line = lines[i]
            line.strip()
            l = line.split(",")
            impList.append((float(l[2]), l[0], l[1], snps[i]))
        impList.sort()
        impList.reverse()

        #Writing genotype data to file.
        tmpFile = open(genotypeTempFile, "w")
        for i in range(0, round2Size):
            outStr = ""
            snp = util.valListToStrList(impList[i][3])
            outStr += str(impList[i][1]) + "," + str(impList[i][2]) + ","
            outStr += ",".join(snp)
            outStr += "\n"
            tmpFile.write(outStr)
        tmpFile.close()
        rstr = _generateRScript_(genotypeTempFile,
                                 phenotypeTempFile,
                                 topImpFile,
                                 topRDataFile,
                                 binary=binary,
                                 nTrees=nTrees,
                                 nodeSize=nodeSize)
        f = open(rFile, 'w')
        f.write(rstr)
        f.close()
        print "Running randomForest on the top importance scores:"
        cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile
        sys.stdout.write(cmdStr + "\n")
        sys.stdout.flush()
        os.system(cmdStr)
Ejemplo n.º 32
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["outputSNPsFile=","outputPhenotFile=", "filterMonomorphic", "rawDataFormat", "delim=", "missingval=", "withArrayId=", "phenotype=", "phenotypeFile=", "phenotypeName=", "calcKinshipMatrix=", "orderAccessions", "help"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:u:d:m:a:f:p:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	inputFile = args[0]
        output_fname = None
	outputPhenotFile = None
	delim = ","
	missingVal = "NA"
	phenotypeFile = None
        kinshipMatrixFile = None
	phenotype = None
	phenotypeName = None
	rawDataFormat = False
	monomorphic = False
	help = 0
	withArrayIds = 1
	orderAccessions = False
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-f","--phenotypeFile"):
			phenotypeFile = arg
                elif opt in ("calcKinshipMatrix"):
                        kinshipMatrixFile = arg
		elif opt in ("--filterMonomorphic"):
			monomorphic = True
		elif opt in ("--rawDataFormat"):
			rawDataFormat = True
		elif opt in ("--minCallProb"):
			minCallProb = float(arg)
		elif opt in ("-p","--phenotype"):
			phenotype = int(arg)
		elif opt in ("-o","--outputSNPsFile"):
			output_fname = arg
		elif opt in ("--orderAccessions"):
			orderAccessions = True
		elif opt in ("-u","--phenotypeFile"):
			outputPhenotFile = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if not output_fname:
		print output_fname
		if help==0:
			print "Output file missing!!\n"
			print __doc__
		sys.exit(2)

	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2

	import dataParsers
	snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	
	if phenotypeFile:
		import phenotypeData
		phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t')  #Get Phenotype data 
		accIndicesToKeep = []			
		phenAccIndicesToKeep = []
		numAcc = len(snpsds[0].accessions)
		if phenotype>=0:
		        #Load phenotype file
			sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
			sys.stdout.flush()
			for i in range(0,len(snpsds[0].accessions)):
				acc1 = snpsds[0].accessions[i]
				for j in range(0,len(phed.accessions)):
					acc2 = phed.accessions[j]
					if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
						accIndicesToKeep.append(i)
						phenAccIndicesToKeep.append(j)
						break					

		elif phenotype==None:
			sys.stdout.write("Removing accessions which do not have any phenotype values.")
			sys.stdout.flush()
			for i in range(0,len(snpsds[0].accessions)):
				acc1 = snpsds[0].accessions[i]
				for j in range(0,len(phed.accessions)):
					acc2 = phed.accessions[j]
					if acc1==acc2:
						accIndicesToKeep.append(i)
						phenAccIndicesToKeep.append(j)
						break
			
					
		#Filter Accessions which do not have the phenotype value.
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			snpsd.removeAccessionIndices(accIndicesToKeep)
		print ""
		print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all."
		
		if outputPhenotFile:
			print "Filtering phenotype data."
			phed.removeAccessions(phenAccIndicesToKeep)
			if orderAccessions:
				accessionMapping = []
				i = 0
				for acc in snpsds[0].accessions:
					if acc in phed.accessions:
						accessionMapping.append((phed.accessions.index(acc),i))
						i += 1
				phed.orderAccessions(accessionMapping)
			if phenotype>=0:
				phed.writeToFile(outputPhenotFile, [phenotype])
			else:
				phed.writeToFile(outputPhenotFile)

		
        #Filtering monomorphic
	if monomorphic:
		print "Filtering monomorphic SNPs"
		for snpsd in snpsds:
			print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps"

	import snpsdata
	
	newSnpsds = []
	if not rawDataFormat:
		sys.stdout.write("Converting data format")
		for snpsd in snpsds:
			sys.stdout.write(".")
			sys.stdout.flush()
			newSnpsds.append(snpsd.getSnpsData())
		print ""
		waid1 = 0
		snpsDataset = snpsdata.SnpsDataSet(newSnpsds,[1,2,3,4,5])
		decoder = {1:1, 0:0, -1:'NA'}
	else:
		snpsDataset = snpsdata.SnpsDataSet(snpsds,[1,2,3,4,5])
		decoder=None
	
	snpsDataset.writeToFile(output_fname, deliminator=delim, missingVal = missingVal, withArrayIds = waid1, decoder=decoder)
Ejemplo n.º 33
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "outputFile=", "delim=", "missingval=", "sampleNum=", "parallel=",
        "parallelAll", "useFloats"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:n:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeFileType = 1
    outputFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    withArrayIds = 1
    parallel = None
    parallelAll = False
    sampleNum = None
    chromosomes = [1, 2, 3, 4, 5]
    useFloats = False

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--outputFile"):
            outputFile = arg
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("n", "--sampleNum"):
            sampleNum = int(arg)
        elif opt in ("--useFloats"):
            useFloats = True
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]

    print "CAMP is being set up with the following parameters:"
    print "phenotypeDataFile:", phenotypeDataFile
    if len(args) > 2:
        print "Phenotype_id:", args[2]
    print "snpsDataFile:", snpsDataFile
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "sampleNum:", sampleNum

    def runParallel(phenotypeIndex, id=""):
        #Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id

        shstr = """#!/bin/csh
#PBS -l walltime=24:00:00
#PBS -l mem=6g 
#PBS -q cmb
"""

        shstr += "#PBS -N C" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " "
        if sampleNum:
            shstr += " -n " + str(sampleNum) + " "
        if useFloats:
            shstr += " --useFloats "

        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    if parallel:  #Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(
                phenotypeDataFile, delimiter='\t')  #Get Phenotype data
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex)
        return
    else:
        phenotypeIndex = int(args[2])

    #Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data

    #Load genotype file
    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal,
                                      withArrayIds=withArrayIds)

    #Checking overlap between phenotype and genotype accessions.
    phenotype = phed.getPhenIndex(phenotypeIndex)
    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    numAcc = len(snpsds[0].accessions)
    sys.stdout.write(
        "Removing accessions which do not have a phenotype value for " +
        phed.phenotypeNames[phenotype] + ".")
    sys.stdout.flush()
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    #Filter accessions which do not have the phenotype value.
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "Filtering phenotype data."
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    #Ordering accessions according to the order of accessions in the genotype file
    accessionMapping = []
    i = 0
    for acc in snpsds[0].accessions:
        if acc in phed.accessions:
            accessionMapping.append((phed.accessions.index(acc), i))
            i += 1
    phed.orderAccessions(accessionMapping)

    #Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData())
    print ""

    #Writing phenotype data to CAMP format.
    (fId, phenotypeFile) = tempfile.mkstemp()
    os.close(fId)
    phenVals = phed.getPhenVals(phenotypeIndex, asString=False)
    if not useFloats:
        phenVals = map(int, phenVals)
    phenFile = open(phenotypeFile, "w")
    for value in phenVals:
        phenFile.write(str(value) + "\n")
    phenFile.close()

    chromosome_list = []
    positions_list = []
    scores_list = []
    interaction_positions_list = []
    mafs = []
    marfs = []
    #Writing SNP data to CAMP format.
    for chromosome in chromosomes:
        (fId, snpsFile) = tempfile.mkstemp()
        os.close(fId)
        (fId, posFile) = tempfile.mkstemp()
        os.close(fId)
        sf = open(snpsFile, "w")
        pf = open(posFile, "w")
        snpsd = newSnpsds[chromosome - 1]
        for i in range(0, len(snpsd.snps)):
            snp = snpsd.snps[i]
            (marf, maf) = snpsdata.getMAF(snp)
            marfs.append(marf)
            mafs.append(maf)
            str_snp = map(str, snp)
            double_snp = []
            for nt in str_snp:
                double_snp.append(nt)
                double_snp.append(nt)
            sf.write("".join(double_snp) + "\n")
            pf.write(str(snpsd.positions[i]) + "\n")
        sf.close()
        pf.close()

        outFile = outputFile + "_job_" + str(chromosome) + ".out"
        errFile = outputFile + "_job_" + str(chromosome) + ".err"
        resFile = outputFile + "_" + str(chromosome) + ".out"
        print "resFile,outFile,errFile,snpsFile,posFile,phenotypeFile:", resFile, outFile, errFile, snpsFile, posFile, phenotypeFile
        results = _runCAMP_(resFile, outFile, errFile, snpsFile, posFile,
                            phenotypeFile, sampleNum)

        positions_list += results["positions"]
        scores_list += results["scores"]
        for (i, j) in results["snpIndices"]:
            if not (j < 0 or i < 0):
                marfs.append(0.5)  #An ugly hack!!!
                mafs.append(0.5)
            chromosome_list.append(chromosome)

    scoreFile = outputFile + ".scores"
    f = open(scoreFile, "w")
    f.write("Chromosome,Position,Score,MARF,MAF,Second_Position\n")
    for i in range(0, len(positions_list)):
        chromosome = chromosome_list[i]
        (pos1, pos2) = positions_list[i]
        score = scores_list[i]
        marf = marfs[i]
        maf = mafs[i]
        l = map(str, [chromosome, pos1, score, marf, maf, pos2])
        f.write(",".join(l) + "\n")
    f.close()
Ejemplo n.º 34
0
def _run_():
	if len(sys.argv)==1:
		print __doc__
		sys.exit(2)
	
	long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", 
					"help", "parallel=", "parallelAll", "addToDB", 
					"callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , 
					"subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", 
					"onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun",
					"permTest=", "savePermutations", "permutationFilter=", "testRobustness",
					"memReq=","walltimeReq=",]
	try:
		opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	
	phenotypeFileType=1
	outputFile=None
	delim=","
	missingVal="NA"
	help=0
	parallel=None
	parallelAll=False
	addToDB=False
	callMethodID=None
	comment=""
	subSample=None
	onlyOriginal96=False
	onlyOriginal192 = False
	subSampleLikePhenotype = None
	subsampleTest = False
	numSubSamples = None
	complement = False
	onlyBelowLatidue = None
	onlyAboveLatidue = None

	sr = False
	srOutput = False
	srInput = False
	srSkipFirstRun = False
	srTopQuantile = 0.95
	srWindowSize = 30000
	
	permTest = None
	savePermutations = False
	permutationFilter = 1.0
	
	testRobustness = False

	memReq = "5g"
	walltimeReq = "100:00:00"

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help=1
			print __doc__
		elif opt in ("-o", "--outputFile"):
			outputFile=arg
		elif opt in ("--phenotypeFileType"):
			phenotypeFileType=int(arg)
		elif opt in ("--parallel"):
			parallel=arg
		elif opt in ("--parallelAll"):
			parallelAll=True
		elif opt in ("--addToDB"):
			addToDB=True
  		elif opt in ("--onlyOriginal96"):
			onlyOriginal96=True
  		elif opt in ("--onlyOriginal192"):
			onlyOriginal192=True
		elif opt in ("--complement"):
			complement=True
		elif opt in ("--subSample"):
			subSample=int(arg)
		elif opt in ("--subsampleTest"):
			subsampleTest = True
			l = arg.split(",")
			subSample=int(l[0])
			numSubSamples=int(l[1])
		elif opt in ("--onlyBelowLatidue"):
			onlyBelowLatidue=float(arg)
		elif opt in ("--onlyAboveLatidue"):
			onlyAboveLatidue=float(arg)
		elif opt in ("--subSampleLikePhenotype"):
			subSampleLikePhenotype=int(arg)
		elif opt in ("--callMethodID"):
			callMethodID=int(arg)
		elif opt in ("--comment"):
			comment=arg
		elif opt in ("-d", "--delim"):
			delim=arg
		elif opt in ("-m", "--missingval"):
			missingVal=arg
		elif opt in ("--sr"):
			sr = True
		elif opt in ("--testRobustness"):
			testRobustness = True
		elif opt in ("--permTest"):
			permTest = int(arg)
		elif opt in ("--savePermutations"):
			savePermutations = True
		elif opt in ("--permutationFilter"):
			permutationFilter = float(arg)
		elif opt in ("--srSkipFirstRun"):
			srSkipFirstRun = True
		elif opt in ("--srInput"):
			srInput = arg
		elif opt in ("--srOutput"):
			srOutput = arg
		elif opt in ("--srPar"):
			vals = arg.split(",")
			srTopQuantile = float(vals[0]) 
			srWindowSize = int(vals[1]) 
		elif opt in ("--memReq"):
			memReq=arg
		elif opt in ("--walltimeReq"):
			walltimeReq=arg
		else:
			if help==0:
				print "Unkown option!!\n"
				print __doc__
			sys.exit(2)

	if len(args)<3 and not parallel:
		if help==0:
			print "Arguments are missing!!\n"
			print __doc__
		sys.exit(2)

	snpsDataFile=args[0]
	phenotypeDataFile=args[1]

	print "Kruskal-Wallis is being set up with the following parameters:"
	print "phenotypeDataFile:",phenotypeDataFile
	print "snpsDataFile:",snpsDataFile
	print "parallel:",parallel
	print "parallelAll:",parallelAll
	print "onlyOriginal96:",onlyOriginal96
	print "onlyOriginal192:",onlyOriginal192
	print "onlyBelowLatidue:",onlyBelowLatidue
	print "onlyAboveLatidue:",onlyAboveLatidue
	print "complement:",complement
	print "subSampleLikePhenotype:",subSampleLikePhenotype
	print "subsampleTest:",subsampleTest
	print "numSubSamples:",numSubSamples
	print "subSample:",subSample
	print "sr:",sr
	print "srSkipFirstRun:",srSkipFirstRun
	print "srInput:",srInput
	print "srOutput:",srOutput
	print "srTopQuantile:",srTopQuantile
	print "srWindowSize:",srWindowSize
	print "permTest:",permTest
	print "savePermutations:",savePermutations
	print "permutationFilter:",permutationFilter
	print "testRobustness:",testRobustness
	print "walltimeReq:",walltimeReq
	print "memReq:",memReq

	def runParallel(phenotypeIndex,id=""):
		#Cluster specific parameters
		phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
		phenName=phed.getPhenotypeName(phenotypeIndex)
		print phenName
		outputFile=resultDir+"KW_"+parallel+"_"+phenName+id

		shstr = "#!/bin/csh\n"
		shstr += "#PBS -l walltime="+walltimeReq+"\n"
		shstr += "#PBS -l mem="+memReq+"\n"
		shstr +="#PBS -q cmb\n"
		
		shstr+="#PBS -N K"+phenName+"_"+parallel+"\n"
		shstr+="set phenotypeName="+parallel+"\n"
		shstr+="set phenotype="+str(phenotypeIndex)+"\n"
		shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" "
		if subSample:
			shstr+=" --subSample="+str(subSample)+" "			
		elif onlyOriginal96:
			shstr+=" --onlyOriginal96 "			
		elif onlyOriginal192:
			shstr+=" --onlyOriginal192 "
		if onlyBelowLatidue:
			shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" "
		elif onlyAboveLatidue:
			shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" "
		if complement: 			
			shstr+=" --complement "
		if permTest:
			shstr+=" --permTest="+str(permTest)+" "
			if savePermutations:
				shstr+=" --savePermutations "
		
		shstr+=" --permutationFilter="+str(permutationFilter)+" "
		if testRobustness:
			shstr+=" --testRobustness "
			
		if sr:
			shstr += " --sr "			
			if not srOutput:
				output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals"				
			shstr += " --srOutput="+str(output)+" "
			if srSkipFirstRun:
				if not srInput:
					output = resultDir+"KW_"+parallel+"_"+phenName+".pvals"
				shstr += " --srInput="+str(output)+" "
				shstr += " --srSkipFirstRun "				
			shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" "


		shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" "
		shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n"

		f=open(parallel+".sh", 'w')
		f.write(shstr)
		f.close()

		#Execute qsub script
		os.system("qsub "+parallel+".sh ")

	if parallel:  #Running on the cluster..
		if parallelAll:
			phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
			for phenotypeIndex in phed.phenIds:
				runParallel(phenotypeIndex)
		elif subsampleTest:
			phenotypeIndex=int(args[2])
			for i in range(0,numSubSamples):
				runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i))
		else:
			phenotypeIndex=int(args[2])
			runParallel(phenotypeIndex)
		return
	else:
		phenotypeIndex=int(args[2])


	print "phenotypeIndex:",phenotypeIndex
	print "output:",outputFile
	print "\nStarting program now!\n"


	#Load phenotype file
	phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t')  #Get Phenotype data 
	
	#If onlyOriginal96, then remove all other phenotypes..
	if onlyOriginal96: 
		print "Filtering for the first 96 accessions"
		original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
		original_96_ecotypes = map(str,original_96_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_96_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_96_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	if onlyOriginal192: 
		print "Filtering for the first 192 accessions"
		original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
		original_192_ecotypes = map(str,original_192_ecotypes)
		keepEcotypes = []
		if complement:
			for acc in phed.accessions:
				if not acc in original_192_ecotypes:
					keepEcotypes.append(acc)
		else:
			keepEcotypes = original_192_ecotypes
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)
	
	if onlyBelowLatidue:
		print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	elif onlyAboveLatidue:
		print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue
		eiDict = phenotypeData._getEcotypeIdInfoDict_()
		print eiDict
		keepEcotypes = []
		for acc in phed.accessions:
			acc = int(acc)
			if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue:
				keepEcotypes.append(str(acc))
			elif eiDict.has_key(acc) and eiDict[acc][2]==None:
				keepEcotypes.append(str(acc))
				
		phed.filterAccessions(keepEcotypes)
		print "len(phed.accessions)", len(phed.accessions)

	
	if subSampleLikePhenotype:
		p_name = phed.getPhenotypeName(subSampleLikePhenotype)
		print "Picking sample as in",p_name
		ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype)
		print ecotypes
		phed.filterAccessions(ecotypes)
		print "len(phed.accessions)", len(phed.accessions)


	if subSample: 
		sample_ecotypes = []
		ecotypes = phed.getNonNAEcotypes(phenotypeIndex)
		sample_ecotypes = random.sample(ecotypes,subSample)			
		phed.filterAccessions(sample_ecotypes)
		print "len(phed.accessions)", len(phed.accessions)
		
	sys.stdout.write("Finished prefiltering phenotype accessions.\n")
	sys.stdout.flush()
	
	
	
	#Load genotype file
	snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal)


	#Checking overlap between phenotype and genotype accessions. 
	phenotype=phed.getPhenIndex(phenotypeIndex)
	accIndicesToKeep=[]			
	phenAccIndicesToKeep=[]
	numAcc=len(snpsds[0].accessions)
	sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".")
	sys.stdout.flush()
	for i in range(0, len(snpsds[0].accessions)):
		acc1=snpsds[0].accessions[i]
		for j in range(0, len(phed.accessions)):
			acc2=phed.accessions[j]
			if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA':
				accIndicesToKeep.append(i)
				phenAccIndicesToKeep.append(j)
				break	


	#Filter accessions which do not have the phenotype value.
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		snpsd.removeAccessionIndices(accIndicesToKeep)
	print ""
	print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."
		
	print "Filtering phenotype data."
	phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values
	
	#Ordering accessions according to the order of accessions in the genotype file
	accessionMapping=[]
	i=0
	for acc in snpsds[0].accessions:
		if acc in phed.accessions:
			accessionMapping.append((phed.accessions.index(acc), i))
			i+=1
	phed.orderAccessions(accessionMapping)

		#Filtering monomorphic
	print "Filtering monomorphic SNPs"
	for snpsd in snpsds:
		print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

	#Converting format to 01
	newSnpsds=[]
	sys.stdout.write("Converting data format")
	for snpsd in snpsds:
		sys.stdout.write(".")
		sys.stdout.flush()
		newSnpsds.append(snpsd.getSnpsData())
	print ""
	
	#Double check genotype file:
	problems = 0
	for i in range(0,len(newSnpsds)):
		snpsd = newSnpsds[i]
		for j in range(0,len(snpsd.snps)):
			snp = snpsd.snps[j]
			sc = snp.count(0)
			if sc==0 or sc==len(snp):
				print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i]
				problems += 1
	if problems >0:
		print "Genotype file appears to have potential problems"
	else:
		print "Genotype file appears to be good"

	if permTest:
		print "Starting a permutation test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
			permTest = 100	
		_perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter)
		sys.exit(0)
	
	if testRobustness:
		print "Starting a robustness test"
		allSNPs = []
		for snpsd in newSnpsds:
			allSNPs += snpsd.snps
		phenVals = phed.getPhenVals(phenotypeIndex)
		test_type = "KW"
		if phed.isBinary(phenotypeIndex):
			test_type = "Fisher"
		_robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter)
		sys.exit(0)
		

	sys.stdout.flush()
	print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun
	if (not sr) or (sr and not srSkipFirstRun):
		#Writing files
		#phed and phenotype
		sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
		phenotypeName=phed.getPhenotypeName(phenotypeIndex)
		
		if phed.isBinary(phenotypeIndex):
			pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex))	
		else:
			snps = sd.getSnps()
			phen_vals = phed.getPhenVals(phenotypeIndex)
			try:
				kw_res = util.kruskal_wallis(snps,phen_vals)
				pvals = kw_res['ps']
			except:
				print snps
				print phen_vals
				print len(snps),len(snps[0]),len(phen_vals)
				raise Exception
							
		res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False)
		pvalFile=outputFile+".pvals"
		res.writeToFile(pvalFile)

		print "Generating a GW plot."
		res.negLogTransform()
		pngFile = pvalFile+".png"
		plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False)	
		srInput = pvalFile
		
	else:
		print "Skipping first stage analysis."
		sys.stdout.flush()

	if sr:
		_secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary)
		print "Generating second run GW plot."
		res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex)
		res.negLogTransform()
		srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex)
		srRes.negLogTransform()
		srPngFile = pvalFile+".sr.png"
		plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)	
Ejemplo n.º 35
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "rFile=", "delim=", "missingval=", "crossExamine=", "statFile=",
        "debug", "report", "help", "withArrayId=", "strainIdentity",
        "heterozygous2NA"
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:s:c:vh",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    if len(args) > 2:
        print args
        raise Exception("Number of arguments isn't correct.")
    inputFile1 = args[0]
    inputFile2 = None
    crossExamineData = False
    if len(args) > 1:
        inputFile2 = args[1]
    else:
        crossExamineData = True

    rFile = None
    statFile = None
    verbose = False
    delim = ","
    missingVal = "NA"
    debug = None
    report = None
    withArrayIds = 0
    fractionSnps = 0.05
    heterozygous2NA = False

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--rFile"):
            rFile = arg
        elif opt in ("-s", "--statFile"):
            statFile = arg
        elif opt in ("-t", "--method"):
            version = arg
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("-v", "--verbose"):
            verbose = True
        elif opt in ("--heterozygous2NA"):
            heterozygous2NA = True
        elif opt in ("-c", "--crossExamine"):
            fractionSnps = float(arg)
        elif opt in ("--strainIdentity"):
            crossExamineData = True

    waid1 = withArrayIds == 1 or withArrayIds == 2
    waid2 = withArrayIds == 2

    snpsds1 = dataParsers.parseCSVData(inputFile1,
                                       format=1,
                                       deliminator=delim,
                                       missingVal=missingVal,
                                       withArrayIds=waid1)
    if inputFile2:
        snpsds2 = dataParsers.parseCSVData(inputFile2,
                                           format=1,
                                           deliminator=delim,
                                           missingVal=missingVal,
                                           withArrayIds=waid2)

    if crossExamineData:
        if inputFile2:
            findIdentities(snpsds1, snpsds2, withArrayIds)
        else:
            crossExamine(snpsds1, fractionSnps, waid1)
        return

    if len(snpsds1) != len(snpsds2):
        raise Exception("Unequal number of chromosomes in files.")

    res = []
    naRate1 = 0
    naRate2 = 0
    numSNPs1 = 0
    numSNPs2 = 0
    for i in range(0, len(snpsds1)):
        res.append(snpsds1[i].compareWith(snpsds2[i],
                                          withArrayIds=withArrayIds,
                                          heterozygous2NA=heterozygous2NA))
        naRate1 += snpsds1[i].countMissingSnps() * len(snpsds1[i].positions)
        naRate2 += snpsds2[i].countMissingSnps() * len(snpsds2[i].positions)
        numSNPs1 += len(snpsds1[i].positions)
        numSNPs2 += len(snpsds2[i].positions)

    naRate1 = naRate1 / float(numSNPs1)
    naRate2 = naRate2 / float(numSNPs2)

    import rfun
    totalCommonPos = 0
    totalPos = [0, 0]
    commonAccessions = res[0][2]
    totalAccessionCounts = [0] * len(commonAccessions)
    accOverlappingCallRate = [[0] * len(commonAccessions),
                              [0] * len(commonAccessions)]
    accCallRate = [[0] * len(commonAccessions), [0] * len(commonAccessions)]
    accErrorRate = [0] * len(commonAccessions)

    statstr = "#Common SNPs positions:\n"
    rstr = "#Snps error rates\n"
    rstr = "par(mfrow=c(5,1));\n"
    snpsErrorRate = []

    totalCounts = 0
    totalFails = 0

    for i in range(0, len(res)):  #for all chromosomes
        r = res[i]
        totalCounts += r[9][0]
        totalFails += r[9][1]
        snpsErrorRate += r[1]
        totalCommonPos += len(r[0])
        totalPos[0] += len(snpsds1[i].positions)
        totalPos[1] += len(snpsds2[i].positions)
        statstr += "Chr. " + str(i + 1) + ":\n"
        statstr += str(r[0]) + "\n"
        xname = "commonPos_ch" + str(i + 1)
        ynames = ["errorRates_ch" + str(i + 1)]
        rstr += rfun.plotOverlayingVectors(r[0], [r[1]],
                                           xlab="Position, chr. " + str(i + 1),
                                           ylab="Error (red)",
                                           type="b",
                                           xname=xname,
                                           ynames=ynames) + "\n\n"
        for j in range(0, len(commonAccessions)):
            totalAccessionCounts[j] += r[6][j]
            accOverlappingCallRate[0][j] += r[4][0][j] * float(len(r[0]))
            accOverlappingCallRate[1][j] += r[4][1][j] * float(len(r[0]))
            accCallRate[0][j] += r[8][0][j]
            accCallRate[1][j] += r[8][1][j]
            accErrorRate[j] += r[3][j] * float(r[6][j])

    statstr += "#Number of common SNPs positions:\n"
    statstr += str(totalCommonPos) + "\n"
    statstr += "#SNPs errors:\n"
    for i in range(0, len(res)):
        r = res[i]
        statstr += "Chr. " + str(i + 1) + ":\n"
        statstr += str(r[1]) + "\n"

    statstr += "#Average Snp Error:\n"
    statstr += str(sum(snpsErrorRate) / float(len(snpsErrorRate))) + "\n"
    statstr += "#Weighted Average Snp Error:\n"
    statstr += str(totalFails / float(totalCounts)) + "\n"

    statstr += "#Commmon accessions:\n"
    statstr += str(commonAccessions) + '\n'
    statstr += "#Number of commmon accessions:\n"
    statstr += str(len(commonAccessions)) + '\n'
    statstr += "#Number of accessions (1):\n"
    statstr += str(len(snpsds1[0].accessions)) + '\n'
    statstr += "#Number of accessions (2):\n"
    statstr += str(len(snpsds2[0].accessions)) + '\n'

    if withArrayIds:
        commonArrayIds = res[0][5]
        statstr += "#ArrayIds:\n"
        statstr += str(commonArrayIds) + '\n'

    if not verbose:
        print "In all", len(commonAccessions), "common accessions found"
        print "In all", totalCommonPos, "common snps found"
        print "Average Snp Error:", sum(snpsErrorRate) / float(
            len(snpsErrorRate))
        print "NA rate (1) =", naRate1
        print "NA rate (2) =", naRate2

    for i in range(0, len(res)):
        r = res[i]
        xname = "commonPos_ch" + str(i + 1)
        ynames = [
            "missingRates1_ch" + str(i + 1), "missingRates2_ch" + str(i + 1)
        ]
        rstr += rfun.plotOverlayingVectors(r[0], [r[7][0], r[7][1]],
                                           xlab="Position, chr. " + str(i + 1),
                                           ylab="Missing (red,green)",
                                           type="b",
                                           xname=xname,
                                           ynames=ynames) + "\n\n"

    for i in range(0, len(commonAccessions)):
        accOverlappingCallRate[0][i] = accOverlappingCallRate[0][i] / float(
            totalCommonPos)
        accOverlappingCallRate[1][i] = accOverlappingCallRate[1][i] / float(
            totalCommonPos)
        accCallRate[0][i] = accCallRate[0][i] / float(totalPos[0])
        accCallRate[1][i] = accCallRate[1][i] / float(totalPos[1])
        accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i])

    accErrAndID = []
    accMissAndID = [[], []]
    accOverlMissAndID = [[], []]
    if withArrayIds:
        for i in range(0, len(commonAccessions)):
            accErrAndID.append(
                (accErrorRate[i], commonAccessions[i], commonArrayIds[i]))
            accMissAndID[0].append(
                (accCallRate[0][i], commonAccessions[i], commonArrayIds[i]))
            accMissAndID[1].append(
                (accCallRate[1][i], commonAccessions[i], commonArrayIds[i]))
        accOverlMissAndID[0] = zip(accOverlappingCallRate[0], commonAccessions,
                                   commonArrayIds)
        accOverlMissAndID[1] = zip(accOverlappingCallRate[1], commonAccessions,
                                   commonArrayIds)
    else:
        for i in range(0, len(commonAccessions)):
            accErrAndID.append((accErrorRate[i], commonAccessions[i]))
            accMissAndID[0].append((accCallRate[0][i], commonAccessions[i]))
            accMissAndID[1].append((accCallRate[1][i], commonAccessions[i]))
        accOverlMissAndID[0] = zip(accOverlappingCallRate[0], commonAccessions)
        accOverlMissAndID[1] = zip(accOverlappingCallRate[1], commonAccessions)
    accErrAndID.sort(
    )  #05/10/08 yh. sort(reverse=True) is not available in python 2.3
    accErrAndID.reverse()
    accMissAndID[0].sort()
    accMissAndID[0].reverse()
    accOverlMissAndID[1].sort()
    accOverlMissAndID[1].reverse()
    statstr += "#Sorted list, based on error rates (Error rate, ecotype id, array id):\n"
    for t in accErrAndID:
        statstr += str(t) + '\n'
    statstr += "#Sorted list, based on missing rates of 1st file, (Missing rate, ecotype id, array id):\n"
    for t in accMissAndID[0]:
        statstr += str(t) + '\n'
    statstr += "#Sorted list, based on missing rates of 2nd file, (Missing rate, ecotype id, array id):\n"
    for t in accMissAndID[1]:
        statstr += str(t) + '\n'
    statstr += "#Sorted list, based on (overlapping positions) missing rates of 1st file, (Missing rate, ecotype id, array id):\n"
    for t in accOverlMissAndID[0]:
        statstr += str(t) + '\n'
    statstr += "#Sorted list, based on (overlapping positions) missing rates of 2nd file, (Missing rate, ecotype id, array id):\n"
    for t in accOverlMissAndID[1]:
        statstr += str(t) + '\n'
    """
	print "Sorted list, based on error rates: ",accErrAndID,'\n'
	accMissAndID[0].sort(reverse=True)
	print "Sorted list, based on missing rates (1st file): ",accMissAndID[0],'\n'
	accMissAndID[1].sort(reverse=True)
	print "Sorted list, based on missing rates (2nd file): ",accMissAndID[1],'\n'
	"""

    if withArrayIds:
        rstr += 'accessions<-c("' + str(r[2][0]) + "_ai" + str(r[5][0]) + '"'
    else:
        rstr += 'accessions<-c("' + str(r[2][0]) + '"'
    for i in range(1, len(r[2])):
        if withArrayIds:
            rstr += ',"' + str(r[2][i]) + "_ai" + str(r[5][i]) + '"'
        else:
            rstr += ',"' + str(r[2][i]) + '"'
    rstr += ")\n"
    rstr += rfun.plotVectors(accCallRate[0], [accErrorRate],
                             xlab="Accession missing value rate",
                             ylab="Accession error rate",
                             xname="accMissingRate1",
                             ynames=["accErrorRate"])
    rstr += "text(accMissingRate1+0.0045,accErrorRate-0.0004,accessions)\n\n"
    rstr += rfun.plotVectors(accCallRate[1], [accErrorRate],
                             xlab="Accession missing value rate",
                             ylab="Accession error rate",
                             xname="accMissingRate2",
                             ynames=["accErrorRate"])
    rstr += "text(accMissingRate2+0.0045,accErrorRate-0.0004,accessions)\n\n"

    if rFile:
        f = open(rFile, "w")
        f.write(rstr)
        f.close()
    if verbose:
        print statstr
    if statFile:
        f = open(statFile, "w")
        f.write(statstr)
        f.close()
Ejemplo n.º 36
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "rFile=",
        "chr=",
        "delim=",
        "missingval=",
        "BoundaryStart=",
        "removeOutliers=",
        "addConstant=",
        "logTransform",
        "BoundaryEnd=",
        "phenotypeFileType=",
        "help",
        "parallel=",
        "parallelAll",
        "LRT",
        "minMAF=",
        "kinshipDatafile=",
        "phenotypeRanks",
        "onlyMissing",
        "onlyOriginal96",
        "onlyOriginal192",
        "onlyBelowLatidue=",
        "complement",
        "negate",
        "srInput=",
        "sr",
        "srOutput=",
        "srPar=",
        "srSkipFirstRun",
        "testRobustness",
        "permutationFilter=",
        "useLinearRegress",
        "regressionCofactors=",
        "FriLerAsCofactor",
        "FriColAsCofactor",
        "memReq=",
        "walltimeReq=",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h",
                                   long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeRanks = False
    removeOutliers = None
    addConstant = -1
    phenotypeFileType = 1
    rFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    minMAF = 0.0
    boundaries = [-1, -1]
    chr = None
    parallel = None
    logTransform = False
    negate = False
    parallelAll = False
    lrt = False
    kinshipDatafile = None
    onlyMissing = False
    onlyOriginal96 = False
    onlyOriginal192 = False
    onlyBelowLatidue = None
    complement = False

    sr = False
    srOutput = False
    srInput = False
    srSkipFirstRun = False
    srTopQuantile = 0.95
    srWindowSize = 30000

    testRobustness = False
    permutationFilter = 0.002

    useLinearRegress = False
    regressionCofactors = None
    FriLerAsCofactor = False
    FriColAsCofactor = False

    memReq = "5g"
    walltimeReq = "150:00:00"

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--rFile"):
            rFile = arg
        elif opt in ("--phenotypeFileType"):
            phenotypeFileType = int(arg)
        elif opt in ("--BoundaryStart"):
            boundaries[0] = int(arg)
        elif opt in ("--BoundaryEnd"):
            boundaries[1] = int(arg)
        elif opt in ("--addConstant"):
            addConstant = float(arg)
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--minMAF"):
            minMAF = float(arg)
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("--onlyMissing"):
            onlyMissing = True
        elif opt in ("--onlyOriginal96"):
            onlyOriginal96 = True
        elif opt in ("--onlyOriginal192"):
            onlyOriginal192 = True
        elif opt in ("--onlyBelowLatidue"):
            onlyBelowLatidue = float(arg)
        elif opt in ("--complement"):
            complement = True
        elif opt in ("--logTransform"):
            logTransform = True
        elif opt in ("--negate"):
            negate = True
        elif opt in ("--removeOutliers"):
            removeOutliers = float(arg)
        elif opt in ("--LRT"):
            lrt = True
        elif opt in ("-c", "--chr"):
            chr = int(arg)
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("--kinshipDatafile"):
            kinshipDatafile = arg
        elif opt in ("--phenotypeRanks"):
            phenotypeRanks = True
        elif opt in ("--sr"):
            sr = True
        elif opt in ("--srSkipFirstRun"):
            srSkipFirstRun = True
        elif opt in ("--srInput"):
            srInput = arg
        elif opt in ("--srOutput"):
            srOutput = arg
        elif opt in ("--srPar"):
            vals = arg.split(",")
            srTopQuantile = float(vals[0])
            srWindowSize = int(vals[1])
        elif opt in ("--testRobustness"):
            testRobustness = True
        elif opt in ("--permutationFilter"):
            permutationFilter = float(arg)
        elif opt in ("--FriLerAsCofactor"):
            FriLerAsCofactor = True
        elif opt in ("--FriColAsCofactor"):
            FriColAsCofactor = True
        elif opt in ("--useLinearRegress"):
            useLinearRegress = True
        elif opt in ("--regressionCofactors"):
            regressionCofactors = arg
        elif opt in ("--memReq"):
            memReq = arg
        elif opt in ("--walltimeReq"):
            walltimeReq = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    print "Emma is being set up with the following parameters:"
    print "output:", rFile
    print "phenotypeRanks:", phenotypeRanks
    print "phenotypeFileType:", phenotypeFileType
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "minMAF:", minMAF
    print "LRT:", lrt
    print "delim:", delim
    print "missingval:", missingVal
    print "kinshipDatafile:", kinshipDatafile
    print "chr:", chr
    print "boundaries:", boundaries
    print "onlyMissing:", onlyMissing
    print "onlyOriginal96:", onlyOriginal96
    print "onlyOriginal192:", onlyOriginal192
    print "onlyBelowLatidue:", onlyBelowLatidue
    print "complement:", complement
    print "negate:", negate
    print "logTransform:", logTransform
    print "addConstant:", addConstant
    print "removeOutliers:", removeOutliers
    print "sr:", sr
    print "srSkipFirstRun:", srSkipFirstRun
    print "srInput:", srInput
    print "srOutput:", srOutput
    print "srTopQuantile:", srTopQuantile
    print "srWindowSize:", srWindowSize
    print "testRobustness:", testRobustness
    print "permutationFilter:", permutationFilter
    print "useLinearRegress:", useLinearRegress
    print "regressionCofactors:", regressionCofactors
    print "FriLerAsCofactor:", FriLerAsCofactor
    print "FriColAsCofactor:", FriColAsCofactor
    print "walltimeReq:", walltimeReq
    print "memReq:", memReq

    def runParallel(phenotypeIndex, phed):
        #Cluster specific parameters
        print phenotypeIndex
        phenName = phed.getPhenotypeName(phenotypeIndex)
        outFileName = resultDir + "Emma_" + parallel + "_" + phenName

        shstr = "#!/bin/csh\n"
        shstr += "#PBS -l walltime=" + walltimeReq + "\n"
        shstr += "#PBS -l mem=" + memReq + "\n"
        shstr += "#PBS -q cmb\n"

        shstr += "#PBS -N E" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        if useLinearRegress:
            outFileName = resultDir + "LR_" + parallel + "_" + phenName
        shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " "
        if useLinearRegress:
            shstr += " --useLinearRegress "

        if regressionCofactors:
            shstr += " --regressionCofactors=" + str(regressionCofactors) + " "
        if FriLerAsCofactor:
            shstr += " --FriLerAsCofactor "
        if FriColAsCofactor:
            shstr += " --FriColAsCofactor "
        if onlyOriginal96:
            shstr += " --onlyOriginal96 "
        elif onlyOriginal192:
            shstr += " --onlyOriginal192 "
        if onlyBelowLatidue:
            shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " "
        if logTransform:
            shstr += " --logTransform "
        if negate:
            shstr += " --negate "
        if removeOutliers:
            shstr += " --removeOutliers=" + str(removeOutliers) + " "
        if phenotypeRanks:
            shstr += " --phenotypeRanks "
        if testRobustness:
            shstr += " --testRobustness "

        shstr += " --permutationFilter=" + str(permutationFilter) + " "

        if sr:
            shstr += " --sr "
            if not srOutput:
                output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
            shstr += " --srOutput=" + str(output) + " "
            if srSkipFirstRun:
                if not srInput:
                    output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                shstr += " --srInput=" + str(output) + " "
                shstr += " --srSkipFirstRun "
            shstr += " --srPar=" + str(srTopQuantile) + "," + str(
                srWindowSize) + " "

        if kinshipDatafile:
            shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " "
        shstr += " --addConstant=" + str(addConstant) + " "
        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(
            phenotypeIndex) + " "
        shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n"

        f = open(parallel + ".sh", 'w')
        f.write(shstr)
        f.close()

        #Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]
    if parallel:  #Running on the cluster..
        phed = phenotypeData.readPhenotypeFile(
            phenotypeDataFile, delimiter='\t')  #Get Phenotype data
        if parallelAll:
            for phenotypeIndex in phed.phenIds:
                if onlyMissing:
                    phenName = phed.getPhenotypeName(phenotypeIndex)
                    pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals"
                    res = None
                    try:
                        res = os.stat(pvalFile)

                    except Exception:
                        print "File", pvalFile, "does not exist."
                    if res and res.st_size > 0:
                        print "File", pvalFile, "already exists, and is non-empty."
                        if sr:
                            srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals"
                            srRes = None
                            try:
                                srRes = os.stat(srInput)
                            except Exception:
                                print "File", srInput, "does not exist."
                            if srRes and srRes.st_size > 0:
                                print "File", srInput, "already exists, and is non-empty."
                            else:
                                runParallel(phenotypeIndex, phed)

                    else:
                        print "Setting up the run."
                        runParallel(phenotypeIndex, phed)

                else:
                    runParallel(phenotypeIndex, phed)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex, phed)
        return
    else:
        phenotypeIndex = int(args[2])

    print "phenotypeIndex:", phenotypeIndex
    print "\nStarting program now!\n"

    snpsds = dataParsers.parseCSVData(snpsDataFile,
                                      format=1,
                                      deliminator=delim,
                                      missingVal=missingVal)

    #Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile,
                                           delimiter='\t')  #Get Phenotype data
    numAcc = len(snpsds[0].accessions)

    #Removing outliers
    if removeOutliers:
        print "Remoing outliers"
        phed.naOutliers(phenotypeIndex, removeOutliers)

    #If onlyOriginal96, then remove all other phenotypes..
    if onlyOriginal96:
        print "Filtering for the first 96 accessions"
        original_96_ecotypes = phenotypeData._getFirst96Ecotypes_()
        original_96_ecotypes = map(str, original_96_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_96_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_96_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyOriginal192:
        print "Filtering for the first 192 accessions"
        original_192_ecotypes = phenotypeData._getFirst192Ecotypes_()
        original_192_ecotypes = map(str, original_192_ecotypes)
        keepEcotypes = []
        if complement:
            for acc in phed.accessions:
                if not acc in original_192_ecotypes:
                    keepEcotypes.append(acc)
        else:
            keepEcotypes = original_192_ecotypes
        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)

    if onlyBelowLatidue:
        print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue
        eiDict = phenotypeData._getEcotypeIdInfoDict_()
        print eiDict
        keepEcotypes = []
        for acc in phed.accessions:
            acc = int(acc)
            if eiDict.has_key(acc) and eiDict[acc][
                    2] and eiDict[acc][2] < onlyBelowLatidue:
                keepEcotypes.append(str(acc))
            elif eiDict.has_key(acc) and eiDict[acc][2] == None:
                keepEcotypes.append(str(acc))

        phed.filterAccessions(keepEcotypes)
        print "len(phed.accessions)", len(phed.accessions)
    sys.stdout.write("Finished prefiltering phenotype accessions.\n")
    sys.stdout.flush()

    phenotype = phed.getPhenIndex(phenotypeIndex)

    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    #Checking which accessions to keep and which to remove .
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

    print "\nFiltering accessions in genotype data:"
    #Filter accessions which do not have the phenotype value (from the genotype data).
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(
        accIndicesToKeep
    ), "accessions removed from genotype data, leaving", len(
        accIndicesToKeep), "accessions in all."

    print "\nNow filtering accessions in phenotype data:"
    phed.removeAccessions(
        phenAccIndicesToKeep
    )  #Removing accessions that don't have genotypes or phenotype values

    print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len(
        phed.accessions) == len(snpsds[0].accessions)
    if len(phed.accessions) != len(snpsds[0].accessions):
        raise Exception

    #Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    #Remove minor allele frequencies
    if minMAF != 0:
        sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.filterMinMAF(minMAF)

    #Removing SNPs which are outside of boundaries.
    if chr:
        print "\nRemoving SNPs which are outside of boundaries."
        snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1])
        snpsds = [snpsds[chr - 1]]

    #Ordering accessions in genotype data to fit phenotype data.
    print "Ordering genotype data accessions."
    accessionMapping = []
    i = 0
    for acc in phed.accessions:
        if acc in snpsds[0].accessions:
            accessionMapping.append((snpsds[0].accessions.index(acc), i))
            i += 1

    #print zip(accessionMapping,snpsds[0].accessions)
    print "len(snpsds[0].snps)", len(snpsds[0].snps)

    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.orderAccessions(accessionMapping)
    print "\nGenotype data has been ordered."

    #Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal))
    print ""

    print "Checking kinshipfile:", kinshipDatafile

    if kinshipDatafile:  #Is there a special kinship file?
        kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile,
                                                 format=1,
                                                 deliminator=delim,
                                                 missingVal=missingVal)

        accIndicesToKeep = []
        #Checking which accessions to keep and which to remove (genotype data).
        sys.stdout.write(
            "Removing accessions which do not have a phenotype value for " +
            phed.phenotypeNames[phenotype] + ".")
        sys.stdout.flush()
        for i in range(0, len(kinshipSnpsds[0].accessions)):
            acc1 = kinshipSnpsds[0].accessions[i]
            for j in range(0, len(phed.accessions)):
                acc2 = phed.accessions[j]
                if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA':
                    accIndicesToKeep.append(i)
                    break
        print accIndicesToKeep

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.removeAccessionIndices(accIndicesToKeep)
        print ""
        print numAcc - len(
            accIndicesToKeep
        ), "accessions removed from kinship genotype data, leaving", len(
            accIndicesToKeep), "accessions in all."

        print "Ordering kinship data accessions."
        accessionMapping = []
        i = 0
        for acc in snpsds[0].accessions:
            if acc in kinshipSnpsds[0].accessions:
                accessionMapping.append(
                    (kinshipSnpsds[0].accessions.index(acc), i))
                i += 1

        print zip(accessionMapping, snpsds[0].accessions)
        print "len(snpsds[0].snps)", len(snpsds[0].snps)

        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.orderAccessions(accessionMapping)
        print "Kinship genotype data has been ordered."

        newKinshipSnpsds = []
        sys.stdout.write("Converting data format")
        for snpsd in kinshipSnpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            newKinshipSnpsds.append(snpsd.getSnpsData(
                missingVal=missingVal))  #This data might have NAs
        print ""
        kinshipSnpsds = newKinshipSnpsds

    else:
        kinshipSnpsds = newSnpsds

    print "Found kinship data."

    #Ordering accessions according to the order of accessions in the genotype file
    #	accessionMapping = []
    #	i = 0
    #	for acc in snpsds[0].accessions:
    #		if acc in phed.accessions:
    #			accessionMapping.append((phed.accessions.index(acc),i))
    #			i += 1
    #	phed.orderAccessions(accessionMapping)

    #Negating phenotypic values
    if negate:
        phed.negateValues(phenotypeIndex)

    if logTransform and not phed.isBinary(
            phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0:
        addConstant = 0

    #Adding a constant.
    if addConstant != -1:
        if addConstant == 0:
            addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10
            addConstant = addConstant - phed.getMinValue(phenotypeIndex)

        print "Adding a constant to phenotype:", addConstant
        phed.addConstant(phenotypeIndex, addConstant)

    #Log-transforming
    if logTransform:
        print "Log transforming phenotype"
        phed.logTransform(phenotypeIndex)
    #Converting phenotypes to Ranks
    elif phenotypeRanks:
        phed.transformToRanks(phenotypeIndex)

    if not chr:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,
                                                  [1, 2, 3, 4, 5])
    else:
        snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr])
        kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr])

    phenotypeName = phed.getPhenotypeName(phenotypeIndex)

    sys.stdout.flush()

    if testRobustness:
        print "Starting a robustness test"
        allSNPs = []
        for snpsd in snpsDataset.snpsDataList:
            allSNPs += snpsd.snps
        phenVals = phed.getPhenVals(phenotypeIndex)
        _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter)
        sys.exit(0)

    if useLinearRegress:
        phenVals = phed.getPhenVals(phenotypeIndex)
        d0 = {}
        d0["phen"] = phenVals
        dh = {}
        dh["phen"] = phenVals
        import rpy, gc
        if regressionCofactors:  #Adds ler and col as cofactors
            import pickle
            f = open(regressionCofactors, "r")
            co_factors = pickle.load(f)
            f.close()
            #inserting co factors into model
            for factor in co_factors:
                d[factor] = co_factors[factor]
        import analyzeHaplotype as ah
        (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True)
        if FriColAsCofactor:
            d0["col"] = col_factor
            dh["col"] = col_factor
        if FriLerAsCofactor:
            d0["ler"] = ler_factor
            dh["ler"] = ler_factor
        chr_pos_pvals = []
        stats = []
        sys.stdout.write("Applying the linear model")
        sys.stdout.flush()
        for i in range(0, len(newSnpsds)):  #[3]:#
            snpsd = newSnpsds[i]
            sys.stdout.write("|")
            sys.stdout.flush()
            gc.collect(
            )  #Calling garbage collector, in an attempt to clean up memory..
            for j in range(0, len(snpsd.snps)):
                if j % 5000 == 0:
                    sys.stdout.write(".")
                    sys.stdout.flush()
                #if snpsd.positions[j]>1700000:
                #	break
                snp = snpsd.snps[j]
                d0["snp"] = snp
                try:
                    rpy.set_default_mode(rpy.NO_CONVERSION)
                    aov0 = rpy.r.aov(r("phen ~ ."), data=d0)
                    aovh = rpy.r.aov(r("phen ~ ."), data=dh)
                    rpy.set_default_mode(rpy.BASIC_CONVERSION)
                    s0 = rpy.r.summary(aov0)
                    sh = rpy.r.summary(aovh)
                    #print s0,sh
                    rss_0 = s0['Sum Sq'][-1]
                    if type(sh['Sum Sq']) != float:
                        rss_h = sh['Sum Sq'][-1]

                    else:
                        rss_h = sh['Sum Sq']
                    f = (rss_h - rss_0) / (rss_0 /
                                           (len(phenVals) - len(d0) + 1))
                    pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False)
                except Exception, err_str:
                    print "Calculating p-value failed"  #,err_str
                    pval = 1.0
                #print "dh:",dh
                #print "d0:",d0
                #print "rss_h,rss_0:",rss_h,rss_0
                #print "f,p:",f,pval
                chr_pos_pvals.append([i + 1, snpsd.positions[j], pval])
                mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0]))
                maf = mafc / float(len(snp))
                stats.append([maf, mafc])
        sys.stdout.write("\n")
        #Write out to a result file
        sys.stdout.write("Writing results to file\n")
        sys.stdout.flush()
        pvalFile = rFile + ".pvals"
        f = open(pvalFile, "w")
        f.write("Chromosome,position,p-value,marf,maf\n")
        for i in range(0, len(chr_pos_pvals)):
            chr_pos_pval = chr_pos_pvals[i]
            stat = stats[i]
            f.write(
                str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," +
                str(chr_pos_pval[2]) + "," + str(stat[0]) + "," +
                str(stat[1]) + "\n")
        f.close()

        #Plot results
        print "Generating a GW plot."
        phenotypeName = phed.getPhenotypeName(phenotypeIndex)
        res = gwaResults.Result(pvalFile,
                                name="LM_" + phenotypeName,
                                phenotypeID=phenotypeIndex)
        res.negLogTransform()
        pngFile = pvalFile + ".png"
        plotResults.plotResult(res,
                               pngFile=pngFile,
                               percentile=90,
                               type="pvals",
                               ylab="$-$log$_{10}(p)$",
                               plotBonferroni=True,
                               usePylab=False)
Ejemplo n.º 37
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = [
        "outputSNPsFile=",
        "outputPhenotFile=",
        "filterMonomorphic",
        "rawDataFormat",
        "delim=",
        "missingval=",
        "withArrayId=",
        "phenotype=",
        "phenotypeFile=",
        "phenotypeName=",
        "calcKinshipMatrix=",
        "orderAccessions",
        "help",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:u:d:m:a:f:p:h", long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    inputFile = args[0]
    output_fname = None
    outputPhenotFile = None
    delim = ","
    missingVal = "NA"
    phenotypeFile = None
    kinshipMatrixFile = None
    phenotype = None
    phenotypeName = None
    rawDataFormat = False
    monomorphic = False
    help = 0
    withArrayIds = 1
    orderAccessions = False

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-a", "--withArrayId"):
            withArrayIds = int(arg)
        elif opt in ("-f", "--phenotypeFile"):
            phenotypeFile = arg
        elif opt in ("calcKinshipMatrix"):
            kinshipMatrixFile = arg
        elif opt in ("--filterMonomorphic"):
            monomorphic = True
        elif opt in ("--rawDataFormat"):
            rawDataFormat = True
        elif opt in ("--minCallProb"):
            minCallProb = float(arg)
        elif opt in ("-p", "--phenotype"):
            phenotype = int(arg)
        elif opt in ("-o", "--outputSNPsFile"):
            output_fname = arg
        elif opt in ("--orderAccessions"):
            orderAccessions = True
        elif opt in ("-u", "--phenotypeFile"):
            outputPhenotFile = arg
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if not output_fname:
        print output_fname
        if help == 0:
            print "Output file missing!!\n"
            print __doc__
        sys.exit(2)

    waid1 = withArrayIds == 1 or withArrayIds == 2
    waid2 = withArrayIds == 2

    import dataParsers

    snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)

    if phenotypeFile:
        import phenotypeData

        phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t")  # Get Phenotype data
        accIndicesToKeep = []
        phenAccIndicesToKeep = []
        numAcc = len(snpsds[0].accessions)
        if phenotype >= 0:
            # Load phenotype file
            sys.stdout.write(
                "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + "."
            )
            sys.stdout.flush()
            for i in range(0, len(snpsds[0].accessions)):
                acc1 = snpsds[0].accessions[i]
                for j in range(0, len(phed.accessions)):
                    acc2 = phed.accessions[j]
                    if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != "NA":
                        accIndicesToKeep.append(i)
                        phenAccIndicesToKeep.append(j)
                        break

        elif phenotype == None:
            sys.stdout.write("Removing accessions which do not have any phenotype values.")
            sys.stdout.flush()
            for i in range(0, len(snpsds[0].accessions)):
                acc1 = snpsds[0].accessions[i]
                for j in range(0, len(phed.accessions)):
                    acc2 = phed.accessions[j]
                    if acc1 == acc2:
                        accIndicesToKeep.append(i)
                        phenAccIndicesToKeep.append(j)
                        break

                        # Filter Accessions which do not have the phenotype value.
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            snpsd.removeAccessionIndices(accIndicesToKeep)
        print ""
        print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."

        if outputPhenotFile:
            print "Filtering phenotype data."
            phed.removeAccessions(phenAccIndicesToKeep)
            if orderAccessions:
                accessionMapping = []
                i = 0
                for acc in snpsds[0].accessions:
                    if acc in phed.accessions:
                        accessionMapping.append((phed.accessions.index(acc), i))
                        i += 1
                phed.orderAccessions(accessionMapping)
            if phenotype >= 0:
                phed.writeToFile(outputPhenotFile, [phenotype])
            else:
                phed.writeToFile(outputPhenotFile)

                # Filtering monomorphic
    if monomorphic:
        print "Filtering monomorphic SNPs"
        for snpsd in snpsds:
            print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

    import snpsdata

    newSnpsds = []
    if not rawDataFormat:
        sys.stdout.write("Converting data format")
        for snpsd in snpsds:
            sys.stdout.write(".")
            sys.stdout.flush()
            newSnpsds.append(snpsd.getSnpsData())
        print ""
        waid1 = 0
        snpsDataset = snpsdata.SnpsDataSet(newSnpsds, [1, 2, 3, 4, 5])
        decoder = {1: 1, 0: 0, -1: "NA"}
    else:
        snpsDataset = snpsdata.SnpsDataSet(snpsds, [1, 2, 3, 4, 5])
        decoder = None

    snpsDataset.writeToFile(output_fname, deliminator=delim, missingVal=missingVal, withArrayIds=waid1, decoder=decoder)
Ejemplo n.º 38
0
def _run_():
	if len(sys.argv) == 1:
		print __doc__
		sys.exit(2)
	
	long_options_list = ["rFile=", "delim=", "missingval=", "crossExamine=", "statFile=", "debug", 
						"report", "help", "withArrayId=","strainIdentity", "heterozygous2NA"]
	try:
		opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:s:c:vh", long_options_list)

	except:
		traceback.print_exc()
		print sys.exc_info()
		print __doc__
		sys.exit(2)
	
	if len(args)>2:
		print args
		raise Exception("Number of arguments isn't correct.")
	inputFile1 = args[0]
	inputFile2 = None
	crossExamineData=False
	if len(args)>1:
		inputFile2 = args[1]
	else:
		crossExamineData=True
	
	rFile = None
	statFile = None
	verbose = False
	delim = ","
	missingVal = "NA"
	debug = None
	report = None
	withArrayIds = 0
	fractionSnps = 0.05
	heterozygous2NA = False
	
	for opt, arg in opts:
		if opt in ("-h", "--help"):
			help = 1
			print __doc__
		elif opt in ("-o", "--rFile"):
			rFile = arg
		elif opt in ("-s", "--statFile"):
			statFile = arg
		elif opt in ("-t","--method"):
			version = arg
		elif opt in ("-d","--delim"):
			delim = arg
		elif opt in ("-m","--missingval"):
			missingVal = arg	
		elif opt in ("-a","--withArrayId"):
			withArrayIds = int(arg)
		elif opt in ("-v", "--verbose"):
			verbose = True
		elif opt in ("--heterozygous2NA"):
			heterozygous2NA = True
		elif opt in ("-c", "--crossExamine"):
			fractionSnps = float(arg)
		elif opt in ("--strainIdentity"):
			crossExamineData=True

	
	waid1 = withArrayIds==1 or withArrayIds==2
	waid2 = withArrayIds==2
	
	snpsds1 = dataParsers.parseCSVData(inputFile1, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
	if inputFile2:
		snpsds2 = dataParsers.parseCSVData(inputFile2, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2)

	if crossExamineData:
		if inputFile2:
			findIdentities(snpsds1,snpsds2,withArrayIds)
		else:
			crossExamine(snpsds1,fractionSnps,waid1)
		return
	

	if len(snpsds1) != len(snpsds2):
		raise Exception("Unequal number of chromosomes in files.")
		
	res = []
	naRate1 = 0
	naRate2 = 0
	numSNPs1 = 0
	numSNPs2 = 0
	for i in range(0,len(snpsds1)):
		res.append(snpsds1[i].compareWith(snpsds2[i],withArrayIds=withArrayIds,heterozygous2NA=heterozygous2NA))
		naRate1 += snpsds1[i].countMissingSnps()*len(snpsds1[i].positions)
		naRate2 += snpsds2[i].countMissingSnps()*len(snpsds2[i].positions)
		numSNPs1 += len(snpsds1[i].positions)
		numSNPs2 += len(snpsds2[i].positions)
		
		
	naRate1 = naRate1/float(numSNPs1)
	naRate2 = naRate2/float(numSNPs2)

	import rfun
	totalCommonPos = 0
	totalPos = [0,0]
	commonAccessions = res[0][2]
	totalAccessionCounts = [0]*len(commonAccessions)
	accOverlappingCallRate = [[0]*len(commonAccessions),[0]*len(commonAccessions)]
	accCallRate = [[0]*len(commonAccessions),[0]*len(commonAccessions)]
	accErrorRate = [0]*len(commonAccessions)
	
	statstr = "#Common SNPs positions:\n"
	rstr = "#Snps error rates\n"
	rstr = "par(mfrow=c(5,1));\n"
	snpsErrorRate = []

 	totalCounts = 0
 	totalFails = 0

	for i in range(0,len(res)): #for all chromosomes
		r = res[i]
		totalCounts += r[9][0]
		totalFails += r[9][1]
		snpsErrorRate +=r[1]
		totalCommonPos += len(r[0])
		totalPos[0] += len(snpsds1[i].positions)
		totalPos[1] += len(snpsds2[i].positions)
		statstr += "Chr. "+str(i+1)+":\n"
		statstr += str(r[0])+"\n"
		xname = "commonPos_ch"+str(i+1)
		ynames = ["errorRates_ch"+str(i+1)]
		rstr += rfun.plotOverlayingVectors(r[0],[r[1]],xlab="Position, chr. "+str(i+1),ylab="Error (red)",type="b",xname=xname,ynames=ynames)+"\n\n"
		for j in range(0,len(commonAccessions)):
			totalAccessionCounts[j] += r[6][j]
			accOverlappingCallRate[0][j]+=r[4][0][j]*float(len(r[0]))
			accOverlappingCallRate[1][j]+=r[4][1][j]*float(len(r[0]))
			accCallRate[0][j]+=r[8][0][j]
			accCallRate[1][j]+=r[8][1][j]
			accErrorRate[j]+=r[3][j]*float(r[6][j])

	statstr += "#Number of common SNPs positions:\n"
	statstr += str(totalCommonPos)+"\n"
	statstr += "#SNPs errors:\n"
	for i in range(0,len(res)):
		r = res[i]
		statstr += "Chr. "+str(i+1)+":\n"
		statstr += str(r[1])+"\n"


	statstr += "#Average Snp Error:\n"
	statstr += str(sum(snpsErrorRate)/float(len(snpsErrorRate)))+"\n"
	statstr += "#Weighted Average Snp Error:\n"
	statstr += str(totalFails/float(totalCounts))+"\n"
	
	statstr += "#Commmon accessions:\n"
	statstr += str(commonAccessions)+'\n'
	statstr += "#Number of commmon accessions:\n"
	statstr += str(len(commonAccessions))+'\n'
	statstr += "#Number of accessions (1):\n"
	statstr += str(len(snpsds1[0].accessions))+'\n'
	statstr += "#Number of accessions (2):\n"
	statstr += str(len(snpsds2[0].accessions))+'\n'

	if withArrayIds:
		commonArrayIds = res[0][5]
		statstr += "#ArrayIds:\n"
		statstr += str(commonArrayIds)+'\n'

	if not verbose:
		print "In all",len(commonAccessions),"common accessions found"
		print "In all",totalCommonPos,"common snps found"
		print "Average Snp Error:",sum(snpsErrorRate)/float(len(snpsErrorRate))
		print "NA rate (1) =",naRate1
		print "NA rate (2) =",naRate2


	for i in range(0,len(res)):
		r = res[i]
		xname = "commonPos_ch"+str(i+1)
		ynames = ["missingRates1_ch"+str(i+1),"missingRates2_ch"+str(i+1)]
		rstr += rfun.plotOverlayingVectors(r[0],[r[7][0],r[7][1]],xlab="Position, chr. "+str(i+1),ylab="Missing (red,green)",type="b",xname=xname,ynames=ynames)+"\n\n"

	for i in range(0,len(commonAccessions)):
		accOverlappingCallRate[0][i]=accOverlappingCallRate[0][i]/float(totalCommonPos)
		accOverlappingCallRate[1][i]=accOverlappingCallRate[1][i]/float(totalCommonPos)
		accCallRate[0][i]=accCallRate[0][i]/float(totalPos[0])
		accCallRate[1][i]=accCallRate[1][i]/float(totalPos[1])
		accErrorRate[i]=accErrorRate[i]/float(totalAccessionCounts[i])

	accErrAndID = []
	accMissAndID = [[],[]]
	accOverlMissAndID = [[],[]]
	if withArrayIds:
		for i in range(0,len(commonAccessions)):
			accErrAndID.append((accErrorRate[i], commonAccessions[i], commonArrayIds[i]))
			accMissAndID[0].append((accCallRate[0][i], commonAccessions[i], commonArrayIds[i]))
			accMissAndID[1].append((accCallRate[1][i], commonAccessions[i], commonArrayIds[i]))
		accOverlMissAndID[0] = zip(accOverlappingCallRate[0],commonAccessions,commonArrayIds)
		accOverlMissAndID[1] = zip(accOverlappingCallRate[1],commonAccessions,commonArrayIds)
	else:
		for i in range(0,len(commonAccessions)):
			accErrAndID.append((accErrorRate[i], commonAccessions[i]))
			accMissAndID[0].append((accCallRate[0][i], commonAccessions[i]))
			accMissAndID[1].append((accCallRate[1][i], commonAccessions[i]))
		accOverlMissAndID[0] = zip(accOverlappingCallRate[0],commonAccessions)
		accOverlMissAndID[1] = zip(accOverlappingCallRate[1],commonAccessions)
	accErrAndID.sort()	#05/10/08 yh. sort(reverse=True) is not available in python 2.3
	accErrAndID.reverse()
	accMissAndID[0].sort()
	accMissAndID[0].reverse()
	accOverlMissAndID[1].sort()
	accOverlMissAndID[1].reverse()
	statstr += "#Sorted list, based on error rates (Error rate, ecotype id, array id):\n"
	for t in accErrAndID:
		statstr += str(t)+'\n'
	statstr += "#Sorted list, based on missing rates of 1st file, (Missing rate, ecotype id, array id):\n"
	for t in accMissAndID[0]:
		statstr += str(t)+'\n'
	statstr += "#Sorted list, based on missing rates of 2nd file, (Missing rate, ecotype id, array id):\n"
	for t in accMissAndID[1]:
		statstr += str(t)+'\n'
	statstr += "#Sorted list, based on (overlapping positions) missing rates of 1st file, (Missing rate, ecotype id, array id):\n"
	for t in accOverlMissAndID[0]:
		statstr += str(t)+'\n'
	statstr += "#Sorted list, based on (overlapping positions) missing rates of 2nd file, (Missing rate, ecotype id, array id):\n"
	for t in accOverlMissAndID[1]:
		statstr += str(t)+'\n'
 
	"""
	print "Sorted list, based on error rates: ",accErrAndID,'\n'
	accMissAndID[0].sort(reverse=True)
	print "Sorted list, based on missing rates (1st file): ",accMissAndID[0],'\n'
	accMissAndID[1].sort(reverse=True)
	print "Sorted list, based on missing rates (2nd file): ",accMissAndID[1],'\n'
	"""
		

	if withArrayIds:
		rstr += 'accessions<-c("'+str(r[2][0])+"_ai"+str(r[5][0])+'"'
	else:
		rstr += 'accessions<-c("'+str(r[2][0])+'"'		
	for i in range(1, len(r[2])):
		if withArrayIds:
			rstr += ',"'+str(r[2][i])+"_ai"+str(r[5][i])+'"'			
		else:
			rstr += ',"'+str(r[2][i])+'"'
	rstr +=")\n"
	rstr += rfun.plotVectors(accCallRate[0],[accErrorRate],xlab="Accession missing value rate",ylab="Accession error rate",xname="accMissingRate1",ynames=["accErrorRate"])
	rstr += "text(accMissingRate1+0.0045,accErrorRate-0.0004,accessions)\n\n"
	rstr += rfun.plotVectors(accCallRate[1],[accErrorRate],xlab="Accession missing value rate",ylab="Accession error rate",xname="accMissingRate2",ynames=["accErrorRate"])
	rstr += "text(accMissingRate2+0.0045,accErrorRate-0.0004,accessions)\n\n"

	if rFile:
		f = open(rFile,"w")
		f.write(rstr)
		f.close()
	if verbose:
		print statstr
	if statFile:
		f = open(statFile,"w")
		f.write(statstr)
		f.close()
Ejemplo n.º 39
0
def _run_():
    if len(sys.argv) == 1:
        print __doc__
        sys.exit(2)

    long_options_list = ["outputFile=", "delim=", "missingval=", "sampleNum=", "parallel=", "parallelAll", "useFloats"]
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:d:m:n:h", long_options_list)

    except:
        traceback.print_exc()
        print sys.exc_info()
        print __doc__
        sys.exit(2)

    phenotypeFileType = 1
    outputFile = None
    delim = ","
    missingVal = "NA"
    help = 0
    withArrayIds = 1
    parallel = None
    parallelAll = False
    sampleNum = None
    chromosomes = [1, 2, 3, 4, 5]
    useFloats = False

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            help = 1
            print __doc__
        elif opt in ("-o", "--outputFile"):
            outputFile = arg
        elif opt in ("--parallel"):
            parallel = arg
        elif opt in ("--parallelAll"):
            parallelAll = True
        elif opt in ("-d", "--delim"):
            delim = arg
        elif opt in ("-m", "--missingval"):
            missingVal = arg
        elif opt in ("n", "--sampleNum"):
            sampleNum = int(arg)
        elif opt in ("--useFloats"):
            useFloats = True
        else:
            if help == 0:
                print "Unkown option!!\n"
                print __doc__
            sys.exit(2)

    if len(args) < 3 and not parallel:
        if help == 0:
            print "Arguments are missing!!\n"
            print __doc__
        sys.exit(2)

    snpsDataFile = args[0]
    phenotypeDataFile = args[1]

    print "CAMP is being set up with the following parameters:"
    print "phenotypeDataFile:", phenotypeDataFile
    if len(args) > 2:
        print "Phenotype_id:", args[2]
    print "snpsDataFile:", snpsDataFile
    print "parallel:", parallel
    print "parallelAll:", parallelAll
    print "sampleNum:", sampleNum

    def runParallel(phenotypeIndex, id=""):
        # Cluster specific parameters
        phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
        phenName = phed.getPhenotypeName(phenotypeIndex)
        phenName = phenName.replace("/", "_div_")
        phenName = phenName.replace("*", "_star_")
        outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id

        shstr = """#!/bin/csh
#PBS -l walltime=24:00:00
#PBS -l mem=6g 
#PBS -q cmb
"""

        shstr += "#PBS -N C" + phenName + "_" + parallel + "\n"
        shstr += "set phenotypeName=" + parallel + "\n"
        shstr += "set phenotype=" + str(phenotypeIndex) + "\n"
        shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " "
        if sampleNum:
            shstr += " -n " + str(sampleNum) + " "
        if useFloats:
            shstr += " --useFloats "

        shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " "
        shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n"

        f = open(parallel + ".sh", "w")
        f.write(shstr)
        f.close()

        # Execute qsub script
        os.system("qsub " + parallel + ".sh ")

    if parallel:  # Running on the cluster..
        if parallelAll:
            phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data
            for phenotypeIndex in phed.phenIds:
                runParallel(phenotypeIndex)
        else:
            phenotypeIndex = int(args[2])
            runParallel(phenotypeIndex)
        return
    else:
        phenotypeIndex = int(args[2])

        # Load phenotype file
    phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t")  # Get Phenotype data

    # Load genotype file
    snpsds = dataParsers.parseCSVData(
        snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds
    )

    # Checking overlap between phenotype and genotype accessions.
    phenotype = phed.getPhenIndex(phenotypeIndex)
    accIndicesToKeep = []
    phenAccIndicesToKeep = []
    numAcc = len(snpsds[0].accessions)
    sys.stdout.write(
        "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + "."
    )
    sys.stdout.flush()
    for i in range(0, len(snpsds[0].accessions)):
        acc1 = snpsds[0].accessions[i]
        for j in range(0, len(phed.accessions)):
            acc2 = phed.accessions[j]
            if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != "NA":
                accIndicesToKeep.append(i)
                phenAccIndicesToKeep.append(j)
                break

                # Filter accessions which do not have the phenotype value.
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        snpsd.removeAccessionIndices(accIndicesToKeep)
    print ""
    print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all."

    print "Filtering phenotype data."
    phed.removeAccessions(phenAccIndicesToKeep)  # Removing accessions that don't have genotypes or phenotype values

    # Ordering accessions according to the order of accessions in the genotype file
    accessionMapping = []
    i = 0
    for acc in snpsds[0].accessions:
        if acc in phed.accessions:
            accessionMapping.append((phed.accessions.index(acc), i))
            i += 1
    phed.orderAccessions(accessionMapping)

    # Filtering monomorphic
    print "Filtering monomorphic SNPs"
    for snpsd in snpsds:
        print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps"

        # Converting format to 01
    newSnpsds = []
    sys.stdout.write("Converting data format")
    for snpsd in snpsds:
        sys.stdout.write(".")
        sys.stdout.flush()
        newSnpsds.append(snpsd.getSnpsData())
    print ""

    # Writing phenotype data to CAMP format.
    (fId, phenotypeFile) = tempfile.mkstemp()
    os.close(fId)
    phenVals = phed.getPhenVals(phenotypeIndex, asString=False)
    if not useFloats:
        phenVals = map(int, phenVals)
    phenFile = open(phenotypeFile, "w")
    for value in phenVals:
        phenFile.write(str(value) + "\n")
    phenFile.close()

    chromosome_list = []
    positions_list = []
    scores_list = []
    interaction_positions_list = []
    mafs = []
    marfs = []
    # Writing SNP data to CAMP format.
    for chromosome in chromosomes:
        (fId, snpsFile) = tempfile.mkstemp()
        os.close(fId)
        (fId, posFile) = tempfile.mkstemp()
        os.close(fId)
        sf = open(snpsFile, "w")
        pf = open(posFile, "w")
        snpsd = newSnpsds[chromosome - 1]
        for i in range(0, len(snpsd.snps)):
            snp = snpsd.snps[i]
            (marf, maf) = snpsdata.getMAF(snp)
            marfs.append(marf)
            mafs.append(maf)
            str_snp = map(str, snp)
            double_snp = []
            for nt in str_snp:
                double_snp.append(nt)
                double_snp.append(nt)
            sf.write("".join(double_snp) + "\n")
            pf.write(str(snpsd.positions[i]) + "\n")
        sf.close()
        pf.close()

        outFile = outputFile + "_job_" + str(chromosome) + ".out"
        errFile = outputFile + "_job_" + str(chromosome) + ".err"
        resFile = outputFile + "_" + str(chromosome) + ".out"
        print "resFile,outFile,errFile,snpsFile,posFile,phenotypeFile:", resFile, outFile, errFile, snpsFile, posFile, phenotypeFile
        results = _runCAMP_(resFile, outFile, errFile, snpsFile, posFile, phenotypeFile, sampleNum)

        positions_list += results["positions"]
        scores_list += results["scores"]
        for (i, j) in results["snpIndices"]:
            if not (j < 0 or i < 0):
                marfs.append(0.5)  # An ugly hack!!!
                mafs.append(0.5)
            chromosome_list.append(chromosome)

    scoreFile = outputFile + ".scores"
    f = open(scoreFile, "w")
    f.write("Chromosome,Position,Score,MARF,MAF,Second_Position\n")
    for i in range(0, len(positions_list)):
        chromosome = chromosome_list[i]
        (pos1, pos2) = positions_list[i]
        score = scores_list[i]
        marf = marfs[i]
        maf = mafs[i]
        l = map(str, [chromosome, pos1, score, marf, maf, pos2])
        f.write(",".join(l) + "\n")
    f.close()
Ejemplo n.º 40
0
def analyzeSNPs():
    import KW, phenotype_parsers, phenotypeData
    import Emma
    result_id = "filtered_imputed"
    data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/"
    #ref_seq_name = "2010_Col-0"
    ref_seq_name = "raw_ref_col-0"
    ref_start = 3170501
    ref_chr = 5
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_edited_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #			ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)
    #ad = sequences.readFastaAlignment(data_dir+"flc_seqs_aln_merged_011810.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start,
    #		ref_chr=ref_chr,alignment_type="muscle",ref_direction=1)

    #r = ad.get_snps(type=1)
    #seq_snpsd = r['snpsd']
    #seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA')
    #seq_snpsd.onlyBinarySnps()
    #i_snpsd = r['indels']
    #print indels
    #i_snpsd = i_snpsd.getSnpsData(missingVal='NA')
    #print zip(i_snpsd.positions, i_snpsd.snps)
    #print i_snpsd.accessionsl
    seq_snpsd = dataParsers.parseCSVData(
        data_dir + "/flc_seqs_aln_imputed_snps_012510.csv")[0]
    seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA')

    #	d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_073009.csv"
    d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_imputed_012610.csv"
    d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data")
    #	d2010_sd.filter_na_accessions()
    d2010_sd.filter_na_snps()
    d2010_sd.convert_2_binary()
    d2010_sd.filter_maf_snps(0.05)
    #kinship_2010 = Emma.calcKinship(d2010_sd.getSnps(0.05))
    d2010_sd = d2010_sd.get_region_snpsd(5, 3140000, 3220000)
    d2010_sd.remove_redundant_snps(w_missing=True)

    d250k_file = "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_data_t43_081009.csv"
    snpsd = dataParsers.parse_snp_data(d250k_file)
    snpsd.filter_accessions(seq_snpsd.accessions)
    snpsd.convert_2_binary()
    snpsd.filter_maf_snps(0.05)
    #kinship_250k = Emma.calcKinship(snpsd.getSnps(0.02))

    snpsd = snpsd.get_region_snpsd(5, 3140000, 3220000)
    snpsd.remove_redundant_snps()

    seq_snpsd.remove_accessions(snpsd.accessions)
    seq_snpsd.snpsFilterRare(0.05)
    seq_snpsd.onlyBinarySnps()
    acc_map = []
    for i, acc in enumerate(seq_snpsd.accessions):
        acc_map.append((i, snpsd.accessions.index(acc)))

    seq_snpsd.orderAccessions(acc_map)
    seq_snpsd.remove_redundant_snps(w_missing=True)

    #snpsd.mergeDataUnion(d2010_sd,priority=2,unionType=3)
    #ad.compare_with_snps_data(snpsd) #Something missing here snpsd...?
    #i_snpsd =
    #snpsd.mergeDataUnion(d250k_sd,unionType=3,verbose=True)

    #NOW PERFORM GWAS AND PLOT RESULT!!!!

    phend = phenotypeData.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )
    #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv")
    results_colors = ['blue', 'green', 'red']
    #kinship_matrices = [kinship_250k,kinship_250k,kinship_2010]
    snpsds = [snpsd, seq_snpsd, d2010_sd]
    phenotypeIndices = phend.phenIds
    log_transforms = [1, 2]
    import analyzePhenotype as ap
    import analyzeSNPResult as asr
    import copy

    #	for i in phenotypeIndices:
    #		#ap.drawHistogram(phend,i,pdfFile="/Users/bjarnivilhjalmsson/tmp/hist_"+str(phend.getPhenotypeName(i))+".pdf")
    #		#if i in log_transforms:
    #		phend.logTransform(i)
    #		#print "log transforming"
    #		results = []
    #		filtered_sds=[]
    #		for sd,k in zip(snpsds,kinship_matrices):
    #			new_sd = copy.deepcopy(sd)
    #			res = Emma.run_emma_w_missing_data(new_sd,phend,i,5,k)
    #			res.negLogTransform()
    #			snps_indices_to_keep = res.filterMARF(minMaf=0.1)
    #			print "Got",len(res.scores),len(res.positions),"p-values from Emma."
    #			results.append(res)
    #			#pvals = res.scores
    #			#positions = res.positions
    #			#pp = zip(pvals,positions)
    #			#pp.sort()
    #			#print pp
    #			#import plotResults as pr
    #			#pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf")
    #			new_sd.filter_snp_indices(snps_indices_to_keep)
    #			filtered_sds.append(new_sd)
    #		import regionPlotter as rp
    #		reg_plotter = rp.RegionPlotter()
    #		reg_plotter.plot_small_result(results,results_colors=results_colors,
    #					pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_"+result_id+"_emma_gwas_"+str(phend.getPhenotypeName(i))+".pdf")
    #		for j,(r,sd) in enumerate(zip(results,filtered_sds)):
    #			r_i = r.scores.index(max(r.scores))
    #			phend.plot_marker_box_plot(i,sd,r_i,pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_emma_"+str(phend.getPhenotypeName(i))+"_"+results_colors[j]+".pdf",marker_score=r.scores[r_i])
    #
    phend = phenotypeData.readPhenotypeFile(
        "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv"
    )  #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv")

    for i in phenotypeIndices:
        results = []
        filtered_sds = []
        for sd in snpsds:
            new_sd = copy.deepcopy(sd)
            res, f_sd = KW.run_kw(new_sd, phend, i, 5)
            filtered_sds.append(f_sd)
            res.negLogTransform()
            print "Got", len(res.scores), len(
                res.positions), "p-values from KW."
            results.append(res)
            #pvals = res.scores
            #positions = res.positions
            #pp = zip(pvals,positions)
            #pp.sort()
            #print pp
            #import plotResults as pr
            #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf")
        import regionPlotter as rp
        reg_plotter = rp.RegionPlotter()
        reg_plotter.plot_small_result(
            results,
            results_colors=results_colors,
            pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_" +
            result_id + "_gwas_" + str(phend.getPhenotypeName(i)) + ".pdf")
        for j, (r, sd) in enumerate(zip(results, filtered_sds)):
            if len(r.scores) != len(sd.snps):
                print "Lengths not equal? %d, %d", (len(r.scores),
                                                    len(sd.snps))
            r_i = r.scores.index(max(r.scores))
            phend.plot_marker_box_plot(
                i,
                sd,
                r_i,
                pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_kw_" +
                str(phend.getPhenotypeName(i)) + "_" + results_colors[j] +
                ".pdf",
                marker_score=r.scores[r_i])
Ejemplo n.º 41
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix
		
		#to check whether two input file are in different orientation
		file_format2count = {}
		file_format_ls = [self.input_fname1_format, self.input_fname2_format]
		for file_format in file_format_ls:
			if file_format not in file_format2count:
				file_format2count[file_format] = 0
			file_format2count[file_format] += 1
		

		#2008-05-15 TwoSNPData can handle character matrix/2D-list. but transposeSNPData needs numeric matrix to transpose except when numpy is installed.
		if 1 in file_format2count and file_format2count[1]==1:	#there's one and only one strain x snp format.
			#it needs transpose matrix. only numpy works on character matrix. not sure Numeric or numarray is imported. so transform the input matrix to integer.
			use_nt2number = 1
		else:
			use_nt2number = 0
		
		if self.input_fname1_format==1:
			header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname1)
			snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\
							data_matrix=data_matrix)
		elif self.input_fname1_format==2:
			snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=False, use_nt2number=use_nt2number)
			snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)	#already nt in number
			del snpsd_ls
		elif self.input_fname1_format==3:
			snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=True, use_nt2number=use_nt2number)
			snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)
			del snpsd_ls
		else:
			sys.stderr.write('Error: unsupported input_fname1 format, %s\n' % self.input_fname1_format)
			sys.exit(2)
		
		if self.run_type!=2:
			if self.input_fname2_format==1:
				header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname2)
				snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list,\
								data_matrix=data_matrix)
			elif self.input_fname2_format==2:
				snpsd_ls = dataParsers.parseCSVData(self.input_fname2, withArrayIds=False, use_nt2number=use_nt2number)
				snpData2 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0)
				del snpsd_ls
			else:
				sys.stderr.write('Error: unsupported input_fname2 format, %s\n' % self.input_fname2_format)
				sys.exit(2)
			
	
			if 1 in file_format2count and file_format2count[1]==1:	#there's one and only one strain x snp format. transpose the 2nd snpData
				snpData2 = transposeSNPData(snpData2, report=self.report)
			
			if self.input_fname1_format == 1:	#row_id for the 1st file = (ecotype_id, duplicate). for 2nd file, row_id=ecotype_id.
				row_matching_by_which_value = 0
				col_matching_by_which_value = None
			elif self.input_fname1_format == 2:	#col_id for the 1st file = accession. for 2nd file, col_id=accession.
				row_matching_by_which_value = None
				col_matching_by_which_value = None
			elif self.input_fname1_format == 3:	#col_id for the 1st file = (array_id, accession). for 2nd file, col_id=accession.
				row_matching_by_which_value = None
				col_matching_by_which_value = 1
		else:
			#2008-10-12 pairwise mismatch between same data
			snpData2 = snpData1
			row_matching_by_which_value = None
			col_matching_by_which_value = None
		
		twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, row_matching_by_which_value=row_matching_by_which_value,\
							col_matching_by_which_value=col_matching_by_which_value, debug=self.debug)
		
		if self.run_type==3:
			#2008-10-12 compare snpData1 and snpData2 only for designated entries from snpData1
			if not self.ecotype_id_ls:
				sys.stderr.write("Run_type %s: ecotype_id_ls (%s) is not specified.\n"%(self.run_type, self.ecotype_id_ls))
				sys.exit(3)
			ecotype_id_set = Set(self.ecotype_id_ls)
			row_id_ls = []	#test against 
			for row_id in snpData1.row_id_ls:
				
				if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
					ecotype_id = row_id[0]
				else:
					ecotype_id = row_id
				if ecotype_id in ecotype_id_set:
					row_id_ls.append(row_id)
			print '%s arrays'%(len(row_id_ls))
			if self.ecotype_id_ls:
				for row_id in row_id_ls:
					col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id)
					if col_id2NA_mismatch_rate:
						if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
							row_id_name = '_'.join(row_id)
						else:
							row_id_name = row_id
						output_fname = '%s_%s'%(self.output_fname, row_id_name)
						twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname)
		elif self.run_type==2:
			#2008-10-12	column-wise mismatch of snpData1 vs snpData1 between rows with same ecotype_id but different array_id
			row_id_pair_set = Set()
			for row_id in snpData1.row_id_ls:
				
				if not isinstance(row_id, str) and hasattr(row_id, '__len__'):
					ecotype_id = row_id[0]
				else:
					ecotype_id = row_id
				for row_id2 in snpData2.row_id_ls:
					if row_id2[0]==ecotype_id and row_id2[1]!=row_id[1]:	#same ecotype_id but different array_id
						row_id_pair_set.add((row_id, row_id2))
			
			print '%s arrays'%(len(row_id_pair_set))
			for row_id1, row_id2 in row_id_pair_set:
				row_id12row_id2 = {row_id1:row_id2}
				col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id1, row_id12row_id2=row_id12row_id2)
				if col_id2NA_mismatch_rate:
					output_fname = '%s_%s_vs_%s'%(self.output_fname, '_'.join(row_id1), '_'.join(row_id2))
					twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname)
		elif self.run_type==1:
			#sys.exit(2)	#2008-10-12 skip all original functions
			row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise()
			col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise()
			if row_id2NA_mismatch_rate:
				QC_250k.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname, file_1st_open=1)
			if col_id2NA_mismatch_rate:
				QC_250k.output_row_id2NA_mismatch_rate(col_id2NA_mismatch_rate, self.output_fname, file_1st_open=0)