def plot_flc_haplotype(): import analyzeHaplotype as ah #res = readFastaFile("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/FLC_muscle_072109.aln") #seqs = res["sequences"] #seq_names = res["names"] (positions, aln_snps, seq_names) = getSNPsFromSequences( "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/FLC_muscle_072109.aln", ref_start_pos=3170001, ref_seq="cut_3184000_3170000", reversed=True) #aln_snps = map(list,zip(*seqs)) #seqs = reverse_sequences(seqs) i = seq_names.index("cut_3184000_3170000") seq_names[i] = "Col_TAIR8" (flc_250k_snps, flc_snps, flc_250K_positions, accessions, flc_data_acc_map) = get_overlapping_snps_in_region() print flc_data_acc_map import phenotypeData as pd a_dict = pd._getEcotypeIdInfoDict_() new_accessions = [] for acc in accessions: new_accessions.append(unicode(a_dict[int(acc)][0], 'iso-8859-1')) accessions = new_accessions ah.plot_flc_haplotypes( aln_snps, positions=positions, accessions=seq_names, haplotypeFile="/Users/bjarnivilhjalmsson/tmp/aln_haplotype.pdf", treeFile="/Users/bjarnivilhjalmsson/tmp/aln_tree.pdf", acc_250k=flc_data_acc_map, flc_250K_positions=flc_250K_positions)
def _get192Ecotypes_(): resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4] total_accessions = set() for p_i in phenotypeIndices: if not p_i in [5,6,7]: accessions = phed.getAccessionsWithValues(p_i) total_accessions = total_accessions.union(accessions) ecotypes_192 = phenotypeData._getFirst192Ecotypes_() ecotypes_192 = [str(e) for e in ecotypes_192] print "len(ecotypes_192):",len(ecotypes_192) #print ecotypes_192 phed.filterAccessions(ecotypes_192) for p_i in [5,6,7]: accessions = phed.getAccessionsWithValues(p_i) total_accessions = total_accessions.union(accessions) total_accessions = list(total_accessions) print len(total_accessions) total_accessions.sort() print total_accessions ecotype_info_dict = phenotypeData._getEcotypeIdInfoDict_() ets = [] i = 0 for et in total_accessions: et = int(et) if ecotype_info_dict.has_key(et): print str(et)+", "+str(ecotype_info_dict[et][0])+", "+str(ecotype_info_dict[et][1]) i += 1 ets.append(et) else: print et,"is missing in genotype data." print i return ets
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "rFile=", "chr=", "delim=", "missingval=", "BoundaryStart=", "removeOutliers=", "addConstant=", "logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", "kinshipDatafile=", "phenotypeRanks", "onlyMissing", "onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", "complement", "negate", "srInput=", "sr", "srOutput=", "srPar=", "srSkipFirstRun", "testRobustness", "permutationFilter=", "useLinearRegress", "regressionCofactors=", "FriLerAsCofactor", "FriColAsCofactor", "memReq=", "walltimeReq=", ] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeRanks = False removeOutliers = None addConstant = -1 phenotypeFileType = 1 rFile = None delim = "," missingVal = "NA" help = 0 minMAF = 0.0 boundaries = [-1, -1] chr = None parallel = None logTransform = False negate = False parallelAll = False lrt = False kinshipDatafile = None onlyMissing = False onlyOriginal96 = False onlyOriginal192 = False onlyBelowLatidue = None complement = False sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 testRobustness = False permutationFilter = 0.002 useLinearRegress = False regressionCofactors = None FriLerAsCofactor = False FriColAsCofactor = False memReq = "5g" walltimeReq = "150:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--rFile"): rFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--addConstant"): addConstant = float(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--onlyMissing"): onlyMissing = True elif opt in ("--onlyOriginal96"): onlyOriginal96 = True elif opt in ("--onlyOriginal192"): onlyOriginal192 = True elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue = float(arg) elif opt in ("--complement"): complement = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--negate"): negate = True elif opt in ("--removeOutliers"): removeOutliers = float(arg) elif opt in ("--LRT"): lrt = True elif opt in ("-c", "--chr"): chr = int(arg) elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("--kinshipDatafile"): kinshipDatafile = arg elif opt in ("--phenotypeRanks"): phenotypeRanks = True elif opt in ("--sr"): sr = True elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--FriLerAsCofactor"): FriLerAsCofactor = True elif opt in ("--FriColAsCofactor"): FriColAsCofactor = True elif opt in ("--useLinearRegress"): useLinearRegress = True elif opt in ("--regressionCofactors"): regressionCofactors = arg elif opt in ("--memReq"): memReq = arg elif opt in ("--walltimeReq"): walltimeReq = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) print "Emma is being set up with the following parameters:" print "output:", rFile print "phenotypeRanks:", phenotypeRanks print "phenotypeFileType:", phenotypeFileType print "parallel:", parallel print "parallelAll:", parallelAll print "minMAF:", minMAF print "LRT:", lrt print "delim:", delim print "missingval:", missingVal print "kinshipDatafile:", kinshipDatafile print "chr:", chr print "boundaries:", boundaries print "onlyMissing:", onlyMissing print "onlyOriginal96:", onlyOriginal96 print "onlyOriginal192:", onlyOriginal192 print "onlyBelowLatidue:", onlyBelowLatidue print "complement:", complement print "negate:", negate print "logTransform:", logTransform print "addConstant:", addConstant print "removeOutliers:", removeOutliers print "sr:", sr print "srSkipFirstRun:", srSkipFirstRun print "srInput:", srInput print "srOutput:", srOutput print "srTopQuantile:", srTopQuantile print "srWindowSize:", srWindowSize print "testRobustness:", testRobustness print "permutationFilter:", permutationFilter print "useLinearRegress:", useLinearRegress print "regressionCofactors:", regressionCofactors print "FriLerAsCofactor:", FriLerAsCofactor print "FriColAsCofactor:", FriColAsCofactor print "walltimeReq:", walltimeReq print "memReq:", memReq def runParallel(phenotypeIndex, phed): #Cluster specific parameters print phenotypeIndex phenName = phed.getPhenotypeName(phenotypeIndex) outFileName = resultDir + "Emma_" + parallel + "_" + phenName shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime=" + walltimeReq + "\n" shstr += "#PBS -l mem=" + memReq + "\n" shstr += "#PBS -q cmb\n" shstr += "#PBS -N E" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" if useLinearRegress: outFileName = resultDir + "LR_" + parallel + "_" + phenName shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " " if useLinearRegress: shstr += " --useLinearRegress " if regressionCofactors: shstr += " --regressionCofactors=" + str(regressionCofactors) + " " if FriLerAsCofactor: shstr += " --FriLerAsCofactor " if FriColAsCofactor: shstr += " --FriColAsCofactor " if onlyOriginal96: shstr += " --onlyOriginal96 " elif onlyOriginal192: shstr += " --onlyOriginal192 " if onlyBelowLatidue: shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " " if logTransform: shstr += " --logTransform " if negate: shstr += " --negate " if removeOutliers: shstr += " --removeOutliers=" + str(removeOutliers) + " " if phenotypeRanks: shstr += " --phenotypeRanks " if testRobustness: shstr += " --testRobustness " shstr += " --permutationFilter=" + str(permutationFilter) + " " if sr: shstr += " --sr " if not srOutput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" shstr += " --srOutput=" + str(output) + " " if srSkipFirstRun: if not srInput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" shstr += " --srInput=" + str(output) + " " shstr += " --srSkipFirstRun " shstr += " --srPar=" + str(srTopQuantile) + "," + str( srWindowSize) + " " if kinshipDatafile: shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " " shstr += " --addConstant=" + str(addConstant) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data if parallelAll: for phenotypeIndex in phed.phenIds: if onlyMissing: phenName = phed.getPhenotypeName(phenotypeIndex) pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" res = None try: res = os.stat(pvalFile) except Exception: print "File", pvalFile, "does not exist." if res and res.st_size > 0: print "File", pvalFile, "already exists, and is non-empty." if sr: srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" srRes = None try: srRes = os.stat(srInput) except Exception: print "File", srInput, "does not exist." if srRes and srRes.st_size > 0: print "File", srInput, "already exists, and is non-empty." else: runParallel(phenotypeIndex, phed) else: print "Setting up the run." runParallel(phenotypeIndex, phed) else: runParallel(phenotypeIndex, phed) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex, phed) return else: phenotypeIndex = int(args[2]) print "phenotypeIndex:", phenotypeIndex print "\nStarting program now!\n" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data numAcc = len(snpsds[0].accessions) #Removing outliers if removeOutliers: print "Remoing outliers" phed.naOutliers(phenotypeIndex, removeOutliers) #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str, original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str, original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][ 2] and eiDict[acc][2] < onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2] == None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] #Checking which accessions to keep and which to remove . for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break print "\nFiltering accessions in genotype data:" #Filter accessions which do not have the phenotype value (from the genotype data). for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "\nNow filtering accessions in phenotype data:" phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len( phed.accessions) == len(snpsds[0].accessions) if len(phed.accessions) != len(snpsds[0].accessions): raise Exception #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Remove minor allele frequencies if minMAF != 0: sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Removing SNPs which are outside of boundaries. if chr: print "\nRemoving SNPs which are outside of boundaries." snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1]) snpsds = [snpsds[chr - 1]] #Ordering accessions in genotype data to fit phenotype data. print "Ordering genotype data accessions." accessionMapping = [] i = 0 for acc in phed.accessions: if acc in snpsds[0].accessions: accessionMapping.append((snpsds[0].accessions.index(acc), i)) i += 1 #print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "\nGenotype data has been ordered." #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) print "" print "Checking kinshipfile:", kinshipDatafile if kinshipDatafile: #Is there a special kinship file? kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal) accIndicesToKeep = [] #Checking which accessions to keep and which to remove (genotype data). sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(kinshipSnpsds[0].accessions)): acc1 = kinshipSnpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) break print accIndicesToKeep for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from kinship genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "Ordering kinship data accessions." accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in kinshipSnpsds[0].accessions: accessionMapping.append( (kinshipSnpsds[0].accessions.index(acc), i)) i += 1 print zip(accessionMapping, snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "Kinship genotype data has been ordered." newKinshipSnpsds = [] sys.stdout.write("Converting data format") for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() newKinshipSnpsds.append(snpsd.getSnpsData( missingVal=missingVal)) #This data might have NAs print "" kinshipSnpsds = newKinshipSnpsds else: kinshipSnpsds = newSnpsds print "Found kinship data." #Ordering accessions according to the order of accessions in the genotype file # accessionMapping = [] # i = 0 # for acc in snpsds[0].accessions: # if acc in phed.accessions: # accessionMapping.append((phed.accessions.index(acc),i)) # i += 1 # phed.orderAccessions(accessionMapping) #Negating phenotypic values if negate: phed.negateValues(phenotypeIndex) if logTransform and not phed.isBinary( phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0: addConstant = 0 #Adding a constant. if addConstant != -1: if addConstant == 0: addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10 addConstant = addConstant - phed.getMinValue(phenotypeIndex) print "Adding a constant to phenotype:", addConstant phed.addConstant(phenotypeIndex, addConstant) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotypeIndex) #Converting phenotypes to Ranks elif phenotypeRanks: phed.transformToRanks(phenotypeIndex) if not chr: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [1, 2, 3, 4, 5]) else: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr]) phenotypeName = phed.getPhenotypeName(phenotypeIndex) sys.stdout.flush() if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in snpsDataset.snpsDataList: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter) sys.exit(0) if useLinearRegress: phenVals = phed.getPhenVals(phenotypeIndex) d0 = {} d0["phen"] = phenVals dh = {} dh["phen"] = phenVals import rpy, gc if regressionCofactors: #Adds ler and col as cofactors import pickle f = open(regressionCofactors, "r") co_factors = pickle.load(f) f.close() #inserting co factors into model for factor in co_factors: d[factor] = co_factors[factor] import analyzeHaplotype as ah (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True) if FriColAsCofactor: d0["col"] = col_factor dh["col"] = col_factor if FriLerAsCofactor: d0["ler"] = ler_factor dh["ler"] = ler_factor chr_pos_pvals = [] stats = [] sys.stdout.write("Applying the linear model") sys.stdout.flush() for i in range(0, len(newSnpsds)): #[3]:# snpsd = newSnpsds[i] sys.stdout.write("|") sys.stdout.flush() gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. for j in range(0, len(snpsd.snps)): if j % 5000 == 0: sys.stdout.write(".") sys.stdout.flush() #if snpsd.positions[j]>1700000: # break snp = snpsd.snps[j] d0["snp"] = snp try: rpy.set_default_mode(rpy.NO_CONVERSION) aov0 = rpy.r.aov(r("phen ~ ."), data=d0) aovh = rpy.r.aov(r("phen ~ ."), data=dh) rpy.set_default_mode(rpy.BASIC_CONVERSION) s0 = rpy.r.summary(aov0) sh = rpy.r.summary(aovh) #print s0,sh rss_0 = s0['Sum Sq'][-1] if type(sh['Sum Sq']) != float: rss_h = sh['Sum Sq'][-1] else: rss_h = sh['Sum Sq'] f = (rss_h - rss_0) / (rss_0 / (len(phenVals) - len(d0) + 1)) pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False) except Exception, err_str: print "Calculating p-value failed" #,err_str pval = 1.0 #print "dh:",dh #print "d0:",d0 #print "rss_h,rss_0:",rss_h,rss_0 #print "f,p:",f,pval chr_pos_pvals.append([i + 1, snpsd.positions[j], pval]) mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0])) maf = mafc / float(len(snp)) stats.append([maf, mafc]) sys.stdout.write("\n") #Write out to a result file sys.stdout.write("Writing results to file\n") sys.stdout.flush() pvalFile = rFile + ".pvals" f = open(pvalFile, "w") f.write("Chromosome,position,p-value,marf,maf\n") for i in range(0, len(chr_pos_pvals)): chr_pos_pval = chr_pos_pvals[i] stat = stats[i] f.write( str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," + str(chr_pos_pval[2]) + "," + str(stat[0]) + "," + str(stat[1]) + "\n") f.close() #Plot results print "Generating a GW plot." phenotypeName = phed.getPhenotypeName(phenotypeIndex) res = gwaResults.Result(pvalFile, name="LM_" + phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() pngFile = pvalFile + ".png" plotResults.plotResult(res, pngFile=pngFile, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plotBonferroni=True, usePylab=False)
def _run_(): if len(sys.argv)==1: print __doc__ sys.exit(2) long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "addToDB", "callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , "subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", "onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "permTest=", "savePermutations", "permutationFilter=", "testRobustness", "memReq=","walltimeReq=",] try: opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType=1 outputFile=None delim="," missingVal="NA" help=0 parallel=None parallelAll=False addToDB=False callMethodID=None comment="" subSample=None onlyOriginal96=False onlyOriginal192 = False subSampleLikePhenotype = None subsampleTest = False numSubSamples = None complement = False onlyBelowLatidue = None onlyAboveLatidue = None sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 permTest = None savePermutations = False permutationFilter = 1.0 testRobustness = False memReq = "5g" walltimeReq = "100:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help=1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile=arg elif opt in ("--phenotypeFileType"): phenotypeFileType=int(arg) elif opt in ("--parallel"): parallel=arg elif opt in ("--parallelAll"): parallelAll=True elif opt in ("--addToDB"): addToDB=True elif opt in ("--onlyOriginal96"): onlyOriginal96=True elif opt in ("--onlyOriginal192"): onlyOriginal192=True elif opt in ("--complement"): complement=True elif opt in ("--subSample"): subSample=int(arg) elif opt in ("--subsampleTest"): subsampleTest = True l = arg.split(",") subSample=int(l[0]) numSubSamples=int(l[1]) elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue=float(arg) elif opt in ("--onlyAboveLatidue"): onlyAboveLatidue=float(arg) elif opt in ("--subSampleLikePhenotype"): subSampleLikePhenotype=int(arg) elif opt in ("--callMethodID"): callMethodID=int(arg) elif opt in ("--comment"): comment=arg elif opt in ("-d", "--delim"): delim=arg elif opt in ("-m", "--missingval"): missingVal=arg elif opt in ("--sr"): sr = True elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permTest"): permTest = int(arg) elif opt in ("--savePermutations"): savePermutations = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--memReq"): memReq=arg elif opt in ("--walltimeReq"): walltimeReq=arg else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile=args[0] phenotypeDataFile=args[1] print "Kruskal-Wallis is being set up with the following parameters:" print "phenotypeDataFile:",phenotypeDataFile print "snpsDataFile:",snpsDataFile print "parallel:",parallel print "parallelAll:",parallelAll print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "onlyAboveLatidue:",onlyAboveLatidue print "complement:",complement print "subSampleLikePhenotype:",subSampleLikePhenotype print "subsampleTest:",subsampleTest print "numSubSamples:",numSubSamples print "subSample:",subSample print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "permTest:",permTest print "savePermutations:",savePermutations print "permutationFilter:",permutationFilter print "testRobustness:",testRobustness print "walltimeReq:",walltimeReq print "memReq:",memReq def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) print phenName outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime="+walltimeReq+"\n" shstr += "#PBS -l mem="+memReq+"\n" shstr +="#PBS -q cmb\n" shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") if parallel: #Running on the cluster.. if parallelAll: phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) elif subsampleTest: phenotypeIndex=int(args[2]) for i in range(0,numSubSamples): runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i)) else: phenotypeIndex=int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex=int(args[2]) print "phenotypeIndex:",phenotypeIndex print "output:",outputFile print "\nStarting program now!\n" #Load phenotype file phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) elif onlyAboveLatidue: print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if subSampleLikePhenotype: p_name = phed.getPhenotypeName(subSampleLikePhenotype) print "Picking sample as in",p_name ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype) print ecotypes phed.filterAccessions(ecotypes) print "len(phed.accessions)", len(phed.accessions) if subSample: sample_ecotypes = [] ecotypes = phed.getNonNAEcotypes(phenotypeIndex) sample_ecotypes = random.sample(ecotypes,subSample) phed.filterAccessions(sample_ecotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() #Load genotype file snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal) #Checking overlap between phenotype and genotype accessions. phenotype=phed.getPhenIndex(phenotypeIndex) accIndicesToKeep=[] phenAccIndicesToKeep=[] numAcc=len(snpsds[0].accessions) sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1=snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2=phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping=[] i=0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i+=1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds=[] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Double check genotype file: problems = 0 for i in range(0,len(newSnpsds)): snpsd = newSnpsds[i] for j in range(0,len(snpsd.snps)): snp = snpsd.snps[j] sc = snp.count(0) if sc==0 or sc==len(snp): print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i] problems += 1 if problems >0: print "Genotype file appears to have potential problems" else: print "Genotype file appears to be good" if permTest: print "Starting a permutation test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" permTest = 100 _perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter) sys.exit(0) if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" _robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter) sys.exit(0) sys.stdout.flush() print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun if (not sr) or (sr and not srSkipFirstRun): #Writing files #phed and phenotype sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) phenotypeName=phed.getPhenotypeName(phenotypeIndex) if phed.isBinary(phenotypeIndex): pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex)) else: snps = sd.getSnps() phen_vals = phed.getPhenVals(phenotypeIndex) try: kw_res = util.kruskal_wallis(snps,phen_vals) pvals = kw_res['ps'] except: print snps print phen_vals print len(snps),len(snps[0]),len(phen_vals) raise Exception res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False) pvalFile=outputFile+".pvals" res.writeToFile(pvalFile) print "Generating a GW plot." res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile else: print "Skipping first stage analysis." sys.stdout.flush() if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["rFile=","chr=", "delim=", "missingval=", "withArrayId=", "BoundaryStart=", "removeOutliers=", "addConstant=", "logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", "kinshipDatafile=", "phenotypeRanks", "onlyMissing","onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", "complement", "negate", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "testRobustness", "permutationFilter="] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeRanks = False removeOutliers = None addConstant = -1 phenotypeFileType = 1 rFile = None delim = "," missingVal = "NA" help = 0 minMAF=0.0 withArrayIds = 1 boundaries = [-1,-1] chr=None parallel = None logTransform = False negate = False parallelAll = False lrt = False kinshipDatafile = None onlyMissing = False onlyOriginal96 = False onlyOriginal192 = False onlyBelowLatidue = None complement = False sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 testRobustness = False permutationFilter = 0.002 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-o","--rFile"): rFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--addConstant"): addConstant = float(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--onlyMissing"): onlyMissing = True elif opt in ("--onlyOriginal96"): onlyOriginal96 = True elif opt in ("--onlyOriginal192"): onlyOriginal192 = True elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue = float(arg) elif opt in ("--complement"): complement = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--negate"): negate = True elif opt in ("--removeOutliers"): removeOutliers = float(arg) elif opt in ("--LRT"): lrt = True elif opt in ("-c","--chr"): chr = int(arg) elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("--kinshipDatafile"): kinshipDatafile = arg elif opt in ("--phenotypeRanks"): phenotypeRanks = True elif opt in ("--sr"): sr = True elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) print "Emma is being set up with the following parameters:" print "output:",rFile print "phenotypeRanks:",phenotypeRanks print "withArrayId:",withArrayIds print "phenotypeFileType:",phenotypeFileType print "parallel:",parallel print "parallelAll:",parallelAll print "minMAF:",minMAF print "LRT:",lrt print "delim:",delim print "missingval:",missingVal print "kinshipDatafile:",kinshipDatafile print "chr:",chr print "boundaries:",boundaries print "onlyMissing:",onlyMissing print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "complement:",complement print "negate:",negate print "logTransform:",logTransform print "addConstant:",addConstant print "removeOutliers:",removeOutliers print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "testRobustness:",testRobustness print "permutationFilter:",permutationFilter def runParallel(phenotypeIndex,phed): #Cluster specific parameters print phenotypeIndex phenName = phed.getPhenotypeName(phenotypeIndex) outFileName = resultDir+"Emma_"+parallel+"_"+phenName shstr = """#!/bin/csh #PBS -l walltime=100:00:00 #PBS -l mem=8g #PBS -q cmb """ shstr += "#PBS -N E"+phenName+"_"+parallel+"\n" shstr += "set phenotypeName="+parallel+"\n" shstr += "set phenotype="+str(phenotypeIndex)+"\n" shstr += "(python "+emmadir+"Emma.py -o "+outFileName+" " if onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " if logTransform: shstr += " --logTransform " if negate: shstr += " --negate " if removeOutliers: shstr += " --removeOutliers="+str(removeOutliers)+" " if phenotypeRanks: shstr += " --phenotypeRanks " if testRobustness: shstr+=" --testRobustness " shstr+=" --permutationFilter="+str(permutationFilter)+" " if sr: shstr += " --sr " if not srOutput: output = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"Emma_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr += " -a "+str(withArrayIds)+" " if kinshipDatafile: shstr += " --kinshipDatafile="+str(kinshipDatafile)+" " shstr += " --addConstant="+str(addConstant)+" " shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n" f = open(parallel+".sh",'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data if parallelAll: for phenotypeIndex in phed.phenIds: if onlyMissing: phenName = phed.getPhenotypeName(phenotypeIndex) pvalFile = resultDir+"Emma_"+parallel+"_"+phenName+".pvals" res = None try: res = os.stat(pvalFile) except Exception: print "File",pvalFile,"does not exist." if res and res.st_size>0: print "File",pvalFile,"already exists, and is non-empty." if sr: srInput = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals" srRes = None try: srRes = os.stat(srInput) except Exception: print "File",srInput,"does not exist." if srRes and srRes.st_size>0: print "File",srInput,"already exists, and is non-empty." else: runParallel(phenotypeIndex,phed) else: print "Setting up the run." runParallel(phenotypeIndex,phed) else: runParallel(phenotypeIndex,phed) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex,phed) return else: phenotypeIndex = int(args[2]) print "phenotypeIndex:",phenotypeIndex print "\nStarting program now!\n" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data numAcc = len(snpsds[0].accessions) #Removing outliers if removeOutliers: print "Remoing outliers" phed.naOutliers(phenotypeIndex,removeOutliers) #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] #Checking which accessions to keep and which to remove . for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break print "\nFiltering accessions in genotype data:" #Filter accessions which do not have the phenotype value (from the genotype data). for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed from genotype data, leaving",len(accIndicesToKeep),"accessions in all." print "\nNow filtering accessions in phenotype data:" phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is",len(phed.accessions)==len(snpsds[0].accessions) if len(phed.accessions)!=len(snpsds[0].accessions): raise Exception #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" #Remove minor allele frequencies if minMAF!=0: sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Removing SNPs which are outside of boundaries. if chr: print "\nRemoving SNPs which are outside of boundaries." snpsds[chr-1].filterRegion(boundaries[0],boundaries[1]) snpsds = [snpsds[chr-1]] #Ordering accessions in genotype data to fit phenotype data. print "Ordering genotype data accessions." accessionMapping = [] i = 0 for acc in phed.accessions: if acc in snpsds[0].accessions: accessionMapping.append((snpsds[0].accessions.index(acc),i)) i += 1 #print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)",len(snpsds[0].snps) for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "\nGenotype data has been ordered." #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) print "" print "Checking kinshipfile:",kinshipDatafile if kinshipDatafile: #Is there a special kinship file? kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) accIndicesToKeep = [] #Checking which accessions to keep and which to remove (genotype data). sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0,len(kinshipSnpsds[0].accessions)): acc1 = kinshipSnpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) break print accIndicesToKeep for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed from kinship genotype data, leaving",len(accIndicesToKeep),"accessions in all." print "Ordering kinship data accessions." accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in kinshipSnpsds[0].accessions: accessionMapping.append((kinshipSnpsds[0].accessions.index(acc),i)) i += 1 print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)",len(snpsds[0].snps) for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "Kinship genotype data has been ordered." newKinshipSnpsds = [] sys.stdout.write("Converting data format") for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() newKinshipSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) #This data might have NAs print "" kinshipSnpsds = newKinshipSnpsds else: kinshipSnpsds = newSnpsds print "Found kinship data." #Ordering accessions according to the order of accessions in the genotype file # accessionMapping = [] # i = 0 # for acc in snpsds[0].accessions: # if acc in phed.accessions: # accessionMapping.append((phed.accessions.index(acc),i)) # i += 1 # phed.orderAccessions(accessionMapping) #Negating phenotypic values if negate: phed.negateValues(phenotypeIndex) #Adding a constant. if addConstant!=-1: if addConstant==0: addConstant = math.sqrt(phed.getVariance(phenotypeIndex))/10 addConstant = addConstant - phed.getMinValue(phenotypeIndex) print "Adding a constant to phenotype:",addConstant phed.addConstant(phenotypeIndex,addConstant) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotypeIndex) #Converting phenotypes to Ranks elif phenotypeRanks: phed.transformToRanks(phenotypeIndex) if not chr: snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[1,2,3,4,5]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[1,2,3,4,5]) else: snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[chr]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[chr]) phenotypeName = phed.getPhenotypeName(phenotypeIndex) sys.stdout.flush() if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in snpsDataset.snpsDataList: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) _robustness_test_(allSNPs,phenVals,rFile,filter=permutationFilter) sys.exit(0) if (not sr) or (sr and not srSkipFirstRun): sys.stdout.write("Running Primary Emma.\n") sys.stdout.flush() pvalFile = _runEmmaScript_(snpsDataset, kinshipSnpsDataset, phed, phenotypeIndex, rFile, chr=chr, delim=delim, missingVal=missingVal, boundaries=boundaries, lrt=lrt) res = gwaResults.Result(pvalFile,name="EMMA_"+phenotypeName, phenotypeID=phenotypeIndex) res.filterMAF() res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,kinshipSnpsDataset) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.filterMAF() res.negLogTransform() srRes = gwaResults.Result(srOutput,name="EMMA_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.filterMAF() srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def _run_(): l_192 = [ 6909, 6977, 100000, 6906, 8266, 6897, 6898, 5837, 6907, 7438, 6910, 6913, 6914, 6918, 6919, 8214, 6924, 8424, 6926, 6928, 6933, 7520, 7521, 6936, 7522, 6937, 6939, 6900, 6901, 6908, 6009, 6915, 6917, 6920, 6922, 6923, 6927, 6929, 6930, 6940, 6942, 6943, 7518, 6946, 8213, 6951, 6958, 6959, 7525, 6961, 6967, 6973, 6974, 6976, 7516, 6979, 6980, 6982, 6983, 6985, 6931, 6043, 6945, 7519, 7526, 7523, 6956, 6960, 7524, 6963, 6964, 6965, 6966, 6969, 6971, 6975, 7517, 6978, 6981, 6984, 6899, 6903, 6904, 6905, 6911, 6916, 8215, 6921, 6932, 6046, 6944, 7515, 7514, 6962, 6968, 6972, 6970, 8329, 7163, 8258, 8259, 8290, 7461, 7323, 8254, 8270, 8233, 8285, 6016, 8423, 8237, 6040, 6064, 6957, 8369, 8247, 8426, 9058, 8249, 9057, 6709, 7000, 7062, 7460, 7123, 7147, 7255, 7275, 8241, 6988, 8256, 8264, 8265, 8231, 8271, 8274, 8275, 8420, 8283, 8284, 6008, 8422, 8296, 8297, 8300, 8235, 8306, 8310, 8236, 8311, 8314, 8239, 8240, 8323, 8242, 8325, 8326, 8222, 8430, 6042, 8335, 8343, 6074, 8351, 8353, 8354, 7296, 8365, 8374, 8376, 8378, 8412, 8387, 8389, 6243, 7306, 7418, 8312, 8313, 8334, 8337, 8357, 8366, 8411, 8388, 8395, 7014, 7081, 8243, 8245, 7033, 7064, 7094, 7424, 7231, 7282, 7477, 7346, 8230 ] print len(l_192) acc_info = _read_seeds_files_() et_list = acc_info["ecotypes"] spi = pd._get_stock_parent_info_dict_() ei_dict = pd._getEcotypeIdInfoDict_() usc_ecotypes = _read_seeds_files_( file_names=["USC_seeds_info_from Joy.csv"], file_dir="/Users/bjarnivilhjalmsson/Projects/Solexa_sequencing/" )["ecotypes"] diff_set = set(et_list).difference( set(et_list).intersection(set(usc_ecotypes))) for e in diff_set: print ei_dict[e] ms_acc_info = _read_MPI_SALK_sequences_() ms_accessions = ms_acc_info["accessions"] ms_stock_parents = ms_acc_info["stock_parents"] print len(spi) a2e = pd._getAccessionToEcotypeIdDict_(ms_accessions, ) len(a2e) ecotypes = [] for acc, sp in zip(ms_accessions, ms_stock_parents): if acc in a2e: ecotypes.append(int(a2e[acc])) elif sp in spi: ecotypes.append(int(spi[sp][0])) else: ecotypes.append(None) print acc, ",", sp, ": weren't found!!" print len(ecotypes) - ecotypes.count(None), len(ecotypes) tg_e_dict = pd._getEcotype2TgEcotypeDict_() ms_e_set = set(ecotypes) print len(ms_e_set) elist = [] for e in et_list: if e: elist.append(tg_e_dict[e]) else: elist.append(None) e_set = set(elist) i_set = ms_e_set.intersection(e_set) print e_set print ms_e_set print i_set print ei_dict[list(i_set)[0]] ecotypes_192 = set(l_192) i_set1 = ecotypes_192.intersection(ms_e_set) i_set2 = ecotypes_192.intersection(e_set) s = ecotypes_192.intersection(ms_e_set.union(e_set)) ds = ecotypes_192.difference(s) print len(i_set1), len(i_set2), len(s), len(ds) for e in ds: print ei_dict[e] f = open("/tmp/missing_gwas.csv", "w") f.write("ecotype_id,accession_name,stock_parent,country_of_origin\n") for e in ds: f.write( str(e) + "," + ei_dict[e][0] + "," + ei_dict[e][1] + "," + ei_dict[e][4] + "\n") f.close()
def load_phentoype_file_bergelsson(): import env filename = "/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/bergelsson_rosette_glucs.csv" f = open(filename, "r") reader = csv.reader(f) phenotype_names = reader.next()[2:] for i in range(len(phenotype_names)): phenotype_names[i] = phenotype_names[i].replace(" ", "_") phenotype_names[i] = 'jb_' + phenotype_names[i] print phenotype_names accession_names = [] accession_ID = [] for row in reader: accession_names.append(row[0].split()[0].lower()) accession_ID.append(row[1]) f.close() print accession_names #acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"]) e_info_dict = pd._getEcotypeIdInfoDict_() ei_2_tgei = pd._getEcotype2TgEcotypeDict_() #print len(acc_dict),acc_dict ecotypes = [] uncertain_list = [] for acc, acc_id in zip(accession_names, accession_ID): #if not acc in acc_dict: if not int(acc_id) in ei_2_tgei: print "(%s, %s) is missing in dictionary" % (acc, acc_id) a_id = int(acc_id) if a_id in e_info_dict: e_info = e_info_dict[a_id] print "Guessing that it's:", e_info else: print "No good guess for it. Look it up!!\n" #acc_dict[acc] = acc_id ecotypes.append(acc_id) else: #ecotype = acc_dict[acc] ecotype = ei_2_tgei[int(acc_id)] ecotypes.append(ecotype) phenotype_indices = range(2, len(phenotype_names) + 2) phenotypes = [] #[acc_id][phenotype_name] f = open(filename, "r") reader = csv.reader(f) reader.next() print len(set(accession_ID)), len(set(ecotypes)) for row in reader: #print row #if row[0].split()[0].lower() in acc_dict: phen_vals = [] for pv in row[2:]: if pv == "" or pv == 'NA': pv = 'NA' else: pv = float(pv) phen_vals.append(pv) if len(phen_vals) != len(phenotype_names): import pdb; pdb.set_trace() phenotypes.append(phen_vals) #else: # print "Missing:",row[0] phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/phen_bergelsson_051710.tsv", delimiter='\t') phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Joy_Bergelsson/phen_bergelsson_051710.csv", delimiter=',')
def plot_local_haplotypes(filename, marker_data, focal_start, focal_end, error_tolerance=0, phenotypeData=None, phen_id=None): """ Plots certain types of haplotype plots... """ haplotype_ids = range(1000, 0, -1) #Fill color matrix up.. start_i = 0 cur_pos = 0 while start_i < len(marker_data.positions) and cur_pos < focal_start: cur_pos = marker_data.positions[start_i] start_i += 1 if start_i == len(marker_data.positions): raise Exception("Region is not covered by markers.") end_i = start_i while end_i < len(marker_data.positions) and cur_pos < focal_end: cur_pos = marker_data.positions[end_i] end_i += 1 center_haplotypes = [] for a_i in range(len(marker_data.accessions)): haplotype = [] for snp in marker_data.snps[start_i:end_i]: haplotype.append(snp[a_i]) center_haplotypes.append(haplotype) haplotype_dict = {} hap_pos = (marker_data.positions[end_i - 1] + marker_data.positions[start_i]) / 2 for a_i, c_h in enumerate(center_haplotypes): ch = tuple(c_h) if not ch in haplotype_dict: haplotype_dict[ch] = [ 0, 0, [a_i], hap_pos ] #haplotype id number, haplotype frequency count, list of accessions indices, and position. else: haplotype_dict[ch][2].append(a_i) freq_hapl_list = [] for ch in haplotype_dict: hi = haplotype_dict[ch] haplotype_dict[ch][1] = len(hi[2]) freq_hapl_list.append((len(hi[2]), ch)) freq_hapl_list.sort(reverse=True) for (hc, haplotype) in freq_hapl_list: if hc == 1: haplotype_dict[haplotype][0] = 0 else: haplotype_dict[haplotype][0] = haplotype_ids.pop() center_haplotype_dict = haplotype_dict left_haplotypes = [] right_haplotypes = [] left_haplotypes.append(center_haplotype_dict) right_haplotypes = [] left_positions = [hap_pos] right_positions = [] #Starting with the haplotype structure to the left! some_haplotype = True i = start_i - 1 old_hap_dict = center_haplotype_dict while old_hap_dict and i >= 0: #print i #l1 = [len(old_hap_dict[h][2]) for h in old_hap_dict] #l2 = [old_hap_dict[h][0] for h in old_hap_dict] #print l1,l2, sum(l1) haplotype_dict = {} hap_pos = marker_data.positions[i] left_positions.append(hap_pos) for hap in old_hap_dict: (h_id, h_count, acc_indices, pos) = old_hap_dict[hap] #info on the old haplotype #print h_id temp_hap_dict = {} for a_i in acc_indices: new_hap = tuple([marker_data.snps[i][a_i]] + list(hap)) if not new_hap in temp_hap_dict: temp_hap_dict[new_hap] = [ 0, 0, [a_i], hap_pos ] #haplotype id number, haplotype frequency count, list of accessions indices, and position. else: temp_hap_dict[new_hap][2].append(a_i) freq_hapl_list = [] for h in temp_hap_dict: hi = temp_hap_dict[h] temp_hap_dict[h][1] = len(hi[2]) freq_hapl_list.append((len(hi[2]), h)) freq_hapl_list.sort() #print freq_hapl_list (hc, h) = freq_hapl_list.pop( ) #the most frequent haplotype gets colored like the last one. if hc == 1: del temp_hap_dict[h] else: temp_hap_dict[h][0] = h_id freq_hapl_list.reverse() for (hc, h) in freq_hapl_list: if hc == 1: del temp_hap_dict[h] else: temp_hap_dict[h][0] = haplotype_ids.pop() for h in temp_hap_dict: haplotype_dict[h] = temp_hap_dict[h] if haplotype_dict: left_haplotypes.append(haplotype_dict) old_hap_dict = haplotype_dict i -= 1 #Now the same with the haplotype structure to the right! i = end_i old_hap_dict = center_haplotype_dict while old_hap_dict and i < len(marker_data.snps): #print i #l1 = [len(old_hap_dict[h][2]) for h in old_hap_dict] #l2 = [old_hap_dict[h][0] for h in old_hap_dict] #print l1,l2, sum(l1) haplotype_dict = {} hap_pos = marker_data.positions[i] right_positions.append(hap_pos) for hap in old_hap_dict: (h_id, h_count, acc_indices, pos) = old_hap_dict[hap] #info on the old haplotype temp_hap_dict = {} for a_i in acc_indices: nt = marker_data.snps[i][a_i] new_hap = list(hap) new_hap.append(nt) new_hap = tuple(new_hap) #print new_hap if not new_hap in temp_hap_dict: temp_hap_dict[new_hap] = [ 0, 0, [a_i], hap_pos ] #haplotype id number, haplotype frequency count, list of accessions indices, and position. else: temp_hap_dict[new_hap][2].append(a_i) freq_hapl_list = [] for h in temp_hap_dict: hi = temp_hap_dict[h] temp_hap_dict[h][1] = len(hi[2]) freq_hapl_list.append((len(hi[2]), h)) freq_hapl_list.sort() (hc, h) = freq_hapl_list.pop( ) #the most frequent haplotype gets colored like the last one. if hc == 1: del temp_hap_dict[h] else: temp_hap_dict[h][0] = h_id freq_hapl_list.reverse() for (hc, h) in freq_hapl_list: if hc == 1: del temp_hap_dict[h] else: temp_hap_dict[h][0] = haplotype_ids.pop() for h in temp_hap_dict: haplotype_dict[h] = temp_hap_dict[h] if haplotype_dict: right_haplotypes.append(haplotype_dict) old_hap_dict = haplotype_dict i += 1 #Clustering... dm = calc_local_dist(marker_data, focal_start, focal_end, error_tolerance=error_tolerance) print dm import scipy as sp import scipy.cluster.hierarchy as hc Z = hc.average(dm) #Performing clustering using the dist. matr. print Z import pylab dend_dict = hc.dendrogram(Z, labels=marker_data.accessions) new_acc_order = dend_dict['ivl'] print new_acc_order ai_map = [new_acc_order.index(acc) for acc in marker_data.accessions] import numpy as np #Updating the positions in the figure. left_positions.reverse() positions = left_positions + right_positions x_s = np.zeros((len(positions) + 1, len(marker_data.accessions) + 1)) start_pos = positions[0] - (0.5 * (positions[1] - positions[0])) print len(x_s), len(x_s[0, ]) for j in range(0, len(x_s[0, ])): x_s[0, j] = start_pos for j in range(1, len(x_s) - 1): # number of SNPs x = positions[j - 1] + 0.5 * (positions[j] - positions[j - 1]) for k in range(0, len(x_s[j, ])): # number of NTs x_s[j, k] = x for j in range(0, len(x_s[0, ])): x_s[-1, j] = positions[-1] + (0.5 * (positions[-1] - positions[-2])) y_s = np.zeros((len(positions) + 1, len(marker_data.accessions) + 1)) for j in range(0, len(y_s)): # number of SNPs for k in range(0, len(y_s[j, ])): # number of NTs y_s[j, k] = k - 0.5 #Updating the colors in the figure. color_matrix = np.ones((len(positions), len(marker_data.accessions))) left_haplotypes.reverse() haplotypes = left_haplotypes + right_haplotypes max_color = float(haplotype_ids.pop()) for i, hap_dict in enumerate(haplotypes): for h in hap_dict: (h_id, h_count, acc_indices, pos) = hap_dict[h] for a_i in acc_indices: m_ai = ai_map[a_i] if h_id == 0: color_matrix[i, m_ai] = 1.0 else: color_matrix[i, m_ai] = h_id / max_color import phenotypeData as pd e_dict = pd._getEcotypeIdInfoDict_() accessions = [ unicode(e_dict[int(e)][0], 'iso-8859-1') for e in new_acc_order ] #Plot figure.. import pylab pylab.figure(figsize=(18, 8)) pylab.axes([0.08, 0.06, 0.9, 0.88]) pylab.pcolor(x_s, y_s, color_matrix, cmap=pylab.cm.hot) #Dealing with the phenotype data phenotypeData.removeAccessionsNotInSNPsData(marker_data) et_mapping = [] for i, et in enumerate(new_acc_order): et_mapping.append((marker_data.accessions.index(et), i)) phenotypeData.orderAccessions(et_mapping) phen_vals = phenotypeData.getPhenVals(phen_id, noNAs=False) acc_strings1 = [ accessions[i] + ", " + str(phen_vals[i]) for i in range(len(accessions)) ] acc_strings = [ accessions[i] + ", " + str(phen_vals[i]) for i in range(len(accessions)) ] pylab.yticks(range(0, len(marker_data.accessions)), acc_strings, size="small") x_range = (x_s[-1, 0] - x_s[0, 0]) #Retreiving and drawing the genes import regionPlotter as rp import gwaResults as gr genes = gr.get_gene_list(start_pos=x_s[0, 0], end_pos=x_s[-1, 0], chr=5) rp.drawGenes(genes, y_shift=-3, rangeVal=40) pylab.axis((x_s[0, 0] - 0.05 * x_range, x_s[-1, 0] + 0.05 * x_range, -0.1 * len(marker_data.accessions) - 1, 1.02 * len(marker_data.accessions))) pylab.savefig(filename, format='pdf')
def _run_(): if len(sys.argv)==1: print __doc__ sys.exit(2) long_options_list=["outputFile=", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "addToDB", "callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , "subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", "onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "permTest=", "savePermutations", "permutationFilter=", "testRobustness"] try: opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType=1 outputFile=None delim="," missingVal="NA" help=0 withArrayIds=1 parallel=None parallelAll=False addToDB=False callMethodID=None comment="" subSample=None onlyOriginal96=False onlyOriginal192 = False subSampleLikePhenotype = None subsampleTest = False numSubSamples = None complement = False onlyBelowLatidue = None onlyAboveLatidue = None sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 permTest = None savePermutations = False permutationFilter = 1.0 testRobustness = False for opt, arg in opts: if opt in ("-h", "--help"): help=1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds=int(arg) elif opt in ("-o", "--outputFile"): outputFile=arg elif opt in ("--phenotypeFileType"): phenotypeFileType=int(arg) elif opt in ("--parallel"): parallel=arg elif opt in ("--parallelAll"): parallelAll=True elif opt in ("--addToDB"): addToDB=True elif opt in ("--onlyOriginal96"): onlyOriginal96=True elif opt in ("--onlyOriginal192"): onlyOriginal192=True elif opt in ("--complement"): complement=True elif opt in ("--subSample"): subSample=int(arg) elif opt in ("--subsampleTest"): subsampleTest = True l = arg.split(",") subSample=int(l[0]) numSubSamples=int(l[1]) elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue=float(arg) elif opt in ("--onlyAboveLatidue"): onlyAboveLatidue=float(arg) elif opt in ("--subSampleLikePhenotype"): subSampleLikePhenotype=int(arg) elif opt in ("--callMethodID"): callMethodID=int(arg) elif opt in ("--comment"): comment=arg elif opt in ("-d", "--delim"): delim=arg elif opt in ("-m", "--missingval"): missingVal=arg elif opt in ("--sr"): sr = True elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permTest"): permTest = int(arg) elif opt in ("--savePermutations"): savePermutations = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile=args[0] phenotypeDataFile=args[1] print "Kruskal-Wallis is being set up with the following parameters:" print "phenotypeDataFile:",phenotypeDataFile print "snpsDataFile:",snpsDataFile print "parallel:",parallel print "parallelAll:",parallelAll print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "onlyAboveLatidue:",onlyAboveLatidue print "subSampleLikePhenotype:",subSampleLikePhenotype print "subsampleTest:",subsampleTest print "numSubSamples:",numSubSamples print "subSample:",subSample print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "permTest:",permTest print "savePermutations:",savePermutations print "permutationFilter:",permutationFilter print "testRobustness:",testRobustness def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) phenName=phenName.replace("/", "_div_") phenName=phenName.replace("*", "_star_") outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr="""#!/bin/csh #PBS -l walltime=100:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " shstr+=" -a "+str(withArrayIds)+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") if parallel: #Running on the cluster.. if parallelAll: phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) elif subsampleTest: phenotypeIndex=int(args[2]) for i in range(0,numSubSamples): runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i)) else: phenotypeIndex=int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex=int(args[2]) print "phenotypeIndex:",phenotypeIndex print "output:",outputFile print "\nStarting program now!\n" #Load phenotype file phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) elif onlyAboveLatidue: print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if subSampleLikePhenotype: p_name = phed.getPhenotypeName(subSampleLikePhenotype) print "Picking sample as in",p_name ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype) print ecotypes phed.filterAccessions(ecotypes) print "len(phed.accessions)", len(phed.accessions) if subSample: sample_ecotypes = [] ecotypes = phed.getNonNAEcotypes(phenotypeIndex) sample_ecotypes = random.sample(ecotypes,subSample) phed.filterAccessions(sample_ecotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() #Load genotype file snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal, withArrayIds = withArrayIds) #Checking overlap between phenotype and genotype accessions. phenotype=phed.getPhenIndex(phenotypeIndex) accIndicesToKeep=[] phenAccIndicesToKeep=[] numAcc=len(snpsds[0].accessions) sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1=snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2=phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping=[] i=0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i+=1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds=[] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Double check genotype file: problems = 0 for i in range(0,len(newSnpsds)): snpsd = newSnpsds[i] for j in range(0,len(snpsd.snps)): snp = snpsd.snps[j] sc = snp.count(0) if sc==0 or sc==len(snp): print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i] problems += 1 if problems >0: print "Genotype file appears to have potential problems" else: print "Genotype file appears to be good" if permTest: print "Starting a permutation test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" permTest = 100 _perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter) sys.exit(0) if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" _robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter) sys.exit(0) sys.stdout.flush() print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun if (not sr) or (sr and not srSkipFirstRun): #Writing files if env.user=="bjarni": tempfile.tempdir='/tmp' (fId, phenotypeTempFile)=tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile)=tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write("Phenotype file written\n") sys.stdout.flush() snpsDataset=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) decoder={1:1, 0:0,-1:'NA'} snpsDataset.writeToFile(genotypeTempFile, deliminator = delim, missingVal = missingVal, withArrayIds = 0, decoder = decoder) sys.stdout.write("Genotype file written\n") sys.stdout.flush() phenotypeName=phed.getPhenotypeName(phenotypeIndex) rDataFile=outputFile+".rData" pvalFile=outputFile+".pvals" #Is the phenotype binary? binary=phed.isBinary(phenotypeIndex) rstr=_generateRScript_(genotypeTempFile, phenotypeTempFile, rDataFile, pvalFile, name = phenotypeName, binary = binary) rFileName=outputFile+".r" f=open(rFileName, 'w') f.write(rstr) f.close() outRfile=rFileName+".out" errRfile=rFileName+".err" print "Running R file:" cmdStr="(R --vanilla < "+rFileName+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() gc.collect() os.system(cmdStr) #print "Emma output saved in R format in", rDataFile print "Generating a GW plot." res = gwaResults.Result(pvalFile,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile else: print "Skipping first stage analysis." sys.stdout.flush() if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)