def create_diploid_dataset(call_method_id=76, file_name='/tmp/test.csv', coding_type='normal'): #Load parent list parents = [] with open(env.env['data_dir'] + 'heterozygous_genotypes.csv') as f: f.next() for l in f: parents.append(map(str.strip, l.split(','))) snpsd = dp.load_snps_call_method(call_method_id) l = zip(snpsd.accessions, range(len(snpsd.accessions))) l.sort() l = map(list, zip(*l)) acc_list = l[0] orders = l[1] sds = [] for i, sd in enumerate(snpsd.snpsDataList): snps = sp.array(sd.snps, dtype='int8') snps_list = [] p_list = [] for ps in parents: f_id = bisect.bisect(acc_list, ps[0]) - 1 m_id = bisect.bisect(acc_list, ps[1]) - 1 if acc_list[f_id] == ps[0] and acc_list[m_id] == ps[1]: f_gt = snps[:, orders[f_id]].flatten() m_gt = snps[:, orders[m_id]].flatten() if coding_type == 'normal': o_gt = f_gt + m_gt elif coding_type == 'dominant': o_gt = sp.bitwise_xor(f_gt, m_gt) snps_list.append(o_gt) p_list.append('%s_%s' % (ps[0], ps[1])) snps_list = sp.transpose(sp.array(snps_list, dtype='int8')) snps = [] for s in snps_list: snps.append(s) sds.append( snpsdata.SNPsData(snps, sd.positions, accessions=p_list, chromosome=i + 1)) sd = snpsdata.SNPsDataSet(sds, [1, 2, 3, 4, 5]) sd.writeToFile(file_name)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "rFile=", "chr=", "delim=", "missingval=", "BoundaryStart=", "removeOutliers=", "addConstant=", "logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", "kinshipDatafile=", "phenotypeRanks", "onlyMissing", "onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", "complement", "negate", "srInput=", "sr", "srOutput=", "srPar=", "srSkipFirstRun", "testRobustness", "permutationFilter=", "useLinearRegress", "regressionCofactors=", "FriLerAsCofactor", "FriColAsCofactor", "memReq=", "walltimeReq=", ] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeRanks = False removeOutliers = None addConstant = -1 phenotypeFileType = 1 rFile = None delim = "," missingVal = "NA" help = 0 minMAF = 0.0 boundaries = [-1, -1] chr = None parallel = None logTransform = False negate = False parallelAll = False lrt = False kinshipDatafile = None onlyMissing = False onlyOriginal96 = False onlyOriginal192 = False onlyBelowLatidue = None complement = False sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 testRobustness = False permutationFilter = 0.002 useLinearRegress = False regressionCofactors = None FriLerAsCofactor = False FriColAsCofactor = False memReq = "5g" walltimeReq = "150:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--rFile"): rFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--addConstant"): addConstant = float(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--onlyMissing"): onlyMissing = True elif opt in ("--onlyOriginal96"): onlyOriginal96 = True elif opt in ("--onlyOriginal192"): onlyOriginal192 = True elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue = float(arg) elif opt in ("--complement"): complement = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--negate"): negate = True elif opt in ("--removeOutliers"): removeOutliers = float(arg) elif opt in ("--LRT"): lrt = True elif opt in ("-c", "--chr"): chr = int(arg) elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("--kinshipDatafile"): kinshipDatafile = arg elif opt in ("--phenotypeRanks"): phenotypeRanks = True elif opt in ("--sr"): sr = True elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--FriLerAsCofactor"): FriLerAsCofactor = True elif opt in ("--FriColAsCofactor"): FriColAsCofactor = True elif opt in ("--useLinearRegress"): useLinearRegress = True elif opt in ("--regressionCofactors"): regressionCofactors = arg elif opt in ("--memReq"): memReq = arg elif opt in ("--walltimeReq"): walltimeReq = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) print "Emma is being set up with the following parameters:" print "output:", rFile print "phenotypeRanks:", phenotypeRanks print "phenotypeFileType:", phenotypeFileType print "parallel:", parallel print "parallelAll:", parallelAll print "minMAF:", minMAF print "LRT:", lrt print "delim:", delim print "missingval:", missingVal print "kinshipDatafile:", kinshipDatafile print "chr:", chr print "boundaries:", boundaries print "onlyMissing:", onlyMissing print "onlyOriginal96:", onlyOriginal96 print "onlyOriginal192:", onlyOriginal192 print "onlyBelowLatidue:", onlyBelowLatidue print "complement:", complement print "negate:", negate print "logTransform:", logTransform print "addConstant:", addConstant print "removeOutliers:", removeOutliers print "sr:", sr print "srSkipFirstRun:", srSkipFirstRun print "srInput:", srInput print "srOutput:", srOutput print "srTopQuantile:", srTopQuantile print "srWindowSize:", srWindowSize print "testRobustness:", testRobustness print "permutationFilter:", permutationFilter print "useLinearRegress:", useLinearRegress print "regressionCofactors:", regressionCofactors print "FriLerAsCofactor:", FriLerAsCofactor print "FriColAsCofactor:", FriColAsCofactor print "walltimeReq:", walltimeReq print "memReq:", memReq def runParallel(phenotypeIndex, phed): #Cluster specific parameters print phenotypeIndex phenName = phed.getPhenotypeName(phenotypeIndex) outFileName = resultDir + "Emma_" + parallel + "_" + phenName shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime=" + walltimeReq + "\n" shstr += "#PBS -l mem=" + memReq + "\n" shstr += "#PBS -q cmb\n" shstr += "#PBS -N E" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" if useLinearRegress: outFileName = resultDir + "LR_" + parallel + "_" + phenName shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " " if useLinearRegress: shstr += " --useLinearRegress " if regressionCofactors: shstr += " --regressionCofactors=" + str(regressionCofactors) + " " if FriLerAsCofactor: shstr += " --FriLerAsCofactor " if FriColAsCofactor: shstr += " --FriColAsCofactor " if onlyOriginal96: shstr += " --onlyOriginal96 " elif onlyOriginal192: shstr += " --onlyOriginal192 " if onlyBelowLatidue: shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " " if logTransform: shstr += " --logTransform " if negate: shstr += " --negate " if removeOutliers: shstr += " --removeOutliers=" + str(removeOutliers) + " " if phenotypeRanks: shstr += " --phenotypeRanks " if testRobustness: shstr += " --testRobustness " shstr += " --permutationFilter=" + str(permutationFilter) + " " if sr: shstr += " --sr " if not srOutput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" shstr += " --srOutput=" + str(output) + " " if srSkipFirstRun: if not srInput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" shstr += " --srInput=" + str(output) + " " shstr += " --srSkipFirstRun " shstr += " --srPar=" + str(srTopQuantile) + "," + str( srWindowSize) + " " if kinshipDatafile: shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " " shstr += " --addConstant=" + str(addConstant) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data if parallelAll: for phenotypeIndex in phed.phenIds: if onlyMissing: phenName = phed.getPhenotypeName(phenotypeIndex) pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" res = None try: res = os.stat(pvalFile) except Exception: print "File", pvalFile, "does not exist." if res and res.st_size > 0: print "File", pvalFile, "already exists, and is non-empty." if sr: srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" srRes = None try: srRes = os.stat(srInput) except Exception: print "File", srInput, "does not exist." if srRes and srRes.st_size > 0: print "File", srInput, "already exists, and is non-empty." else: runParallel(phenotypeIndex, phed) else: print "Setting up the run." runParallel(phenotypeIndex, phed) else: runParallel(phenotypeIndex, phed) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex, phed) return else: phenotypeIndex = int(args[2]) print "phenotypeIndex:", phenotypeIndex print "\nStarting program now!\n" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data numAcc = len(snpsds[0].accessions) #Removing outliers if removeOutliers: print "Remoing outliers" phed.naOutliers(phenotypeIndex, removeOutliers) #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str, original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str, original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][ 2] and eiDict[acc][2] < onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2] == None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] #Checking which accessions to keep and which to remove . for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break print "\nFiltering accessions in genotype data:" #Filter accessions which do not have the phenotype value (from the genotype data). for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "\nNow filtering accessions in phenotype data:" phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len( phed.accessions) == len(snpsds[0].accessions) if len(phed.accessions) != len(snpsds[0].accessions): raise Exception #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Remove minor allele frequencies if minMAF != 0: sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Removing SNPs which are outside of boundaries. if chr: print "\nRemoving SNPs which are outside of boundaries." snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1]) snpsds = [snpsds[chr - 1]] #Ordering accessions in genotype data to fit phenotype data. print "Ordering genotype data accessions." accessionMapping = [] i = 0 for acc in phed.accessions: if acc in snpsds[0].accessions: accessionMapping.append((snpsds[0].accessions.index(acc), i)) i += 1 #print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "\nGenotype data has been ordered." #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) print "" print "Checking kinshipfile:", kinshipDatafile if kinshipDatafile: #Is there a special kinship file? kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal) accIndicesToKeep = [] #Checking which accessions to keep and which to remove (genotype data). sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(kinshipSnpsds[0].accessions)): acc1 = kinshipSnpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) break print accIndicesToKeep for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from kinship genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "Ordering kinship data accessions." accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in kinshipSnpsds[0].accessions: accessionMapping.append( (kinshipSnpsds[0].accessions.index(acc), i)) i += 1 print zip(accessionMapping, snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "Kinship genotype data has been ordered." newKinshipSnpsds = [] sys.stdout.write("Converting data format") for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() newKinshipSnpsds.append(snpsd.getSnpsData( missingVal=missingVal)) #This data might have NAs print "" kinshipSnpsds = newKinshipSnpsds else: kinshipSnpsds = newSnpsds print "Found kinship data." #Ordering accessions according to the order of accessions in the genotype file # accessionMapping = [] # i = 0 # for acc in snpsds[0].accessions: # if acc in phed.accessions: # accessionMapping.append((phed.accessions.index(acc),i)) # i += 1 # phed.orderAccessions(accessionMapping) #Negating phenotypic values if negate: phed.negateValues(phenotypeIndex) if logTransform and not phed.isBinary( phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0: addConstant = 0 #Adding a constant. if addConstant != -1: if addConstant == 0: addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10 addConstant = addConstant - phed.getMinValue(phenotypeIndex) print "Adding a constant to phenotype:", addConstant phed.addConstant(phenotypeIndex, addConstant) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotypeIndex) #Converting phenotypes to Ranks elif phenotypeRanks: phed.transformToRanks(phenotypeIndex) if not chr: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [1, 2, 3, 4, 5]) else: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr]) phenotypeName = phed.getPhenotypeName(phenotypeIndex) sys.stdout.flush() if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in snpsDataset.snpsDataList: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter) sys.exit(0) if useLinearRegress: phenVals = phed.getPhenVals(phenotypeIndex) d0 = {} d0["phen"] = phenVals dh = {} dh["phen"] = phenVals import rpy, gc if regressionCofactors: #Adds ler and col as cofactors import pickle f = open(regressionCofactors, "r") co_factors = pickle.load(f) f.close() #inserting co factors into model for factor in co_factors: d[factor] = co_factors[factor] import analyzeHaplotype as ah (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True) if FriColAsCofactor: d0["col"] = col_factor dh["col"] = col_factor if FriLerAsCofactor: d0["ler"] = ler_factor dh["ler"] = ler_factor chr_pos_pvals = [] stats = [] sys.stdout.write("Applying the linear model") sys.stdout.flush() for i in range(0, len(newSnpsds)): #[3]:# snpsd = newSnpsds[i] sys.stdout.write("|") sys.stdout.flush() gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. for j in range(0, len(snpsd.snps)): if j % 5000 == 0: sys.stdout.write(".") sys.stdout.flush() #if snpsd.positions[j]>1700000: # break snp = snpsd.snps[j] d0["snp"] = snp try: rpy.set_default_mode(rpy.NO_CONVERSION) aov0 = rpy.r.aov(r("phen ~ ."), data=d0) aovh = rpy.r.aov(r("phen ~ ."), data=dh) rpy.set_default_mode(rpy.BASIC_CONVERSION) s0 = rpy.r.summary(aov0) sh = rpy.r.summary(aovh) #print s0,sh rss_0 = s0['Sum Sq'][-1] if type(sh['Sum Sq']) != float: rss_h = sh['Sum Sq'][-1] else: rss_h = sh['Sum Sq'] f = (rss_h - rss_0) / (rss_0 / (len(phenVals) - len(d0) + 1)) pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False) except Exception, err_str: print "Calculating p-value failed" #,err_str pval = 1.0 #print "dh:",dh #print "d0:",d0 #print "rss_h,rss_0:",rss_h,rss_0 #print "f,p:",f,pval chr_pos_pvals.append([i + 1, snpsd.positions[j], pval]) mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0])) maf = mafc / float(len(snp)) stats.append([maf, mafc]) sys.stdout.write("\n") #Write out to a result file sys.stdout.write("Writing results to file\n") sys.stdout.flush() pvalFile = rFile + ".pvals" f = open(pvalFile, "w") f.write("Chromosome,position,p-value,marf,maf\n") for i in range(0, len(chr_pos_pvals)): chr_pos_pval = chr_pos_pvals[i] stat = stats[i] f.write( str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," + str(chr_pos_pval[2]) + "," + str(stat[0]) + "," + str(stat[1]) + "\n") f.close() #Plot results print "Generating a GW plot." phenotypeName = phed.getPhenotypeName(phenotypeIndex) res = gwaResults.Result(pvalFile, name="LM_" + phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() pngFile = pvalFile + ".png" plotResults.plotResult(res, pngFile=pngFile, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plotBonferroni=True, usePylab=False)
def _run_(): if len(sys.argv)==1: print __doc__ sys.exit(2) long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "addToDB", "callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , "subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", "onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "permTest=", "savePermutations", "permutationFilter=", "testRobustness", "memReq=","walltimeReq=",] try: opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType=1 outputFile=None delim="," missingVal="NA" help=0 parallel=None parallelAll=False addToDB=False callMethodID=None comment="" subSample=None onlyOriginal96=False onlyOriginal192 = False subSampleLikePhenotype = None subsampleTest = False numSubSamples = None complement = False onlyBelowLatidue = None onlyAboveLatidue = None sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 permTest = None savePermutations = False permutationFilter = 1.0 testRobustness = False memReq = "5g" walltimeReq = "100:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help=1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile=arg elif opt in ("--phenotypeFileType"): phenotypeFileType=int(arg) elif opt in ("--parallel"): parallel=arg elif opt in ("--parallelAll"): parallelAll=True elif opt in ("--addToDB"): addToDB=True elif opt in ("--onlyOriginal96"): onlyOriginal96=True elif opt in ("--onlyOriginal192"): onlyOriginal192=True elif opt in ("--complement"): complement=True elif opt in ("--subSample"): subSample=int(arg) elif opt in ("--subsampleTest"): subsampleTest = True l = arg.split(",") subSample=int(l[0]) numSubSamples=int(l[1]) elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue=float(arg) elif opt in ("--onlyAboveLatidue"): onlyAboveLatidue=float(arg) elif opt in ("--subSampleLikePhenotype"): subSampleLikePhenotype=int(arg) elif opt in ("--callMethodID"): callMethodID=int(arg) elif opt in ("--comment"): comment=arg elif opt in ("-d", "--delim"): delim=arg elif opt in ("-m", "--missingval"): missingVal=arg elif opt in ("--sr"): sr = True elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permTest"): permTest = int(arg) elif opt in ("--savePermutations"): savePermutations = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--memReq"): memReq=arg elif opt in ("--walltimeReq"): walltimeReq=arg else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile=args[0] phenotypeDataFile=args[1] print "Kruskal-Wallis is being set up with the following parameters:" print "phenotypeDataFile:",phenotypeDataFile print "snpsDataFile:",snpsDataFile print "parallel:",parallel print "parallelAll:",parallelAll print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "onlyAboveLatidue:",onlyAboveLatidue print "complement:",complement print "subSampleLikePhenotype:",subSampleLikePhenotype print "subsampleTest:",subsampleTest print "numSubSamples:",numSubSamples print "subSample:",subSample print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "permTest:",permTest print "savePermutations:",savePermutations print "permutationFilter:",permutationFilter print "testRobustness:",testRobustness print "walltimeReq:",walltimeReq print "memReq:",memReq def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) print phenName outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime="+walltimeReq+"\n" shstr += "#PBS -l mem="+memReq+"\n" shstr +="#PBS -q cmb\n" shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") if parallel: #Running on the cluster.. if parallelAll: phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) elif subsampleTest: phenotypeIndex=int(args[2]) for i in range(0,numSubSamples): runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i)) else: phenotypeIndex=int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex=int(args[2]) print "phenotypeIndex:",phenotypeIndex print "output:",outputFile print "\nStarting program now!\n" #Load phenotype file phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) elif onlyAboveLatidue: print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if subSampleLikePhenotype: p_name = phed.getPhenotypeName(subSampleLikePhenotype) print "Picking sample as in",p_name ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype) print ecotypes phed.filterAccessions(ecotypes) print "len(phed.accessions)", len(phed.accessions) if subSample: sample_ecotypes = [] ecotypes = phed.getNonNAEcotypes(phenotypeIndex) sample_ecotypes = random.sample(ecotypes,subSample) phed.filterAccessions(sample_ecotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() #Load genotype file snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal) #Checking overlap between phenotype and genotype accessions. phenotype=phed.getPhenIndex(phenotypeIndex) accIndicesToKeep=[] phenAccIndicesToKeep=[] numAcc=len(snpsds[0].accessions) sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1=snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2=phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping=[] i=0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i+=1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds=[] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Double check genotype file: problems = 0 for i in range(0,len(newSnpsds)): snpsd = newSnpsds[i] for j in range(0,len(snpsd.snps)): snp = snpsd.snps[j] sc = snp.count(0) if sc==0 or sc==len(snp): print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i] problems += 1 if problems >0: print "Genotype file appears to have potential problems" else: print "Genotype file appears to be good" if permTest: print "Starting a permutation test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" permTest = 100 _perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter) sys.exit(0) if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" _robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter) sys.exit(0) sys.stdout.flush() print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun if (not sr) or (sr and not srSkipFirstRun): #Writing files #phed and phenotype sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) phenotypeName=phed.getPhenotypeName(phenotypeIndex) if phed.isBinary(phenotypeIndex): pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex)) else: snps = sd.getSnps() phen_vals = phed.getPhenVals(phenotypeIndex) try: kw_res = util.kruskal_wallis(snps,phen_vals) pvals = kw_res['ps'] except: print snps print phen_vals print len(snps),len(snps[0]),len(phen_vals) raise Exception res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False) pvalFile=outputFile+".pvals" res.writeToFile(pvalFile) print "Generating a GW plot." res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile else: print "Skipping first stage analysis." sys.stdout.flush() if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def map_phenotype(p_i, phed, mapping_method, trans_method, p_dict): import copy phed = copy.deepcopy(phed) phenotype_name = phed.get_name(p_i) phen_is_binary = phed.is_binary(p_i) if trans_method == 'most_normal': trans_method, shapiro_pval = phed.most_normal_transformation(p_i, perform_trans=False) file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.get_name(p_i), mapping_method, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'], p_dict['call_method_id']) result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method) emmax_perm_threshold = None k = None res = None #Check whether result already exists. if p_dict['use_existing_results']: if p_dict['region_plots']: sd = _get_genotype_data_(p_dict) num_outliers = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates']) if p_dict['remove_outliers']: assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA." snps = sd.getSnps() else: snps = None print "\nChecking for existing results." result_file = file_prefix + ".pvals" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = True else: result_file = file_prefix + ".scores" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = False if res: print "Found existing results.. (%s)" % (result_file) sys.stdout.flush() #Loading candidate genes cand_genes = None if p_dict['cand_genes_file']: cand_genes, tair_ids = gwaResults.load_cand_genes_file(p_dict['cand_genes_file']) else: cand_genes = None tair_ids = None if not res: #If results weren't found in a file... then do GWA. #Loading data sd = _get_genotype_data_(p_dict) num_outliers, n_filtered_snps = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates']) #Do we need to calculate the K-matrix? if mapping_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm']: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() if p_dict['kinship_file']: #Kinship file was supplied.. print 'Loading supplied kinship file: %s' % p_dict['kinship_file'] k = kinship.load_kinship_from_file(p_dict['kinship_file'], sd.accessions) else: print 'Loading kinship file.' if p_dict['data_file'] != None: if p_dict['kinship_type'] == 'ibs': k = sd.get_ibs_kinship_matrix() elif p_dict['kinship_type'] == 'ibd': k = sd.get_ibd_kinship_matrix() else: k = kinship.get_kinship(call_method_id=p_dict['call_method_id'], data_format=p_dict['data_format'], method=p_dict['kinship_type'], n_removed_snps=n_filtered_snps, remain_accessions=sd.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") if p_dict['remove_outliers']: if num_outliers == 0: print "No outliers were removed!" phen_vals = phed.get_values(p_i) if p_dict['local_gwas']: #Filter SNPs, etc.. sd = snpsdata.SNPsDataSet([sd.get_region_snpsd(*p_dict['local_gwas'])], [p_dict['local_gwas'][0]], data_format=sd.data_format) snps = sd.getSnps() sys.stdout.write("Finished loading and handling data!\n") print "Plotting a histogram" p_her = None hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'], p_dict['call_method_id']) hist_png_file = hist_file_prefix + "_hist.png" if k is not None: p_her = phed.get_pseudo_heritability(p_i, k)['pseudo_heritability'] p_her_pval = phed.get_pseudo_heritability(p_i, k)['pval'] phed.plot_histogram(p_i, png_file=hist_png_file, p_her=p_her, p_her_pval=p_her_pval) else: phed.plot_histogram(p_i, png_file=hist_png_file) print "Applying %s to data." % (mapping_method) sys.stdout.flush() kwargs = {} additional_columns = [] if "kw" == mapping_method: if phen_is_binary: warnings.warn("Warning, applying KW to a binary phenotype") kw_res = util.kruskal_wallis(snps, phen_vals) pvals = kw_res['ps'] kwargs['statistics'] = kw_res['ds'] additional_columns.append('statistics') elif "ft" == mapping_method: raise NotImplementedError # pvals, or_est = run_fet(snps, phen_vals) # kwargs['odds_ratio_est'] = or_est # additional_columns.append('odds_ratio_est') else: #Parametric tests below: if mapping_method in ['emma', 'emmax', 'emmax_perm', 'emmax_step', 'emmax_anova', 'loc_glob_mm']: r = lm.mm_lrt_test(phen_vals, k) if r['pval'] > 0.05: print "Performing EMMA, even though a mixed model does not fit the data significantly better" print 'p-value: %0.3f' % r['pval'] else: print 'The mixed model fits the data significantly better than the simple linear model.' print 'p-value: %f' % r['pval'] if mapping_method in ['loc_glob_mm']: res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix=file_prefix, global_k=k, window_size=p_dict['loc_glob_ws'], jump_size=p_dict['loc_glob_ws'] / 2, kinship_method=p_dict['kinship_type']) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_(res_file_name, res_dict) return elif mapping_method in ['emma']: res = lm.emma(snps, phen_vals, k) elif mapping_method in ['emmax']: if p_dict['emmax_perm']: perm_sd = _get_genotype_data_(p_dict) num_outliers = prepare_data(perm_sd, phed, p_i, 'none', 0, p_dict['with_replicates']) perm_sd.filter_mac_snps(p_dict['mac_threshold']) t_snps = perm_sd.getSnps() t_phen_vals = phed.get_values(p_i) res = lm.emmax_perm_test(t_snps, t_phen_vals, k, p_dict['emmax_perm']) emmax_perm_threshold = res['threshold_05'][0] import pylab hist_res = pylab.hist(-sp.log10(res['min_ps']), alpha=0.6) threshold = -sp.log10(emmax_perm_threshold) b_threshold = -sp.log10(1.0 / (len(t_snps) * 20.0)) pylab.vlines(threshold, 0, max(hist_res[0]), color='g') pylab.vlines(b_threshold, 0, max(hist_res[0]), color='r') pylab.savefig(file_prefix + 'perm_%d_min_pval_hist.png' % (p_dict['emmax_perm']), format='png') if p_dict['with_replicates']: #Get values, with ecotypes, construct Z and do GWAM phen_vals = phed.get_values(p_i) Z = phed.get_incidence_matrix(p_i) res = lm.emmax(snps, phen_vals, k, Z=Z, with_betas=p_dict['with_betas'], emma_num=p_dict['emmax_emma_num']) else: res = lm.emmax(snps, phen_vals, k, with_betas=p_dict['with_betas'], emma_num=p_dict['emmax_emma_num']) elif mapping_method in ['emmax_step']: sd.filter_mac_snps(p_dict['mac_threshold']) local = False if p_dict['local_gwas']: local = True file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas'])) res = lm.emmax_step_wise(phen_vals, k, sd=sd, num_steps=p_dict['num_steps'], file_prefix=file_prefix, local=local, cand_gene_list=cand_genes, save_pvals=p_dict['save_stepw_pvals'], emma_num=p_dict['emmax_emma_num']) print 'Step-wise EMMAX finished!' return elif mapping_method in ['lm_step']: sd.filter_mac_snps(p_dict['mac_threshold']) local = False if p_dict['local_gwas']: local = True file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas'])) res = lm.lm_step_wise(phen_vals, sd=sd, num_steps=p_dict['num_steps'], file_prefix=file_prefix, local=local, cand_gene_list=cand_genes, save_pvals=p_dict['save_stepw_pvals']) print 'Step-wise LM finished!' return elif mapping_method in ['lm']: res = lm.linear_model(snps, phen_vals) elif mapping_method in ['emmax_anova']: res = lm.emmax_anova(snps, phen_vals, k) elif mapping_method in ['lm_anova']: res = lm.anova(snps, phen_vals) else: print "Mapping method", mapping_method, 'was not found.' return if mapping_method in ['lm', 'emma', 'emmax']: kwargs['genotype_var_perc'] = res['var_perc'] additional_columns.append('genotype_var_perc') if p_dict['with_betas'] or mapping_method in ['emma' ]: betas = map(list, zip(*res['betas'])) kwargs['beta0'] = betas[0] additional_columns.append('beta0') if len(betas) > 1: kwargs['beta1'] = betas[1] additional_columns.append('beta1') pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() if mapping_method in ['lm_anova', 'emmax_anova']: kwargs['genotype_var_perc'] = res['var_perc'] pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() # print 'Calculating SNP-phenotype correlations.' # kwargs['correlations'] = calc_correlations(snps, phen_vals) # additional_columns.append('correlations') print 'Writing result to file.' res = gwaResults.Result(scores=pvals.tolist(), snps_data=sd, name=result_name, **kwargs) if mapping_method in ["kw", "ft", "emma", 'lm', "emmax", 'emmax_anova', 'lm_anova']: result_file = file_prefix + ".pvals" else: result_file = file_prefix + ".scores" res.write_to_file(result_file, additional_columns, max_fraction=p_dict['pvalue_filter']) #add results to DB.. if p_dict['add_to_db']: print 'Adding results to DB.' if p_dict['with_db_ids']: db_pid = p_i else: db_pid = phed.get_db_pid(p_i) import results_2_db as rdb short_name = 'cm%d_pid%d_%s_%s_%s_%d_%s' % (p_dict['call_method_id'], db_pid, phenotype_name, mapping_method, trans_method, p_dict['remove_outliers'], str(p_dict['with_replicates'])) tm_id = transformation_method_dict[trans_method] try: rdb.add_results_to_db(result_file, short_name, p_dict['call_method_id'], db_pid, analysis_methods_dict[mapping_method], tm_id, remove_outliers=p_dict['remove_outliers']) except Exception, err_str: print 'Failed inserting results into DB!' print err_str