def getKinshipMatrix(): #snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv" snpsDataFile="/home/cmb-01/bvilhjal/Projects/data/250K_f13_012609.csv" import dataParsers,snpsdata snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")#,debug=True) snps = [] sys.stdout.write("Converting format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snps += snpsd.getSnpsData(missingVal="NA").snps print "" #snps = _sampleSNPs_(snps,100) print "Calculating kinship" K = calcKinship(snps) eDict = phenotypeData._getEcotypeIdToStockParentDict_() accessions = map(int,snpsd.accessions) #for et in accessions: #print eDict[et] for i in range(0,len(accessions)): et = accessions[i] info = eDict[et] st = str(et)+", "+str(info[0])+", "+str(info[1])+":" st += str(K[i][0]) for j in range(1,i+1): st += ", "+str(K[i][j]) print st
def getKinshipMatrix(): #snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv" snpsDataFile = "/home/cmb-01/bvilhjal/Projects/data/250K_f13_012609.csv" import dataParsers, snpsdata snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) snps = [] sys.stdout.write("Converting format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snps += snpsd.getSnpsData(missingVal="NA").snps print "" #snps = _sampleSNPs_(snps,100) print "Calculating kinship" K = calcKinship(snps) eDict = phenotypeData._getEcotypeIdToStockParentDict_() accessions = map(int, snpsd.accessions) #for et in accessions: #print eDict[et] for i in range(0, len(accessions)): et = accessions[i] info = eDict[et] st = str(et) + ", " + str(info[0]) + ", " + str(info[1]) + ":" st += str(K[i][0]) for j in range(1, i + 1): st += ", " + str(K[i][j]) print st
def _runTest_(): import dataParsers import phenotypeData #Get phenotype data phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_transformed_publishable_v2.tsv" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') #Get Phenotype data #Get SNPs data snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile) #Get SNPs data psFile = env.homedir + "tmp/tree.ps" marg_file = env.homedir + "tmp/test" out_file = env.homedir + "tmp/test_out" rFile = env.homedir + "tmp/tree_test.r" #Run Margarita marg = Margarita(marg_file, out_file) chr = 4 snpsd = snpsds[chr - 1].getSnpsData() marg.gwaWithTrees(marg_file, snpsd, phed, phenotype = 1, numMarkers = 200, chromosome = chr, boundaries = [200000, 350000], numPerm = 1, cutoff = 16, numArg = 100) #(self, id, snpsd, phed, phenotype=0, boundaries = None, numMarkers = 100, numPerm = 500000, cutoff = 16, numArg = 50) #which marginal tree runNum = 1 argNum = 1 markerNum = 1 marg.parseTreeFile(marg_file + ".marg.trees", rFile, psFile, runNum, argNum, markerNum)
def _plotKinshipDiffs_(): filterProb = 0.2 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "full_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") # ,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) for snpsd in snpsds: snpsd.filterMinMAF(0.1) snpsd.filterMonoMorphicSnps() totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps # For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps print "Calculating the global kinship..." globalKinship = calcKinship(totalSNPs) print "done." normalizedGlobalKinship = globalKinship / mean(globalKinship) gc.collect() # Calling garbage collector, in an attempt to clean up memory.. for i in range(4, 5): # len(snpsds)): chr = i + 1 snpsd = snpsds[i] # pylab.subplot(5,1,chr) # pylab.figure(figsize=(18,4)) # (kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000) # pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$') # (kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000) # pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$') # pylab.legend(numpoints=2,handlelen=0.005) # pylab.title("Kinship diff. chr. "+str(chr)) # pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf") # pylab.clf() pylab.figure(figsize=(18, 4)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000) pylab.plot(binPos, emmaDiffs, "r", label="ws$=300000$") pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000) pylab.plot(binPos, emmaDiffs, "b", label="ws$=500000$") pylab.title("Emma avg. p-value diff. on chr. " + str(chr)) pylab.legend(numpoints=2, handlelen=0.005) pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf") pylab.clf() gc.collect() # Calling garbage collector, in an attempt to clean up memory..
def _runTest_(): filename = "/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv" import dataParsers,snpsdata snpsds = dataParsers.parseCSVData(filename, format=1, deliminator=",")#,debug=True) snpsd = snpsdata.SNPsDataSet(snpsds,[1,2,3,4,5]) eDict = _getEcotypeIdToStockParentDict_() accessions = map(int,snpsd.accessions) accessions.sort() print "ecotype_id, native_name, stock_parent" i = 0 for et in accessions: et = int(et) print str(et)+", "+str(eDict[et][0])+", "+str(eDict[et][1])
def run(self): """ 2008-5-18 """ if self.debug: import pdb pdb.set_trace() snpsd_ls = dataParsers.parseCSVData(self.input_fname, withArrayIds=self.withArrayIds) snpData = RawSnpsData_ls2SNPData(snpsd_ls, use_nt2number=1) del snpsd_ls newSnpData = transposeSNPData(snpData) del snpData newSnpData.tofile(self.output_fname, transform_to_numpy=0)
def _impute_FLC_192_(): phed = pd.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv" d250k_sd = dataParsers.parse_snp_data(d250k_file) d250k_sd.filter_accessions(phed.accessions) d250k_sd.filter_maf_snps(0.05) seq_snpsd = dataParsers.parseCSVData( data_dir + "/flc_seqs_aln_imputed_snps_012710.csv") seq_snpsd.onlyBinarySnps() d250k_sd.snpsDataList[4].compareWith(seq_snpsd) d250k_sd.snpsDataList[4].merge_data(seq_snpsd)
def _countVals_(): resdir = "/Network/Data/250k/tmp-bvilhjal/phenotype_analyzis/" phenotypeFile = "/Network/Data/250k/dataFreeze_011209/phenotypes_all_raw_012509.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter = '\t') phenotypeIndices = phenotypeData.categories_2_phenotypes[1]+phenotypeData.categories_2_phenotypes[2]+phenotypeData.categories_2_phenotypes[3]+phenotypeData.categories_2_phenotypes[4] print "total # of phenotypes:", phed.countPhenotypes() print "# of phenotypes analyzed:", len(phenotypeIndices) totalCounts = [] for p_i in phenotypeIndices: valCount = phed.countValues(p_i) totalCounts.append(valCount) snpsDataFile="/Network/Data/250k/dataFreeze_011209/250K_f13_012509.csv" import dataParsers,snpsdata snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",")#,debug=True) snpsd = snpsdata.SNPsDataSet(snpsds,[1,2,3,4,5]) phed.removeAccessionsNotInSNPsData(snpsd) overlappingCounts = [] for p_i in phenotypeIndices: valCount = phed.countValues(p_i) overlappingCounts.append(valCount) #ecotypes_192 = phenotypeData._getFirst192Ecotypes_() ecotypes_192 = _get192Ecotypes_() ecotypes_192 = [str(e) for e in ecotypes_192] print "len(ecotypes_192):",len(ecotypes_192) print ecotypes_192 phed.filterAccessions(ecotypes_192) filename = resdir+"phen_value_count_new_data_012509_v2.txt" f = open(filename,"w") f.write("Phenotype, total_count, overlapping_count, 192_overlap_count\n") for i in range(0,len(phenotypeIndices)): p_i = phenotypeIndices[i] try: phenName = phed.getPhenotypeName(p_i) valCount = phed.countValues(p_i) f.write(str(phenName)+", "+str(totalCounts[i])+", "+str(overlappingCounts[i])+", "+str(valCount)+"\n") except Exception: print "\nPhenotype index", p_i, "failed." f.close()
def plotHaplotypes(chr, startPos, endPos): snpsd = dataParsers.parseCSVData( "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv")[chr - 1] import scipy as sp import scipy.cluster.hierarchy as hc import Emma snpsd = snpsd.getSnpsData() newSnps = [] positions = [] for i in range(0, len(snpsd.positions)): pos = snpsd.positions[i] if pos > endPos: break elif pos >= startPos: newSnps.append(snpsd.snps[i]) positions.append(snpsd.positions[i]) print "calculating the kinship" K = Emma.calcKinship(newSnps) #print "K:",K Z = hc.average(K) #print "Z:",Z import pylab #hc.leaders(Z) dend_dict = hc.dendrogram(Z, labels=snpsd.accessions) new_acc_order = dend_dict['ivl'] print new_acc_order print snpsd.accessions pylab.savefig("/Users/bjarni/tmp/FRI_tree.pdf", format='pdf') #cluster to get ordering?? acc_mapping = [] for acc in snpsd.accessions: i = new_acc_order.index(acc) acc_mapping.append(i) snps = [] for snp in newSnps: newSNP = [0] * len(snp) for (nt, i) in zip(snp, acc_mapping): newSNP[i] = nt snps.append(newSNP) snps = sp.array(snps) pylab.matshow(snps.transpose()) pylab.savefig("/Users/bjarni/tmp/FRI_haplotype.pdf", format='pdf')
def plot_250k_Tree(chr=None, startPos=None, endPos=None): import scipy as sp import scipy.cluster.hierarchy as hc import Emma import pylab import phenotypeData e_dict = phenotypeData._getEcotypeIdToStockParentDict_() snpsds = dataParsers.parseCSVData( "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv") snps = [] for snpsd in snpsds: snps += snpsd.getSnpsData().snps snps = sampleSNPs(snps, 100000, False) labels = [] for acc in snpsds[0].accessions: try: s = unicode(e_dict[int(acc, )][0], 'iso-8859-1') except Exception, err_s: print err_s print e_dict[int(acc)][0] s = acc labels.append(s)
def getLerAndColAccessions(snpsds=None, asFactors=False): if not snpsds: snpsds = dataParsers.parseCSVData( "/Network/Data/250k/dataFreeze_011209/250K_192_043009.csv") snpsd = snpsds[3] #.getSnpsData() ler_pos = 268809 col_pos = 269962 col_accessions = [[], []] ler_accessions = [[], []] col_factor = [] ler_factor = [] for i in range(0, len(snpsd.positions)): pos = snpsd.positions[i] if pos > col_pos: break elif pos == ler_pos: for j in range(0, len(snpsd.snps[i])): if snpsd.snps[i][j] == 0: ler_accessions[0].append(snpsd.accessions[j]) ler_factor.append(0) else: ler_accessions[1].append(snpsd.accessions[j]) ler_factor.append(1) elif pos == col_pos: for j in range(0, len(snpsd.snps[i])): if snpsd.snps[i][j] == 0: col_accessions[0].append(snpsd.accessions[j]) col_factor.append(0) else: col_accessions[1].append(snpsd.accessions[j]) col_factor.append(1) if asFactors: return (ler_factor, col_factor) else: return (ler_accessions[1], col_accessions[1])
def _test_(): import dataParsers snpsds1 = dataParsers.parseCSVData("149_v1.csv", deliminator=",") snpsds2 = dataParsers.parseCSVData("384.csv", deliminator=",") merge(snpsds1,snpsds2,unionType=1,priority=1) print snpsds1[0].positions
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["phenotypeIDs=","rawPhenotypes","onlyBinary", "onlyCategorical", "onlyQuantitative", "onlyReplicates", "delim=", "missingval=", "help","includeSD","orderByGenotypeFile=", "onlyPublishable"] try: opts, args = getopt.getopt(sys.argv[1:], "p:u:h:o:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) output_fname = None phenotypeIDs = None delim = "," missingVal = "NA" rawPhenotypes = False onlyBinary = False onlyCategorical = False onlyQuantitative = False onlyReplicates = False includeSD = False onlyPublishable = False genotypeFile = None help = 0 passwd = None user=None host="papaya.usc.edu" for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o",): output_fname = arg elif opt in ("-u",): user = arg elif opt in ("-p",): passwd = arg elif opt in ("-h",): host = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("--rawPhenotypes"): rawPhenotypes = True elif opt in ("--onlyBinary"): onlyBinary = True elif opt in ("--onlyCategorical"): onlyCategorical = True elif opt in ("--onlyQuantitative"): onlyQuantitative = True elif opt in ("--onlyReplicates"): onlyReplicates = True elif opt in ("--includeSD"): includeSD = True elif opt in ("--onlyPublishable"): onlyPublishable = True elif opt in ("--phenotypeIDs"): phenotypeIDs = [] for num in arg.split(","): phenotypeIDs.append(int(num)) elif opt in ("--orderByGenotypeFile"): genotypeFile = arg else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) if not user: sys.stdout.write("Username: "******"Remoing phenotypes." phenData.removePhenotypeIDs(phenotypeIDs) #Sort in correct order. if genotypeFile: snpsds = dataParsers.parseCSVData(genotypeFile, format=1, deliminator=delim, missingVal=missingVal) print "Removing accessions which are not in the genotype file." indicesToKeep = [] for i in range(0,len(phenData.accessions)): if phenData.accessions[i] in snpsds[0].accessions: indicesToKeep.append(i) phenData.removeAccessions(indicesToKeep) print "Ordering accessions to match the genotype file order" associationMapping = [] j = 0 for acc in snpsds[0].accessions: if acc in phenData.accessions: associationMapping.append((phenData.accessions.index(acc),j)) j += 1 phenData.orderAccessions(associationMapping) #Output phenotypes to file. phenData.writeToFile(output_fname, delimiter='\t')
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["rFile=","chr=", "delim=", "missingval=", "withArrayId=", "BoundaryStart=", "removeOutliers=", "addConstant=", "logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", "kinshipDatafile=", "phenotypeRanks", "onlyMissing","onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", "complement", "negate", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "testRobustness", "permutationFilter="] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeRanks = False removeOutliers = None addConstant = -1 phenotypeFileType = 1 rFile = None delim = "," missingVal = "NA" help = 0 minMAF=0.0 withArrayIds = 1 boundaries = [-1,-1] chr=None parallel = None logTransform = False negate = False parallelAll = False lrt = False kinshipDatafile = None onlyMissing = False onlyOriginal96 = False onlyOriginal192 = False onlyBelowLatidue = None complement = False sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 testRobustness = False permutationFilter = 0.002 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-o","--rFile"): rFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--addConstant"): addConstant = float(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--onlyMissing"): onlyMissing = True elif opt in ("--onlyOriginal96"): onlyOriginal96 = True elif opt in ("--onlyOriginal192"): onlyOriginal192 = True elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue = float(arg) elif opt in ("--complement"): complement = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--negate"): negate = True elif opt in ("--removeOutliers"): removeOutliers = float(arg) elif opt in ("--LRT"): lrt = True elif opt in ("-c","--chr"): chr = int(arg) elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("--kinshipDatafile"): kinshipDatafile = arg elif opt in ("--phenotypeRanks"): phenotypeRanks = True elif opt in ("--sr"): sr = True elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) print "Emma is being set up with the following parameters:" print "output:",rFile print "phenotypeRanks:",phenotypeRanks print "withArrayId:",withArrayIds print "phenotypeFileType:",phenotypeFileType print "parallel:",parallel print "parallelAll:",parallelAll print "minMAF:",minMAF print "LRT:",lrt print "delim:",delim print "missingval:",missingVal print "kinshipDatafile:",kinshipDatafile print "chr:",chr print "boundaries:",boundaries print "onlyMissing:",onlyMissing print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "complement:",complement print "negate:",negate print "logTransform:",logTransform print "addConstant:",addConstant print "removeOutliers:",removeOutliers print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "testRobustness:",testRobustness print "permutationFilter:",permutationFilter def runParallel(phenotypeIndex,phed): #Cluster specific parameters print phenotypeIndex phenName = phed.getPhenotypeName(phenotypeIndex) outFileName = resultDir+"Emma_"+parallel+"_"+phenName shstr = """#!/bin/csh #PBS -l walltime=100:00:00 #PBS -l mem=8g #PBS -q cmb """ shstr += "#PBS -N E"+phenName+"_"+parallel+"\n" shstr += "set phenotypeName="+parallel+"\n" shstr += "set phenotype="+str(phenotypeIndex)+"\n" shstr += "(python "+emmadir+"Emma.py -o "+outFileName+" " if onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " if logTransform: shstr += " --logTransform " if negate: shstr += " --negate " if removeOutliers: shstr += " --removeOutliers="+str(removeOutliers)+" " if phenotypeRanks: shstr += " --phenotypeRanks " if testRobustness: shstr+=" --testRobustness " shstr+=" --permutationFilter="+str(permutationFilter)+" " if sr: shstr += " --sr " if not srOutput: output = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"Emma_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr += " -a "+str(withArrayIds)+" " if kinshipDatafile: shstr += " --kinshipDatafile="+str(kinshipDatafile)+" " shstr += " --addConstant="+str(addConstant)+" " shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n" f = open(parallel+".sh",'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data if parallelAll: for phenotypeIndex in phed.phenIds: if onlyMissing: phenName = phed.getPhenotypeName(phenotypeIndex) pvalFile = resultDir+"Emma_"+parallel+"_"+phenName+".pvals" res = None try: res = os.stat(pvalFile) except Exception: print "File",pvalFile,"does not exist." if res and res.st_size>0: print "File",pvalFile,"already exists, and is non-empty." if sr: srInput = resultDir+"Emma_"+parallel+"_"+phenName+".sr.pvals" srRes = None try: srRes = os.stat(srInput) except Exception: print "File",srInput,"does not exist." if srRes and srRes.st_size>0: print "File",srInput,"already exists, and is non-empty." else: runParallel(phenotypeIndex,phed) else: print "Setting up the run." runParallel(phenotypeIndex,phed) else: runParallel(phenotypeIndex,phed) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex,phed) return else: phenotypeIndex = int(args[2]) print "phenotypeIndex:",phenotypeIndex print "\nStarting program now!\n" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data numAcc = len(snpsds[0].accessions) #Removing outliers if removeOutliers: print "Remoing outliers" phed.naOutliers(phenotypeIndex,removeOutliers) #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] #Checking which accessions to keep and which to remove . for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break print "\nFiltering accessions in genotype data:" #Filter accessions which do not have the phenotype value (from the genotype data). for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed from genotype data, leaving",len(accIndicesToKeep),"accessions in all." print "\nNow filtering accessions in phenotype data:" phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is",len(phed.accessions)==len(snpsds[0].accessions) if len(phed.accessions)!=len(snpsds[0].accessions): raise Exception #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" #Remove minor allele frequencies if minMAF!=0: sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Removing SNPs which are outside of boundaries. if chr: print "\nRemoving SNPs which are outside of boundaries." snpsds[chr-1].filterRegion(boundaries[0],boundaries[1]) snpsds = [snpsds[chr-1]] #Ordering accessions in genotype data to fit phenotype data. print "Ordering genotype data accessions." accessionMapping = [] i = 0 for acc in phed.accessions: if acc in snpsds[0].accessions: accessionMapping.append((snpsds[0].accessions.index(acc),i)) i += 1 #print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)",len(snpsds[0].snps) for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "\nGenotype data has been ordered." #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) print "" print "Checking kinshipfile:",kinshipDatafile if kinshipDatafile: #Is there a special kinship file? kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) accIndicesToKeep = [] #Checking which accessions to keep and which to remove (genotype data). sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0,len(kinshipSnpsds[0].accessions)): acc1 = kinshipSnpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) break print accIndicesToKeep for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed from kinship genotype data, leaving",len(accIndicesToKeep),"accessions in all." print "Ordering kinship data accessions." accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in kinshipSnpsds[0].accessions: accessionMapping.append((kinshipSnpsds[0].accessions.index(acc),i)) i += 1 print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)",len(snpsds[0].snps) for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "Kinship genotype data has been ordered." newKinshipSnpsds = [] sys.stdout.write("Converting data format") for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() newKinshipSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) #This data might have NAs print "" kinshipSnpsds = newKinshipSnpsds else: kinshipSnpsds = newSnpsds print "Found kinship data." #Ordering accessions according to the order of accessions in the genotype file # accessionMapping = [] # i = 0 # for acc in snpsds[0].accessions: # if acc in phed.accessions: # accessionMapping.append((phed.accessions.index(acc),i)) # i += 1 # phed.orderAccessions(accessionMapping) #Negating phenotypic values if negate: phed.negateValues(phenotypeIndex) #Adding a constant. if addConstant!=-1: if addConstant==0: addConstant = math.sqrt(phed.getVariance(phenotypeIndex))/10 addConstant = addConstant - phed.getMinValue(phenotypeIndex) print "Adding a constant to phenotype:",addConstant phed.addConstant(phenotypeIndex,addConstant) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotypeIndex) #Converting phenotypes to Ranks elif phenotypeRanks: phed.transformToRanks(phenotypeIndex) if not chr: snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[1,2,3,4,5]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[1,2,3,4,5]) else: snpsDataset = snpsdata.SNPsDataSet(newSnpsds,[chr]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds,[chr]) phenotypeName = phed.getPhenotypeName(phenotypeIndex) sys.stdout.flush() if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in snpsDataset.snpsDataList: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) _robustness_test_(allSNPs,phenVals,rFile,filter=permutationFilter) sys.exit(0) if (not sr) or (sr and not srSkipFirstRun): sys.stdout.write("Running Primary Emma.\n") sys.stdout.flush() pvalFile = _runEmmaScript_(snpsDataset, kinshipSnpsDataset, phed, phenotypeIndex, rFile, chr=chr, delim=delim, missingVal=missingVal, boundaries=boundaries, lrt=lrt) res = gwaResults.Result(pvalFile,name="EMMA_"+phenotypeName, phenotypeID=phenotypeIndex) res.filterMAF() res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,kinshipSnpsDataset) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.filterMAF() res.negLogTransform() srRes = gwaResults.Result(srOutput,name="EMMA_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.filterMAF() srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def _plotKW_(): """ Analyze how population structure affects KW. """ filterProb = 0.1 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "_full_quick_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") # ,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps # For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps # globalKinship = calcKinship(totalSNPs) gc.collect() # Calling garbage collector, in an attempt to clean up memory.. # chr = 1 # for snpsd in snpsds: snpsd = snpsds[3] k = calcKinship(snpsd.snps[200:1400]) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) # runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)") k = calcKinship(totalSNPs) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) # runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)") phenVals = phed.getPhenVals(p_i) pvals = _run_kw_(snpsd.snps[200:1400], phenVals) log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)") (pvals, new_positions, acc_groups) = get_KW_pvals( snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW" ) ecot_map = phenotypeData._getEcotypeIdToStockParentDict_() for i in range(0, len(acc_groups)): acc_list = [] for a_i in acc_groups[i]: e_i = snpsd.accessions[a_i] # print e_i acc_list.append(ecot_map[int(e_i)][0]) print "group", i, ":", acc_list log_pvals = [] for pval in pvals: # print pval log_pvals.append(-math.log10(pval)) pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)") pylab.legend(numpoints=2, handlelen=0.005) pylab.show()
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", "help", "heterozygous2NA", "first192", "removeLer", "removeCol" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" comparisonFile = None maxMissing = 1.0 maxError = 1.0 removeEcotypes = None removeArray = None removeIdentical = False onlyCommon = False debug = None report = None help = 0 withArrayIds = 1 first96 = False first192 = False heterozygous2NA = False removeLer = False removeCol = False for opt, arg in opts: if opt in ('-o'): output_fname = arg elif opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("--maxError"): maxError = float(arg) elif opt in ("--maxMissing"): maxMissing = float(arg) elif opt in ("--heterozygous2NA"): heterozygous2NA = True elif opt in ("--removeEcotypeId"): removeEcotypes = arg.split(",") removeEcotypes = map(int, removeEcotypes) elif opt in ("--removeArrayId"): removeArray = int(arg) elif opt in ("--removeIdentical"): removeIdentical = True elif opt in ("--onlyCommon"): onlyCommon = True elif opt in ("--first96"): first96 = True elif opt in ("--first192"): first192 = True elif opt in ("--removeLer"): removeLer = True elif opt in ("--removeCol"): removeCol = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds == 1 or withArrayIds == 2 waid2 = withArrayIds == 2 or withArrayIds == 3 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) accessionsToRemove = [] arraysToRemove = None if first96: import dataParsers d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1', user="******", passwd="bamboo123") ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1', user="******", passwd="bamboo123") print "Dictionaries loaded" names = [] first96Names = [] for i in range(0, len(snpsds[0].accessions)): ecotype = snpsds[0].accessions[i] arrayID = snpsds[0].arrayIds[i] names.append((arrayID, ecotd[ecotype], ecotype)) if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0: accessionsToRemove.append(ecotype) else: first96Names.append( (arrayID, d[ecotype][1], d[ecotype][0], ecotype)) first96Names.sort() print "First 96 accessions, len:", len(first96Names), ":" for name in first96Names: print name names.sort() print "All accessions:" for name in names: print name elif first192: import phenotypeData ecotypes_192 = map(str, phenotypeData._getFirst192Ecotypes_()) print ecotypes_192, snpsds[0].accessions for acc in snpsds[0].accessions: if acc not in ecotypes_192: accessionsToRemove.append(acc) print "found", len(ecotypes_192), '"192" ecotypes... removing', len( accessionsToRemove), "ecotypes." if removeLer: import analyzeHaplotype as ah accessionsToRemove += ah.getLerAndColAccessions(snpsds)[0] if removeCol: import analyzeHaplotype as ah accessionsToRemove += ah.getLerAndColAccessions(snpsds)[1] #Retrieve comparison list of accessions. (Error rates for accessions) if (removeIdentical or maxError < 1.0) and comparisonFile: sys.stderr.write("Loading comparison file:") snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) res = [] sys.stderr.write("Comparing accessions.") for i in range(0, len(snpsds)): res.append(snpsds[i].compareWith(snpsds2[i], withArrayIds=withArrayIds, verbose=False, heterozygous2NA=heterozygous2NA)) sys.stderr.write(".") sys.stderr.write("\n") totalAccessionCounts = [0] * len(res[0][2]) accErrorRate = [0] * len(res[0][2]) for i in range(0, len(snpsds)): r = res[i] for j in range(0, len(r[2])): totalAccessionCounts[j] += r[6][j] accErrorRate[j] += r[3][j] * float(r[6][j]) for i in range(0, len(accErrorRate)): accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i]) accErrAndID = [] if 0 < withArrayIds < 3: for i in range(0, len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i], r[5][i])) else: for i in range(0, len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i])) accErrAndID.sort() accErrAndID.reverse() #Figure out which accessions are too erroraneous if maxError < 1.0 and comparisonFile: if withArrayIds: arraysToRemove = [] for (error, ecotype, array) in accErrAndID: if error > maxError: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for (error, ecotype) in accErrAndID: if error > maxError: accessionsToRemove.append(ecotype) if removeIdentical and comparisonFile and withArrayIds: print "Locating identical accessions" accErrAndID.sort() if not arraysToRemove: arraysToRemove = [] for accession in set(snpsds[0].accessions): if snpsds[0].accessions.count(accession) > 1: found = 0 for (error, ecotype, array) in accErrAndID: if ecotype == accession: if found > 0: accessionsToRemove.append(ecotype) arraysToRemove.append(array) found += 1 if onlyCommon and comparisonFile: print "Locating accessions which are not shared" snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions))) if not arraysToRemove: arraysToRemove = [] for i in range(0, len(snpsds[0].accessions)): acc = snpsds[0].accessions[i] if not acc in snpsds2[0].accessions: accessionsToRemove.append(acc) if 0 < withArrayIds < 3: arraysToRemove.append(snpsds[0].arrayIds[i]) if maxMissing < 1.0: missingCounts = [0] * len(snpsds[0].accessions) numSnps = 0 for snpsd in snpsds: mc = snpsd.accessionsMissingCounts() numSnps += len(snpsd.positions) for i in range(0, len(snpsds[0].accessions)): missingCounts[i] += mc[i] missingRates = [] if withArrayIds: arraysToRemove = [] for i in range(0, len(snpsds[0].accessions)): missingRates.append( (missingCounts[i] / float(numSnps), snpsds[0].accessions[i], snpsds[0].arrayIds[i])) missingRates.sort() missingRates.reverse() for (mrate, ecotype, array) in missingRates: if mrate > maxMissing: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for i in range(0, len(snpsds[0].accessions)): missingRates.append((missingCounts[i] / float(numSnps), snpsds[0].accessions[i])) missingRates.sort() missingRates.reverse() for (mrate, ecotype) in missingRates: if mrate > maxMissing: accessionsToRemove.append(ecotype) if removeEcotypes: for removeEcotype in removeEcotypes: accessionsToRemove.append(str(int(removeEcotype))) print "Removing", len(accessionsToRemove), "accessions." if removeArray: if not arraysToRemove: arraysToRemove = [] arraysToRemove.append(str(removeArray)) print "Removing", len(arraysToRemove), " arrays." numAccessions = len(snpsds[0].accessions) sys.stderr.write("Removing accessions.") for snpsd in snpsds: snpsd.removeAccessions(accessionsToRemove, arrayIds=arraysToRemove) sys.stderr.write(".") print "\n", ( numAccessions - len(snpsds[0].accessions) ), "accessions out of " + str(numAccessions) + " were removed." import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname, snpsds, chromosomes=[1, 2, 3, 4, 5], deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
def _run_(): import os if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["id=", "chr=", "numARG=", "numMarkers=", "numPerm=", "smartCutoff=", "BoundaryStart=", "BoundaryEnd=", "binary", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", "debug", "parallel=", "parallelAll", "help", "scoreFile="] try: opts, args = getopt.getopt(sys.argv[1:], "i:s:c:d:m:a:bh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) import tempfile tempfile.tempdir = '/tmp' (fId, id) = tempfile.mkstemp() os.close(fId) scoreFile = None chr = None numARG = 30 numMarkers = 100 numPerm = 0 smartCutoff = 10 binary = False delim = "," missingVal = "NA" debug = None report = None help = 0 withArrayId = 0 boundaries = [ - 1, - 1] phenotypeFileType = 1 parallel = None parallelAll = False snpsDataFile = None for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-i", "--id"): id = '/tmp/' + arg elif opt in ("-s", "--scoreFile"): scoreFile = arg elif opt in ("-c", "--chr"): chr = int(arg) elif opt in ("--numARG"): numARG = int(arg) elif opt in ("--numMarkers"): numMarkers = int(arg) elif opt in ("--numPerm"): numPerm = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--smartCutoff"): smartCutoff = int(arg) elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--binary"): binary = True elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-a", "--withArrayId"): withArrayId = int(arg) elif opt in ("-b", "--debug"): debug = 1 if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) if boundaries[0] == boundaries[1] and boundaries[0] == - 1: boundaries = None margFile = id + ".marg" outFile = margFile + ".out" def runParallel(phenotypeIndex): #Cluster specific parameters #margdir = '/home/cmb-01/bvilhjal/Projects/Python-snps/' resultDir = env.results_dir #'/home/cmb-01/bvilhjal/results/' import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outFileName = resultDir + "Marg_" + parallel + "_" + phenName scoreFile = outFileName + ".score" shstr = """#!/bin/csh #PBS -l walltime=120:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr += "#PBS -N M" + phenName + "_" + parallel + "\n" #shstr += "(python " + margdir + "margarita.py " shstr += "(python " + env.script_dir + "margarita.py " if phed.isBinary(phenotypeIndex): shstr += " --binary " shstr += " -s " + scoreFile shstr += " -a " + str(withArrayId) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " " shstr += "> " + outFileName + ".out) >& " + outFileName + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") #Nested function ends snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. if len(args) > 2: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: snpsDataFile = args[0] if not parallelAll: phenotypeIndex = int(args[1]) runParallel(phenotypeIndex) return import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) return phenotypeIndex = int(args[2]) #Print out information about this run... print "Preparing a blended margarita...." print "Num ARG:", numARG print "Num Markers:", numMarkers print "Num Permutations:", numPerm print "Smart cutoff:", smartCutoff print "Binary:", binary print "ScoreFile:", scoreFile import dataParsers, snpsdata, phenotypeData #phenotypeFile = "/Users/bjarni/Projects/Python-snps/tinaPhenos_041808.csv" if phenotypeFileType == 1: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data elif phenotypeFileType == 2: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, accessionDecoder = dataParsers.accessionName2EcotypeId, type = 2) snpsds = dataParsers.parseCSVData(snpsDataFile, deliminator = delim, missingVal = missingVal, withArrayIds = bool(withArrayId)) #Get SNPs data marg = Margarita(margFile, outFile, numARG, numMarkers, numPerm, smartCutoff) if chr: snpsd = snpsds[chr - 1].getSnpsData() marg.gwa(snpsd, phed, phenotype = phenotypeIndex, boundaries = boundaries, chromosome = chr, binary = binary) else: scoreStr = "" for chr in [0, 1, 2, 3, 4]: snpsd = snpsds[chr].getSnpsData() (newRStr, newScoreStr, permPvals) = marg.gwa(snpsd, phed, phenotype = phenotypeIndex, boundaries = boundaries, chromosome = chr + 1, binary = binary) scoreStr += newScoreStr f = open(scoreFile, 'w') f.write(scoreStr) f.close()
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "phenotypeIDs=", "rawPhenotypes", "onlyBinary", "onlyCategorical", "onlyQuantitative", "onlyReplicates", "delim=", "missingval=", "help", "includeSD", "orderByGenotypeFile=", "onlyPublishable" ] try: opts, args = getopt.getopt(sys.argv[1:], "p:u:h:o:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) output_fname = None phenotypeIDs = None delim = "," missingVal = "NA" rawPhenotypes = False onlyBinary = False onlyCategorical = False onlyQuantitative = False onlyReplicates = False includeSD = False onlyPublishable = False genotypeFile = None help = 0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", ): output_fname = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-d", "--delim"): delim = arg elif opt in ("--rawPhenotypes"): rawPhenotypes = True elif opt in ("--onlyBinary"): onlyBinary = True elif opt in ("--onlyCategorical"): onlyCategorical = True elif opt in ("--onlyQuantitative"): onlyQuantitative = True elif opt in ("--onlyReplicates"): onlyReplicates = True elif opt in ("--includeSD"): includeSD = True elif opt in ("--onlyPublishable"): onlyPublishable = True elif opt in ("--phenotypeIDs"): phenotypeIDs = [] for num in arg.split(","): phenotypeIDs.append(int(num)) elif opt in ("--orderByGenotypeFile"): genotypeFile = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) #Retrieve phenotype data. phenData = getPhenotypes(onlyBinary=onlyBinary, onlyQuantitative=onlyQuantitative, onlyCategorical=onlyCategorical, onlyReplicates=onlyReplicates, includeSD=includeSD, rawPhenotypes=rawPhenotypes, onlyPublishable=onlyPublishable) if phenotypeIDs: print "Remoing phenotypes." phenData.removePhenotypeIDs(phenotypeIDs) #Sort in correct order. if genotypeFile: snpsds = dataParsers.parseCSVData(genotypeFile, format=1, deliminator=delim, missingVal=missingVal) print "Removing accessions which are not in the genotype file." indicesToKeep = [] for i in range(0, len(phenData.accessions)): if phenData.accessions[i] in snpsds[0].accessions: indicesToKeep.append(i) phenData.removeAccessions(indicesToKeep) print "Ordering accessions to match the genotype file order" associationMapping = [] j = 0 for acc in snpsds[0].accessions: if acc in phenData.accessions: associationMapping.append((phenData.accessions.index(acc), j)) j += 1 phenData.orderAccessions(associationMapping) #Output phenotypes to file. phenData.writeToFile(output_fname, delimiter='\t', with_pid=True)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "monomorphic", "onlyBinary", "delim=", "missingval=", "withArrayId=", "callProbFile=", "minMAF=", "minCallProb=", "debug", "report", "help", "output01Format", "filterRegion="] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:brh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" comparisonFile = None maxMissing = 1.0 maxError = 1.0 monomorphic = False debug = None report = None help = 0 withArrayIds = 0 minCallProb=None minMAF=None callProbFile = None onlyBinary = False output01Format = False filterRegion = False startPos = None endPos = None chromosome = None chromosomes=[1,2,3,4,5] for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-o",): output_fname = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("--maxError"): maxError = float(arg) elif opt in ("--maxMissing"): maxMissing = float(arg) elif opt in ("--minCallProb"): minCallProb = float(arg) elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--callProbFile"): callProbFile = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 elif opt in ("--monomorphic"): monomorphic = True elif opt in ("--onlyBinary"): onlyBinary = True elif opt in ("--output01Format"): output01Format = True elif opt in ("--filterRegion"): filterRegion = True region = arg.split(",") region = map(int,region) chromosome = region[0] startPos = region[1] endPos = region[2] else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 if callProbFile and minCallProb: #Read prob file into SNPsdatas. #snpsds = dataParsers.parseCSVDataWithCallProb(inputFile, callProbFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) pass else: snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) #Filtering monomorphic if monomorphic: print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" if onlyBinary or output01Format: print "Filtering non-binary SNPs" for snpsd in snpsds: print "Removed", str(snpsd.onlyBinarySnps()),"Snps" #Filtering missing values if maxMissing<1.0 and maxMissing>=0.0: print "Filtering SNPs with missing values" numAccessions = len(snpsds[0].accessions) for snpsd in snpsds: print "Removed", str(snpsd.filterMissingSnps(int(maxMissing*numAccessions))),"Snps" #Filtering bad SNPs if comparisonFile and maxError<1.0: print "Filtering erroneous SNPs, with maxError=",maxError snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) for i in range(0,len(snpsds)): snpsds[i].filterBadSnps(snpsds2[i],maxError) if minMAF: print "Removing SNPs withe MAF <",minMAF for snpsd in snpsds: print "Removed", str(snpsd.filterMinMAF(minMAF)),"Snps" #Output specific region.. if filterRegion: chromosomes = [chromosome] snpsd = snpsds[chromosome-1] snpsd.filterRegion(startPos,endPos) snpsds = [snpsd] #Converting lousy calls to NAs if callProbFile and minCallProb: print "Converting base calls with call prob. lower than",minCallProb,"to NAs" #To avoid memory problems, the file/data is processed one line at a time. gInFile = open(inputFile,"r") pInFile = open(callProbFile,"r") outFile = open(output_fname,"w") if withArrayIds==2: gline = gInFile.readline() outFile.write(gline) pInFile.readline() gline = gInFile.readline() outFile.write(gline) pInFile.readline() i = 0 totalCount = 0.0 convertedCount = 0.0 while(1): i += 1 gline = gInFile.readline() pline = pInFile.readline() #print gline if gline and pline: snp = gline.strip().split(delim) probs = pline.strip().split(delim) probs = map(float,probs) newSNP = [] totalCount += len(snp) for (nt,prob) in zip(snp,probs): if prob>minCallProb: newSNP.append(nt) convertedCount += 1.0 else: newSNP.append('NA') outFile.write(delim.join(newSNP)+"\n") else: print i,gline,pline break if i%10000==0: print i print i gInFile.close() pInFile.close() outFile.close() print "Fraction converted =",convertedCount/totalCount else: if output01Format: snpsds01format = [] for snpsd in snpsds: snpsds01format.append(snpsd.getSnpsData(missingVal=missingVal)) #FINISH snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds01format,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1) else: snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["priority=", "delim=", "missingval=", "union=", "intersection=", "debug", "report", "help", "withArrayId="] try: opts, args = getopt.getopt(sys.argv[1:], "o:p:d:m:u:i:a:brh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) if len(args)!=2: raise Exception("Number of arguments isn't correct.") inputFile1 = args[0] inputFile2 = args[1] priority = 1 union = 0 intersection = 0 output_fname = None delim = "," missingVal = "NA" debug = None report = None withArrayIds = 0 chromosomes = [1,2,3,4,5] for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-p", "--priority"): priority = int(arg) elif opt in ("-u", "--union"): union = int(arg) elif opt in ("-i", "--intersection"): intersection = int(arg) elif opt in ("-o",): output_fname = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 import dataParsers (snpsds1,chromosomes1) = dataParsers.parseCSVData(inputFile1, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1,returnChromosomes=True) (snpsds2,chromosomes2) = dataParsers.parseCSVData(inputFile2, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2,returnChromosomes=True) withArrayIds = waid1 if len(snpsds1) != len(snpsds2): print("Warning: Unequal number of chromosomes.") #raise Exception("Unequal number of chromosomes.") import snpsdata if union==0 and intersection==0: for i in range(0,len(chromosomes1)): chr1 = chromosomes1[i] for j in range(0,len(chromosomes2)): chr2 = chromosomes2[j] if chr1==chr2: snpsds1[i].mergeData(snpsds2[j],priority=priority) chromosomes = chromosomes1 snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal, withArrayIds = waid1) elif 0<union<4 and intersection==0: for i in range(0,len(chromosomes1)): chr1 = chromosomes1[i] for j in range(0,len(chromosomes2)): chr2 = chromosomes2[j] if chr1==chr2: snpsds1[i].mergeDataUnion(snpsds2[j], priority=priority, unionType=union) if union==1 or union==3: chromosomes = set(chromosomes1).union(set(chromosomes2)) chromosomes = list(chromosomes) chromosomes.sort() elif union==2: chromosomes = chromosomes1 snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal) elif 0<intersection<4 and union==0: for i in range(0,len(snpsds1)): snpsds1[i].mergeDataIntersection(snpsds2[i], priority=priority, intersectionType=intersection) if intersection==1 or intersection==3: chromosomes = set(chromosomes1).intersection(set(chromosomes2)) chromosomes = list(chromosomes) chromosomes.sort() elif intersection==2: chromosomes = chromosomes1 snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds1,chromosomes=chromosomes, deliminator=delim, missingVal = missingVal) else: if help==0: print "The union or intersection options used are wrong!!\n" print __doc__ sys.exit(2)
def _test1_(): import dataParsers snpsds = dataParsers.parseCSVData("2010_v3.csv") #snpsds = dataParsers.parseCSVData("250K_m3.csv",withArrayIds=1) #comparisonSnpsds = dataParsers.parseCSVData("2010_v3.csv") filterMonomorphic(snpsds)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["delim=", "missingval=", "withArrayId=", "comparisonFile=", "debug", "report", "help"] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:brh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = ", " missingVal = "NA" comparisonFile = None debug = None report = None help = 0 withArrayIds = 0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("-o",): output_fname = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 import dataParsers import snpsdata snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) #Calculating Error rates #if comparisonFile: # snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) # for i in range(0,len(snpsds)): #Compare ... and record relevant information... #snpsds[i].compare filterBadSnps(snpsds2[i],maxError) # pass #Calculating NA rates.. print "Calculating NA rates" snpsNARates = [] for i in range(0,len(snpsds)): snpsNARates += snpsds[i].getSnpsNArates() import util rstr = "" rstr += "snpsNARates <- c("+",".join(util.valListToStrList(snpsNARates))+")\n" rstr += 'hist(snpsNARates, xlab="NA rates", ylab="SNP frequency", breaks=60)' f = open(output_fname,"w") f.write(rstr) f.close()
def _testRun1_(): import dataParsers snpsds = dataParsers.parseCSVData("250K_m3.csv",withArrayIds=1) comparisonSnpsds = dataParsers.parseCSVData("2010_v3.csv") filterByError(snpsds,comparisonSnpsds,0.2,withArrayIds=1)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["maxError=", "comparisonFile=", "maxMissing=", "removeEcotypeId=", "removeArrayId=", "first96", "removeIdentical", "onlyCommon", "delim=", "missingval=", "withArrayId=", "debug", "report", "help", "heterozygous2NA"] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:bh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" comparisonFile = None maxMissing = 1.0 maxError = 1.0 removeEcotypes = None removeArray = None removeIdentical = False onlyCommon = False debug = None report = None help = 0 withArrayIds = 1 first96 = False heterozygous2NA = False for opt, arg in opts: if opt in ('-o'): output_fname = arg elif opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("--maxError"): maxError = float(arg) elif opt in ("--maxMissing"): maxMissing = float(arg) elif opt in ("--heterozygous2NA"): heterozygous2NA = True elif opt in ("--removeEcotypeId"): removeEcotypes = arg.split(",") removeEcotypes = map(int,removeEcotypes) elif opt in ("--removeArrayId"): removeArray = int(arg) elif opt in ("--removeIdentical"): removeIdentical = True elif opt in ("--onlyCommon"): onlyCommon = True elif opt in ("--first96"): first96 = True elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 or withArrayIds==3 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) accessionsToRemove = [] arraysToRemove = None if first96: import dataParsers d = dataParsers.getEcotypeToAccessionDictionary(defaultValue='-1',user="******",passwd="bamboo123") ecotd = dataParsers.getEcotypeToNameDictionary(defaultValue='-1',user="******",passwd="bamboo123") print "Dictionaries loaded" names = [] first96Names = [] for i in range(0,len(snpsds[0].accessions)): ecotype = snpsds[0].accessions[i] arrayID = snpsds[0].arrayIds[i] names.append((arrayID,ecotd[ecotype],ecotype)) if int(d[ecotype][0]) > 97 or int(d[ecotype][0]) < 0: accessionsToRemove.append(ecotype) else: first96Names.append((arrayID,d[ecotype][1],d[ecotype][0],ecotype)) first96Names.sort() print "First 96 accessions, len:",len(first96Names),":" for name in first96Names: print name names.sort() print "All accessions:" for name in names: print name #Retrieve comparison list of accessions. (Error rates for accessions) if (removeIdentical or maxError<1.0) and comparisonFile: sys.stderr.write("Loading comparison file:") snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) res = [] sys.stderr.write("Comparing accessions.") for i in range(0,len(snpsds)): res.append(snpsds[i].compareWith(snpsds2[i],withArrayIds=withArrayIds,verbose=False,heterozygous2NA=heterozygous2NA)) sys.stderr.write(".") sys.stderr.write("\n") totalAccessionCounts = [0]*len(res[0][2]) accErrorRate = [0]*len(res[0][2]) for i in range(0,len(snpsds)): r = res[i] for j in range(0,len(r[2])): totalAccessionCounts[j] += r[6][j] accErrorRate[j]+=r[3][j]*float(r[6][j]) for i in range(0,len(accErrorRate)): accErrorRate[i]=accErrorRate[i]/float(totalAccessionCounts[i]) accErrAndID = [] if 0<withArrayIds<3: for i in range(0,len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i], r[5][i])) else: for i in range(0,len(r[2])): accErrAndID.append((accErrorRate[i], r[2][i])) accErrAndID.sort() accErrAndID.reverse() #Figure out which accessions are too erroraneous if maxError<1.0 and comparisonFile: if withArrayIds: arraysToRemove = [] for (error,ecotype,array) in accErrAndID: if error> maxError: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for (error,ecotype) in accErrAndID: if error> maxError: accessionsToRemove.append(ecotype) if removeIdentical and comparisonFile and withArrayIds: print "Locating identical accessions" accErrAndID.sort() if not arraysToRemove: arraysToRemove = [] for accession in set(snpsds[0].accessions): if snpsds[0].accessions.count(accession)>1: found = 0 for (error,ecotype,array) in accErrAndID: if ecotype==accession: if found>0: accessionsToRemove.append(ecotype) arraysToRemove.append(array) found += 1 if onlyCommon and comparisonFile: print "Locating accessions which are not shared" snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) #print snpsds2[0].accessions,'\n',snpsds[0].accessions,'\n',len(set(snpsds2[0].accessions).intersection(set(snpsds[0].accessions))) if not arraysToRemove: arraysToRemove = [] for i in range(0,len(snpsds[0].accessions)): acc = snpsds[0].accessions[i] if not acc in snpsds2[0].accessions: accessionsToRemove.append(acc) if 0<withArrayIds<3: arraysToRemove.append(snpsds[0].arrayIds[i]) if maxMissing<1.0: missingCounts = [0]*len(snpsds[0].accessions) numSnps = 0 for snpsd in snpsds: mc = snpsd.accessionsMissingCounts() numSnps += len(snpsd.positions) for i in range(0,len(snpsds[0].accessions)): missingCounts[i] += mc[i] missingRates = [] if withArrayIds: arraysToRemove = [] for i in range(0,len(snpsds[0].accessions)): missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i],snpsds[0].arrayIds[i])) missingRates.sort() missingRates.reverse() for (mrate,ecotype,array) in missingRates: if mrate>maxMissing: accessionsToRemove.append(ecotype) arraysToRemove.append(array) else: for i in range(0,len(snpsds[0].accessions)): missingRates.append((missingCounts[i]/float(numSnps),snpsds[0].accessions[i])) missingRates.sort() missingRates.reverse() for (mrate,ecotype) in missingRates: if mrate>maxMissing: accessionsToRemove.append(ecotype) if removeEcotypes: for removeEcotype in removeEcotypes: accessionsToRemove.append(str(int(removeEcotype))) print "Removing", len(accessionsToRemove), "accessions." if removeArray: if not arraysToRemove: arraysToRemove = [] arraysToRemove.append(str(removeArray)) print "Removing", len(arraysToRemove)," arrays." numAccessions = len(snpsds[0].accessions) sys.stderr.write("Removing accessions.") for snpsd in snpsds: snpsd.removeAccessions(accessionsToRemove,arrayIds=arraysToRemove) sys.stderr.write(".") print "\n", (numAccessions-len(snpsds[0].accessions)), "accessions out of "+str(numAccessions)+" were removed." import snpsdata snpsdata.writeRawSnpsDatasToFile(output_fname,snpsds,chromosomes=[1,2,3,4,5], deliminator=delim, missingVal = missingVal, withArrayIds = waid1)
def _testRun1_(): import dataParsers snpsds = dataParsers.parseCSVData("250K_m3.csv", withArrayIds=1) comparisonSnpsds = dataParsers.parseCSVData("2010_v3.csv") filterByError(snpsds, comparisonSnpsds, 0.2, withArrayIds=1)
def _plotKW_(): """ Analyze how population structure affects KW. """ filterProb = 0.1 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "_full_quick_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps #For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps #globalKinship = calcKinship(totalSNPs) gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. #chr = 1 #for snpsd in snpsds: snpsd = snpsds[3] k = calcKinship(snpsd.snps[200:1400]) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) #runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "c.", label="Emma (local)") k = calcKinship(totalSNPs) res = runEmma(phed, p_i, k, snpsd.snps[200:1400]) #runEmma(phed,p_i,k,snps): pvals = res["ps"] log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "g.", label="Emma (global)") phenVals = phed.getPhenVals(p_i) pvals = _run_kw_(snpsd.snps[200:1400], phenVals) log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(snpsd.positions[200:1400], log_pvals, "r.", label="KW (full data)") (pvals, new_positions, acc_groups) = get_KW_pvals(snpsd.snps[200:1400], snpsd.positions[200:1400], phed, p_i, kinshipThreshold=0.95, method="KW") ecot_map = phenotypeData._getEcotypeIdToStockParentDict_() for i in range(0, len(acc_groups)): acc_list = [] for a_i in acc_groups[i]: e_i = snpsd.accessions[a_i] #print e_i acc_list.append(ecot_map[int(e_i)][0]) print "group", i, ":", acc_list log_pvals = [] for pval in pvals: #print pval log_pvals.append(-math.log10(pval)) pylab.plot(new_positions, log_pvals, "b.", label="KW (merged data)") pylab.legend(numpoints=2, handlelen=0.005) pylab.show()
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF="] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 impFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None logTransform = False parallelAll = False chunkSize = 250000 round2Size = 5000 nTrees = 15000 nodeSize = None mem = "8g" skipSecondRound = True minMAF = 0.0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-o","--rFile"): impFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--secondRound"): skipSecondRound = False elif opt in ("-d","--delim"): delim = arg elif opt in ("--chunkSize"): chunkSize = int(arg) elif opt in ("--round2Size"): round2Size = int(arg) elif opt in ("--nTrees"): nTrees = int(arg) elif opt in ("--nodeSize"): nodeSize = int(arg) elif opt in ("--mem"): mem = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-m","--minMAF"): minMAF = float(arg) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) def runParallel(phenotypeIndex): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/","_div_") phenName = phenName.replace("*","_star_") impFileName = resultDir+"RF_"+parallel+"_"+phenName outFileName = impFileName shstr = """#!/bin/csh #PBS -l walltime=120:00:00 """ shstr += "#PBS -l mem="+mem+"\n" shstr +=""" #PBS -q cmb """ shstr += "#PBS -N RF"+phenName+"_"+parallel+"\n" shstr += "(python "+programDir+"RandomForest.py -o "+impFileName+" --chunkSize "+str(chunkSize)+" --nTrees "+str(nTrees)+" --mem "+str(mem)+" --round2Size "+str(round2Size)+"" if nodeSize: shstr += " --nodeSize "+str(nodeSize)+" " if logTransform: shstr += " --logTransform " if not skipSecondRound: shstr += " --secondRound " shstr += " -a "+str(withArrayIds)+" " shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n" f = open(parallel+".sh",'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") #Nested function ends snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) print "chunkSize:",chunkSize print "nTrees:",nTrees print "nodeSize:",nodeSize print "mem:",mem print "logTransform:",logTransform print "round2Size:",round2Size print "skipSecondRound:",skipSecondRound #Loading genotype data import dataParsers snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) #Load phenotype file sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc),i)) i += 1 phed.orderAccessions(accessionMapping) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotype) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" #Remove minor allele frequencies if minMAF!=0: sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Converting format to 01 import snpsdata newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" snpsds = newSnpsds #Writing files import tempfile if env.user=="bjarni": tempfile.tempdir='/tmp' (fId, phenotypeTempFile) = tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile) = tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write( "Phenotype file written\n") sys.stdout.flush() #Retain only the correct runchunk of data. chromasomes = [] positions = [] snps = [] for i in range(0,len(snpsds)): snpsd = snpsds[i] positions += snpsd.positions snps += snpsd.snps chrList = [i+1]*len(snpsd.positions) chromasomes += chrList #Is the phenotype binary? binary = phed.isBinary(phenotypeIndex) import util impFile = impFile+".imp" rDataFile = impFile+".rData" rFile = impFile+".r" outRfile = rFile+".out" errRfile = rFile+".err" topImpFile = impFile+"_top"+str(chunkSize)+".imp" topRDataFile = impFile+"_top.rData" try: os.remove(impFile) #Removing file if it already exits. except Exception: print "Couldn't remove",impFile try: os.remove(topImpFile) #Removing file if it already exits. except Exception: print "Couldn't remove",topImpFile for startIndex in range(0,len(positions),chunkSize): if startIndex+chunkSize>=len(positions): endIndex = len(positions) else: endIndex = startIndex+chunkSize #Writing genotype data to file. tmpFile = open(genotypeTempFile,"w") for i in range(startIndex,endIndex): outStr ="" snp = util.valListToStrList(snps[i]) outStr += str(chromasomes[i])+","+str(positions[i])+"," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile,'w') f.write(rstr) f.close() #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out" #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err" print "Running model nr",startIndex/chunkSize,":" cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() os.system(cmdStr) print "Random forest output saved in", impFile if not skipSecondRound: #Run on the top 'chunkSize' number of hits. #loading the R output file. impF = open(impFile,"r") lines=impF.readlines() impF.close() impList = list() for i in range(1,len(lines)): line = lines[i] line.strip() l = line.split(",") impList.append( (float(l[2]),l[0],l[1],snps[i]) ) impList.sort() impList.reverse() #Writing genotype data to file. tmpFile = open(genotypeTempFile,"w") for i in range(0,round2Size): outStr = "" snp = util.valListToStrList(impList[i][3]) outStr += str(impList[i][1])+","+str(impList[i][2])+"," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile,'w') f.write(rstr) f.close() print "Running randomForest on the top importance scores:" cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() os.system(cmdStr)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "monomorphic", "monomorphic", "delim=", "missingval=", "withArrayId=", "windowSize=", "debug", "report", "help" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:w:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = "," missingVal = "NA" monomorphic = False help = 0 withArrayIds = 0 windowSize = 30 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("--monomorphic"): monomorphic = True elif opt in ("--windowSize"): windowSize = arg elif opt in ("-o", ): output_fname = arg elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds == 1 or withArrayIds == 2 waid2 = withArrayIds == 2 (snpsds, chromosomes) = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1, returnChromosomes=True) accessions = snpsds[0].accessions arrayIds = snpsds[0].arrayIds positionsList = [] tmpFiles = [] #tempfile.tempdir='/tmp' i = 1 for snpsd in snpsds: tmpFile1 = tempfile.mkstemp() os.close(tmpFile1[0]) tmpFile2 = tempfile.mkstemp() os.close(tmpFile2[0]) tmpFiles.append((tmpFile1[1], tmpFile2[1])) positionsList.append(snpsd.positions) print "Preparing data in", tmpFile1[1] writeAsNputeFile(snpsd, tmpFile1[1]) checkNputeFile(tmpFile1[1]) del snpsd.snps nputeCmd = "python " + path_NPUTE + "NPUTE.py -m 0 -w " + str( windowSize) + " -i " + str(tmpFile1[1]) + " -o " + str(tmpFile2[1]) print "Imputing chromosome", i i += 1 print nputeCmd os.system(nputeCmd) for i in range(0, len(tmpFiles)): print "Reading chromosome", i + 1 snpsds[i] = readNputeFile(tmpFiles[i][1], accessions, positionsList[i], arrayIds=arrayIds) os.remove(tmpFiles[i][0]) os.remove(tmpFiles[i][1]) snpsDataSet = snpsdata.SnpsDataSet(snpsds, [1, 2, 3, 4, 5]) #Filtering monomorphic if monomorphic: print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" snpsDataSet.writeToFile(output_fname, deliminator=delim, missingVal=missingVal, withArrayIds=waid1)
def _plotKinshipDiffs_(): filterProb = 0.2 p_i = 1 res_dir = "/Users/bjarni/tmp/" runId = "full_" snpsDataFile = "/Network/Data/250k/dataFreeze_080608/250K_f10_080608.csv" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=",") #,debug=True) phenotypeFile = "/Network/Data/250k/dataFreeze_080608/phenotypes_all_raw_111008.tsv" print "Loading phenotype data" phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') snpsdata.coordinateSnpsAndPhenotypeData(phed, p_i, snpsds) for snpsd in snpsds: snpsd.filterMinMAF(0.1) snpsd.filterMonoMorphicSnps() totalSNPs = [] for i in range(len(snpsds)): snpsds[i] = snpsds[i].getSnpsData() totalSNPs += snpsds[i].snps #For memory, remove random SNPs snps = [] for snp in totalSNPs: if random.random() < filterProb: snps.append(snp) totalSNPs = snps print "Calculating the global kinship..." globalKinship = calcKinship(totalSNPs) print "done." normalizedGlobalKinship = globalKinship / mean(globalKinship) gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. for i in range(4, 5): #len(snpsds)): chr = i + 1 snpsd = snpsds[i] #pylab.subplot(5,1,chr) # pylab.figure(figsize=(18,4)) # (kinshipDiffs,binPos,local300Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=300000) # pylab.plot(binPos,kinshipDiffs,"r",label='ws$=300000$') # (kinshipDiffs,binPos,local500Kinships) = getKinshipDiffs(snpsd,normalizedGlobalKinship,windowSize=500000) # pylab.plot(binPos,kinshipDiffs,"b",label='ws$=500000$') # pylab.legend(numpoints=2,handlelen=0.005) # pylab.title("Kinship diff. chr. "+str(chr)) # pylab.savefig(res_dir+runId+"kinshipDiffs_500_300kb_chr"+str(chr)+".pdf",format="pdf") # pylab.clf() pylab.figure(figsize=(18, 4)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=300000) pylab.plot(binPos, emmaDiffs, "r", label='ws$=300000$') pylab.title("Emma avg. p-value diff. 500kb on chr. " + str(chr)) (emmaDiffs, binPos) = getEmmaDiffs(snpsd, phed, p_i, globalKinship, windowSize=500000) pylab.plot(binPos, emmaDiffs, "b", label='ws$=500000$') pylab.title("Emma avg. p-value diff. on chr. " + str(chr)) pylab.legend(numpoints=2, handlelen=0.005) pylab.savefig(res_dir + runId + "EmmaPvalDiffs_500_300kb_chr" + str(chr) + ".pdf", format="pdf") pylab.clf() gc.collect( ) #Calling garbage collector, in an attempt to clean up memory..
def _run_(): if len(sys.argv)==1: print __doc__ sys.exit(2) long_options_list=["outputFile=", "delim=", "missingval=", "withArrayId=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "addToDB", "callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , "subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", "onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "permTest=", "savePermutations", "permutationFilter=", "testRobustness"] try: opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType=1 outputFile=None delim="," missingVal="NA" help=0 withArrayIds=1 parallel=None parallelAll=False addToDB=False callMethodID=None comment="" subSample=None onlyOriginal96=False onlyOriginal192 = False subSampleLikePhenotype = None subsampleTest = False numSubSamples = None complement = False onlyBelowLatidue = None onlyAboveLatidue = None sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 permTest = None savePermutations = False permutationFilter = 1.0 testRobustness = False for opt, arg in opts: if opt in ("-h", "--help"): help=1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds=int(arg) elif opt in ("-o", "--outputFile"): outputFile=arg elif opt in ("--phenotypeFileType"): phenotypeFileType=int(arg) elif opt in ("--parallel"): parallel=arg elif opt in ("--parallelAll"): parallelAll=True elif opt in ("--addToDB"): addToDB=True elif opt in ("--onlyOriginal96"): onlyOriginal96=True elif opt in ("--onlyOriginal192"): onlyOriginal192=True elif opt in ("--complement"): complement=True elif opt in ("--subSample"): subSample=int(arg) elif opt in ("--subsampleTest"): subsampleTest = True l = arg.split(",") subSample=int(l[0]) numSubSamples=int(l[1]) elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue=float(arg) elif opt in ("--onlyAboveLatidue"): onlyAboveLatidue=float(arg) elif opt in ("--subSampleLikePhenotype"): subSampleLikePhenotype=int(arg) elif opt in ("--callMethodID"): callMethodID=int(arg) elif opt in ("--comment"): comment=arg elif opt in ("-d", "--delim"): delim=arg elif opt in ("-m", "--missingval"): missingVal=arg elif opt in ("--sr"): sr = True elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permTest"): permTest = int(arg) elif opt in ("--savePermutations"): savePermutations = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile=args[0] phenotypeDataFile=args[1] print "Kruskal-Wallis is being set up with the following parameters:" print "phenotypeDataFile:",phenotypeDataFile print "snpsDataFile:",snpsDataFile print "parallel:",parallel print "parallelAll:",parallelAll print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "onlyAboveLatidue:",onlyAboveLatidue print "subSampleLikePhenotype:",subSampleLikePhenotype print "subsampleTest:",subsampleTest print "numSubSamples:",numSubSamples print "subSample:",subSample print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "permTest:",permTest print "savePermutations:",savePermutations print "permutationFilter:",permutationFilter print "testRobustness:",testRobustness def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) phenName=phenName.replace("/", "_div_") phenName=phenName.replace("*", "_star_") outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr="""#!/bin/csh #PBS -l walltime=100:00:00 #PBS -l mem=4g #PBS -q cmb """ shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " shstr+=" -a "+str(withArrayIds)+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") if parallel: #Running on the cluster.. if parallelAll: phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) elif subsampleTest: phenotypeIndex=int(args[2]) for i in range(0,numSubSamples): runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i)) else: phenotypeIndex=int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex=int(args[2]) print "phenotypeIndex:",phenotypeIndex print "output:",outputFile print "\nStarting program now!\n" #Load phenotype file phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) elif onlyAboveLatidue: print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if subSampleLikePhenotype: p_name = phed.getPhenotypeName(subSampleLikePhenotype) print "Picking sample as in",p_name ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype) print ecotypes phed.filterAccessions(ecotypes) print "len(phed.accessions)", len(phed.accessions) if subSample: sample_ecotypes = [] ecotypes = phed.getNonNAEcotypes(phenotypeIndex) sample_ecotypes = random.sample(ecotypes,subSample) phed.filterAccessions(sample_ecotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() #Load genotype file snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal, withArrayIds = withArrayIds) #Checking overlap between phenotype and genotype accessions. phenotype=phed.getPhenIndex(phenotypeIndex) accIndicesToKeep=[] phenAccIndicesToKeep=[] numAcc=len(snpsds[0].accessions) sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1=snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2=phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping=[] i=0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i+=1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds=[] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Double check genotype file: problems = 0 for i in range(0,len(newSnpsds)): snpsd = newSnpsds[i] for j in range(0,len(snpsd.snps)): snp = snpsd.snps[j] sc = snp.count(0) if sc==0 or sc==len(snp): print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i] problems += 1 if problems >0: print "Genotype file appears to have potential problems" else: print "Genotype file appears to be good" if permTest: print "Starting a permutation test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" permTest = 100 _perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter) sys.exit(0) if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" _robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter) sys.exit(0) sys.stdout.flush() print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun if (not sr) or (sr and not srSkipFirstRun): #Writing files if env.user=="bjarni": tempfile.tempdir='/tmp' (fId, phenotypeTempFile)=tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile)=tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write("Phenotype file written\n") sys.stdout.flush() snpsDataset=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) decoder={1:1, 0:0,-1:'NA'} snpsDataset.writeToFile(genotypeTempFile, deliminator = delim, missingVal = missingVal, withArrayIds = 0, decoder = decoder) sys.stdout.write("Genotype file written\n") sys.stdout.flush() phenotypeName=phed.getPhenotypeName(phenotypeIndex) rDataFile=outputFile+".rData" pvalFile=outputFile+".pvals" #Is the phenotype binary? binary=phed.isBinary(phenotypeIndex) rstr=_generateRScript_(genotypeTempFile, phenotypeTempFile, rDataFile, pvalFile, name = phenotypeName, binary = binary) rFileName=outputFile+".r" f=open(rFileName, 'w') f.write(rstr) f.close() outRfile=rFileName+".out" errRfile=rFileName+".err" print "Running R file:" cmdStr="(R --vanilla < "+rFileName+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() gc.collect() os.system(cmdStr) #print "Emma output saved in R format in", rDataFile print "Generating a GW plot." res = gwaResults.Result(pvalFile,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile else: print "Skipping first stage analysis." sys.stdout.flush() if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF=" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 impFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None logTransform = False parallelAll = False chunkSize = 250000 round2Size = 5000 nTrees = 15000 nodeSize = None mem = "8g" skipSecondRound = True minMAF = 0.0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("-o", "--rFile"): impFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--secondRound"): skipSecondRound = False elif opt in ("-d", "--delim"): delim = arg elif opt in ("--chunkSize"): chunkSize = int(arg) elif opt in ("--round2Size"): round2Size = int(arg) elif opt in ("--nTrees"): nTrees = int(arg) elif opt in ("--nodeSize"): nodeSize = int(arg) elif opt in ("--mem"): mem = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-m", "--minMAF"): minMAF = float(arg) else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) def runParallel(phenotypeIndex): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") impFileName = resultDir + "RF_" + parallel + "_" + phenName outFileName = impFileName shstr = """#!/bin/csh #PBS -l walltime=50:00:00 """ shstr += "#PBS -l mem=" + mem + "\n" shstr += """ #PBS -q cmb """ shstr += "#PBS -N RF" + phenName + "_" + parallel + "\n" shstr += "(python " + programDir + "RandomForest.py -o " + impFileName + " --chunkSize " + str( chunkSize) + " --nTrees " + str(nTrees) + " --mem " + str( mem) + " --round2Size " + str(round2Size) + "" if nodeSize: shstr += " --nodeSize " + str(nodeSize) + " " if logTransform: shstr += " --logTransform " if not skipSecondRound: shstr += " --secondRound " shstr += " -a " + str(withArrayIds) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") #Nested function ends snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) print "chunkSize:", chunkSize print "nTrees:", nTrees print "nodeSize:", nodeSize print "mem:", mem print "logTransform:", logTransform print "round2Size:", round2Size print "skipSecondRound:", skipSecondRound #Loading genotype data import dataParsers snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) #Load phenotype file sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len( accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotype) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Remove minor allele frequencies if minMAF != 0: sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Converting format to 01 import snpsdata newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" snpsds = newSnpsds #Writing files import tempfile if env.user == "bjarni": tempfile.tempdir = '/tmp' (fId, phenotypeTempFile) = tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile) = tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write("Phenotype file written\n") sys.stdout.flush() #Retain only the correct runchunk of data. chromasomes = [] positions = [] snps = [] for i in range(0, len(snpsds)): snpsd = snpsds[i] positions += snpsd.positions snps += snpsd.snps chrList = [i + 1] * len(snpsd.positions) chromasomes += chrList #Is the phenotype binary? binary = phed.isBinary(phenotypeIndex) import util impFile = impFile + ".imp" rDataFile = impFile + ".rData" rFile = impFile + ".r" outRfile = rFile + ".out" errRfile = rFile + ".err" topImpFile = impFile + "_top" + str(chunkSize) + ".imp" topRDataFile = impFile + "_top.rData" try: os.remove(impFile) #Removing file if it already exits. except Exception: print "Couldn't remove", impFile try: os.remove(topImpFile) #Removing file if it already exits. except Exception: print "Couldn't remove", topImpFile for startIndex in range(0, len(positions), chunkSize): if startIndex + chunkSize >= len(positions): endIndex = len(positions) else: endIndex = startIndex + chunkSize #Writing genotype data to file. tmpFile = open(genotypeTempFile, "w") for i in range(startIndex, endIndex): outStr = "" snp = util.valListToStrList(snps[i]) outStr += str(chromasomes[i]) + "," + str(positions[i]) + "," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile, 'w') f.write(rstr) f.close() #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out" #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err" print "Running model nr", startIndex / chunkSize, ":" cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile sys.stdout.write(cmdStr + "\n") sys.stdout.flush() os.system(cmdStr) print "Random forest output saved in", impFile if not skipSecondRound: #Run on the top 'chunkSize' number of hits. #loading the R output file. impF = open(impFile, "r") lines = impF.readlines() impF.close() impList = list() for i in range(1, len(lines)): line = lines[i] line.strip() l = line.split(",") impList.append((float(l[2]), l[0], l[1], snps[i])) impList.sort() impList.reverse() #Writing genotype data to file. tmpFile = open(genotypeTempFile, "w") for i in range(0, round2Size): outStr = "" snp = util.valListToStrList(impList[i][3]) outStr += str(impList[i][1]) + "," + str(impList[i][2]) + "," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile, 'w') f.write(rstr) f.close() print "Running randomForest on the top importance scores:" cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile sys.stdout.write(cmdStr + "\n") sys.stdout.flush() os.system(cmdStr)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["outputSNPsFile=","outputPhenotFile=", "filterMonomorphic", "rawDataFormat", "delim=", "missingval=", "withArrayId=", "phenotype=", "phenotypeFile=", "phenotypeName=", "calcKinshipMatrix=", "orderAccessions", "help"] try: opts, args = getopt.getopt(sys.argv[1:], "o:u:d:m:a:f:p:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None outputPhenotFile = None delim = "," missingVal = "NA" phenotypeFile = None kinshipMatrixFile = None phenotype = None phenotypeName = None rawDataFormat = False monomorphic = False help = 0 withArrayIds = 1 orderAccessions = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-f","--phenotypeFile"): phenotypeFile = arg elif opt in ("calcKinshipMatrix"): kinshipMatrixFile = arg elif opt in ("--filterMonomorphic"): monomorphic = True elif opt in ("--rawDataFormat"): rawDataFormat = True elif opt in ("--minCallProb"): minCallProb = float(arg) elif opt in ("-p","--phenotype"): phenotype = int(arg) elif opt in ("-o","--outputSNPsFile"): output_fname = arg elif opt in ("--orderAccessions"): orderAccessions = True elif opt in ("-u","--phenotypeFile"): outputPhenotFile = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: print output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) if phenotypeFile: import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter='\t') #Get Phenotype data accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) if phenotype>=0: #Load phenotype file sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break elif phenotype==None: sys.stdout.write("Removing accessions which do not have any phenotype values.") sys.stdout.flush() for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2: accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter Accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all." if outputPhenotFile: print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) if orderAccessions: accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc),i)) i += 1 phed.orderAccessions(accessionMapping) if phenotype>=0: phed.writeToFile(outputPhenotFile, [phenotype]) else: phed.writeToFile(outputPhenotFile) #Filtering monomorphic if monomorphic: print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" import snpsdata newSnpsds = [] if not rawDataFormat: sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" waid1 = 0 snpsDataset = snpsdata.SnpsDataSet(newSnpsds,[1,2,3,4,5]) decoder = {1:1, 0:0, -1:'NA'} else: snpsDataset = snpsdata.SnpsDataSet(snpsds,[1,2,3,4,5]) decoder=None snpsDataset.writeToFile(output_fname, deliminator=delim, missingVal = missingVal, withArrayIds = waid1, decoder=decoder)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "outputFile=", "delim=", "missingval=", "sampleNum=", "parallel=", "parallelAll", "useFloats" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:n:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 outputFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None parallelAll = False sampleNum = None chromosomes = [1, 2, 3, 4, 5] useFloats = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile = arg elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("n", "--sampleNum"): sampleNum = int(arg) elif opt in ("--useFloats"): useFloats = True else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile = args[0] phenotypeDataFile = args[1] print "CAMP is being set up with the following parameters:" print "phenotypeDataFile:", phenotypeDataFile if len(args) > 2: print "Phenotype_id:", args[2] print "snpsDataFile:", snpsDataFile print "parallel:", parallel print "parallelAll:", parallelAll print "sampleNum:", sampleNum def runParallel(phenotypeIndex, id=""): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id shstr = """#!/bin/csh #PBS -l walltime=24:00:00 #PBS -l mem=6g #PBS -q cmb """ shstr += "#PBS -N C" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " " if sampleNum: shstr += " -n " + str(sampleNum) + " " if useFloats: shstr += " --useFloats " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data #Load genotype file snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) #Checking overlap between phenotype and genotype accessions. phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len( accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Writing phenotype data to CAMP format. (fId, phenotypeFile) = tempfile.mkstemp() os.close(fId) phenVals = phed.getPhenVals(phenotypeIndex, asString=False) if not useFloats: phenVals = map(int, phenVals) phenFile = open(phenotypeFile, "w") for value in phenVals: phenFile.write(str(value) + "\n") phenFile.close() chromosome_list = [] positions_list = [] scores_list = [] interaction_positions_list = [] mafs = [] marfs = [] #Writing SNP data to CAMP format. for chromosome in chromosomes: (fId, snpsFile) = tempfile.mkstemp() os.close(fId) (fId, posFile) = tempfile.mkstemp() os.close(fId) sf = open(snpsFile, "w") pf = open(posFile, "w") snpsd = newSnpsds[chromosome - 1] for i in range(0, len(snpsd.snps)): snp = snpsd.snps[i] (marf, maf) = snpsdata.getMAF(snp) marfs.append(marf) mafs.append(maf) str_snp = map(str, snp) double_snp = [] for nt in str_snp: double_snp.append(nt) double_snp.append(nt) sf.write("".join(double_snp) + "\n") pf.write(str(snpsd.positions[i]) + "\n") sf.close() pf.close() outFile = outputFile + "_job_" + str(chromosome) + ".out" errFile = outputFile + "_job_" + str(chromosome) + ".err" resFile = outputFile + "_" + str(chromosome) + ".out" print "resFile,outFile,errFile,snpsFile,posFile,phenotypeFile:", resFile, outFile, errFile, snpsFile, posFile, phenotypeFile results = _runCAMP_(resFile, outFile, errFile, snpsFile, posFile, phenotypeFile, sampleNum) positions_list += results["positions"] scores_list += results["scores"] for (i, j) in results["snpIndices"]: if not (j < 0 or i < 0): marfs.append(0.5) #An ugly hack!!! mafs.append(0.5) chromosome_list.append(chromosome) scoreFile = outputFile + ".scores" f = open(scoreFile, "w") f.write("Chromosome,Position,Score,MARF,MAF,Second_Position\n") for i in range(0, len(positions_list)): chromosome = chromosome_list[i] (pos1, pos2) = positions_list[i] score = scores_list[i] marf = marfs[i] maf = mafs[i] l = map(str, [chromosome, pos1, score, marf, maf, pos2]) f.write(",".join(l) + "\n") f.close()
def _run_(): if len(sys.argv)==1: print __doc__ sys.exit(2) long_options_list=["outputFile=", "delim=", "missingval=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "addToDB", "callMethodID=", "comment=", "onlyOriginal192","onlyOriginal96", "subSample=" , "subSampleLikePhenotype=", "subsampleTest=", "complement", "onlyBelowLatidue=", "onlyAboveLatidue=", "srInput=", "sr","srOutput=", "srPar=","srSkipFirstRun", "permTest=", "savePermutations", "permutationFilter=", "testRobustness", "memReq=","walltimeReq=",] try: opts, args=getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType=1 outputFile=None delim="," missingVal="NA" help=0 parallel=None parallelAll=False addToDB=False callMethodID=None comment="" subSample=None onlyOriginal96=False onlyOriginal192 = False subSampleLikePhenotype = None subsampleTest = False numSubSamples = None complement = False onlyBelowLatidue = None onlyAboveLatidue = None sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 permTest = None savePermutations = False permutationFilter = 1.0 testRobustness = False memReq = "5g" walltimeReq = "100:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help=1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile=arg elif opt in ("--phenotypeFileType"): phenotypeFileType=int(arg) elif opt in ("--parallel"): parallel=arg elif opt in ("--parallelAll"): parallelAll=True elif opt in ("--addToDB"): addToDB=True elif opt in ("--onlyOriginal96"): onlyOriginal96=True elif opt in ("--onlyOriginal192"): onlyOriginal192=True elif opt in ("--complement"): complement=True elif opt in ("--subSample"): subSample=int(arg) elif opt in ("--subsampleTest"): subsampleTest = True l = arg.split(",") subSample=int(l[0]) numSubSamples=int(l[1]) elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue=float(arg) elif opt in ("--onlyAboveLatidue"): onlyAboveLatidue=float(arg) elif opt in ("--subSampleLikePhenotype"): subSampleLikePhenotype=int(arg) elif opt in ("--callMethodID"): callMethodID=int(arg) elif opt in ("--comment"): comment=arg elif opt in ("-d", "--delim"): delim=arg elif opt in ("-m", "--missingval"): missingVal=arg elif opt in ("--sr"): sr = True elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permTest"): permTest = int(arg) elif opt in ("--savePermutations"): savePermutations = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--memReq"): memReq=arg elif opt in ("--walltimeReq"): walltimeReq=arg else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile=args[0] phenotypeDataFile=args[1] print "Kruskal-Wallis is being set up with the following parameters:" print "phenotypeDataFile:",phenotypeDataFile print "snpsDataFile:",snpsDataFile print "parallel:",parallel print "parallelAll:",parallelAll print "onlyOriginal96:",onlyOriginal96 print "onlyOriginal192:",onlyOriginal192 print "onlyBelowLatidue:",onlyBelowLatidue print "onlyAboveLatidue:",onlyAboveLatidue print "complement:",complement print "subSampleLikePhenotype:",subSampleLikePhenotype print "subsampleTest:",subsampleTest print "numSubSamples:",numSubSamples print "subSample:",subSample print "sr:",sr print "srSkipFirstRun:",srSkipFirstRun print "srInput:",srInput print "srOutput:",srOutput print "srTopQuantile:",srTopQuantile print "srWindowSize:",srWindowSize print "permTest:",permTest print "savePermutations:",savePermutations print "permutationFilter:",permutationFilter print "testRobustness:",testRobustness print "walltimeReq:",walltimeReq print "memReq:",memReq def runParallel(phenotypeIndex,id=""): #Cluster specific parameters phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data phenName=phed.getPhenotypeName(phenotypeIndex) print phenName outputFile=resultDir+"KW_"+parallel+"_"+phenName+id shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime="+walltimeReq+"\n" shstr += "#PBS -l mem="+memReq+"\n" shstr +="#PBS -q cmb\n" shstr+="#PBS -N K"+phenName+"_"+parallel+"\n" shstr+="set phenotypeName="+parallel+"\n" shstr+="set phenotype="+str(phenotypeIndex)+"\n" shstr+="(python "+scriptDir+"KW.py -o "+outputFile+" " if subSample: shstr+=" --subSample="+str(subSample)+" " elif onlyOriginal96: shstr+=" --onlyOriginal96 " elif onlyOriginal192: shstr+=" --onlyOriginal192 " if onlyBelowLatidue: shstr+=" --onlyBelowLatidue="+str(onlyBelowLatidue)+" " elif onlyAboveLatidue: shstr+=" --onlyAboveLatidue="+str(onlyAboveLatidue)+" " if complement: shstr+=" --complement " if permTest: shstr+=" --permTest="+str(permTest)+" " if savePermutations: shstr+=" --savePermutations " shstr+=" --permutationFilter="+str(permutationFilter)+" " if testRobustness: shstr+=" --testRobustness " if sr: shstr += " --sr " if not srOutput: output = resultDir+"KW_"+parallel+"_"+phenName+".sr.pvals" shstr += " --srOutput="+str(output)+" " if srSkipFirstRun: if not srInput: output = resultDir+"KW_"+parallel+"_"+phenName+".pvals" shstr += " --srInput="+str(output)+" " shstr += " --srSkipFirstRun " shstr += " --srPar="+str(srTopQuantile)+","+str(srWindowSize)+" " shstr+=snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr+="> "+outputFile+"_job"+".out) >& "+outputFile+"_job"+".err\n" f=open(parallel+".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") if parallel: #Running on the cluster.. if parallelAll: phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) elif subsampleTest: phenotypeIndex=int(args[2]) for i in range(0,numSubSamples): runParallel(phenotypeIndex,id="_r"+str(subSample)+"_"+str(i)) else: phenotypeIndex=int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex=int(args[2]) print "phenotypeIndex:",phenotypeIndex print "output:",outputFile print "\nStarting program now!\n" #Load phenotype file phed=phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter = '\t') #Get Phenotype data #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str,original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str,original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude",onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]<onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) elif onlyAboveLatidue: print "Filtering for the accessions which orginate above latitude",onlyAboveLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][2] and eiDict[acc][2]>onlyAboveLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2]==None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if subSampleLikePhenotype: p_name = phed.getPhenotypeName(subSampleLikePhenotype) print "Picking sample as in",p_name ecotypes = phed.getNonNAEcotypes(subSampleLikePhenotype) print ecotypes phed.filterAccessions(ecotypes) print "len(phed.accessions)", len(phed.accessions) if subSample: sample_ecotypes = [] ecotypes = phed.getNonNAEcotypes(phenotypeIndex) sample_ecotypes = random.sample(ecotypes,subSample) phed.filterAccessions(sample_ecotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() #Load genotype file snpsds=dataParsers.parseCSVData(snpsDataFile, format = 1, deliminator = delim, missingVal = missingVal) #Checking overlap between phenotype and genotype accessions. phenotype=phed.getPhenIndex(phenotypeIndex) accIndicesToKeep=[] phenAccIndicesToKeep=[] numAcc=len(snpsds[0].accessions) sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1=snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2=phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping=[] i=0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i+=1 phed.orderAccessions(accessionMapping) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Converting format to 01 newSnpsds=[] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" #Double check genotype file: problems = 0 for i in range(0,len(newSnpsds)): snpsd = newSnpsds[i] for j in range(0,len(snpsd.snps)): snp = snpsd.snps[j] sc = snp.count(0) if sc==0 or sc==len(snp): print "Problem in file found at chr,pos",(i+1),",",snpsd.positions[i] problems += 1 if problems >0: print "Genotype file appears to have potential problems" else: print "Genotype file appears to be good" if permTest: print "Starting a permutation test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" permTest = 100 _perm_test_(allSNPs,phenVals,permTest,outputFile, test_type=test_type,savePermutations=savePermutations, filter=permutationFilter) sys.exit(0) if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in newSnpsds: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) test_type = "KW" if phed.isBinary(phenotypeIndex): test_type = "Fisher" _robustness_test_(allSNPs,phenVals,outputFile, test_type=test_type, filter=permutationFilter) sys.exit(0) sys.stdout.flush() print "sr:",sr, ", srSkipFirstRun:",srSkipFirstRun if (not sr) or (sr and not srSkipFirstRun): #Writing files #phed and phenotype sd=snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) phenotypeName=phed.getPhenotypeName(phenotypeIndex) if phed.isBinary(phenotypeIndex): pvals = run_fet(sd.getSnps(),phed.getPhenVals(phenotypeIndex)) else: snps = sd.getSnps() phen_vals = phed.getPhenVals(phenotypeIndex) try: kw_res = util.kruskal_wallis(snps,phen_vals) pvals = kw_res['ps'] except: print snps print phen_vals print len(snps),len(snps[0]),len(phen_vals) raise Exception res = gwaResults.Result(scores = pvals,name="KW_"+phenotypeName, snpsds=newSnpsds, load_snps=False) pvalFile=outputFile+".pvals" res.writeToFile(pvalFile) print "Generating a GW plot." res.negLogTransform() pngFile = pvalFile+".png" plotResults.plotResult(res,pngFile=pngFile,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", plotBonferroni=True,usePylab=False) srInput = pvalFile else: print "Skipping first stage analysis." sys.stdout.flush() if sr: _secondRun_(srOutput,srInput,srTopQuantile,srWindowSize,newSnpsds,phed,phenotypeIndex,binary=binary) print "Generating second run GW plot." res = gwaResults.Result(srInput,name="KW_"+phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() srRes = gwaResults.Result(srOutput,name="KW_SR_"+phenotypeName, phenotypeID=phenotypeIndex) srRes.negLogTransform() srPngFile = pvalFile+".sr.png" plotResults.plotResultWithSecondRun(res,srRes,pngFile=srPngFile,ylab="$-$log$_{10}(p)$", plotBonferroni=True)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "rFile=", "delim=", "missingval=", "crossExamine=", "statFile=", "debug", "report", "help", "withArrayId=", "strainIdentity", "heterozygous2NA" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:s:c:vh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) if len(args) > 2: print args raise Exception("Number of arguments isn't correct.") inputFile1 = args[0] inputFile2 = None crossExamineData = False if len(args) > 1: inputFile2 = args[1] else: crossExamineData = True rFile = None statFile = None verbose = False delim = "," missingVal = "NA" debug = None report = None withArrayIds = 0 fractionSnps = 0.05 heterozygous2NA = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--rFile"): rFile = arg elif opt in ("-s", "--statFile"): statFile = arg elif opt in ("-t", "--method"): version = arg elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("-v", "--verbose"): verbose = True elif opt in ("--heterozygous2NA"): heterozygous2NA = True elif opt in ("-c", "--crossExamine"): fractionSnps = float(arg) elif opt in ("--strainIdentity"): crossExamineData = True waid1 = withArrayIds == 1 or withArrayIds == 2 waid2 = withArrayIds == 2 snpsds1 = dataParsers.parseCSVData(inputFile1, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) if inputFile2: snpsds2 = dataParsers.parseCSVData(inputFile2, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) if crossExamineData: if inputFile2: findIdentities(snpsds1, snpsds2, withArrayIds) else: crossExamine(snpsds1, fractionSnps, waid1) return if len(snpsds1) != len(snpsds2): raise Exception("Unequal number of chromosomes in files.") res = [] naRate1 = 0 naRate2 = 0 numSNPs1 = 0 numSNPs2 = 0 for i in range(0, len(snpsds1)): res.append(snpsds1[i].compareWith(snpsds2[i], withArrayIds=withArrayIds, heterozygous2NA=heterozygous2NA)) naRate1 += snpsds1[i].countMissingSnps() * len(snpsds1[i].positions) naRate2 += snpsds2[i].countMissingSnps() * len(snpsds2[i].positions) numSNPs1 += len(snpsds1[i].positions) numSNPs2 += len(snpsds2[i].positions) naRate1 = naRate1 / float(numSNPs1) naRate2 = naRate2 / float(numSNPs2) import rfun totalCommonPos = 0 totalPos = [0, 0] commonAccessions = res[0][2] totalAccessionCounts = [0] * len(commonAccessions) accOverlappingCallRate = [[0] * len(commonAccessions), [0] * len(commonAccessions)] accCallRate = [[0] * len(commonAccessions), [0] * len(commonAccessions)] accErrorRate = [0] * len(commonAccessions) statstr = "#Common SNPs positions:\n" rstr = "#Snps error rates\n" rstr = "par(mfrow=c(5,1));\n" snpsErrorRate = [] totalCounts = 0 totalFails = 0 for i in range(0, len(res)): #for all chromosomes r = res[i] totalCounts += r[9][0] totalFails += r[9][1] snpsErrorRate += r[1] totalCommonPos += len(r[0]) totalPos[0] += len(snpsds1[i].positions) totalPos[1] += len(snpsds2[i].positions) statstr += "Chr. " + str(i + 1) + ":\n" statstr += str(r[0]) + "\n" xname = "commonPos_ch" + str(i + 1) ynames = ["errorRates_ch" + str(i + 1)] rstr += rfun.plotOverlayingVectors(r[0], [r[1]], xlab="Position, chr. " + str(i + 1), ylab="Error (red)", type="b", xname=xname, ynames=ynames) + "\n\n" for j in range(0, len(commonAccessions)): totalAccessionCounts[j] += r[6][j] accOverlappingCallRate[0][j] += r[4][0][j] * float(len(r[0])) accOverlappingCallRate[1][j] += r[4][1][j] * float(len(r[0])) accCallRate[0][j] += r[8][0][j] accCallRate[1][j] += r[8][1][j] accErrorRate[j] += r[3][j] * float(r[6][j]) statstr += "#Number of common SNPs positions:\n" statstr += str(totalCommonPos) + "\n" statstr += "#SNPs errors:\n" for i in range(0, len(res)): r = res[i] statstr += "Chr. " + str(i + 1) + ":\n" statstr += str(r[1]) + "\n" statstr += "#Average Snp Error:\n" statstr += str(sum(snpsErrorRate) / float(len(snpsErrorRate))) + "\n" statstr += "#Weighted Average Snp Error:\n" statstr += str(totalFails / float(totalCounts)) + "\n" statstr += "#Commmon accessions:\n" statstr += str(commonAccessions) + '\n' statstr += "#Number of commmon accessions:\n" statstr += str(len(commonAccessions)) + '\n' statstr += "#Number of accessions (1):\n" statstr += str(len(snpsds1[0].accessions)) + '\n' statstr += "#Number of accessions (2):\n" statstr += str(len(snpsds2[0].accessions)) + '\n' if withArrayIds: commonArrayIds = res[0][5] statstr += "#ArrayIds:\n" statstr += str(commonArrayIds) + '\n' if not verbose: print "In all", len(commonAccessions), "common accessions found" print "In all", totalCommonPos, "common snps found" print "Average Snp Error:", sum(snpsErrorRate) / float( len(snpsErrorRate)) print "NA rate (1) =", naRate1 print "NA rate (2) =", naRate2 for i in range(0, len(res)): r = res[i] xname = "commonPos_ch" + str(i + 1) ynames = [ "missingRates1_ch" + str(i + 1), "missingRates2_ch" + str(i + 1) ] rstr += rfun.plotOverlayingVectors(r[0], [r[7][0], r[7][1]], xlab="Position, chr. " + str(i + 1), ylab="Missing (red,green)", type="b", xname=xname, ynames=ynames) + "\n\n" for i in range(0, len(commonAccessions)): accOverlappingCallRate[0][i] = accOverlappingCallRate[0][i] / float( totalCommonPos) accOverlappingCallRate[1][i] = accOverlappingCallRate[1][i] / float( totalCommonPos) accCallRate[0][i] = accCallRate[0][i] / float(totalPos[0]) accCallRate[1][i] = accCallRate[1][i] / float(totalPos[1]) accErrorRate[i] = accErrorRate[i] / float(totalAccessionCounts[i]) accErrAndID = [] accMissAndID = [[], []] accOverlMissAndID = [[], []] if withArrayIds: for i in range(0, len(commonAccessions)): accErrAndID.append( (accErrorRate[i], commonAccessions[i], commonArrayIds[i])) accMissAndID[0].append( (accCallRate[0][i], commonAccessions[i], commonArrayIds[i])) accMissAndID[1].append( (accCallRate[1][i], commonAccessions[i], commonArrayIds[i])) accOverlMissAndID[0] = zip(accOverlappingCallRate[0], commonAccessions, commonArrayIds) accOverlMissAndID[1] = zip(accOverlappingCallRate[1], commonAccessions, commonArrayIds) else: for i in range(0, len(commonAccessions)): accErrAndID.append((accErrorRate[i], commonAccessions[i])) accMissAndID[0].append((accCallRate[0][i], commonAccessions[i])) accMissAndID[1].append((accCallRate[1][i], commonAccessions[i])) accOverlMissAndID[0] = zip(accOverlappingCallRate[0], commonAccessions) accOverlMissAndID[1] = zip(accOverlappingCallRate[1], commonAccessions) accErrAndID.sort( ) #05/10/08 yh. sort(reverse=True) is not available in python 2.3 accErrAndID.reverse() accMissAndID[0].sort() accMissAndID[0].reverse() accOverlMissAndID[1].sort() accOverlMissAndID[1].reverse() statstr += "#Sorted list, based on error rates (Error rate, ecotype id, array id):\n" for t in accErrAndID: statstr += str(t) + '\n' statstr += "#Sorted list, based on missing rates of 1st file, (Missing rate, ecotype id, array id):\n" for t in accMissAndID[0]: statstr += str(t) + '\n' statstr += "#Sorted list, based on missing rates of 2nd file, (Missing rate, ecotype id, array id):\n" for t in accMissAndID[1]: statstr += str(t) + '\n' statstr += "#Sorted list, based on (overlapping positions) missing rates of 1st file, (Missing rate, ecotype id, array id):\n" for t in accOverlMissAndID[0]: statstr += str(t) + '\n' statstr += "#Sorted list, based on (overlapping positions) missing rates of 2nd file, (Missing rate, ecotype id, array id):\n" for t in accOverlMissAndID[1]: statstr += str(t) + '\n' """ print "Sorted list, based on error rates: ",accErrAndID,'\n' accMissAndID[0].sort(reverse=True) print "Sorted list, based on missing rates (1st file): ",accMissAndID[0],'\n' accMissAndID[1].sort(reverse=True) print "Sorted list, based on missing rates (2nd file): ",accMissAndID[1],'\n' """ if withArrayIds: rstr += 'accessions<-c("' + str(r[2][0]) + "_ai" + str(r[5][0]) + '"' else: rstr += 'accessions<-c("' + str(r[2][0]) + '"' for i in range(1, len(r[2])): if withArrayIds: rstr += ',"' + str(r[2][i]) + "_ai" + str(r[5][i]) + '"' else: rstr += ',"' + str(r[2][i]) + '"' rstr += ")\n" rstr += rfun.plotVectors(accCallRate[0], [accErrorRate], xlab="Accession missing value rate", ylab="Accession error rate", xname="accMissingRate1", ynames=["accErrorRate"]) rstr += "text(accMissingRate1+0.0045,accErrorRate-0.0004,accessions)\n\n" rstr += rfun.plotVectors(accCallRate[1], [accErrorRate], xlab="Accession missing value rate", ylab="Accession error rate", xname="accMissingRate2", ynames=["accErrorRate"]) rstr += "text(accMissingRate2+0.0045,accErrorRate-0.0004,accessions)\n\n" if rFile: f = open(rFile, "w") f.write(rstr) f.close() if verbose: print statstr if statFile: f = open(statFile, "w") f.write(statstr) f.close()
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "rFile=", "chr=", "delim=", "missingval=", "BoundaryStart=", "removeOutliers=", "addConstant=", "logTransform", "BoundaryEnd=", "phenotypeFileType=", "help", "parallel=", "parallelAll", "LRT", "minMAF=", "kinshipDatafile=", "phenotypeRanks", "onlyMissing", "onlyOriginal96", "onlyOriginal192", "onlyBelowLatidue=", "complement", "negate", "srInput=", "sr", "srOutput=", "srPar=", "srSkipFirstRun", "testRobustness", "permutationFilter=", "useLinearRegress", "regressionCofactors=", "FriLerAsCofactor", "FriColAsCofactor", "memReq=", "walltimeReq=", ] try: opts, args = getopt.getopt(sys.argv[1:], "o:c:d:m:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeRanks = False removeOutliers = None addConstant = -1 phenotypeFileType = 1 rFile = None delim = "," missingVal = "NA" help = 0 minMAF = 0.0 boundaries = [-1, -1] chr = None parallel = None logTransform = False negate = False parallelAll = False lrt = False kinshipDatafile = None onlyMissing = False onlyOriginal96 = False onlyOriginal192 = False onlyBelowLatidue = None complement = False sr = False srOutput = False srInput = False srSkipFirstRun = False srTopQuantile = 0.95 srWindowSize = 30000 testRobustness = False permutationFilter = 0.002 useLinearRegress = False regressionCofactors = None FriLerAsCofactor = False FriColAsCofactor = False memReq = "5g" walltimeReq = "150:00:00" for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--rFile"): rFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--BoundaryStart"): boundaries[0] = int(arg) elif opt in ("--BoundaryEnd"): boundaries[1] = int(arg) elif opt in ("--addConstant"): addConstant = float(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--minMAF"): minMAF = float(arg) elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--onlyMissing"): onlyMissing = True elif opt in ("--onlyOriginal96"): onlyOriginal96 = True elif opt in ("--onlyOriginal192"): onlyOriginal192 = True elif opt in ("--onlyBelowLatidue"): onlyBelowLatidue = float(arg) elif opt in ("--complement"): complement = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--negate"): negate = True elif opt in ("--removeOutliers"): removeOutliers = float(arg) elif opt in ("--LRT"): lrt = True elif opt in ("-c", "--chr"): chr = int(arg) elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("--kinshipDatafile"): kinshipDatafile = arg elif opt in ("--phenotypeRanks"): phenotypeRanks = True elif opt in ("--sr"): sr = True elif opt in ("--srSkipFirstRun"): srSkipFirstRun = True elif opt in ("--srInput"): srInput = arg elif opt in ("--srOutput"): srOutput = arg elif opt in ("--srPar"): vals = arg.split(",") srTopQuantile = float(vals[0]) srWindowSize = int(vals[1]) elif opt in ("--testRobustness"): testRobustness = True elif opt in ("--permutationFilter"): permutationFilter = float(arg) elif opt in ("--FriLerAsCofactor"): FriLerAsCofactor = True elif opt in ("--FriColAsCofactor"): FriColAsCofactor = True elif opt in ("--useLinearRegress"): useLinearRegress = True elif opt in ("--regressionCofactors"): regressionCofactors = arg elif opt in ("--memReq"): memReq = arg elif opt in ("--walltimeReq"): walltimeReq = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) print "Emma is being set up with the following parameters:" print "output:", rFile print "phenotypeRanks:", phenotypeRanks print "phenotypeFileType:", phenotypeFileType print "parallel:", parallel print "parallelAll:", parallelAll print "minMAF:", minMAF print "LRT:", lrt print "delim:", delim print "missingval:", missingVal print "kinshipDatafile:", kinshipDatafile print "chr:", chr print "boundaries:", boundaries print "onlyMissing:", onlyMissing print "onlyOriginal96:", onlyOriginal96 print "onlyOriginal192:", onlyOriginal192 print "onlyBelowLatidue:", onlyBelowLatidue print "complement:", complement print "negate:", negate print "logTransform:", logTransform print "addConstant:", addConstant print "removeOutliers:", removeOutliers print "sr:", sr print "srSkipFirstRun:", srSkipFirstRun print "srInput:", srInput print "srOutput:", srOutput print "srTopQuantile:", srTopQuantile print "srWindowSize:", srWindowSize print "testRobustness:", testRobustness print "permutationFilter:", permutationFilter print "useLinearRegress:", useLinearRegress print "regressionCofactors:", regressionCofactors print "FriLerAsCofactor:", FriLerAsCofactor print "FriColAsCofactor:", FriColAsCofactor print "walltimeReq:", walltimeReq print "memReq:", memReq def runParallel(phenotypeIndex, phed): #Cluster specific parameters print phenotypeIndex phenName = phed.getPhenotypeName(phenotypeIndex) outFileName = resultDir + "Emma_" + parallel + "_" + phenName shstr = "#!/bin/csh\n" shstr += "#PBS -l walltime=" + walltimeReq + "\n" shstr += "#PBS -l mem=" + memReq + "\n" shstr += "#PBS -q cmb\n" shstr += "#PBS -N E" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" if useLinearRegress: outFileName = resultDir + "LR_" + parallel + "_" + phenName shstr += "(python " + emmadir + "Emma.py -o " + outFileName + " " if useLinearRegress: shstr += " --useLinearRegress " if regressionCofactors: shstr += " --regressionCofactors=" + str(regressionCofactors) + " " if FriLerAsCofactor: shstr += " --FriLerAsCofactor " if FriColAsCofactor: shstr += " --FriColAsCofactor " if onlyOriginal96: shstr += " --onlyOriginal96 " elif onlyOriginal192: shstr += " --onlyOriginal192 " if onlyBelowLatidue: shstr += " --onlyBelowLatidue=" + str(onlyBelowLatidue) + " " if logTransform: shstr += " --logTransform " if negate: shstr += " --negate " if removeOutliers: shstr += " --removeOutliers=" + str(removeOutliers) + " " if phenotypeRanks: shstr += " --phenotypeRanks " if testRobustness: shstr += " --testRobustness " shstr += " --permutationFilter=" + str(permutationFilter) + " " if sr: shstr += " --sr " if not srOutput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" shstr += " --srOutput=" + str(output) + " " if srSkipFirstRun: if not srInput: output = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" shstr += " --srInput=" + str(output) + " " shstr += " --srSkipFirstRun " shstr += " --srPar=" + str(srTopQuantile) + "," + str( srWindowSize) + " " if kinshipDatafile: shstr += " --kinshipDatafile=" + str(kinshipDatafile) + " " shstr += " --addConstant=" + str(addConstant) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data if parallelAll: for phenotypeIndex in phed.phenIds: if onlyMissing: phenName = phed.getPhenotypeName(phenotypeIndex) pvalFile = resultDir + "Emma_" + parallel + "_" + phenName + ".pvals" res = None try: res = os.stat(pvalFile) except Exception: print "File", pvalFile, "does not exist." if res and res.st_size > 0: print "File", pvalFile, "already exists, and is non-empty." if sr: srInput = resultDir + "Emma_" + parallel + "_" + phenName + ".sr.pvals" srRes = None try: srRes = os.stat(srInput) except Exception: print "File", srInput, "does not exist." if srRes and srRes.st_size > 0: print "File", srInput, "already exists, and is non-empty." else: runParallel(phenotypeIndex, phed) else: print "Setting up the run." runParallel(phenotypeIndex, phed) else: runParallel(phenotypeIndex, phed) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex, phed) return else: phenotypeIndex = int(args[2]) print "phenotypeIndex:", phenotypeIndex print "\nStarting program now!\n" snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal) #Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data numAcc = len(snpsds[0].accessions) #Removing outliers if removeOutliers: print "Remoing outliers" phed.naOutliers(phenotypeIndex, removeOutliers) #If onlyOriginal96, then remove all other phenotypes.. if onlyOriginal96: print "Filtering for the first 96 accessions" original_96_ecotypes = phenotypeData._getFirst96Ecotypes_() original_96_ecotypes = map(str, original_96_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_96_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_96_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyOriginal192: print "Filtering for the first 192 accessions" original_192_ecotypes = phenotypeData._getFirst192Ecotypes_() original_192_ecotypes = map(str, original_192_ecotypes) keepEcotypes = [] if complement: for acc in phed.accessions: if not acc in original_192_ecotypes: keepEcotypes.append(acc) else: keepEcotypes = original_192_ecotypes phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) if onlyBelowLatidue: print "Filtering for the accessions which orginate below latitude", onlyBelowLatidue eiDict = phenotypeData._getEcotypeIdInfoDict_() print eiDict keepEcotypes = [] for acc in phed.accessions: acc = int(acc) if eiDict.has_key(acc) and eiDict[acc][ 2] and eiDict[acc][2] < onlyBelowLatidue: keepEcotypes.append(str(acc)) elif eiDict.has_key(acc) and eiDict[acc][2] == None: keepEcotypes.append(str(acc)) phed.filterAccessions(keepEcotypes) print "len(phed.accessions)", len(phed.accessions) sys.stdout.write("Finished prefiltering phenotype accessions.\n") sys.stdout.flush() phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] #Checking which accessions to keep and which to remove . for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break print "\nFiltering accessions in genotype data:" #Filter accessions which do not have the phenotype value (from the genotype data). for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "\nNow filtering accessions in phenotype data:" phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values print "Verifying number of accessions: len(phed.accessions)==len(snpsds[0].accessions) is", len( phed.accessions) == len(snpsds[0].accessions) if len(phed.accessions) != len(snpsds[0].accessions): raise Exception #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Remove minor allele frequencies if minMAF != 0: sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Removing SNPs which are outside of boundaries. if chr: print "\nRemoving SNPs which are outside of boundaries." snpsds[chr - 1].filterRegion(boundaries[0], boundaries[1]) snpsds = [snpsds[chr - 1]] #Ordering accessions in genotype data to fit phenotype data. print "Ordering genotype data accessions." accessionMapping = [] i = 0 for acc in phed.accessions: if acc in snpsds[0].accessions: accessionMapping.append((snpsds[0].accessions.index(acc), i)) i += 1 #print zip(accessionMapping,snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "\nGenotype data has been ordered." #Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData(missingVal=missingVal)) print "" print "Checking kinshipfile:", kinshipDatafile if kinshipDatafile: #Is there a special kinship file? kinshipSnpsds = dataParsers.parseCSVData(kinshipDatafile, format=1, deliminator=delim, missingVal=missingVal) accIndicesToKeep = [] #Checking which accessions to keep and which to remove (genotype data). sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(kinshipSnpsds[0].accessions)): acc1 = kinshipSnpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) break print accIndicesToKeep for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len( accIndicesToKeep ), "accessions removed from kinship genotype data, leaving", len( accIndicesToKeep), "accessions in all." print "Ordering kinship data accessions." accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in kinshipSnpsds[0].accessions: accessionMapping.append( (kinshipSnpsds[0].accessions.index(acc), i)) i += 1 print zip(accessionMapping, snpsds[0].accessions) print "len(snpsds[0].snps)", len(snpsds[0].snps) for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.orderAccessions(accessionMapping) print "Kinship genotype data has been ordered." newKinshipSnpsds = [] sys.stdout.write("Converting data format") for snpsd in kinshipSnpsds: sys.stdout.write(".") sys.stdout.flush() newKinshipSnpsds.append(snpsd.getSnpsData( missingVal=missingVal)) #This data might have NAs print "" kinshipSnpsds = newKinshipSnpsds else: kinshipSnpsds = newSnpsds print "Found kinship data." #Ordering accessions according to the order of accessions in the genotype file # accessionMapping = [] # i = 0 # for acc in snpsds[0].accessions: # if acc in phed.accessions: # accessionMapping.append((phed.accessions.index(acc),i)) # i += 1 # phed.orderAccessions(accessionMapping) #Negating phenotypic values if negate: phed.negateValues(phenotypeIndex) if logTransform and not phed.isBinary( phenotypeIndex) and phed.getMinValue(phenotypeIndex) <= 0: addConstant = 0 #Adding a constant. if addConstant != -1: if addConstant == 0: addConstant = math.sqrt(phed.getVariance(phenotypeIndex)) / 10 addConstant = addConstant - phed.getMinValue(phenotypeIndex) print "Adding a constant to phenotype:", addConstant phed.addConstant(phenotypeIndex, addConstant) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotypeIndex) #Converting phenotypes to Ranks elif phenotypeRanks: phed.transformToRanks(phenotypeIndex) if not chr: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [1, 2, 3, 4, 5]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [1, 2, 3, 4, 5]) else: snpsDataset = snpsdata.SNPsDataSet(newSnpsds, [chr]) kinshipSnpsDataset = snpsdata.SNPsDataSet(kinshipSnpsds, [chr]) phenotypeName = phed.getPhenotypeName(phenotypeIndex) sys.stdout.flush() if testRobustness: print "Starting a robustness test" allSNPs = [] for snpsd in snpsDataset.snpsDataList: allSNPs += snpsd.snps phenVals = phed.getPhenVals(phenotypeIndex) _robustness_test_(allSNPs, phenVals, rFile, filter=permutationFilter) sys.exit(0) if useLinearRegress: phenVals = phed.getPhenVals(phenotypeIndex) d0 = {} d0["phen"] = phenVals dh = {} dh["phen"] = phenVals import rpy, gc if regressionCofactors: #Adds ler and col as cofactors import pickle f = open(regressionCofactors, "r") co_factors = pickle.load(f) f.close() #inserting co factors into model for factor in co_factors: d[factor] = co_factors[factor] import analyzeHaplotype as ah (ler_factor, col_factor) = ah.getLerAndColAccessions(newSnpsds, True) if FriColAsCofactor: d0["col"] = col_factor dh["col"] = col_factor if FriLerAsCofactor: d0["ler"] = ler_factor dh["ler"] = ler_factor chr_pos_pvals = [] stats = [] sys.stdout.write("Applying the linear model") sys.stdout.flush() for i in range(0, len(newSnpsds)): #[3]:# snpsd = newSnpsds[i] sys.stdout.write("|") sys.stdout.flush() gc.collect( ) #Calling garbage collector, in an attempt to clean up memory.. for j in range(0, len(snpsd.snps)): if j % 5000 == 0: sys.stdout.write(".") sys.stdout.flush() #if snpsd.positions[j]>1700000: # break snp = snpsd.snps[j] d0["snp"] = snp try: rpy.set_default_mode(rpy.NO_CONVERSION) aov0 = rpy.r.aov(r("phen ~ ."), data=d0) aovh = rpy.r.aov(r("phen ~ ."), data=dh) rpy.set_default_mode(rpy.BASIC_CONVERSION) s0 = rpy.r.summary(aov0) sh = rpy.r.summary(aovh) #print s0,sh rss_0 = s0['Sum Sq'][-1] if type(sh['Sum Sq']) != float: rss_h = sh['Sum Sq'][-1] else: rss_h = sh['Sum Sq'] f = (rss_h - rss_0) / (rss_0 / (len(phenVals) - len(d0) + 1)) pval = rpy.r.pf(f, 1, len(phenVals), lower_tail=False) except Exception, err_str: print "Calculating p-value failed" #,err_str pval = 1.0 #print "dh:",dh #print "d0:",d0 #print "rss_h,rss_0:",rss_h,rss_0 #print "f,p:",f,pval chr_pos_pvals.append([i + 1, snpsd.positions[j], pval]) mafc = min(snp.count(snp[0]), len(snp) - snp.count(snp[0])) maf = mafc / float(len(snp)) stats.append([maf, mafc]) sys.stdout.write("\n") #Write out to a result file sys.stdout.write("Writing results to file\n") sys.stdout.flush() pvalFile = rFile + ".pvals" f = open(pvalFile, "w") f.write("Chromosome,position,p-value,marf,maf\n") for i in range(0, len(chr_pos_pvals)): chr_pos_pval = chr_pos_pvals[i] stat = stats[i] f.write( str(chr_pos_pval[0]) + "," + str(chr_pos_pval[1]) + "," + str(chr_pos_pval[2]) + "," + str(stat[0]) + "," + str(stat[1]) + "\n") f.close() #Plot results print "Generating a GW plot." phenotypeName = phed.getPhenotypeName(phenotypeIndex) res = gwaResults.Result(pvalFile, name="LM_" + phenotypeName, phenotypeID=phenotypeIndex) res.negLogTransform() pngFile = pvalFile + ".png" plotResults.plotResult(res, pngFile=pngFile, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plotBonferroni=True, usePylab=False)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "outputSNPsFile=", "outputPhenotFile=", "filterMonomorphic", "rawDataFormat", "delim=", "missingval=", "withArrayId=", "phenotype=", "phenotypeFile=", "phenotypeName=", "calcKinshipMatrix=", "orderAccessions", "help", ] try: opts, args = getopt.getopt(sys.argv[1:], "o:u:d:m:a:f:p:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None outputPhenotFile = None delim = "," missingVal = "NA" phenotypeFile = None kinshipMatrixFile = None phenotype = None phenotypeName = None rawDataFormat = False monomorphic = False help = 0 withArrayIds = 1 orderAccessions = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("-f", "--phenotypeFile"): phenotypeFile = arg elif opt in ("calcKinshipMatrix"): kinshipMatrixFile = arg elif opt in ("--filterMonomorphic"): monomorphic = True elif opt in ("--rawDataFormat"): rawDataFormat = True elif opt in ("--minCallProb"): minCallProb = float(arg) elif opt in ("-p", "--phenotype"): phenotype = int(arg) elif opt in ("-o", "--outputSNPsFile"): output_fname = arg elif opt in ("--orderAccessions"): orderAccessions = True elif opt in ("-u", "--phenotypeFile"): outputPhenotFile = arg elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: print output_fname if help == 0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds == 1 or withArrayIds == 2 waid2 = withArrayIds == 2 import dataParsers snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) if phenotypeFile: import phenotypeData phed = phenotypeData.readPhenotypeFile(phenotypeFile, delimiter="\t") # Get Phenotype data accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) if phenotype >= 0: # Load phenotype file sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + "." ) sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != "NA": accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break elif phenotype == None: sys.stdout.write("Removing accessions which do not have any phenotype values.") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2: accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break # Filter Accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." if outputPhenotFile: print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) if orderAccessions: accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) if phenotype >= 0: phed.writeToFile(outputPhenotFile, [phenotype]) else: phed.writeToFile(outputPhenotFile) # Filtering monomorphic if monomorphic: print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" import snpsdata newSnpsds = [] if not rawDataFormat: sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" waid1 = 0 snpsDataset = snpsdata.SnpsDataSet(newSnpsds, [1, 2, 3, 4, 5]) decoder = {1: 1, 0: 0, -1: "NA"} else: snpsDataset = snpsdata.SnpsDataSet(snpsds, [1, 2, 3, 4, 5]) decoder = None snpsDataset.writeToFile(output_fname, deliminator=delim, missingVal=missingVal, withArrayIds=waid1, decoder=decoder)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["rFile=", "delim=", "missingval=", "crossExamine=", "statFile=", "debug", "report", "help", "withArrayId=","strainIdentity", "heterozygous2NA"] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:s:c:vh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) if len(args)>2: print args raise Exception("Number of arguments isn't correct.") inputFile1 = args[0] inputFile2 = None crossExamineData=False if len(args)>1: inputFile2 = args[1] else: crossExamineData=True rFile = None statFile = None verbose = False delim = "," missingVal = "NA" debug = None report = None withArrayIds = 0 fractionSnps = 0.05 heterozygous2NA = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--rFile"): rFile = arg elif opt in ("-s", "--statFile"): statFile = arg elif opt in ("-t","--method"): version = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-v", "--verbose"): verbose = True elif opt in ("--heterozygous2NA"): heterozygous2NA = True elif opt in ("-c", "--crossExamine"): fractionSnps = float(arg) elif opt in ("--strainIdentity"): crossExamineData=True waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 snpsds1 = dataParsers.parseCSVData(inputFile1, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) if inputFile2: snpsds2 = dataParsers.parseCSVData(inputFile2, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) if crossExamineData: if inputFile2: findIdentities(snpsds1,snpsds2,withArrayIds) else: crossExamine(snpsds1,fractionSnps,waid1) return if len(snpsds1) != len(snpsds2): raise Exception("Unequal number of chromosomes in files.") res = [] naRate1 = 0 naRate2 = 0 numSNPs1 = 0 numSNPs2 = 0 for i in range(0,len(snpsds1)): res.append(snpsds1[i].compareWith(snpsds2[i],withArrayIds=withArrayIds,heterozygous2NA=heterozygous2NA)) naRate1 += snpsds1[i].countMissingSnps()*len(snpsds1[i].positions) naRate2 += snpsds2[i].countMissingSnps()*len(snpsds2[i].positions) numSNPs1 += len(snpsds1[i].positions) numSNPs2 += len(snpsds2[i].positions) naRate1 = naRate1/float(numSNPs1) naRate2 = naRate2/float(numSNPs2) import rfun totalCommonPos = 0 totalPos = [0,0] commonAccessions = res[0][2] totalAccessionCounts = [0]*len(commonAccessions) accOverlappingCallRate = [[0]*len(commonAccessions),[0]*len(commonAccessions)] accCallRate = [[0]*len(commonAccessions),[0]*len(commonAccessions)] accErrorRate = [0]*len(commonAccessions) statstr = "#Common SNPs positions:\n" rstr = "#Snps error rates\n" rstr = "par(mfrow=c(5,1));\n" snpsErrorRate = [] totalCounts = 0 totalFails = 0 for i in range(0,len(res)): #for all chromosomes r = res[i] totalCounts += r[9][0] totalFails += r[9][1] snpsErrorRate +=r[1] totalCommonPos += len(r[0]) totalPos[0] += len(snpsds1[i].positions) totalPos[1] += len(snpsds2[i].positions) statstr += "Chr. "+str(i+1)+":\n" statstr += str(r[0])+"\n" xname = "commonPos_ch"+str(i+1) ynames = ["errorRates_ch"+str(i+1)] rstr += rfun.plotOverlayingVectors(r[0],[r[1]],xlab="Position, chr. "+str(i+1),ylab="Error (red)",type="b",xname=xname,ynames=ynames)+"\n\n" for j in range(0,len(commonAccessions)): totalAccessionCounts[j] += r[6][j] accOverlappingCallRate[0][j]+=r[4][0][j]*float(len(r[0])) accOverlappingCallRate[1][j]+=r[4][1][j]*float(len(r[0])) accCallRate[0][j]+=r[8][0][j] accCallRate[1][j]+=r[8][1][j] accErrorRate[j]+=r[3][j]*float(r[6][j]) statstr += "#Number of common SNPs positions:\n" statstr += str(totalCommonPos)+"\n" statstr += "#SNPs errors:\n" for i in range(0,len(res)): r = res[i] statstr += "Chr. "+str(i+1)+":\n" statstr += str(r[1])+"\n" statstr += "#Average Snp Error:\n" statstr += str(sum(snpsErrorRate)/float(len(snpsErrorRate)))+"\n" statstr += "#Weighted Average Snp Error:\n" statstr += str(totalFails/float(totalCounts))+"\n" statstr += "#Commmon accessions:\n" statstr += str(commonAccessions)+'\n' statstr += "#Number of commmon accessions:\n" statstr += str(len(commonAccessions))+'\n' statstr += "#Number of accessions (1):\n" statstr += str(len(snpsds1[0].accessions))+'\n' statstr += "#Number of accessions (2):\n" statstr += str(len(snpsds2[0].accessions))+'\n' if withArrayIds: commonArrayIds = res[0][5] statstr += "#ArrayIds:\n" statstr += str(commonArrayIds)+'\n' if not verbose: print "In all",len(commonAccessions),"common accessions found" print "In all",totalCommonPos,"common snps found" print "Average Snp Error:",sum(snpsErrorRate)/float(len(snpsErrorRate)) print "NA rate (1) =",naRate1 print "NA rate (2) =",naRate2 for i in range(0,len(res)): r = res[i] xname = "commonPos_ch"+str(i+1) ynames = ["missingRates1_ch"+str(i+1),"missingRates2_ch"+str(i+1)] rstr += rfun.plotOverlayingVectors(r[0],[r[7][0],r[7][1]],xlab="Position, chr. "+str(i+1),ylab="Missing (red,green)",type="b",xname=xname,ynames=ynames)+"\n\n" for i in range(0,len(commonAccessions)): accOverlappingCallRate[0][i]=accOverlappingCallRate[0][i]/float(totalCommonPos) accOverlappingCallRate[1][i]=accOverlappingCallRate[1][i]/float(totalCommonPos) accCallRate[0][i]=accCallRate[0][i]/float(totalPos[0]) accCallRate[1][i]=accCallRate[1][i]/float(totalPos[1]) accErrorRate[i]=accErrorRate[i]/float(totalAccessionCounts[i]) accErrAndID = [] accMissAndID = [[],[]] accOverlMissAndID = [[],[]] if withArrayIds: for i in range(0,len(commonAccessions)): accErrAndID.append((accErrorRate[i], commonAccessions[i], commonArrayIds[i])) accMissAndID[0].append((accCallRate[0][i], commonAccessions[i], commonArrayIds[i])) accMissAndID[1].append((accCallRate[1][i], commonAccessions[i], commonArrayIds[i])) accOverlMissAndID[0] = zip(accOverlappingCallRate[0],commonAccessions,commonArrayIds) accOverlMissAndID[1] = zip(accOverlappingCallRate[1],commonAccessions,commonArrayIds) else: for i in range(0,len(commonAccessions)): accErrAndID.append((accErrorRate[i], commonAccessions[i])) accMissAndID[0].append((accCallRate[0][i], commonAccessions[i])) accMissAndID[1].append((accCallRate[1][i], commonAccessions[i])) accOverlMissAndID[0] = zip(accOverlappingCallRate[0],commonAccessions) accOverlMissAndID[1] = zip(accOverlappingCallRate[1],commonAccessions) accErrAndID.sort() #05/10/08 yh. sort(reverse=True) is not available in python 2.3 accErrAndID.reverse() accMissAndID[0].sort() accMissAndID[0].reverse() accOverlMissAndID[1].sort() accOverlMissAndID[1].reverse() statstr += "#Sorted list, based on error rates (Error rate, ecotype id, array id):\n" for t in accErrAndID: statstr += str(t)+'\n' statstr += "#Sorted list, based on missing rates of 1st file, (Missing rate, ecotype id, array id):\n" for t in accMissAndID[0]: statstr += str(t)+'\n' statstr += "#Sorted list, based on missing rates of 2nd file, (Missing rate, ecotype id, array id):\n" for t in accMissAndID[1]: statstr += str(t)+'\n' statstr += "#Sorted list, based on (overlapping positions) missing rates of 1st file, (Missing rate, ecotype id, array id):\n" for t in accOverlMissAndID[0]: statstr += str(t)+'\n' statstr += "#Sorted list, based on (overlapping positions) missing rates of 2nd file, (Missing rate, ecotype id, array id):\n" for t in accOverlMissAndID[1]: statstr += str(t)+'\n' """ print "Sorted list, based on error rates: ",accErrAndID,'\n' accMissAndID[0].sort(reverse=True) print "Sorted list, based on missing rates (1st file): ",accMissAndID[0],'\n' accMissAndID[1].sort(reverse=True) print "Sorted list, based on missing rates (2nd file): ",accMissAndID[1],'\n' """ if withArrayIds: rstr += 'accessions<-c("'+str(r[2][0])+"_ai"+str(r[5][0])+'"' else: rstr += 'accessions<-c("'+str(r[2][0])+'"' for i in range(1, len(r[2])): if withArrayIds: rstr += ',"'+str(r[2][i])+"_ai"+str(r[5][i])+'"' else: rstr += ',"'+str(r[2][i])+'"' rstr +=")\n" rstr += rfun.plotVectors(accCallRate[0],[accErrorRate],xlab="Accession missing value rate",ylab="Accession error rate",xname="accMissingRate1",ynames=["accErrorRate"]) rstr += "text(accMissingRate1+0.0045,accErrorRate-0.0004,accessions)\n\n" rstr += rfun.plotVectors(accCallRate[1],[accErrorRate],xlab="Accession missing value rate",ylab="Accession error rate",xname="accMissingRate2",ynames=["accErrorRate"]) rstr += "text(accMissingRate2+0.0045,accErrorRate-0.0004,accessions)\n\n" if rFile: f = open(rFile,"w") f.write(rstr) f.close() if verbose: print statstr if statFile: f = open(statFile,"w") f.write(statstr) f.close()
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["outputFile=", "delim=", "missingval=", "sampleNum=", "parallel=", "parallelAll", "useFloats"] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:n:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 outputFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None parallelAll = False sampleNum = None chromosomes = [1, 2, 3, 4, 5] useFloats = False for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-o", "--outputFile"): outputFile = arg elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("-d", "--delim"): delim = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("n", "--sampleNum"): sampleNum = int(arg) elif opt in ("--useFloats"): useFloats = True else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) snpsDataFile = args[0] phenotypeDataFile = args[1] print "CAMP is being set up with the following parameters:" print "phenotypeDataFile:", phenotypeDataFile if len(args) > 2: print "Phenotype_id:", args[2] print "snpsDataFile:", snpsDataFile print "parallel:", parallel print "parallelAll:", parallelAll print "sampleNum:", sampleNum def runParallel(phenotypeIndex, id=""): # Cluster specific parameters phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") outputFile = resultDir + "CAMP_" + parallel + "_" + phenName + id shstr = """#!/bin/csh #PBS -l walltime=24:00:00 #PBS -l mem=6g #PBS -q cmb """ shstr += "#PBS -N C" + phenName + "_" + parallel + "\n" shstr += "set phenotypeName=" + parallel + "\n" shstr += "set phenotype=" + str(phenotypeIndex) + "\n" shstr += "(python " + scriptDir + "Camp.py -o " + outputFile + " " if sampleNum: shstr += " -n " + str(sampleNum) + " " if useFloats: shstr += " --useFloats " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str(phenotypeIndex) + " " shstr += "> " + outputFile + "_job" + ".out) >& " + outputFile + "_job" + ".err\n" f = open(parallel + ".sh", "w") f.write(shstr) f.close() # Execute qsub script os.system("qsub " + parallel + ".sh ") if parallel: # Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) # Load phenotype file phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter="\t") # Get Phenotype data # Load genotype file snpsds = dataParsers.parseCSVData( snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds ) # Checking overlap between phenotype and genotype accessions. phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + "." ) sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != "NA": accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break # Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len(accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) # Removing accessions that don't have genotypes or phenotype values # Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) # Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" # Converting format to 01 newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" # Writing phenotype data to CAMP format. (fId, phenotypeFile) = tempfile.mkstemp() os.close(fId) phenVals = phed.getPhenVals(phenotypeIndex, asString=False) if not useFloats: phenVals = map(int, phenVals) phenFile = open(phenotypeFile, "w") for value in phenVals: phenFile.write(str(value) + "\n") phenFile.close() chromosome_list = [] positions_list = [] scores_list = [] interaction_positions_list = [] mafs = [] marfs = [] # Writing SNP data to CAMP format. for chromosome in chromosomes: (fId, snpsFile) = tempfile.mkstemp() os.close(fId) (fId, posFile) = tempfile.mkstemp() os.close(fId) sf = open(snpsFile, "w") pf = open(posFile, "w") snpsd = newSnpsds[chromosome - 1] for i in range(0, len(snpsd.snps)): snp = snpsd.snps[i] (marf, maf) = snpsdata.getMAF(snp) marfs.append(marf) mafs.append(maf) str_snp = map(str, snp) double_snp = [] for nt in str_snp: double_snp.append(nt) double_snp.append(nt) sf.write("".join(double_snp) + "\n") pf.write(str(snpsd.positions[i]) + "\n") sf.close() pf.close() outFile = outputFile + "_job_" + str(chromosome) + ".out" errFile = outputFile + "_job_" + str(chromosome) + ".err" resFile = outputFile + "_" + str(chromosome) + ".out" print "resFile,outFile,errFile,snpsFile,posFile,phenotypeFile:", resFile, outFile, errFile, snpsFile, posFile, phenotypeFile results = _runCAMP_(resFile, outFile, errFile, snpsFile, posFile, phenotypeFile, sampleNum) positions_list += results["positions"] scores_list += results["scores"] for (i, j) in results["snpIndices"]: if not (j < 0 or i < 0): marfs.append(0.5) # An ugly hack!!! mafs.append(0.5) chromosome_list.append(chromosome) scoreFile = outputFile + ".scores" f = open(scoreFile, "w") f.write("Chromosome,Position,Score,MARF,MAF,Second_Position\n") for i in range(0, len(positions_list)): chromosome = chromosome_list[i] (pos1, pos2) = positions_list[i] score = scores_list[i] marf = marfs[i] maf = mafs[i] l = map(str, [chromosome, pos1, score, marf, maf, pos2]) f.write(",".join(l) + "\n") f.close()
def analyzeSNPs(): import KW, phenotype_parsers, phenotypeData import Emma result_id = "filtered_imputed" data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/" #ref_seq_name = "2010_Col-0" ref_seq_name = "raw_ref_col-0" ref_start = 3170501 ref_chr = 5 #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_edited_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #ad = sequences.readFastaAlignment(data_dir+"flc_seqs_aln_merged_011810.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #r = ad.get_snps(type=1) #seq_snpsd = r['snpsd'] #seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA') #seq_snpsd.onlyBinarySnps() #i_snpsd = r['indels'] #print indels #i_snpsd = i_snpsd.getSnpsData(missingVal='NA') #print zip(i_snpsd.positions, i_snpsd.snps) #print i_snpsd.accessionsl seq_snpsd = dataParsers.parseCSVData( data_dir + "/flc_seqs_aln_imputed_snps_012510.csv")[0] seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA') # d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_073009.csv" d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_imputed_012610.csv" d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data") # d2010_sd.filter_na_accessions() d2010_sd.filter_na_snps() d2010_sd.convert_2_binary() d2010_sd.filter_maf_snps(0.05) #kinship_2010 = Emma.calcKinship(d2010_sd.getSnps(0.05)) d2010_sd = d2010_sd.get_region_snpsd(5, 3140000, 3220000) d2010_sd.remove_redundant_snps(w_missing=True) d250k_file = "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_data_t43_081009.csv" snpsd = dataParsers.parse_snp_data(d250k_file) snpsd.filter_accessions(seq_snpsd.accessions) snpsd.convert_2_binary() snpsd.filter_maf_snps(0.05) #kinship_250k = Emma.calcKinship(snpsd.getSnps(0.02)) snpsd = snpsd.get_region_snpsd(5, 3140000, 3220000) snpsd.remove_redundant_snps() seq_snpsd.remove_accessions(snpsd.accessions) seq_snpsd.snpsFilterRare(0.05) seq_snpsd.onlyBinarySnps() acc_map = [] for i, acc in enumerate(seq_snpsd.accessions): acc_map.append((i, snpsd.accessions.index(acc))) seq_snpsd.orderAccessions(acc_map) seq_snpsd.remove_redundant_snps(w_missing=True) #snpsd.mergeDataUnion(d2010_sd,priority=2,unionType=3) #ad.compare_with_snps_data(snpsd) #Something missing here snpsd...? #i_snpsd = #snpsd.mergeDataUnion(d250k_sd,unionType=3,verbose=True) #NOW PERFORM GWAS AND PLOT RESULT!!!! phend = phenotypeData.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv") results_colors = ['blue', 'green', 'red'] #kinship_matrices = [kinship_250k,kinship_250k,kinship_2010] snpsds = [snpsd, seq_snpsd, d2010_sd] phenotypeIndices = phend.phenIds log_transforms = [1, 2] import analyzePhenotype as ap import analyzeSNPResult as asr import copy # for i in phenotypeIndices: # #ap.drawHistogram(phend,i,pdfFile="/Users/bjarnivilhjalmsson/tmp/hist_"+str(phend.getPhenotypeName(i))+".pdf") # #if i in log_transforms: # phend.logTransform(i) # #print "log transforming" # results = [] # filtered_sds=[] # for sd,k in zip(snpsds,kinship_matrices): # new_sd = copy.deepcopy(sd) # res = Emma.run_emma_w_missing_data(new_sd,phend,i,5,k) # res.negLogTransform() # snps_indices_to_keep = res.filterMARF(minMaf=0.1) # print "Got",len(res.scores),len(res.positions),"p-values from Emma." # results.append(res) # #pvals = res.scores # #positions = res.positions # #pp = zip(pvals,positions) # #pp.sort() # #print pp # #import plotResults as pr # #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf") # new_sd.filter_snp_indices(snps_indices_to_keep) # filtered_sds.append(new_sd) # import regionPlotter as rp # reg_plotter = rp.RegionPlotter() # reg_plotter.plot_small_result(results,results_colors=results_colors, # pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_"+result_id+"_emma_gwas_"+str(phend.getPhenotypeName(i))+".pdf") # for j,(r,sd) in enumerate(zip(results,filtered_sds)): # r_i = r.scores.index(max(r.scores)) # phend.plot_marker_box_plot(i,sd,r_i,pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_emma_"+str(phend.getPhenotypeName(i))+"_"+results_colors[j]+".pdf",marker_score=r.scores[r_i]) # phend = phenotypeData.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv") for i in phenotypeIndices: results = [] filtered_sds = [] for sd in snpsds: new_sd = copy.deepcopy(sd) res, f_sd = KW.run_kw(new_sd, phend, i, 5) filtered_sds.append(f_sd) res.negLogTransform() print "Got", len(res.scores), len( res.positions), "p-values from KW." results.append(res) #pvals = res.scores #positions = res.positions #pp = zip(pvals,positions) #pp.sort() #print pp #import plotResults as pr #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf") import regionPlotter as rp reg_plotter = rp.RegionPlotter() reg_plotter.plot_small_result( results, results_colors=results_colors, pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_" + result_id + "_gwas_" + str(phend.getPhenotypeName(i)) + ".pdf") for j, (r, sd) in enumerate(zip(results, filtered_sds)): if len(r.scores) != len(sd.snps): print "Lengths not equal? %d, %d", (len(r.scores), len(sd.snps)) r_i = r.scores.index(max(r.scores)) phend.plot_marker_box_plot( i, sd, r_i, pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_kw_" + str(phend.getPhenotypeName(i)) + "_" + results_colors[j] + ".pdf", marker_score=r.scores[r_i])
def run(self): if self.debug: import pdb pdb.set_trace() from variation.src.FilterStrainSNPMatrix import FilterStrainSNPMatrix #to check whether two input file are in different orientation file_format2count = {} file_format_ls = [self.input_fname1_format, self.input_fname2_format] for file_format in file_format_ls: if file_format not in file_format2count: file_format2count[file_format] = 0 file_format2count[file_format] += 1 #2008-05-15 TwoSNPData can handle character matrix/2D-list. but transposeSNPData needs numeric matrix to transpose except when numpy is installed. if 1 in file_format2count and file_format2count[1]==1: #there's one and only one strain x snp format. #it needs transpose matrix. only numpy works on character matrix. not sure Numeric or numarray is imported. so transform the input matrix to integer. use_nt2number = 1 else: use_nt2number = 0 if self.input_fname1_format==1: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname1) snpData1 = SNPData(header=header, strain_acc_list=strain_acc_list, category_list=category_list,\ data_matrix=data_matrix) elif self.input_fname1_format==2: snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=False, use_nt2number=use_nt2number) snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) #already nt in number del snpsd_ls elif self.input_fname1_format==3: snpsd_ls = dataParsers.parseCSVData(self.input_fname1, withArrayIds=True, use_nt2number=use_nt2number) snpData1 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) del snpsd_ls else: sys.stderr.write('Error: unsupported input_fname1 format, %s\n' % self.input_fname1_format) sys.exit(2) if self.run_type!=2: if self.input_fname2_format==1: header, strain_acc_list, category_list, data_matrix = read_data(self.input_fname2) snpData2 = SNPData(header=header, strain_acc_list=strain_acc_list,\ data_matrix=data_matrix) elif self.input_fname2_format==2: snpsd_ls = dataParsers.parseCSVData(self.input_fname2, withArrayIds=False, use_nt2number=use_nt2number) snpData2 = RawSnpsData_ls2SNPData(snpsd_ls, report=self.report, use_nt2number=0) del snpsd_ls else: sys.stderr.write('Error: unsupported input_fname2 format, %s\n' % self.input_fname2_format) sys.exit(2) if 1 in file_format2count and file_format2count[1]==1: #there's one and only one strain x snp format. transpose the 2nd snpData snpData2 = transposeSNPData(snpData2, report=self.report) if self.input_fname1_format == 1: #row_id for the 1st file = (ecotype_id, duplicate). for 2nd file, row_id=ecotype_id. row_matching_by_which_value = 0 col_matching_by_which_value = None elif self.input_fname1_format == 2: #col_id for the 1st file = accession. for 2nd file, col_id=accession. row_matching_by_which_value = None col_matching_by_which_value = None elif self.input_fname1_format == 3: #col_id for the 1st file = (array_id, accession). for 2nd file, col_id=accession. row_matching_by_which_value = None col_matching_by_which_value = 1 else: #2008-10-12 pairwise mismatch between same data snpData2 = snpData1 row_matching_by_which_value = None col_matching_by_which_value = None twoSNPData = TwoSNPData(SNPData1=snpData1, SNPData2=snpData2, row_matching_by_which_value=row_matching_by_which_value,\ col_matching_by_which_value=col_matching_by_which_value, debug=self.debug) if self.run_type==3: #2008-10-12 compare snpData1 and snpData2 only for designated entries from snpData1 if not self.ecotype_id_ls: sys.stderr.write("Run_type %s: ecotype_id_ls (%s) is not specified.\n"%(self.run_type, self.ecotype_id_ls)) sys.exit(3) ecotype_id_set = Set(self.ecotype_id_ls) row_id_ls = [] #test against for row_id in snpData1.row_id_ls: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): ecotype_id = row_id[0] else: ecotype_id = row_id if ecotype_id in ecotype_id_set: row_id_ls.append(row_id) print '%s arrays'%(len(row_id_ls)) if self.ecotype_id_ls: for row_id in row_id_ls: col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id) if col_id2NA_mismatch_rate: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): row_id_name = '_'.join(row_id) else: row_id_name = row_id output_fname = '%s_%s'%(self.output_fname, row_id_name) twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname) elif self.run_type==2: #2008-10-12 column-wise mismatch of snpData1 vs snpData1 between rows with same ecotype_id but different array_id row_id_pair_set = Set() for row_id in snpData1.row_id_ls: if not isinstance(row_id, str) and hasattr(row_id, '__len__'): ecotype_id = row_id[0] else: ecotype_id = row_id for row_id2 in snpData2.row_id_ls: if row_id2[0]==ecotype_id and row_id2[1]!=row_id[1]: #same ecotype_id but different array_id row_id_pair_set.add((row_id, row_id2)) print '%s arrays'%(len(row_id_pair_set)) for row_id1, row_id2 in row_id_pair_set: row_id12row_id2 = {row_id1:row_id2} col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise(row_id=row_id1, row_id12row_id2=row_id12row_id2) if col_id2NA_mismatch_rate: output_fname = '%s_%s_vs_%s'%(self.output_fname, '_'.join(row_id1), '_'.join(row_id2)) twoSNPData.output_col_id2NA_mismatch_rate_InGWRFormat(col_id2NA_mismatch_rate, output_fname) elif self.run_type==1: #sys.exit(2) #2008-10-12 skip all original functions row_id2NA_mismatch_rate = twoSNPData.cmp_row_wise() col_id2NA_mismatch_rate = twoSNPData.cmp_col_wise() if row_id2NA_mismatch_rate: QC_250k.output_row_id2NA_mismatch_rate(row_id2NA_mismatch_rate, self.output_fname, file_1st_open=1) if col_id2NA_mismatch_rate: QC_250k.output_row_id2NA_mismatch_rate(col_id2NA_mismatch_rate, self.output_fname, file_1st_open=0)