def plotOverlayingVectors(x, vectorList, main="", xlab="", ylab="",type=None, pch='20', xname="x", ynames=None): """ Writes out a simple R string to plot the vectors.. """ if not ynames: ynames = ["y"]*len(vectorList) maxVal=[] minVal=[] for v in vectorList: maxVal.append(max(v)) minVal.append(min(v)) maxVal = max(maxVal) minVal = min(minVal) xmax = max(x) xmin = min(x) x = util.valListToStrList(x) rstr ="" #rstr = "par(mfrow=c(1,1));\n" rstr += xname+" <- c("+",".join(x)+");\n" for i in range(0,len(vectorList)): y = vectorList[i] y = util.valListToStrList(y) rstr += ynames[i]+" <- c("+",".join(y)+");\n" if i!=0: rstr += "par(new=T);\n" rstr += 'plot('+xname+','+ynames[i]+',main="'+main+'",xlab="'+xlab+'",ylab="'+ylab+'", xlim=c('+str(xmin)+','+str(xmax)+'), ylim=c('+str(minVal)+','+str(maxVal)+'), col='+str(i+2) if type: rstr +=', type="'+type+'"' if pch: rstr +=', pch='+pch rstr += ')\n' return rstr
def plotVectors(x, vectorList, main="", xlab="", ylab="",type=None, xname="x", ynames=None): """ Writes out a simple R string to plot the vectors.. """ if not ynames: ynames = ["y"]*len(vectorList) x = util.valListToStrList(x) rstr ="" rstr = "par(mfrow=c("+str(len(vectorList))+",1));\n" rstr += xname+" <- c("+",".join(x)+");\n" for i in range(0,len(vectorList)): y = util.valListToStrList(vectorList[i]) rstr += ynames[i]+" <- c("+",".join(y)+");\n" rstr += 'plot('+xname+','+ynames[i]+',pch=20, main="'+main+'",xlab="'+xlab+'",ylab="'+ylab+'"' if type: rstr += ', type="'+type+'"' rstr += ')\n' return rstr
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF="] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 impFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None logTransform = False parallelAll = False chunkSize = 250000 round2Size = 5000 nTrees = 15000 nodeSize = None mem = "8g" skipSecondRound = True minMAF = 0.0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("-o","--rFile"): impFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--secondRound"): skipSecondRound = False elif opt in ("-d","--delim"): delim = arg elif opt in ("--chunkSize"): chunkSize = int(arg) elif opt in ("--round2Size"): round2Size = int(arg) elif opt in ("--nTrees"): nTrees = int(arg) elif opt in ("--nodeSize"): nodeSize = int(arg) elif opt in ("--mem"): mem = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-m","--minMAF"): minMAF = float(arg) else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args)<3 and not parallel: if help==0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) def runParallel(phenotypeIndex): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/","_div_") phenName = phenName.replace("*","_star_") impFileName = resultDir+"RF_"+parallel+"_"+phenName outFileName = impFileName shstr = """#!/bin/csh #PBS -l walltime=120:00:00 """ shstr += "#PBS -l mem="+mem+"\n" shstr +=""" #PBS -q cmb """ shstr += "#PBS -N RF"+phenName+"_"+parallel+"\n" shstr += "(python "+programDir+"RandomForest.py -o "+impFileName+" --chunkSize "+str(chunkSize)+" --nTrees "+str(nTrees)+" --mem "+str(mem)+" --round2Size "+str(round2Size)+"" if nodeSize: shstr += " --nodeSize "+str(nodeSize)+" " if logTransform: shstr += " --logTransform " if not skipSecondRound: shstr += " --secondRound " shstr += " -a "+str(withArrayIds)+" " shstr += snpsDataFile+" "+phenotypeDataFile+" "+str(phenotypeIndex)+" " shstr += "> "+outFileName+"_job"+".out) >& "+outFileName+"_job"+".err\n" f = open(parallel+".sh",'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub "+parallel+".sh ") #Nested function ends snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) print "chunkSize:",chunkSize print "nTrees:",nTrees print "nodeSize:",nodeSize print "mem:",mem print "logTransform:",logTransform print "round2Size:",round2Size print "skipSecondRound:",skipSecondRound #Loading genotype data import dataParsers snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) #Load phenotype file sys.stdout.write("Removing accessions which do not have a phenotype value for "+phed.phenotypeNames[phenotype]+".") sys.stdout.flush() for i in range(0,len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0,len(phed.accessions)): acc2 = phed.accessions[j] if acc1==acc2 and phed.phenotypeValues[j][phenotype]!='NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc-len(accIndicesToKeep),"accessions removed, leaving",len(accIndicesToKeep),"accessions in all." print "Filtering phenotype data." phed.removeAccessions(phenAccIndicesToKeep) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc),i)) i += 1 phed.orderAccessions(accessionMapping) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotype) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()),"Snps" #Remove minor allele frequencies if minMAF!=0: sys.stdout.write("Filterting SNPs with MAF<"+str(minMAF)+".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Converting format to 01 import snpsdata newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" snpsds = newSnpsds #Writing files import tempfile if env.user=="bjarni": tempfile.tempdir='/tmp' (fId, phenotypeTempFile) = tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile) = tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write( "Phenotype file written\n") sys.stdout.flush() #Retain only the correct runchunk of data. chromasomes = [] positions = [] snps = [] for i in range(0,len(snpsds)): snpsd = snpsds[i] positions += snpsd.positions snps += snpsd.snps chrList = [i+1]*len(snpsd.positions) chromasomes += chrList #Is the phenotype binary? binary = phed.isBinary(phenotypeIndex) import util impFile = impFile+".imp" rDataFile = impFile+".rData" rFile = impFile+".r" outRfile = rFile+".out" errRfile = rFile+".err" topImpFile = impFile+"_top"+str(chunkSize)+".imp" topRDataFile = impFile+"_top.rData" try: os.remove(impFile) #Removing file if it already exits. except Exception: print "Couldn't remove",impFile try: os.remove(topImpFile) #Removing file if it already exits. except Exception: print "Couldn't remove",topImpFile for startIndex in range(0,len(positions),chunkSize): if startIndex+chunkSize>=len(positions): endIndex = len(positions) else: endIndex = startIndex+chunkSize #Writing genotype data to file. tmpFile = open(genotypeTempFile,"w") for i in range(startIndex,endIndex): outStr ="" snp = util.valListToStrList(snps[i]) outStr += str(chromasomes[i])+","+str(positions[i])+"," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile,'w') f.write(rstr) f.close() #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out" #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err" print "Running model nr",startIndex/chunkSize,":" cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() os.system(cmdStr) print "Random forest output saved in", impFile if not skipSecondRound: #Run on the top 'chunkSize' number of hits. #loading the R output file. impF = open(impFile,"r") lines=impF.readlines() impF.close() impList = list() for i in range(1,len(lines)): line = lines[i] line.strip() l = line.split(",") impList.append( (float(l[2]),l[0],l[1],snps[i]) ) impList.sort() impList.reverse() #Writing genotype data to file. tmpFile = open(genotypeTempFile,"w") for i in range(0,round2Size): outStr = "" snp = util.valListToStrList(impList[i][3]) outStr += str(impList[i][1])+","+str(impList[i][2])+"," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, cluster=True, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile,'w') f.write(rstr) f.close() print "Running randomForest on the top importance scores:" cmdStr = "(R --vanilla < "+rFile+" > "+outRfile+") >& "+errRfile sys.stdout.write(cmdStr+"\n") sys.stdout.flush() os.system(cmdStr)
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = ["delim=", "missingval=", "withArrayId=", "comparisonFile=", "debug", "report", "help"] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:brh", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) inputFile = args[0] output_fname = None delim = ", " missingVal = "NA" comparisonFile = None debug = None report = None help = 0 withArrayIds = 0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a","--withArrayId"): withArrayIds = int(arg) elif opt in ("--comparisonFile"): comparisonFile = arg elif opt in ("-o",): output_fname = arg elif opt in ("-d","--delim"): delim = arg elif opt in ("-m","--missingval"): missingVal = arg elif opt in ("-b", "--debug"): debug = 1 elif opt in ("-r", "--report"): report = 1 else: if help==0: print "Unkown option!!\n" print __doc__ sys.exit(2) if not output_fname: output_fname if help==0: print "Output file missing!!\n" print __doc__ sys.exit(2) waid1 = withArrayIds==1 or withArrayIds==2 waid2 = withArrayIds==2 import dataParsers import snpsdata snpsds = dataParsers.parseCSVData(inputFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid1) #Calculating Error rates #if comparisonFile: # snpsds2 = dataParsers.parseCSVData(comparisonFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=waid2) # for i in range(0,len(snpsds)): #Compare ... and record relevant information... #snpsds[i].compare filterBadSnps(snpsds2[i],maxError) # pass #Calculating NA rates.. print "Calculating NA rates" snpsNARates = [] for i in range(0,len(snpsds)): snpsNARates += snpsds[i].getSnpsNArates() import util rstr = "" rstr += "snpsNARates <- c("+",".join(util.valListToStrList(snpsNARates))+")\n" rstr += 'hist(snpsNARates, xlab="NA rates", ylab="SNP frequency", breaks=60)' f = open(output_fname,"w") f.write(rstr) f.close()
def _run_(): if len(sys.argv) == 1: print __doc__ sys.exit(2) long_options_list = [ "chunkSize=", "nTrees=", "impFile=", "delim=", "missingval=", "withArrayId=", "logTransform", "phenotypeFileType=", "help", "parallel=", "parallelAll", "nodeSize=", "mem=", "round2Size=", "secondRound", "minMAF=" ] try: opts, args = getopt.getopt(sys.argv[1:], "o:d:m:a:h", long_options_list) except: traceback.print_exc() print sys.exc_info() print __doc__ sys.exit(2) phenotypeFileType = 1 impFile = None delim = "," missingVal = "NA" help = 0 withArrayIds = 1 parallel = None logTransform = False parallelAll = False chunkSize = 250000 round2Size = 5000 nTrees = 15000 nodeSize = None mem = "8g" skipSecondRound = True minMAF = 0.0 for opt, arg in opts: if opt in ("-h", "--help"): help = 1 print __doc__ elif opt in ("-a", "--withArrayId"): withArrayIds = int(arg) elif opt in ("-o", "--rFile"): impFile = arg elif opt in ("--phenotypeFileType"): phenotypeFileType = int(arg) elif opt in ("--parallel"): parallel = arg elif opt in ("--parallelAll"): parallelAll = True elif opt in ("--logTransform"): logTransform = True elif opt in ("--secondRound"): skipSecondRound = False elif opt in ("-d", "--delim"): delim = arg elif opt in ("--chunkSize"): chunkSize = int(arg) elif opt in ("--round2Size"): round2Size = int(arg) elif opt in ("--nTrees"): nTrees = int(arg) elif opt in ("--nodeSize"): nodeSize = int(arg) elif opt in ("--mem"): mem = arg elif opt in ("-m", "--missingval"): missingVal = arg elif opt in ("-m", "--minMAF"): minMAF = float(arg) else: if help == 0: print "Unkown option!!\n" print __doc__ sys.exit(2) if len(args) < 3 and not parallel: if help == 0: print "Arguments are missing!!\n" print __doc__ sys.exit(2) def runParallel(phenotypeIndex): #Cluster specific parameters phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data phenName = phed.getPhenotypeName(phenotypeIndex) phenName = phenName.replace("/", "_div_") phenName = phenName.replace("*", "_star_") impFileName = resultDir + "RF_" + parallel + "_" + phenName outFileName = impFileName shstr = """#!/bin/csh #PBS -l walltime=50:00:00 """ shstr += "#PBS -l mem=" + mem + "\n" shstr += """ #PBS -q cmb """ shstr += "#PBS -N RF" + phenName + "_" + parallel + "\n" shstr += "(python " + programDir + "RandomForest.py -o " + impFileName + " --chunkSize " + str( chunkSize) + " --nTrees " + str(nTrees) + " --mem " + str( mem) + " --round2Size " + str(round2Size) + "" if nodeSize: shstr += " --nodeSize " + str(nodeSize) + " " if logTransform: shstr += " --logTransform " if not skipSecondRound: shstr += " --secondRound " shstr += " -a " + str(withArrayIds) + " " shstr += snpsDataFile + " " + phenotypeDataFile + " " + str( phenotypeIndex) + " " shstr += "> " + outFileName + "_job" + ".out) >& " + outFileName + "_job" + ".err\n" f = open(parallel + ".sh", 'w') f.write(shstr) f.close() #Execute qsub script os.system("qsub " + parallel + ".sh ") #Nested function ends snpsDataFile = args[0] phenotypeDataFile = args[1] if parallel: #Running on the cluster.. if parallelAll: phed = phenotypeData.readPhenotypeFile( phenotypeDataFile, delimiter='\t') #Get Phenotype data for phenotypeIndex in phed.phenIds: runParallel(phenotypeIndex) else: phenotypeIndex = int(args[2]) runParallel(phenotypeIndex) return else: phenotypeIndex = int(args[2]) print "chunkSize:", chunkSize print "nTrees:", nTrees print "nodeSize:", nodeSize print "mem:", mem print "logTransform:", logTransform print "round2Size:", round2Size print "skipSecondRound:", skipSecondRound #Loading genotype data import dataParsers snpsds = dataParsers.parseCSVData(snpsDataFile, format=1, deliminator=delim, missingVal=missingVal, withArrayIds=withArrayIds) phed = phenotypeData.readPhenotypeFile(phenotypeDataFile, delimiter='\t') #Get Phenotype data phenotype = phed.getPhenIndex(phenotypeIndex) accIndicesToKeep = [] phenAccIndicesToKeep = [] numAcc = len(snpsds[0].accessions) #Load phenotype file sys.stdout.write( "Removing accessions which do not have a phenotype value for " + phed.phenotypeNames[phenotype] + ".") sys.stdout.flush() for i in range(0, len(snpsds[0].accessions)): acc1 = snpsds[0].accessions[i] for j in range(0, len(phed.accessions)): acc2 = phed.accessions[j] if acc1 == acc2 and phed.phenotypeValues[j][phenotype] != 'NA': accIndicesToKeep.append(i) phenAccIndicesToKeep.append(j) break #Filter accessions which do not have the phenotype value. for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.removeAccessionIndices(accIndicesToKeep) print "" print numAcc - len(accIndicesToKeep), "accessions removed, leaving", len( accIndicesToKeep), "accessions in all." print "Filtering phenotype data." phed.removeAccessions( phenAccIndicesToKeep ) #Removing accessions that don't have genotypes or phenotype values #Ordering accessions according to the order of accessions in the genotype file accessionMapping = [] i = 0 for acc in snpsds[0].accessions: if acc in phed.accessions: accessionMapping.append((phed.accessions.index(acc), i)) i += 1 phed.orderAccessions(accessionMapping) #Log-transforming if logTransform: print "Log transforming phenotype" phed.logTransform(phenotype) #Filtering monomorphic print "Filtering monomorphic SNPs" for snpsd in snpsds: print "Removed", str(snpsd.filterMonoMorphicSnps()), "Snps" #Remove minor allele frequencies if minMAF != 0: sys.stdout.write("Filterting SNPs with MAF<" + str(minMAF) + ".") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() snpsd.filterMinMAF(minMAF) #Converting format to 01 import snpsdata newSnpsds = [] sys.stdout.write("Converting data format") for snpsd in snpsds: sys.stdout.write(".") sys.stdout.flush() newSnpsds.append(snpsd.getSnpsData()) print "" snpsds = newSnpsds #Writing files import tempfile if env.user == "bjarni": tempfile.tempdir = '/tmp' (fId, phenotypeTempFile) = tempfile.mkstemp() os.close(fId) (fId, genotypeTempFile) = tempfile.mkstemp() os.close(fId) phed.writeToFile(phenotypeTempFile, [phenotype]) sys.stdout.write("Phenotype file written\n") sys.stdout.flush() #Retain only the correct runchunk of data. chromasomes = [] positions = [] snps = [] for i in range(0, len(snpsds)): snpsd = snpsds[i] positions += snpsd.positions snps += snpsd.snps chrList = [i + 1] * len(snpsd.positions) chromasomes += chrList #Is the phenotype binary? binary = phed.isBinary(phenotypeIndex) import util impFile = impFile + ".imp" rDataFile = impFile + ".rData" rFile = impFile + ".r" outRfile = rFile + ".out" errRfile = rFile + ".err" topImpFile = impFile + "_top" + str(chunkSize) + ".imp" topRDataFile = impFile + "_top.rData" try: os.remove(impFile) #Removing file if it already exits. except Exception: print "Couldn't remove", impFile try: os.remove(topImpFile) #Removing file if it already exits. except Exception: print "Couldn't remove", topImpFile for startIndex in range(0, len(positions), chunkSize): if startIndex + chunkSize >= len(positions): endIndex = len(positions) else: endIndex = startIndex + chunkSize #Writing genotype data to file. tmpFile = open(genotypeTempFile, "w") for i in range(startIndex, endIndex): outStr = "" snp = util.valListToStrList(snps[i]) outStr += str(chromasomes[i]) + "," + str(positions[i]) + "," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, impFile, rDataFile, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile, 'w') f.write(rstr) f.close() #outRfile = rFile+"_"+str(startIndex/chunkSize)+".out" #errRfile = rFile+"_"+str(startIndex/chunkSize)+".err" print "Running model nr", startIndex / chunkSize, ":" cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile sys.stdout.write(cmdStr + "\n") sys.stdout.flush() os.system(cmdStr) print "Random forest output saved in", impFile if not skipSecondRound: #Run on the top 'chunkSize' number of hits. #loading the R output file. impF = open(impFile, "r") lines = impF.readlines() impF.close() impList = list() for i in range(1, len(lines)): line = lines[i] line.strip() l = line.split(",") impList.append((float(l[2]), l[0], l[1], snps[i])) impList.sort() impList.reverse() #Writing genotype data to file. tmpFile = open(genotypeTempFile, "w") for i in range(0, round2Size): outStr = "" snp = util.valListToStrList(impList[i][3]) outStr += str(impList[i][1]) + "," + str(impList[i][2]) + "," outStr += ",".join(snp) outStr += "\n" tmpFile.write(outStr) tmpFile.close() rstr = _generateRScript_(genotypeTempFile, phenotypeTempFile, topImpFile, topRDataFile, binary=binary, nTrees=nTrees, nodeSize=nodeSize) f = open(rFile, 'w') f.write(rstr) f.close() print "Running randomForest on the top importance scores:" cmdStr = "(R --vanilla < " + rFile + " > " + outRfile + ") >& " + errRfile sys.stdout.write(cmdStr + "\n") sys.stdout.flush() os.system(cmdStr)