def __growthStats(self, project): if project is None: return ["", ""] results = [] tree = project.mcTree rootName = tree.getRootName() rootExpPath = project.expMap[rootName] rootExp = ExperimentWrapper(ET.parse(rootExpPath).getroot()) rootPath = rootExp.getReferencePath() rootSize = float(os.path.getsize(rootPath)) leafNames = [tree.getName(i) for i in tree.getLeaves()] leafSizes = [] for leaf in leafNames: leafPath = project.sequencePath(leaf) leafSize = float(os.path.getsize(leafPath)) leafSizes.append(leafSize) results.append(rootSize / max(leafSizes)) ratioSum = 0.0 ratioCount = 0 for expName, expPath in project.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) rootPath = exp.getReferencePath() rootSize = float(os.path.getsize(rootPath)) for leafName, leafPath in exp.seqMap.items(): leafSize = float(os.path.getsize(leafPath)) ratio = rootSize / leafSize ratioSum += ratio ratioCount += 1 avgRatio = ratioSum / ratioCount results.append(avgRatio) return results
def __pollKtServers(self): self.curKtservers = set() try: mc = MultiCactusProject() mc.readXML(self.projectPath) for eventName,expPath in mc.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) try: if pingKtServer(exp): self.curKtservers.add("%s_%s:%s" % ( eventName, exp.getDbHost(), str(exp.getDbPort()))) except: pass try: secElem = exp.getSecondaryDBElem() if secElem is not None and pingKtServer(secElem): self.curKtservers.add("%s_secondary_%s:%s" % ( eventName, secElem.getDbHost(), str(secElem.getDbPort()))) except: pass except: self.curKtservers = set() if len(self.prevKtservers) > 0 and len(self.curKtservers) > 0 and\ self.curKtservers == self.prevKtservers: self.sameKtserversTime += self.pollTime else: self.prevKtservers = set(self.curKtservers) self.sameKtserversTime = 0
def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning)
def runProgressive(self): logger.debug("Going to put the alignment in %s" % self.outputDir) if not os.path.isdir(self.outputDir): os.mkdir(self.outputDir) if not os.path.exists(os.path.join(self.outputDir, "progressiveCactusAlignment")): xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml")) #Set the parameters tempLocalDir = os.path.join(self.outputDir, "tempProgressiveCactusAlignment") system("rm -rf %s" % tempLocalDir) os.mkdir(tempLocalDir) #Set the config parameters self.params.applyToXml(xmlTree) config = xmlTree.getroot() assert config is not None #Write the config file tempConfigFile = os.path.join(tempLocalDir, "config.xml") fileHandle = open(tempConfigFile, 'w') assert fileHandle is not None tree = ET.ElementTree(config) tree.write(fileHandle) fileHandle.close() #Make the experiment file tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml") if self.params.kyotoTycoon == True: dbConfElem = ET.Element("st_kv_database_conf", type="kyoto_tycoon") ktElem = ET.SubElement(dbConfElem, "kyoto_tycoon", host="localhost", port="1978", database_dir="dummy") else: dbConfElem = None cactusWorkflowExperiment = CactusWorkflowExperiment( sequences=self.sequences, newickTreeString=self.newickTree, #requiredSpecies=self.requiredSpecies, #singleCopySpecies=self.singleCopySpecies, databaseName="cactusAlignment", outputDir=tempLocalDir, configFile=tempConfigFile, databaseConf = dbConfElem) cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile) #The jobtree tempJobTreeDir = os.path.join(tempLocalDir, "jobTree") #The place to put the temporary experiment dir tempExperimentDir = os.path.join(tempLocalDir, "progressiveCactusAlignment") #The temporary experiment runCactusCreateMultiCactusProject(tempExperimentFile, tempExperimentDir) logger.info("Setup the cactus progressive experiment") runCactusProgressive(os.path.join(tempExperimentDir, "progressiveCactusAlignment_project.xml"), tempJobTreeDir, #batchSystem=batchSystem, buildMaf=True, joinMaf=True, #buildTrees=buildTrees, buildFaces=buildFaces, buildReference=buildReference, jobTreeStats=True, maxThreads=4, logLevel="DEBUG") logger.info("Ran the progressive workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir for the progressive run") #Run the cactus tree stats expPath = os.path.join(tempExperimentDir, "Anc0", "Anc0_experiment.xml") exp = ExperimentWrapper(ET.parse(expPath).getroot()) if exp.getDbType() == "kyoto_tycoon": ktserver = KtserverLauncher() ktserver.spawnServer(exp) treeStatsFile = os.path.join(self.outputDir, "treeStats.xml") system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(), treeStatsFile)) if exp.getDbType() == "kyoto_tycoon": ktserver.killServer(exp) #Now copy the true assembly back to the output system("mv %s %s/experiment.xml" % (tempExperimentFile, self.outputDir)) system("mv %s %s" % (tempExperimentDir, self.outputDir)) system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir, self.outputDir)) system("mv %s %s/config.xml" % (tempConfigFile, self.outputDir)) #But keep a link to the multicactus project in its original path so we can navigate # the paths in the xml... actualResultsDir = os.path.join(os.path.abspath(self.outputDir), "progressiveCactusAlignment") tempResultsDir = os.path.join(self.outputDir, "tempProgressiveCactusAlignment") system("ln -s %s %s" % (actualResultsDir, tempResultsDir))
def runVanilla(self): logger.debug("Going to put the alignment in %s" % self.outputDir) if not os.path.isdir(self.outputDir): os.mkdir(self.outputDir) if not os.path.exists(os.path.join(self.outputDir, "cactusAlignmentVanilla")): xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml")) #Set the parameters tempLocalDir = os.path.join(self.outputDir, "tempVanillaCactusAlignment") system("rm -rf %s" % tempLocalDir) os.mkdir(tempLocalDir) #Set the config parameters self.params.applyToXml(xmlTree) config = xmlTree.getroot() assert config is not None #Write the config file tempConfigFile = os.path.join(tempLocalDir, "config.xml") fileHandle = open(tempConfigFile, 'w') assert fileHandle is not None tree = ET.ElementTree(config) tree.write(fileHandle) fileHandle.close() #Make the experiment file tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml") #Now do standard cactus.. #Make the experiment file tempExperimentFile2 = os.path.join(tempLocalDir, "experiment.xml") cactusWorkflowExperiment = CactusWorkflowExperiment( sequences=self.sequences, newickTreeString=self.newickTree, #requiredSpecies=self.requiredSpecies, #singleCopySpecies=self.singleCopySpecies, databaseName="cactusAlignmentVanilla", outputDir=tempLocalDir, configFile=tempConfigFile) tempExperimentDir2 = os.path.join(tempLocalDir, "cactusAlignmentVanilla") cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile2) # apply naming to the event tree to be consistent with progressive exp = ExperimentWrapper(ET.parse(tempExperimentFile2).getroot()) cleanEventTree(exp) exp.writeXML(tempExperimentFile2) #We're done with the progressive, now run the vanilla cactus for comparison tempJobTreeDir2 = os.path.join(tempLocalDir, "jobTreeVanilla") runCactusWorkflow(tempExperimentFile2, tempJobTreeDir2, jobTreeStats=True, setupAndBuildAlignments=True, buildReference=True, maxThreads=4) runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir2) logger.info("Checked the job tree dir for the vanilla run") runCactusMAFGenerator(os.path.join(self.outputDir, "cactusVanilla.maf"), getCactusDiskString(tempExperimentDir2)) #Run the cactus tree stats treeStatsFile = os.path.join(self.outputDir, "treeStats.xml") system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(), treeStatsFile)) system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir2, self.outputDir)) system("mv %s %s" % (tempExperimentDir2, self.outputDir)) system("mv %s %s/experiment.xml" % (tempExperimentFile2, self.outputDir))
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options, seqFile, workingDir): self.options = options self.seqFile = seqFile self.workingDir = workingDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = os.path.join(cactusRootPath(), "progressive") configPath = os.path.join(dir, "cactus_progressive_workflow_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = os.path.abspath( os.path.join(self.workingDir, "config.xml")) expPath = os.path.abspath( os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if os.path.exists(projPath) and self.options.overwrite: system("rm -rf %s" % projPath) if self.options.outputMaf is True: fixNames=1 else: fixNames=0 if os.path.exists(projPath): if not self.isSameAsExisting(expPath, projPath, fixNames): raise RuntimeError("Existing project %s not " % projPath+ "compatible with current input. Please " "erase the working directory or rerun " "with the --overwrite option to start " "from scratch.") else: logPath = os.path.join(self.workingDir, 'cactus.log') logFile = open(logPath, "a") logFile.write("\nContinuing existing alignment. Use " "--overwrite or erase the working directory to " "force restart from scratch.\n") logFile.close() else: cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, projPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) system(cmd) # create a project in a dummy directory. check if the # project xml is the same as the current project. # we do this to see if we should start fresh or try to # work with the existing project when the overwrite flag is off def isSameAsExisting(self, expPath, projPath, fixNames): if not os.path.exists(projPath): return False oldPath = os.path.dirname(projPath + "/") tempPath = "%s_temp" % oldPath if os.path.exists(tempPath): system("rm -rf %s" % tempPath) cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, tempPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) system(cmd) projFilePathNew = os.path.join(tempPath,'%s_temp_project.xml' % self.alignmentDirName) projFilePathOld = os.path.join(oldPath, '%s_project.xml' % self.alignmentDirName) newFile = [line for line in open(projFilePathNew, "r")] oldFile = [line for line in open(projFilePathOld, "r")] areSame = True if len(newFile) != len(oldFile): areSame = False for newLine, oldLine in zip(newFile, oldFile): if newLine.replace(tempPath, oldPath) != oldLine: areSame = False system("rm -rf %s" % tempPath) return areSame
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) # for now we do not support appending at the script level print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] experiment = ExperimentWrapper( ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree()) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format( experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format( totalTime, totalAppendTime)
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) # for now we do not support appending at the script level print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree()) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format(totalTime, totalAppendTime)