def loadProject(self, mcProject, fileStore = None): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] expMap = None if fileStore: expMap = dict() for name in mcProject.expIDMap: expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name]) else: expMap = mcProject.expMap for name, expPath in list(expMap.items()): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents and tree.isLeaf(node): self.inGraph.add_edge(name, nodeName) if fileStore: configFile = fileStore.readGlobalFile(exp.getConfigID()) else: # hack from running from cactus-prepare configFile = exp.getConfigPath() configElem = ET.parse(configFile).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
def loadProject(self, mcProject): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] for name, expPath in mcProject.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) if not tree.isLeaf(node) and nodeName not in exp.getOutgroupEvents(): # This node is just an internal node added while # creating the induced tree from the species # tree. None of the sequence is used, so skip it. continue assert tree.hasParent(node) if nodeName not in exp.getOutgroupEvents() and tree.getName(tree.getParent(node)) != name: # This leaf isn't an ingroup or an outgroup, it was # just added to make the species tree # binary. (Hopefully this will be unnecessary in # the future.) continue # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents: self.inGraph.add_edge(name, nodeName) configElem = ET.parse(exp.getConfig()).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
def loadProject(self, mcProject, fileStore = None): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] expMap = None if fileStore: expMap = dict() for name in mcProject.expIDMap: expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name]) else: expMap = mcProject.expMap for name, expPath in expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents and nodeName in exp.getSequenceMap(): self.inGraph.add_edge(name, nodeName) configFile = fileStore.readGlobalFile(exp.getConfigID()) configElem = ET.parse(configFile).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options, seqFile, workingDir): self.options = options self.seqFile = seqFile self.workingDir = workingDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir(os.path.join(self.workingDir, "sequenceData")) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = os.path.abspath( os.path.join(self.workingDir, "config.xml")) expPath = os.path.abspath( os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if os.path.exists(projPath) and self.options.overwrite: system("rm -rf %s" % projPath) if self.options.outputMaf is True: fixNames=1 else: fixNames=0 if os.path.exists(projPath): if not self.isSameAsExisting(expPath, projPath, fixNames): raise RuntimeError("Existing project %s not " % projPath+ "compatible with current input. Please " "erase the working directory or rerun " "with the --overwrite option to start " "from scratch.") else: logPath = os.path.join(self.workingDir, 'cactus.log') logFile = open(logPath, "a") logFile.write("\nContinuing existing alignment. Use " "--overwrite or erase the working directory to " "force restart from scratch.\n") logFile.close() else: cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, projPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDist: cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath system(cmd) # create a project in a dummy directory. check if the # project xml is the same as the current project. # we do this to see if we should start fresh or try to # work with the existing project when the overwrite flag is off def isSameAsExisting(self, expPath, projPath, fixNames): if not os.path.exists(projPath): return False oldPath = os.path.dirname(projPath + "/") tempPath = "%s_temp" % oldPath if os.path.exists(tempPath): system("rm -rf %s" % tempPath) cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, tempPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDist: cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath system(cmd) projFilePathNew = os.path.join(tempPath,'%s_temp_project.xml' % self.alignmentDirName) projFilePathOld = os.path.join(oldPath, '%s_project.xml' % self.alignmentDirName) newFile = [line for line in open(projFilePathNew, "r")] oldFile = [line for line in open(projFilePathOld, "r")] areSame = True if len(newFile) != len(oldFile): areSame = False for newLine, oldLine in zip(newFile, oldFile): if newLine.replace(tempPath, oldPath) != oldLine: areSame = False system("rm -rf %s" % tempPath) return areSame
def run(self): logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here self.options.experimentFile = self.project.expMap[self.event] expXml = ET.parse(self.options.experimentFile).getroot() experiment = ExperimentWrapper(expXml) configXml = ET.parse(experiment.getConfigPath()).getroot() configWrapper = ConfigWrapper(configXml) # need at least 3 processes for every event when using ktserver: # 1 proc to run jobs, 1 proc to run server, 1 proc to run 2ndary server if experiment.getDbType() == "kyoto_tycoon": maxParallel = min(len(self.project.expMap), configWrapper.getMaxParallelSubtrees()) if self.options.batchSystem == "singleMachine": if int(self.options.maxThreads) < maxParallel * 3: raise RuntimeError("At least %d threads are required (only %d were specified) to handle up to %d events using kyoto tycoon. Either increase the number of threads using the --maxThreads option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, self.options.maxThreads, maxParallel, configWrapper.getMaxParallelSubtrees())) else: if int(self.options.maxCpus) < maxParallel * 3: raise RuntimeError("At least %d concurrent cpus are required to handle up to %d events using kyoto tycoon. Either increase the number of cpus using the --maxCpus option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, maxParallel, configWrapper.getMaxParallelSubtrees())) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants workFlowArgs = CactusWorkflowArguments(self.options) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.overwrite = self.options.overwrite workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet experiment = ExperimentWrapper(workFlowArgs.experimentNode) donePath = os.path.join(os.path.dirname(workFlowArgs.experimentFile), "DONE") doneDone = os.path.isfile(donePath) refDone = not workFlowArgs.buildReference or os.path.isfile(experiment.getReferencePath()) halDone = not workFlowArgs.buildHal or (os.path.isfile(experiment.getHALFastaPath()) and os.path.isfile(experiment.getHALPath())) if not workFlowArgs.overwrite and doneDone and refDone and halDone: self.logToMaster("Skipping %s because it is already done and overwrite is disabled" % self.event) else: system("rm -f %s" % donePath) # delete database # and overwrite specified (or if reference not present) dbPath = os.path.join(experiment.getDbDir(), experiment.getDbName()) seqPath = os.path.join(experiment.getDbDir(), "sequences") system("rm -f %s* %s %s" % (dbPath, seqPath, experiment.getReferencePath())) if workFlowArgs.configWrapper.getDoTrimStrategy() and workFlowArgs.outgroupEventNames is not None: # Use the trimming strategy to blast ingroups vs outgroups. self.addChildTarget(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) else: self.addChildTarget(CactusSetupPhase(cactusWorkflowArguments=workFlowArgs, phaseName="setup")) logger.info("Going to create alignments and define the cactus tree") self.setFollowOnTarget(FinishUp(workFlowArgs, self.project))
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options, seqFile, workingDir): self.options = options self.seqFile = seqFile self.workingDir = workingDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir( os.path.join(self.workingDir, "sequenceData")) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = absSymPath(os.path.join(self.workingDir, "config.xml")) expPath = absSymPath(os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if os.path.exists(projPath) and self.options.overwrite: system("rm -rf %s" % projPath) if self.options.outputMaf is True: fixNames = 1 else: fixNames = 0 if os.path.exists(projPath): if not self.isSameAsExisting(expPath, projPath, fixNames): raise RuntimeError("Existing project %s not " % projPath + "compatible with current input. Please " "erase the working directory or rerun " "with the --overwrite option to start " "from scratch.") else: logPath = os.path.join(self.workingDir, 'cactus.log') logFile = open(logPath, "a") logFile.write("\nContinuing existing alignment. Use " "--overwrite or erase the working directory to " "force restart from scratch.\n") logFile.close() else: cmd = "cactus_createMultiCactusProject.py \"%s\" \"%s\" --fixNames=%d" % ( expPath, projPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDists: cmd += " --rootOutgroupDists %s" % self.options.rootOutgroupDists cmd += " --rootOutgroupPaths %s" % self.options.rootOutgroupPaths if self.options.root is not None: cmd += " --root %s" % self.options.root system(cmd) # create a project in a dummy directory. check if the # project xml is the same as the current project. # we do this to see if we should start fresh or try to # work with the existing project when the overwrite flag is off def isSameAsExisting(self, expPath, projPath, fixNames): if not os.path.exists(projPath): return False oldPath = os.path.dirname(projPath + "/") tempPath = "%s_temp" % oldPath # Fix for relative directories if oldPath[0:2] == './': oldPath = oldPath[2:] if tempPath[0:2] == './': tempPath = tempPath[2:] if os.path.exists(tempPath): system("rm -rf %s" % tempPath) cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, tempPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDists: cmd += " --rootOutgroupDists %s" % self.options.rootOutgroupDists cmd += " --rootOutgroupPaths %s" % self.options.rootOutgroupPaths if self.options.root is not None: cmd += " --root %s" % self.options.root system(cmd) projFilePathNew = os.path.join( tempPath, '%s_temp_project.xml' % self.alignmentDirName) projFilePathOld = os.path.join( oldPath, '%s_project.xml' % self.alignmentDirName) newFile = [line for line in open(projFilePathNew, "r")] oldFile = [line for line in open(projFilePathOld, "r")] areSame = True if len(newFile) != len(oldFile): areSame = False for newLine, oldLine in zip(newFile, oldFile): if newLine.replace(tempPath, oldPath) != oldLine: areSame = False system("rm -rf %s" % tempPath) return areSame