class ProgressiveNext(RoundedJob): def __init__(self, options, project, event, schedule, depProjects, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule self.depProjects = depProjects def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() fileStore.logToMaster("Project has %i dependencies" % len(self.depProjects)) for projName in self.depProjects: depProject = self.depProjects[projName] for expName in depProject.expIDMap: expID = depProject.expIDMap[expName] experiment = ExperimentWrapper(ET.parse(fileStore.readGlobalFile(expID)).getroot()) fileStore.logToMaster("Reference ID for experiment %s: %s" % (expName, experiment.getReferenceID())) if experiment.getReferenceID(): self.project.expIDMap[expName] = expID self.project.outputSequenceIDMap[expName] = experiment.getReferenceID() eventExpWrapper = None logger.info("Progressive Next: " + self.event) if not self.schedule.isVirtual(self.event): eventExpWrapper = self.addChild(ProgressiveUp(self.options, self.project, self.event, memory=self.configWrapper.getDefaultMemory())).rv() return self.addFollowOn(ProgressiveOut(self.options, self.project, self.event, eventExpWrapper, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()
class RunCactusPreprocessorThenProgressiveDown2(RoundedJob): def __init__(self, options, project, event, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() # Save preprocessed sequences if self.options.intermediateResultsUrl is not None: preprocessedSequences = self.project.getOutputSequenceIDMap() for genome, seqID in preprocessedSequences.items(): fileStore.exportFile(seqID, self.options.intermediateResultsUrl + '-preprocessed-' + genome) # Log the stats for the preprocessed assemblies for name, sequence in self.project.getOutputSequenceIDMap().items(): self.addChildJobFn(logAssemblyStats, "After preprocessing", name, sequence) project = self.addChild(ProgressiveDown(options=self.options, project=self.project, event=self.event, schedule=self.schedule, memory=self.configWrapper.getDefaultMemory())).rv() #Combine the smaller HAL files from each experiment return self.addFollowOnJobFn(exportHal, project=project, memory=self.configWrapper.getDefaultMemory(), disk=self.configWrapper.getExportHalDisk(), preemptable=False).rv()
class ProgressiveDown(RoundedJob): def __init__(self, options, project, event, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Down: " + self.event) depProjects = dict() deps = self.schedule.deps(self.event) fileStore.logToMaster("There are %i dependent projects" % len(deps)) for child in deps: fileStore.logToMaster("Adding dependent project %s" % child) depProjects[child] = self.addChild(ProgressiveDown(self.options, self.project, child, self.schedule)).rv() return self.addFollowOn(ProgressiveNext(self.options, self.project, self.event, self.schedule, depProjects, memory=self.configWrapper.getDefaultMemory())).rv()
class ProgressiveOut(RoundedJob): def __init__(self, options, project, event, eventExpWrapper, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.eventExpWrapper = eventExpWrapper self.schedule = schedule def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() if not self.schedule.isVirtual(self.event): tmpExp = fileStore.getLocalTempFile() self.eventExpWrapper.writeXML(tmpExp) self.project.expIDMap[self.event] = fileStore.writeGlobalFile(tmpExp) followOnEvent = self.schedule.followOn(self.event) if followOnEvent is not None: logger.info("Adding follow-on event %s" % followOnEvent) return self.addFollowOn(ProgressiveDown(self.options, self.project, followOnEvent, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv() return self.project
def setUp(self): self.batchSystem = "singleMachine" if getBatchSystem() != None: self.batchSystem = getBatchSystem() unittest.TestCase.setUp(self) self.useOutgroup = False self.doSelfAlignment = False #Load the config file, turn on the checks. configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot()) configWrapper.turnAllModesOn() self.tempDir = getTempDirectory(os.getcwd()) self.configFile = os.path.join(self.tempDir, "tempConfig.xml") configWrapper.writeXML(self.configFile)
def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint)
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() fileStore.logToMaster("Using the following configuration:\n%s" % ET.tostring(self.configNode)) # Log the stats for the un-preprocessed assemblies for name, sequence in self.project.getInputSequenceIDMap().items(): self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence) # Create jobs to create the output sequences logger.info("Reading config file from: %s" % self.project.getConfigID()) configFile = fileStore.readGlobalFile(self.project.getConfigID()) configNode = ET.parse(configFile).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary.. #Add the preprocessor child job. The output is a job promise value that will be #converted into a list of the IDs of the preprocessed sequences in the follow on job. preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode)) self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))]) #Now build the progressive-down job schedule = Schedule() schedule.loadProject(self.project, fileStore=fileStore) schedule.compute() self.options.event = self.project.mcTree.getRootName() leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ] fileStore.logToMaster("Leaf names = %s" % leafNames) self.options.globalLeafEventSet = set(leafNames) return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event]) expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = fileStore.readGlobalFile(experiment.getConfigID()) configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = experiment.getTree() seqNames = [] for node in tree.postOrderTraversal(): if tree.isLeaf(node): name = tree.getName(node) seqIDMap[name] = self.project.outputSequenceIDMap[name] seqNames.append(name) logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames)) experimentFile = fileStore.getLocalTempFile() experiment.writeXML(experimentFile) self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants configFile = fileStore.readGlobalFile(experiment.getConfigID()) configNode = ET.parse(configFile).getroot() workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet if self.options.intermediateResultsUrl is not None: # Give the URL prefix a special name for this particular # subproblem (by suffixing it with the name of the # internal node in the guide tree) workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event # Use the trimming strategy to blast ingroups vs outgroups. finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv() logger.info("Going to create alignments and define the cactus tree") return finalExpWrapper
def loadProject(self, mcProject): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] for name, expPath in mcProject.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) if not tree.isLeaf(node) and nodeName not in exp.getOutgroupEvents(): # This node is just an internal node added while # creating the induced tree from the species # tree. None of the sequence is used, so skip it. continue assert tree.hasParent(node) if nodeName not in exp.getOutgroupEvents() and tree.getName(tree.getParent(node)) != name: # This leaf isn't an ingroup or an outgroup, it was # just added to make the species tree # binary. (Hopefully this will be unnecessary in # the future.) continue # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents: self.inGraph.add_edge(name, nodeName) configElem = ET.parse(exp.getConfig()).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
def loadProject(self, mcProject, fileStore = None): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] expMap = None if fileStore: expMap = dict() for name in mcProject.expIDMap: expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name]) else: expMap = mcProject.expMap for name, expPath in expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents and nodeName in exp.getSequenceMap(): self.inGraph.add_edge(name, nodeName) configFile = fileStore.readGlobalFile(exp.getConfigID()) configElem = ET.parse(configFile).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options): self.options = options self.seqFile = SeqFile(options.seqFile) self.workingDir = options.cactusDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") log.info("Using config from path %s." % configPath) configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if not os.path.exists(self.workingDir): os.makedirs(self.workingDir) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = absSymPath( os.path.join(self.workingDir, "config.xml")) expPath = absSymPath( os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if len(self.seqFile.outgroups) == 0: # No outgroups specified, assume the default outgroup set outgroups = None else: outgroups = self.seqFile.outgroups runCreateMultiCactusProject(expPath, projPath, fixNames=0, outgroupNames=outgroups, root=self.options.root)
def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") log.info("Using config from path %s." % configPath) configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True)
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() if not self.schedule.isVirtual(self.event): tmpExp = fileStore.getLocalTempFile() self.eventExpWrapper.writeXML(tmpExp) self.project.expIDMap[self.event] = fileStore.writeGlobalFile(tmpExp) followOnEvent = self.schedule.followOn(self.event) if followOnEvent is not None: logger.info("Adding follow-on event %s" % followOnEvent) return self.addFollowOn(ProgressiveDown(self.options, self.project, followOnEvent, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv() return self.project
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences seqIDs = [] print "Importing %s sequences" % (len(project.getInputSequencePaths())) for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options, seqFile, workingDir): self.options = options self.seqFile = seqFile self.workingDir = workingDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir(os.path.join(self.workingDir, "sequenceData")) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = os.path.abspath( os.path.join(self.workingDir, "config.xml")) expPath = os.path.abspath( os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if os.path.exists(projPath) and self.options.overwrite: system("rm -rf %s" % projPath) if self.options.outputMaf is True: fixNames=1 else: fixNames=0 if os.path.exists(projPath): if not self.isSameAsExisting(expPath, projPath, fixNames): raise RuntimeError("Existing project %s not " % projPath+ "compatible with current input. Please " "erase the working directory or rerun " "with the --overwrite option to start " "from scratch.") else: logPath = os.path.join(self.workingDir, 'cactus.log') logFile = open(logPath, "a") logFile.write("\nContinuing existing alignment. Use " "--overwrite or erase the working directory to " "force restart from scratch.\n") logFile.close() else: cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, projPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDist: cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath system(cmd) # create a project in a dummy directory. check if the # project xml is the same as the current project. # we do this to see if we should start fresh or try to # work with the existing project when the overwrite flag is off def isSameAsExisting(self, expPath, projPath, fixNames): if not os.path.exists(projPath): return False oldPath = os.path.dirname(projPath + "/") tempPath = "%s_temp" % oldPath if os.path.exists(tempPath): system("rm -rf %s" % tempPath) cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, tempPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDist: cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath system(cmd) projFilePathNew = os.path.join(tempPath,'%s_temp_project.xml' % self.alignmentDirName) projFilePathOld = os.path.join(oldPath, '%s_project.xml' % self.alignmentDirName) newFile = [line for line in open(projFilePathNew, "r")] oldFile = [line for line in open(projFilePathOld, "r")] areSame = True if len(newFile) != len(oldFile): areSame = False for newLine, oldLine in zip(newFile, oldFile): if newLine.replace(tempPath, oldPath) != oldLine: areSame = False system("rm -rf %s" % tempPath) return areSame
def run(self): logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here self.options.experimentFile = self.project.expMap[self.event] expXml = ET.parse(self.options.experimentFile).getroot() experiment = ExperimentWrapper(expXml) configXml = ET.parse(experiment.getConfigPath()).getroot() configWrapper = ConfigWrapper(configXml) # need at least 3 processes for every event when using ktserver: # 1 proc to run jobs, 1 proc to run server, 1 proc to run 2ndary server if experiment.getDbType() == "kyoto_tycoon": maxParallel = min(len(self.project.expMap), configWrapper.getMaxParallelSubtrees()) if self.options.batchSystem == "singleMachine": if int(self.options.maxThreads) < maxParallel * 3: raise RuntimeError("At least %d threads are required (only %d were specified) to handle up to %d events using kyoto tycoon. Either increase the number of threads using the --maxThreads option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, self.options.maxThreads, maxParallel, configWrapper.getMaxParallelSubtrees())) else: if int(self.options.maxCpus) < maxParallel * 3: raise RuntimeError("At least %d concurrent cpus are required to handle up to %d events using kyoto tycoon. Either increase the number of cpus using the --maxCpus option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, maxParallel, configWrapper.getMaxParallelSubtrees())) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants workFlowArgs = CactusWorkflowArguments(self.options) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.overwrite = self.options.overwrite workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet experiment = ExperimentWrapper(workFlowArgs.experimentNode) donePath = os.path.join(os.path.dirname(workFlowArgs.experimentFile), "DONE") doneDone = os.path.isfile(donePath) refDone = not workFlowArgs.buildReference or os.path.isfile(experiment.getReferencePath()) halDone = not workFlowArgs.buildHal or (os.path.isfile(experiment.getHALFastaPath()) and os.path.isfile(experiment.getHALPath())) if not workFlowArgs.overwrite and doneDone and refDone and halDone: self.logToMaster("Skipping %s because it is already done and overwrite is disabled" % self.event) else: system("rm -f %s" % donePath) # delete database # and overwrite specified (or if reference not present) dbPath = os.path.join(experiment.getDbDir(), experiment.getDbName()) seqPath = os.path.join(experiment.getDbDir(), "sequences") system("rm -f %s* %s %s" % (dbPath, seqPath, experiment.getReferencePath())) if workFlowArgs.configWrapper.getDoTrimStrategy() and workFlowArgs.outgroupEventNames is not None: # Use the trimming strategy to blast ingroups vs outgroups. self.addChildTarget(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) else: self.addChildTarget(CactusSetupPhase(cactusWorkflowArguments=workFlowArgs, phaseName="setup")) logger.info("Going to create alignments and define the cactus tree") self.setFollowOnTarget(FinishUp(workFlowArgs, self.project))
def createFileStructure(mcProj, expTemplate, configTemplate, options): if not os.path.exists(options.path): os.makedirs(options.path) mcProj.writeXML(os.path.join(options.path, "%s_project.xml" % options.name)) seqMap = expTemplate.seqMap portOffset = 0 for name, expPath in mcProj.expMap.items(): path = os.path.join(options.path, name) seqMap[name] = os.path.join(path, name + '.fa') for name, expPath in mcProj.expMap.items(): path = os.path.join(options.path, name) children = mcProj.entireTree.getChildNames(name) exp = copy.deepcopy(expTemplate) # Get outgroups outgroups = [] if configTemplate.getOutgroupStrategy() != 'none' \ and name in mcProj.outgroup.ogMap: for og, ogDist in mcProj.outgroup.ogMap[name]: if og in seqMap: ogPath = seqMap[og] else: ogPath = os.path.join(options.path, og) ogPath = os.path.join(ogPath, refFileName(og)) seqMap[og] = ogPath outgroups += [og] # Get subtree connecting children + outgroups assert len(children) > 0 subtree = mcProj.entireTree.extractSpanningTree(children + outgroups) dbBase = path if expTemplate.getDbDir() is not None: dbBase = os.path.abspath(expTemplate.getDbDir()) exp.setDbDir(os.path.join(dbBase, name, "%s_DB" % name)) if expTemplate.getDbType() == "kyoto_tycoon" and \ os.path.splitext(name)[1] != ".kch": exp.setDbName("%s.kch" % name) else: exp.setDbName(name) if expTemplate.getDbType() == "kyoto_tycoon": exp.setDbPort(expTemplate.getDbPort() + portOffset) portOffset += 1 host = expTemplate.getDbHost() if host is not None: exp.setDbHost(host) exp.setReferencePath(os.path.join(path, name + '.fa')) if configTemplate.getBuildHal() == True: exp.setHALPath(os.path.join(path, "%s_hal.c2h" % name)) if configTemplate.getBuildFasta() == True: exp.setHALFastaPath(os.path.join(path, "%s_hal.fa" % name)) exp.updateTree(subtree, seqMap, outgroups) exp.setConfigPath(os.path.join(path, "%s_config.xml" % name)) if not os.path.exists(exp.getDbDir()): os.makedirs(exp.getDbDir()) if not os.path.exists(path): os.makedirs(path) exp.writeXML(expPath) config = ConfigWrapper(copy.deepcopy(configTemplate.xmlRoot)) config.setReferenceName(name) config.verifyMinBlockDegree(exp) config.writeXML(exp.getConfigPath())