def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event]) expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = fileStore.readGlobalFile(experiment.getConfigID()) configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = experiment.getTree() seqNames = [] for node in tree.postOrderTraversal(): if tree.isLeaf(node): name = tree.getName(node) seqIDMap[name] = self.project.outputSequenceIDMap[name] seqNames.append(name) logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames)) experimentFile = fileStore.getLocalTempFile() experiment.writeXML(experimentFile) self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants configFile = fileStore.readGlobalFile(experiment.getConfigID()) configNode = ET.parse(configFile).getroot() workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet if self.options.intermediateResultsUrl is not None: # Give the URL prefix a special name for this particular # subproblem (by suffixing it with the name of the # internal node in the guide tree) workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event # Use the trimming strategy to blast ingroups vs outgroups. finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv() logger.info("Going to create alignments and define the cactus tree") return finalExpWrapper
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event]) expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = fileStore.readGlobalFile(experiment.getConfigID()) configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = experiment.getTree() seqNames = [] for node in tree.postOrderTraversal(): if tree.isLeaf(node): name = tree.getName(node) seqIDMap[name] = self.project.outputSequenceIDMap[name] seqNames.append(name) logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames)) experimentFile = fileStore.getLocalTempFile() experiment.writeXML(experimentFile) self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants configFile = fileStore.readGlobalFile(experiment.getConfigID()) configNode = ET.parse(configFile).getroot() workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet if self.options.intermediateResultsUrl is not None: # Give the URL prefix a special name for this particular # subproblem (by suffixing it with the name of the # internal node in the guide tree) workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event # Use the trimming strategy to blast ingroups vs outgroups. finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv() logger.info("Going to create alignments and define the cactus tree") return finalExpWrapper
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=True, checkpointInfo=None): HALPath = "tmp_alignment.hal" # traverse tree to make sure we are going breadth-first tree = project.mcTree # find subtree if event specified rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in project.expMap: experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName]) experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupGenomes() experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID())) expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHalID() is not None assert experiment.getHalFastaID() is not None subHALPath = job.fileStore.readGlobalFile(experiment.getHalID()) halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID()) args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)] if len(outgroups) > 0: args += ["--outgroups", ",".join(outgroups)] if cacheBytes is not None: args += ["--cacheBytes", cacheBytes] if cacheMDC is not None: args += ["--cacheMDC", cacheMDC] if cacheRDC is not None: args += ["--cacheRDC", cacheRDC] if cacheW0 is not None: args += ["--cacheW0", cacheW0] if chunk is not None: args += ["--chunk", chunk] if deflate is not None: args += ["--deflate", deflate] if inMemory is True: args += ["--inMemory"] cactus_call(parameters=["halAppendCactusSubtree"] + args) cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit]) with job.fileStore.readGlobalFileStream(project.configID) as configFile: cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read()).decode()]) if checkpointInfo: write_s3(HALPath, checkpointInfo[1], region=checkpointInfo[0]) return job.fileStore.writeGlobalFile(HALPath)
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=False): HALPath = "tmp_alignment.hal" # traverse tree to make sure we are going breadth-first tree = project.mcTree # find subtree if event specified rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in project.expMap: experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName]) experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID())) expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHalID() is not None assert experiment.getHalFastaID() is not None subHALPath = job.fileStore.readGlobalFile(experiment.getHalID()) halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID()) args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)] if len(outgroups) > 0: args += ["--outgroups", ",".join(outgroups)] if cacheBytes is not None: args += ["--cacheBytes", cacheBytes] if cacheMDC is not None: args += ["--cacheMDC", cacheMDC] if cacheRDC is not None: args += ["--cacheRDC", cacheRDC] if cacheW0 is not None: args += ["--cacheW0", cacheW0] if chunk is not None: args += ["--chunk", chunk] if deflate is not None: args += ["--deflate", deflate] if inMemory is True: args += ["--inMemory"] cactus_call(parameters=["halAppendCactusSubtree"] + args) cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit]) with job.fileStore.readGlobalFileStream(project.configID) as configFile: cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read())]) return job.fileStore.writeGlobalFile(HALPath)
def loadProject(self, mcProject, fileStore = None): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] expMap = None if fileStore: expMap = dict() for name in mcProject.expIDMap: expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name]) else: expMap = mcProject.expMap for name, expPath in list(expMap.items()): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents and tree.isLeaf(node): self.inGraph.add_edge(name, nodeName) if fileStore: configFile = fileStore.readGlobalFile(exp.getConfigID()) else: # hack from running from cactus-prepare configFile = exp.getConfigPath() configElem = ET.parse(configFile).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
def loadProject(self, mcProject, fileStore = None): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] expMap = None if fileStore: expMap = dict() for name in mcProject.expIDMap: expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name]) else: expMap = mcProject.expMap for name, expPath in expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents and nodeName in exp.getSequenceMap(): self.inGraph.add_edge(name, nodeName) configFile = fileStore.readGlobalFile(exp.getConfigID()) configElem = ET.parse(configFile).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)