def __pollKtServers(self): self.curKtservers = set() try: mc = MultiCactusProject() mc.readXML(self.projectPath) for eventName, expPath in mc.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) try: if pingKtServer(exp): self.curKtservers.add( "%s_%s:%s" % (eventName, exp.getDbHost(), str(exp.getDbPort()))) except: pass try: secElem = exp.getSecondaryDBElem() if secElem is not None and pingKtServer(secElem): self.curKtservers.add("%s_secondary_%s:%s" % (eventName, secElem.getDbHost(), str(secElem.getDbPort()))) except: pass except: self.curKtservers = set() if len(self.prevKtservers) > 0 and len(self.curKtservers) > 0 and\ self.curKtservers == self.prevKtservers: self.sameKtserversTime += self.pollTime else: self.prevKtservers = set(self.curKtservers) self.sameKtserversTime = 0
def testChangingSequencePaths(self): """Tests that changing a sequence path persists correctly.""" self.exp.setSequenceID('HUMAN', 'human2.txt') self.assertEqual(self.exp.getSequenceID('HUMAN'), 'human2.txt') # Reload the wrapper and try again self.exp = ExperimentWrapper(self.xmlRoot) self.assertEqual(self.exp.getSequenceID('HUMAN'), 'human2.txt')
def setUp(self): unittest.TestCase.setUp(self) self.tree = NXNewick().parseString( '((((HUMAN:0.006969,CHIMP:0.009727)anc2:0.025291,BABOON:0.044568)anc1:0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);' ) self.xmlRoot = self.__makeXmlDummy() self.exp = ExperimentWrapper(self.xmlRoot) self.exp.setTree(self.tree) self.seqMap = { 'HUMAN': 'human.txt', 'CHIMP': 'chimp.txt', 'BABOON': 'baboon.txt', 'MOUSE': 'mouse.txt', 'RAT': 'rat.txt', 'DOG': 'dog.txt', 'CAT': 'cat.txt', 'PIG': 'pig.txt', 'COW': 'cow.txt' } self.exp.setRootGenome('anc1') self.exp.setRootReconstructed(True) self.exp.setOutgroupGenomes( ['MOUSE', 'RAT', 'DOG', 'CAT', 'PIG', 'COW']) for genome, seq in self.seqMap.items(): # These aren't real IDs, but should still work for our # purposes self.exp.setSequenceID(genome, seq)
def progressiveWithSubtreeRootFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats): """Choose an arbitrary subtree from the larger species tree to run the alignment on. This function is necessary to keep runWorkflow_multipleExamples general (specifying a subtree root doesn't make sense for runCactusWorkflow). """ # Get valid internal nodes that are the root of the subtree we # want to align expWrapper = ExperimentWrapper(ET.parse(experimentFile).getroot()) tree = expWrapper.getTree() validNodes = [] for node in tree.postOrderTraversal(): if tree.hasName(node) and not tree.isLeaf(node): validNodes.append(tree.getName(node)) # Choose a random valid subtree root (NB: the entire species # tree is a valid subtree) subtreeRoot = random.choice(validNodes) logger.info("Chose subtree root %s to test from species tree " "%s" % (subtreeRoot, NXNewick().writeString(tree))) self.progressiveFunction(experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot)
def __pollKtServers(self): self.curKtservers = set() try: mc = MultiCactusProject() mc.readXML(self.projectPath) for eventName,expPath in mc.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) try: if pingKtServer(exp): self.curKtservers.add("%s_%s:%s" % ( eventName, exp.getDbHost(), str(exp.getDbPort()))) except: pass try: secElem = exp.getSecondaryDBElem() if secElem is not None and pingKtServer(secElem): self.curKtservers.add("%s_secondary_%s:%s" % ( eventName, secElem.getDbHost(), str(secElem.getDbPort()))) except: pass except: self.curKtservers = set() if len(self.prevKtservers) > 0 and len(self.curKtservers) > 0 and\ self.curKtservers == self.prevKtservers: self.sameKtserversTime += self.pollTime else: self.prevKtservers = set(self.curKtservers) self.sameKtserversTime = 0
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options): self.options = options self.seqFile = SeqFile(options.seqFile) self.workingDir = options.cactusDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") log.info("Using config from path %s." % configPath) configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if not os.path.exists(self.workingDir): os.makedirs(self.workingDir) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = absSymPath(os.path.join(self.workingDir, "config.xml")) expPath = absSymPath(os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if len(self.seqFile.outgroups) == 0: # No outgroups specified, assume the default outgroup set outgroups = None else: outgroups = self.seqFile.outgroups runCreateMultiCactusProject(expPath, projPath, fixNames=0, outgroupNames=outgroups, root=self.options.root)
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options): self.options = options self.seqFile = SeqFile(options.seqFile) self.workingDir = options.cactusDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") log.info("Using config from path %s." % configPath) configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if not os.path.exists(self.workingDir): os.makedirs(self.workingDir) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = absSymPath( os.path.join(self.workingDir, "config.xml")) expPath = absSymPath( os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if len(self.seqFile.outgroups) == 0: # No outgroups specified, assume the default outgroup set outgroups = None else: outgroups = self.seqFile.outgroups runCreateMultiCactusProject(expPath, projPath, fixNames=0, outgroupNames=outgroups, root=self.options.root)
def sequencePath(self, eventName): parentEvent = self.mcTree.getSubtreeRoot(eventName) expPath = self.expMap[parentEvent] expElem = ET.parse(expPath).getroot() exp = ExperimentWrapper(expElem) seq = exp.getSequence(eventName) assert os.path.isfile(seq) return seq
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue(header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
def testSequenceMap(self): xmlRoot = self.__makeXmlDummy(self.tree, self.sequences) exp = ExperimentWrapper(xmlRoot) assert NXNewick().writeString(exp.getTree()) == self.tree seqMap = exp.buildSequenceMap() seqList = self.sequences.split() for i in seqList: assert seqMap[os.path.splitext(i)[0].upper()] == i
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot=None): eW = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqFile = getTempFile() with open(seqFile, 'w') as f: tree = eW.getTree()
def get_leaves_and_outgroups(options, project, root): """ fish the leaves and outgroups out of the experiment xml """ # open up the experiment (as we do in ProgressiveUp.run) experimentFile = project.expMap[root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) tree = MultiCactusTree(experiment.getTree()).extractSubTree(root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() return leaves, outgroups
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=False): HALPath = "tmp_alignment.hal" # traverse tree to make sure we are going breadth-first tree = project.mcTree # find subtree if event specified rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in project.expMap: experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName]) experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID())) expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHalID() is not None assert experiment.getHalFastaID() is not None subHALPath = job.fileStore.readGlobalFile(experiment.getHalID()) halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID()) args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)] if len(outgroups) > 0: args += ["--outgroups", ",".join(outgroups)] if cacheBytes is not None: args += ["--cacheBytes", cacheBytes] if cacheMDC is not None: args += ["--cacheMDC", cacheMDC] if cacheRDC is not None: args += ["--cacheRDC", cacheRDC] if cacheW0 is not None: args += ["--cacheW0", cacheW0] if chunk is not None: args += ["--chunk", chunk] if deflate is not None: args += ["--deflate", deflate] if inMemory is True: args += ["--inMemory"] cactus_call(parameters=["halAppendCactusSubtree"] + args) cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit]) with job.fileStore.readGlobalFileStream(project.configID) as configFile: cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read())]) return job.fileStore.writeGlobalFile(HALPath)
def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if not os.path.exists(self.workingDir): os.makedirs(self.workingDir)
def processExperiment(self, ignoreSeqPaths): expXml = self.seqFile.toXMLElement(ignoreSeqPaths) #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database in ["kyoto_tycoon", "redis"] confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) self.expWrapper.setConfigPath(self.configPath) if not os.path.exists(self.workingDir): os.makedirs(self.workingDir)
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event]) expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = fileStore.readGlobalFile(experiment.getConfigID()) configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = experiment.getTree() seqNames = [] for node in tree.postOrderTraversal(): name = tree.getName(node) if tree.isLeaf(node) or (name == experiment.getRootGenome() and experiment.isRootReconstructed() == False): seqIDMap[name] = self.project.outputSequenceIDMap[name] seqNames.append(name) logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames)) experimentFile = fileStore.getLocalTempFile() experiment.writeXML(experimentFile) self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile) # take union of command line options and config options for hal and reference halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants configFile = fileStore.readGlobalFile(experiment.getConfigID()) configNode = ET.parse(configFile).getroot() workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap) # copy over the options so we don't trail them around workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet if self.options.intermediateResultsUrl is not None: # Give the URL prefix a special name for this particular # subproblem (by suffixing it with the name of the # internal node in the guide tree) workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event # Use the trimming strategy to blast ingroups vs outgroups. finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv() logger.info("Going to create alignments and define the cactus tree") return finalExpWrapper
def run(self): if self.isSecondary == False: wfArgs = self.newChild.cactusWorkflowArguments experiment = ExperimentWrapper(wfArgs.experimentNode) dbElem = experiment else: dbString = self.newChild.getOptionalPhaseAttrib( "secondaryDatabaseString") assert dbString is not None confXML = ET.fromstring(dbString) dbElem = DbElemWrapper(confXML) self.logToMaster("Blocking on ktserver %s with killPath %s" % ( ET.tostring(dbElem.getDbElem()), self.killSwitchPath)) blockUntilKtserverIsRunnning(dbElem, self.killSwitchPath, self.blockTimeout, self.blockTimestep) if self.isSecondary == False: experiment.writeXML(wfArgs.experimentFile) wfArgs.cactusDiskDatabaseString = dbElem.getConfString() else: self.newChild.phaseNode.attrib[ "secondaryDatabaseString"] = dbElem.getConfString() # added on as a hack to get this into the experiment.xml etPath = self.newChild.phaseNode.attrib[ "experimentPath"] experiment = ExperimentWrapper(ET.parse(etPath).getroot()) experiment.setSecondaryDBElem(dbElem) experiment.writeXML(etPath) self.addChildTarget(self.newChild) self.setFollowOnTarget(KtserverTargetKiller(dbElem, self.killSwitchPath, self.killTimeout))
def run(self): cactusAlignmentName = "cactusAlignment" cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(cactusAlignment): #Prepare the assembly #First copy it. if self.assemblyFile[-3:] == '.gz': tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) system("gunzip %s" % tempAssemblyFile) tempAssemblyFile = tempAssemblyFile[:-3] assert os.path.exists(tempAssemblyFile) else: tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) #Make the supporting temporary files tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir()) tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper( sequences=self.haplotypeSequences + [tempAssemblyFile], newickTreeString=self.newickTree, outputDir=self.getLocalTempDir(), configFile=self.configFile) cactusWorkflowExperiment.setDbName(cactusAlignmentName) cactusWorkflowExperiment.setDbDir( os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName()) ) #This needs to be set to ensure the thing gets put in the right directory cactusWorkflowExperiment.writeXML(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, buildAvgs=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Compute the stats cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName) tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(), "jobTreeStats.xml") system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile)) #Now copy the true assembly back to the output system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir)) #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir)) #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir)) #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir)) assert os.path.exists(cactusAlignment) #We're done! self.addChildTarget( MakeStats1(self.outputDir, cactusAlignment, self.options))
def createFileStructure(mcProj, expTemplate, configTemplate, options): if not os.path.exists(options.path): os.makedirs(options.path) mcProj.writeXML(os.path.join(options.path, "%s_project.xml" % options.name)) for name, expPath in list(mcProj.expMap.items()): path = os.path.join(options.path, name) children = mcProj.entireTree.getChildNames(name) # Get outgroups outgroups = [] if configTemplate.getOutgroupStrategy() != 'none' \ and name in mcProj.outgroup.ogMap: # Outgroup name is the first element of the ogMap tuples outgroups.extend(list(map(itemgetter(0), mcProj.outgroup.ogMap[name]))) subtree = mcProj.entireTree.extractSpanningTree(children + [name] + outgroups) exp = ExperimentWrapper.createExperimentWrapper(NXNewick().writeString(subtree), children + [name] + outgroups, databaseConf=expTemplate.confElem) exp.setRootGenome(name) exp.setOutgroupGenomes(outgroups) if not os.path.exists(path): os.makedirs(path) config = ConfigWrapper(copy.deepcopy(configTemplate.xmlRoot)) if expTemplate.getSequenceID(name): exp.setRootReconstructed(False) exp.setSequenceID(name, expTemplate.getSequenceID(name)) else: exp.setRootReconstructed(True) exp.writeXML(expPath)
def getCactusWorkflowExperimentForTest(sequences, newickTreeString, outputDir, configFile=None, constraints=None, progressive=False, reconstruct=True): """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates any globally set database conf. """ halFile = os.path.join(outputDir, "test.hal") fastaFile = os.path.join(outputDir, "test.fa") databaseConf = ET.fromstring( _GLOBAL_DATABASE_CONF_STRING ) if _GLOBAL_DATABASE_CONF_STRING is not None else None tree = NXNewick().parseString(newickTreeString, addImpliedRoots=False) genomes = [ tree.getName(id) for id in tree.postOrderTraversal() if tree.isLeaf(id) ] exp = ExperimentWrapper.createExperimentWrapper(newickTreeString, genomes, outputDir, databaseConf=databaseConf, configFile=configFile, halFile=halFile, fastaFile=fastaFile, constraints=constraints, progressive=progressive) for genome, sequence in zip(genomes, sequences): print((genome, sequence)) exp.setSequenceID(genome, sequence) exp.setRootGenome("reference") if reconstruct: exp.setRootReconstructed(True) return exp
def main(): usage = "usage: %prog [options] <experiment> <output project path>" description = "Setup a multi-cactus project using an experiment xml as template" parser = OptionParser(usage=usage, description=description) parser.add_option("--fixNames", dest="fixNames", default = "True", help="try to make sequence and event names MAF-compliant [default=true]") parser.add_option("--outgroupNames", dest="outgroupNames", default = None, help="comma-separated names of high quality assemblies to use as outgroups [default=everything]") parser.add_option("--root", dest="root", type=str, help="name of alignment root (must be labeled ancestral node in tree in input experiment). Useful " "for allowing the tree to contain nodes that won't be in the alignment but can still be used for " "outgroups.", default=None) parser.add_option("--overwrite", action="store_true", help="Overwrite existing experiment files", default=False) options, args = parser.parse_args() if len(args) != 2: parser.print_help() raise RuntimeError("Wrong number of arguments") options.expFile = args[0] options.path = os.path.abspath(args[1]) options.name = os.path.basename(options.path) options.fixNames = not options.fixNames.lower() == "false" if (os.path.isdir(options.path) and not options.overwrite) or os.path.isfile(options.path): raise RuntimeError("Output project path %s exists\n" % options.path) expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot()) configPath = expTemplate.getConfigPath() confTemplate = ConfigWrapper(ET.parse(configPath).getroot()) if options.fixNames: cleanEventTree(expTemplate) checkInputSequencePaths(expTemplate) tree = expTemplate.getTree() # Check that the tree is sensible (root has at least 1 child) if len(tree.getChildren(tree.getRootId())) == 0: raise RuntimeError("Input species tree has only one node.") if options.outgroupNames is not None: projNames = set([tree.getName(x) for x in tree.getLeaves()]) options.outgroupNames = set(options.outgroupNames.split(",")) for outgroupName in options.outgroupNames: if outgroupName not in projNames: raise RuntimeError("Specified outgroup %s not found in tree" % outgroupName) mcProj = createMCProject(tree, expTemplate, confTemplate, options) #Replace the sequences with output sequences expTemplate.updateTree(mcProj.mcTree, expTemplate.buildSequenceMap()) expTemplate.setSequences(CactusPreprocessor.getOutputSequenceFiles(mcProj.inputSequences, expTemplate.getOutputSequenceDir())) #Now do the file tree creation createFileStructure(mcProj, expTemplate, confTemplate, options) # mcProj.check() return 0
def loadProject(self, mcProject): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] for name, expPath in mcProject.expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) if not tree.isLeaf(node) and nodeName not in exp.getOutgroupEvents(): # This node is just an internal node added while # creating the induced tree from the species # tree. None of the sequence is used, so skip it. continue assert tree.hasParent(node) if nodeName not in exp.getOutgroupEvents() and tree.getName(tree.getParent(node)) != name: # This leaf isn't an ingroup or an outgroup, it was # just added to make the species tree # binary. (Hopefully this will be unnecessary in # the future.) continue # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents: self.inGraph.add_edge(name, nodeName) configElem = ET.parse(exp.getConfig()).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
def runCreateMultiCactusProject(expFile, projectFile, fixNames=False, outgroupNames=None, root=None, overwrite=False): options = CreateMultiCactusProjectOptions(expFile, projectFile, fixNames=fixNames, outgroupNames=outgroupNames, root=root, overwrite=overwrite) expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot()) configPath = expTemplate.getConfigPath() confTemplate = ConfigWrapper(ET.parse(configPath).getroot()) if options.fixNames: cleanEventTree(expTemplate) tree = expTemplate.getTree() if options.outgroupNames is not None: options.outgroupNames = set(options.outgroupNames) projNames = set([tree.getName(x) for x in tree.getLeaves()]) for outgroupName in options.outgroupNames: if outgroupName not in projNames: raise RuntimeError("Specified outgroup %s not found in tree" % outgroupName) mcProj = createMCProject(tree, expTemplate, confTemplate, options) #Replace the sequences with output sequences expTemplate.updateTree(mcProj.mcTree, expTemplate.buildSequenceMap()) #Now do the file tree creation createFileStructure(mcProj, expTemplate, confTemplate, options)
def syncToFileStore(self, toil): self.expIDMap = dict() for name, expPath in list(self.expMap.items()): expWrapper = ExperimentWrapper(ET.parse(expPath).getroot()) expWrapper.setConfigID(toil.importFile("file://" + expWrapper.getConfigPath())) expWrapper.writeXML(expPath) self.expIDMap[name] = toil.importFile("file://" + expPath)
def loadProject(self, mcProject, fileStore = None): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] expMap = None if fileStore: expMap = dict() for name in mcProject.expIDMap: expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name]) else: expMap = mcProject.expMap for name, expPath in list(expMap.items()): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents and tree.isLeaf(node): self.inGraph.add_edge(name, nodeName) if fileStore: configFile = fileStore.readGlobalFile(exp.getConfigID()) else: # hack from running from cactus-prepare configFile = exp.getConfigPath() configElem = ET.parse(configFile).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir( os.path.join(self.workingDir, "sequenceData"))
def getCactusWorkflowExperimentForTest(sequences, newickTreeString, outputDir, configFile=None, constraints=None, progressive=False): """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates any globally set database conf. """ halFile = os.path.join(outputDir, "test.hal") fastaFile = os.path.join(outputDir, "test.fa") return ExperimentWrapper.createExperimentWrapper(sequences, newickTreeString, outputDir, databaseConf=_GLOBAL_DATABASE_CONF_STRING, configFile=configFile, halFile=halFile, fastaFile=fastaFile, constraints=constraints, progressive=progressive)
def testOutgroups(self): xmlRoot = self.__makeXmlDummy(self.tree, self.sequences) exp = ExperimentWrapper(xmlRoot) assert NXNewick().writeString(exp.getTree()) == self.tree exp.addOutgroupSequence("outgroup", 1.3, "outgroup.fa") exp.addOutgroupSequence("outgroup2", 2.6, "outgroup2.fa") assert exp.getOutgroupEvents() == ["outgroup", "outgroup2"] seqMap = exp.buildSequenceMap() assert "outgroup" in seqMap assert seqMap["outgroup"] == "outgroup.fa" assert "outgroup2" in seqMap assert seqMap["outgroup2"] == "outgroup2.fa"
def loadProject(self, mcProject, fileStore = None): self.inGraph = NX.DiGraph() globTree = mcProject.mcTree self.maxParallelSubtrees = None leafEvents = [globTree.getName(i) for i in globTree.getLeaves()] expMap = None if fileStore: expMap = dict() for name in mcProject.expIDMap: expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name]) else: expMap = mcProject.expMap for name, expPath in expMap.items(): exp = ExperimentWrapper(ET.parse(expPath).getroot()) tree = exp.getTree() self.inGraph.add_node(name) # Go through the species tree and add the correct # dependencies (i.e. to the outgroups and the ingroups, # but not to the other nodes that are just there because # they are needed to form the correct paths). for node in tree.postOrderTraversal(): nodeName = tree.getName(node) # we don't add edges for leaves (in the global tree) # as they are input sequences and do not form dependencies # (it would be clever to maybe do the same with existing # references when --overwrite is not specified but for now # we just do the leaves) if nodeName not in leafEvents and nodeName in exp.getSequenceMap(): self.inGraph.add_edge(name, nodeName) configFile = fileStore.readGlobalFile(exp.getConfigID()) configElem = ET.parse(configFile).getroot() conf = ConfigWrapper(configElem) # load max parellel subtrees from the node's config if self.maxParallelSubtrees is None: self.maxParallelSubtrees = conf.getMaxParallelSubtrees() else: assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees() assert NX.is_directed_acyclic_graph(self.inGraph)
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() fileStore.logToMaster("Project has %i dependencies" % len(self.depProjects)) for projName in self.depProjects: depProject = self.depProjects[projName] for expName in depProject.expIDMap: expID = depProject.expIDMap[expName] experiment = ExperimentWrapper(ET.parse(fileStore.readGlobalFile(expID)).getroot()) fileStore.logToMaster("Reference ID for experiment %s: %s" % (expName, experiment.getReferenceID())) if experiment.getReferenceID(): self.project.expIDMap[expName] = expID self.project.outputSequenceIDMap[expName] = experiment.getReferenceID() eventExpWrapper = None logger.info("Progressive Next: " + self.event) if not self.schedule.isVirtual(self.event): eventExpWrapper = self.addChild(ProgressiveUp(self.options, self.project, self.event, memory=self.configWrapper.getDefaultMemory())).rv() return self.addFollowOn(ProgressiveOut(self.options, self.project, self.event, eventExpWrapper, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event]) expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = fileStore.readGlobalFile(experiment.getConfigID()) configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = experiment.getTree() seqNames = [] for node in tree.postOrderTraversal(): if tree.isLeaf(node): name = tree.getName(node) seqIDMap[name] = self.project.outputSequenceIDMap[name] seqNames.append(name) logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames)) experimentFile = fileStore.getLocalTempFile() experiment.writeXML(experimentFile) self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants configFile = fileStore.readGlobalFile(experiment.getConfigID()) configNode = ET.parse(configFile).getroot() workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet if self.options.intermediateResultsUrl is not None: # Give the URL prefix a special name for this particular # subproblem (by suffixing it with the name of the # internal node in the guide tree) workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event # Use the trimming strategy to blast ingroups vs outgroups. finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv() logger.info("Going to create alignments and define the cactus tree") return finalExpWrapper
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot=None, logLevel=None): eW = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqFile = getTempFile() with open(seqFile, 'w') as f: tree = eW.getTree() newick = NXNewick().writeString(tree) f.write('%s\n' % newick) for genome in eW.getGenomesWithSequence(): f.write('%s %s\n' % (genome, eW.getSequenceID(genome))) config = eW.getConfigPath() runCactusProgressive(seqFile, config, toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats, logLevel=logLevel)
def progressiveWithSubtreeRootFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, logLevel=None): """Choose an arbitrary subtree from the larger species tree to run the alignment on. This function is necessary to keep runWorkflow_multipleExamples general (specifying a subtree root doesn't make sense for runCactusWorkflow). """ # Get valid internal nodes that are the root of the subtree we # want to align expWrapper = ExperimentWrapper(ET.parse(experimentFile).getroot()) tree = expWrapper.getTree() validNodes = [] for node in tree.postOrderTraversal(): if tree.hasName(node) and not tree.isLeaf(node) and tree.hasParent( node): validNodes.append(tree.getName(node)) # Choose a random valid subtree root (excluding the species tree root) subtreeRoot = random.choice(validNodes) self.progressiveFunction(experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot, logLevel=logLevel)
def run(self): cactusAlignmentName = "cactusAlignment" cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(cactusAlignment): #Prepare the assembly #First copy it. if self.assemblyFile[-3:] == '.gz': tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) system("gunzip %s" % tempAssemblyFile) tempAssemblyFile = tempAssemblyFile[:-3] assert os.path.exists(tempAssemblyFile) else: tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) #Make the supporting temporary files tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir()) tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper( sequences=self.haplotypeSequences + [ tempAssemblyFile ], newickTreeString=self.newickTree, outputDir=self.getLocalTempDir(), configFile=self.configFile) cactusWorkflowExperiment.setDbName(cactusAlignmentName) cactusWorkflowExperiment.setDbDir(os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName())) #This needs to be set to ensure the thing gets put in the right directory cactusWorkflowExperiment.writeXML(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, buildAvgs=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Compute the stats cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName) tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),"jobTreeStats.xml") system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile)) #Now copy the true assembly back to the output system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir)) #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir)) #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir)) #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir)) assert os.path.exists(cactusAlignment) #We're done! self.addChildTarget(MakeStats1(self.outputDir, cactusAlignment, self.options))
def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir(os.path.join(self.workingDir, "sequenceData"))
def getCactusWorkflowExperimentForTest(sequences, newickTreeString, outputDir, configFile=None, constraints=None, progressive=False): """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates any globally set database conf. """ halFile = os.path.join(outputDir, "test.hal") fastaFile = os.path.join(outputDir, "test.fa") databaseConf = ET.fromstring( _GLOBAL_DATABASE_CONF_STRING ) if _GLOBAL_DATABASE_CONF_STRING is not None else None return ExperimentWrapper.createExperimentWrapper(sequences, newickTreeString, outputDir, databaseConf=databaseConf, configFile=configFile, halFile=halFile, fastaFile=fastaFile, constraints=constraints, progressive=progressive)
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list( map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue( header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options, seqFile, workingDir): self.options = options self.seqFile = seqFile self.workingDir = workingDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir(os.path.join(self.workingDir, "sequenceData")) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = os.path.abspath( os.path.join(self.workingDir, "config.xml")) expPath = os.path.abspath( os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if os.path.exists(projPath) and self.options.overwrite: system("rm -rf %s" % projPath) if self.options.outputMaf is True: fixNames=1 else: fixNames=0 if os.path.exists(projPath): if not self.isSameAsExisting(expPath, projPath, fixNames): raise RuntimeError("Existing project %s not " % projPath+ "compatible with current input. Please " "erase the working directory or rerun " "with the --overwrite option to start " "from scratch.") else: logPath = os.path.join(self.workingDir, 'cactus.log') logFile = open(logPath, "a") logFile.write("\nContinuing existing alignment. Use " "--overwrite or erase the working directory to " "force restart from scratch.\n") logFile.close() else: cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, projPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDist: cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath system(cmd) # create a project in a dummy directory. check if the # project xml is the same as the current project. # we do this to see if we should start fresh or try to # work with the existing project when the overwrite flag is off def isSameAsExisting(self, expPath, projPath, fixNames): if not os.path.exists(projPath): return False oldPath = os.path.dirname(projPath + "/") tempPath = "%s_temp" % oldPath if os.path.exists(tempPath): system("rm -rf %s" % tempPath) cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, tempPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDist: cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath system(cmd) projFilePathNew = os.path.join(tempPath,'%s_temp_project.xml' % self.alignmentDirName) projFilePathOld = os.path.join(oldPath, '%s_project.xml' % self.alignmentDirName) newFile = [line for line in open(projFilePathNew, "r")] oldFile = [line for line in open(projFilePathOld, "r")] areSame = True if len(newFile) != len(oldFile): areSame = False for newLine, oldLine in zip(newFile, oldFile): if newLine.replace(tempPath, oldPath) != oldLine: areSame = False system("rm -rf %s" % tempPath) return areSame
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) if not args['append']: # Overwrite existing hal print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] experiment = ExperimentWrapper( ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree()) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format( experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) if args["inMemory"] is True: cmdline += " --inMemory" print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format( totalTime, totalAppendTime)
def run(self): logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here self.options.experimentFile = self.project.expMap[self.event] expXml = ET.parse(self.options.experimentFile).getroot() experiment = ExperimentWrapper(expXml) configXml = ET.parse(experiment.getConfigPath()).getroot() configWrapper = ConfigWrapper(configXml) # need at least 3 processes for every event when using ktserver: # 1 proc to run jobs, 1 proc to run server, 1 proc to run 2ndary server if experiment.getDbType() == "kyoto_tycoon": maxParallel = min(len(self.project.expMap), configWrapper.getMaxParallelSubtrees()) if self.options.batchSystem == "singleMachine": if int(self.options.maxThreads) < maxParallel * 3: raise RuntimeError("At least %d threads are required (only %d were specified) to handle up to %d events using kyoto tycoon. Either increase the number of threads using the --maxThreads option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, self.options.maxThreads, maxParallel, configWrapper.getMaxParallelSubtrees())) else: if int(self.options.maxCpus) < maxParallel * 3: raise RuntimeError("At least %d concurrent cpus are required to handle up to %d events using kyoto tycoon. Either increase the number of cpus using the --maxCpus option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, maxParallel, configWrapper.getMaxParallelSubtrees())) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants workFlowArgs = CactusWorkflowArguments(self.options) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.overwrite = self.options.overwrite workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet experiment = ExperimentWrapper(workFlowArgs.experimentNode) donePath = os.path.join(os.path.dirname(workFlowArgs.experimentFile), "DONE") doneDone = os.path.isfile(donePath) refDone = not workFlowArgs.buildReference or os.path.isfile(experiment.getReferencePath()) halDone = not workFlowArgs.buildHal or (os.path.isfile(experiment.getHALFastaPath()) and os.path.isfile(experiment.getHALPath())) if not workFlowArgs.overwrite and doneDone and refDone and halDone: self.logToMaster("Skipping %s because it is already done and overwrite is disabled" % self.event) else: system("rm -f %s" % donePath) # delete database # and overwrite specified (or if reference not present) dbPath = os.path.join(experiment.getDbDir(), experiment.getDbName()) seqPath = os.path.join(experiment.getDbDir(), "sequences") system("rm -f %s* %s %s" % (dbPath, seqPath, experiment.getReferencePath())) if workFlowArgs.configWrapper.getDoTrimStrategy() and workFlowArgs.outgroupEventNames is not None: # Use the trimming strategy to blast ingroups vs outgroups. self.addChildTarget(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) else: self.addChildTarget(CactusSetupPhase(cactusWorkflowArguments=workFlowArgs, phaseName="setup")) logger.info("Going to create alignments and define the cactus tree") self.setFollowOnTarget(FinishUp(workFlowArgs, self.project))
class TestCase(unittest.TestCase): def setUp(self): unittest.TestCase.setUp(self) self.tree = NXNewick().parseString( '((((HUMAN:0.006969,CHIMP:0.009727)anc2:0.025291,BABOON:0.044568)anc1:0.11,(MOUSE:0.072818,RAT:0.081244):0.260342):0.02326,((DOG:0.07,CAT:0.07):0.087381,(PIG:0.06,COW:0.06):0.104728):0.04);' ) self.xmlRoot = self.__makeXmlDummy() self.exp = ExperimentWrapper(self.xmlRoot) self.exp.setTree(self.tree) self.seqMap = { 'HUMAN': 'human.txt', 'CHIMP': 'chimp.txt', 'BABOON': 'baboon.txt', 'MOUSE': 'mouse.txt', 'RAT': 'rat.txt', 'DOG': 'dog.txt', 'CAT': 'cat.txt', 'PIG': 'pig.txt', 'COW': 'cow.txt' } self.exp.setRootGenome('anc1') self.exp.setRootReconstructed(True) self.exp.setOutgroupGenomes( ['MOUSE', 'RAT', 'DOG', 'CAT', 'PIG', 'COW']) for genome, seq in self.seqMap.items(): # These aren't real IDs, but should still work for our # purposes self.exp.setSequenceID(genome, seq) def testGetSequencePath(self): for genome, seq in self.seqMap.items(): self.assertEqual(self.exp.getSequenceID(genome), seq) # Should not be any entries for genomes not in the tree self.assertEqual(self.exp.getSequenceID('DUCK'), None) def testChangingSequencePaths(self): """Tests that changing a sequence path persists correctly.""" self.exp.setSequenceID('HUMAN', 'human2.txt') self.assertEqual(self.exp.getSequenceID('HUMAN'), 'human2.txt') # Reload the wrapper and try again self.exp = ExperimentWrapper(self.xmlRoot) self.assertEqual(self.exp.getSequenceID('HUMAN'), 'human2.txt') def testOutgroups(self): self.assertEqual(set(self.exp.getOutgroupGenomes()), set(['MOUSE', 'RAT', 'DOG', 'CAT', 'PIG', 'COW'])) self.exp.setOutgroupGenomes([]) self.assertEqual(self.exp.getOutgroupGenomes(), []) def testRootGenome(self): self.assertEqual(self.exp.getRootGenome(), 'anc1') self.exp.setRootGenome('anc2') self.assertEqual(self.exp.getRootGenome(), 'anc2') def testSetTree(self): # A modfied version, with fewer genomes and a new one tree2 = NXNewick().parseString( '((HUMAN:0.006969,CHIMP:0.009727):0.025291,BABOON:0.044568,ARMADILLO:1.0);' ) self.exp.setTree(tree2) self.assertEqual(set(self.exp.getGenomesWithSequence()), set(['HUMAN', 'CHIMP', 'BABOON'])) def __makeXmlDummy(self): rootElem = ET.Element("dummy") rootElem.append(self.__makeDiskElem()) return rootElem def __makeDiskElem(self): diskElem = ET.Element("cactus_disk") confElem = ET.Element("st_kv_database_conf") confElem.attrib['type'] = 'kyoto_tycoon' diskElem.append(confElem) dbElem = ET.Element('kyoto_tycoon') confElem.append(dbElem) return diskElem
class ProjectWrapper: alignmentDirName = 'progressiveAlignment' def __init__(self, options, seqFile, workingDir): self.options = options self.seqFile = seqFile self.workingDir = workingDir self.configWrapper = None self.expWrapper = None self.processConfig() self.processExperiment() def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint) def processExperiment(self): expXml = self.seqFile.toXMLElement() #create the cactus disk cdElem = ET.SubElement(expXml, "cactus_disk") database = self.options.database assert database == "kyoto_tycoon" or database == "tokyo_cabinet" confElem = ET.SubElement(cdElem, "st_kv_database_conf") confElem.attrib["type"] = database dbElem = ET.SubElement(confElem, database) self.expWrapper = ExperimentWrapper(expXml) if self.options.database == "kyoto_tycoon": self.expWrapper.setDbPort(str(self.options.ktPort)) if self.options.ktHost is not None: self.expWrapper.setDbHost(self.options.ktHost) if self.options.ktType == 'memory': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(False) elif self.options.ktType == 'snapshot': self.expWrapper.setDbInMemory(True) self.expWrapper.setDbSnapshot(True) else: assert self.options.ktType == 'disk' self.expWrapper.setDbInMemory(False) self.expWrapper.setDbSnapshot(False) # sonlib doesn't allow for spaces in attributes in the db conf # which renders this options useless # if self.options.ktOpts is not None: # self.expWrapper.setDbServerOptions(self.options.ktOpts) if self.options.ktCreateTuning is not None: self.expWrapper.setDbCreateTuningOptions( self.options.ktCreateTuning) if self.options.ktOpenTuning is not None: self.expWrapper.setDbReadTuningOptions( self.options.ktOpenTuning) #set the sequence output directory outSeqDir = os.path.join(self.workingDir, "sequenceData") if os.path.exists(outSeqDir) and self.options.overwrite: system("rm -rf %s" % outSeqDir) if not os.path.exists(outSeqDir): system("mkdir %s" % outSeqDir) self.expWrapper.setOutputSequenceDir( os.path.join(self.workingDir, "sequenceData")) def writeXml(self): assert os.path.isdir(self.workingDir) configPath = absSymPath(os.path.join(self.workingDir, "config.xml")) expPath = absSymPath(os.path.join(self.workingDir, "expTemplate.xml")) self.expWrapper.setConfigPath(configPath) self.configWrapper.writeXML(configPath) self.expWrapper.writeXML(expPath) projPath = os.path.join(self.workingDir, ProjectWrapper.alignmentDirName) if os.path.exists(projPath) and self.options.overwrite: system("rm -rf %s" % projPath) if self.options.outputMaf is True: fixNames = 1 else: fixNames = 0 if os.path.exists(projPath): if not self.isSameAsExisting(expPath, projPath, fixNames): raise RuntimeError("Existing project %s not " % projPath + "compatible with current input. Please " "erase the working directory or rerun " "with the --overwrite option to start " "from scratch.") else: logPath = os.path.join(self.workingDir, 'cactus.log') logFile = open(logPath, "a") logFile.write("\nContinuing existing alignment. Use " "--overwrite or erase the working directory to " "force restart from scratch.\n") logFile.close() else: cmd = "cactus_createMultiCactusProject.py \"%s\" \"%s\" --fixNames=%d" % ( expPath, projPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDists: cmd += " --rootOutgroupDists %s" % self.options.rootOutgroupDists cmd += " --rootOutgroupPaths %s" % self.options.rootOutgroupPaths if self.options.root is not None: cmd += " --root %s" % self.options.root system(cmd) # create a project in a dummy directory. check if the # project xml is the same as the current project. # we do this to see if we should start fresh or try to # work with the existing project when the overwrite flag is off def isSameAsExisting(self, expPath, projPath, fixNames): if not os.path.exists(projPath): return False oldPath = os.path.dirname(projPath + "/") tempPath = "%s_temp" % oldPath # Fix for relative directories if oldPath[0:2] == './': oldPath = oldPath[2:] if tempPath[0:2] == './': tempPath = tempPath[2:] if os.path.exists(tempPath): system("rm -rf %s" % tempPath) cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % ( expPath, tempPath, fixNames) if len(self.seqFile.outgroups) > 0: cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups) if self.options.rootOutgroupDists: cmd += " --rootOutgroupDists %s" % self.options.rootOutgroupDists cmd += " --rootOutgroupPaths %s" % self.options.rootOutgroupPaths if self.options.root is not None: cmd += " --root %s" % self.options.root system(cmd) projFilePathNew = os.path.join( tempPath, '%s_temp_project.xml' % self.alignmentDirName) projFilePathOld = os.path.join( oldPath, '%s_project.xml' % self.alignmentDirName) newFile = [line for line in open(projFilePathNew, "r")] oldFile = [line for line in open(projFilePathOld, "r")] areSame = True if len(newFile) != len(oldFile): areSame = False for newLine, oldLine in zip(newFile, oldFile): if newLine.replace(tempPath, oldPath) != oldLine: areSame = False system("rm -rf %s" % tempPath) return areSame
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=False): HALPath = "tmp_alignment.hal" # traverse tree to make sure we are going breadth-first tree = project.mcTree # find subtree if event specified rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in project.expMap: experimentFilePath = job.fileStore.readGlobalFile( project.expIDMap[genomeName]) experiment = ExperimentWrapper( ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupGenomes() experiment.setConfigPath( job.fileStore.readGlobalFile(experiment.getConfigID())) expTreeString = NXNewick().writeString( experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHalID() is not None assert experiment.getHalFastaID() is not None subHALPath = job.fileStore.readGlobalFile(experiment.getHalID()) halFastaPath = job.fileStore.readGlobalFile( experiment.getHalFastaID()) args = [ os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath) ] if len(outgroups) > 0: args += ["--outgroups", ",".join(outgroups)] if cacheBytes is not None: args += ["--cacheBytes", cacheBytes] if cacheMDC is not None: args += ["--cacheMDC", cacheMDC] if cacheRDC is not None: args += ["--cacheRDC", cacheRDC] if cacheW0 is not None: args += ["--cacheW0", cacheW0] if chunk is not None: args += ["--chunk", chunk] if deflate is not None: args += ["--deflate", deflate] if inMemory is True: args += ["--inMemory"] cactus_call(parameters=["halAppendCactusSubtree"] + args) cactus_call( parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit]) with job.fileStore.readGlobalFileStream(project.configID) as configFile: cactus_call(parameters=[ "halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read()).decode() ]) return job.fileStore.writeGlobalFile(HALPath)
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) if not args['append']: # Overwrite existing hal print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] print experimentFilePath experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) if args["inMemory"] is True: cmdline += " --inMemory" print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format(totalTime, totalAppendTime)
def getConfigPath(self): return ExperimentWrapper(ET.parse( self.expMap.values()[0]).getroot()).getConfigPath()
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments configWrapper.disableCafMegablockFilter() # the recoverable chains parameter does not seem to play nicely with star-like alignments either #configWrapper.disableRecoverableChains() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path( '.ig_coverage_{}'.format(i))))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, doRenaming=options.nonCactusInput, pafInput=options.pafInput)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def updateProject(path): mcProj = MultiCactusProject() mcProj.readXML(path) basePath, name = os.path.split(path) for name,oldPath in mcProj.expMap.items(): fileName = os.path.basename(oldPath) dirName = os.path.dirname(oldPath).rpartition('/')[2] newPath = os.path.join(basePath, dirName, fileName) if not os.path.isfile(newPath): raise RuntimeError("Experiment file %s not found\n" % newPath) mcProj.expMap[name] = newPath exp = ExperimentWrapper(ET.parse(newPath).getroot()) oldDbDir = exp.getDbDir() if oldDbDir is not None: dbDirName = oldDbDir[oldDbDir.find(name):] newDbDir = os.path.join(basePath, dbDirName) exp.setDbDir(newDbDir) oldRefPath = exp.getReferencePath() if oldRefPath is not None: refName = oldRefPath[oldRefPath.find(name):] newRefPath = os.path.join(basePath, refName) exp.setReferencePath(newRefPath) oldHalPath = exp.getHALPath() if oldHalPath is not None: halName = oldHalPath[oldHalPath.find(name):] newHalPath = os.path.join(basePath, halName) exp.setHALPath(newHalPath) oldHalFastaPath = exp.getHALFastaPath() if oldHalFastaPath is not None: halFastaName = oldHalFastaPath[oldHalFastaPath.find(name):] newHalFastaPath = os.path.join(basePath, halFastaName) exp.setHALFastaPath(newHalFastaPath) # seems to have dissappeared from experiment? #oldMafPath = exp.getMAFPath() #if oldMafPath is not None: # mafName = oldMafPath[oldMafPath.find(name):] # newMafPath = os.path.join(basePath, mafName) # exp.setMAFPath(newMafPath) if exp.getDbType() == "kyoto_tycoon": oldHostName = exp.getDbHost() if oldHostName is not None: newHostName = socket.gethostname() exp.setDbHost(newHostName) system("cp %s %s.old" %(newPath, newPath)) exp.writeXML(newPath) mcProj.writeXML(path)