def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue(header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
def updateProject(path): mcProj = MultiCactusProject() mcProj.readXML(path) basePath, name = os.path.split(path) for name,oldPath in mcProj.expMap.items(): fileName = os.path.basename(oldPath) dirName = os.path.dirname(oldPath).rpartition('/')[2] newPath = os.path.join(basePath, dirName, fileName) if not os.path.isfile(newPath): raise RuntimeError("Experiment file %s not found\n" % newPath) mcProj.expMap[name] = newPath exp = ExperimentWrapper(ET.parse(newPath).getroot()) oldDbDir = exp.getDbDir() if oldDbDir is not None: dbDirName = oldDbDir[oldDbDir.find(name):] newDbDir = os.path.join(basePath, dbDirName) exp.setDbDir(newDbDir) oldRefPath = exp.getReferencePath() if oldRefPath is not None: refName = oldRefPath[oldRefPath.find(name):] newRefPath = os.path.join(basePath, refName) exp.setReferencePath(newRefPath) oldHalPath = exp.getHALPath() if oldHalPath is not None: halName = oldHalPath[oldHalPath.find(name):] newHalPath = os.path.join(basePath, halName) exp.setHALPath(newHalPath) oldHalFastaPath = exp.getHALFastaPath() if oldHalFastaPath is not None: halFastaName = oldHalFastaPath[oldHalFastaPath.find(name):] newHalFastaPath = os.path.join(basePath, halFastaName) exp.setHALFastaPath(newHalFastaPath) # seems to have dissappeared from experiment? #oldMafPath = exp.getMAFPath() #if oldMafPath is not None: # mafName = oldMafPath[oldMafPath.find(name):] # newMafPath = os.path.join(basePath, mafName) # exp.setMAFPath(newMafPath) if exp.getDbType() == "kyoto_tycoon": oldHostName = exp.getDbHost() if oldHostName is not None: newHostName = socket.gethostname() exp.setDbHost(newHostName) system("cp %s %s.old" %(newPath, newPath)) exp.writeXML(newPath) mcProj.writeXML(path)
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) if not args['append']: # Overwrite existing hal print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] experiment = ExperimentWrapper( ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree()) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format( experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) if args["inMemory"] is True: cmdline += " --inMemory" print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format( totalTime, totalAppendTime)
def run(self): logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here self.options.experimentFile = self.project.expMap[self.event] expXml = ET.parse(self.options.experimentFile).getroot() experiment = ExperimentWrapper(expXml) configXml = ET.parse(experiment.getConfigPath()).getroot() configWrapper = ConfigWrapper(configXml) # need at least 3 processes for every event when using ktserver: # 1 proc to run jobs, 1 proc to run server, 1 proc to run 2ndary server if experiment.getDbType() == "kyoto_tycoon": maxParallel = min(len(self.project.expMap), configWrapper.getMaxParallelSubtrees()) if self.options.batchSystem == "singleMachine": if int(self.options.maxThreads) < maxParallel * 3: raise RuntimeError("At least %d threads are required (only %d were specified) to handle up to %d events using kyoto tycoon. Either increase the number of threads using the --maxThreads option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, self.options.maxThreads, maxParallel, configWrapper.getMaxParallelSubtrees())) else: if int(self.options.maxCpus) < maxParallel * 3: raise RuntimeError("At least %d concurrent cpus are required to handle up to %d events using kyoto tycoon. Either increase the number of cpus using the --maxCpus option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, maxParallel, configWrapper.getMaxParallelSubtrees())) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants workFlowArgs = CactusWorkflowArguments(self.options) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.overwrite = self.options.overwrite workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet experiment = ExperimentWrapper(workFlowArgs.experimentNode) donePath = os.path.join(os.path.dirname(workFlowArgs.experimentFile), "DONE") doneDone = os.path.isfile(donePath) refDone = not workFlowArgs.buildReference or os.path.isfile(experiment.getReferencePath()) halDone = not workFlowArgs.buildHal or (os.path.isfile(experiment.getHALFastaPath()) and os.path.isfile(experiment.getHALPath())) if not workFlowArgs.overwrite and doneDone and refDone and halDone: self.logToMaster("Skipping %s because it is already done and overwrite is disabled" % self.event) else: system("rm -f %s" % donePath) # delete database # and overwrite specified (or if reference not present) dbPath = os.path.join(experiment.getDbDir(), experiment.getDbName()) seqPath = os.path.join(experiment.getDbDir(), "sequences") system("rm -f %s* %s %s" % (dbPath, seqPath, experiment.getReferencePath())) if workFlowArgs.configWrapper.getDoTrimStrategy() and workFlowArgs.outgroupEventNames is not None: # Use the trimming strategy to blast ingroups vs outgroups. self.addChildTarget(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) else: self.addChildTarget(CactusSetupPhase(cactusWorkflowArguments=workFlowArgs, phaseName="setup")) logger.info("Going to create alignments and define the cactus tree") self.setFollowOnTarget(FinishUp(workFlowArgs, self.project))
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list( map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue( header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
def main(): args = initParser() myProj = MultiCactusProject() myProj.readXML(args['cactus_project']) if not args['append']: # Overwrite existing hal print 'rm -f {0}'.format(args['HAL_file_path']) system('rm -f {0}'.format(args['HAL_file_path'])) # some quick stats totalTime = time.time() totalAppendTime = 0 # traverse tree to make sure we are going breadth-first tree = myProj.mcTree # find subtree if event specified event = args['event'] rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in myProj.expMap: experimentFilePath = myProj.expMap[genomeName] print experimentFilePath experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHALPath() is not None assert experiment.getHALFastaPath() is not None cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path']) if len(outgroups) > 0: cmdline += " --outgroups {0}".format(",".join(outgroups)) if args["cacheBytes"] is not None: cmdline += " --cacheBytes {0}".format(args["cacheBytes"]) if args["cacheMDC"] is not None: cmdline += " --cacheMDC {0}".format(args["cacheMDC"]) if args["cacheRDC"] is not None: cmdline += " --cacheRDC {0}".format(args["cacheRDC"]) if args["cacheW0"] is not None: cmdline += " --cacheW0 {0}".format(args["cacheW0"]) if args["chunk"] is not None: cmdline += " --chunk {0}".format(args["chunk"]) if args["deflate"] is not None: cmdline += " --deflate {0}".format(args["deflate"]) if args["inMemory"] is True: cmdline += " --inMemory" print cmdline appendTime = time.time() system(cmdline) appendTime = time.time() - appendTime totalAppendTime += appendTime # print "time of above command: {0:.2f}".format(appendTime) totalTime = time.time() - totalTime print "total time: {0:.2f} total halAppendCactusSubtree time: {1:.2f}".format(totalTime, totalAppendTime)