Esempio n. 1
0
class ProgressiveOut(RoundedJob):
    def __init__(self, options, project, event, eventExpWrapper, schedule, memory=None, cores=None):
        RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
        self.options = options
        self.project = project
        self.event = event
        self.eventExpWrapper = eventExpWrapper
        self.schedule = schedule
        
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        if not self.schedule.isVirtual(self.event):
            tmpExp = fileStore.getLocalTempFile()
            self.eventExpWrapper.writeXML(tmpExp)
            self.project.expIDMap[self.event] = fileStore.writeGlobalFile(tmpExp)
        followOnEvent = self.schedule.followOn(self.event)
        if followOnEvent is not None:
            logger.info("Adding follow-on event %s" % followOnEvent)
            return self.addFollowOn(ProgressiveDown(self.options, self.project, followOnEvent,
                                                    self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()

        return self.project
Esempio n. 2
0
class ProgressiveDown(RoundedJob):
    def __init__(self, options, project, event, schedule, memory=None, cores=None):
        RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
        self.options = options
        self.project = project
        self.event = event
        self.schedule = schedule
    
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()
        logger.info("Progressive Down: " + self.event)

        depProjects = dict()
        deps = self.schedule.deps(self.event)
        fileStore.logToMaster("There are %i dependent projects" % len(deps))
        for child in deps:
            fileStore.logToMaster("Adding dependent project %s" % child)
            depProjects[child] = self.addChild(ProgressiveDown(self.options,
                                                               self.project, child,
                                                               self.schedule)).rv()

        return self.addFollowOn(ProgressiveNext(self.options, self.project, self.event,
                                                              self.schedule, depProjects, memory=self.configWrapper.getDefaultMemory())).rv()
Esempio n. 3
0
class RunCactusPreprocessorThenProgressiveDown2(RoundedJob):
    def __init__(self, options, project, event, schedule, memory=None, cores=None):
        RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
        self.options = options
        self.project = project
        self.event = event
        self.schedule = schedule

    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        # Save preprocessed sequences
        if self.options.intermediateResultsUrl is not None:
            preprocessedSequences = self.project.getOutputSequenceIDMap()
            for genome, seqID in preprocessedSequences.items():
                fileStore.exportFile(seqID, self.options.intermediateResultsUrl + '-preprocessed-' + genome)

        # Log the stats for the preprocessed assemblies
        for name, sequence in self.project.getOutputSequenceIDMap().items():
            self.addChildJobFn(logAssemblyStats, "After preprocessing", name, sequence)

        project = self.addChild(ProgressiveDown(options=self.options, project=self.project, event=self.event, schedule=self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()

        #Combine the smaller HAL files from each experiment
        return self.addFollowOnJobFn(exportHal, project=project, memory=self.configWrapper.getDefaultMemory(),
                                     disk=self.configWrapper.getExportHalDisk(),
                                     preemptable=False).rv()
Esempio n. 4
0
class ProgressiveNext(RoundedJob):
    def __init__(self, options, project, event, schedule, depProjects, memory=None, cores=None):
        RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
        self.options = options
        self.project = project
        self.event = event
        self.schedule = schedule
        self.depProjects = depProjects
    
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        fileStore.logToMaster("Project has %i dependencies" % len(self.depProjects))
        for projName in self.depProjects:
            depProject = self.depProjects[projName]
            for expName in depProject.expIDMap: 
                expID = depProject.expIDMap[expName]
                experiment = ExperimentWrapper(ET.parse(fileStore.readGlobalFile(expID)).getroot())
                fileStore.logToMaster("Reference ID for experiment %s: %s" % (expName, experiment.getReferenceID()))
                if experiment.getReferenceID():
                    self.project.expIDMap[expName] = expID
                    self.project.outputSequenceIDMap[expName] = experiment.getReferenceID()
                        
        eventExpWrapper = None
        logger.info("Progressive Next: " + self.event)
        if not self.schedule.isVirtual(self.event):
            eventExpWrapper = self.addChild(ProgressiveUp(self.options, self.project, self.event, memory=self.configWrapper.getDefaultMemory())).rv()
        return self.addFollowOn(ProgressiveOut(self.options, self.project, self.event, eventExpWrapper, self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()
Esempio n. 5
0
 def setUp(self):
     self.batchSystem = "singleMachine"
     if getBatchSystem() != None:
         self.batchSystem = getBatchSystem()
     unittest.TestCase.setUp(self)
     self.useOutgroup = False
     self.doSelfAlignment = False
     #Load the config file, turn on the checks.
     configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot())
     configWrapper.turnAllModesOn()
     self.tempDir = getTempDirectory(os.getcwd())
     self.configFile = os.path.join(self.tempDir, "tempConfig.xml")
     configWrapper.writeXML(self.configFile)
Esempio n. 6
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        fileStore.logToMaster("Using the following configuration:\n%s" % ET.tostring(self.configNode))

        # Log the stats for the un-preprocessed assemblies
        for name, sequence in self.project.getInputSequenceIDMap().items():
            self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence)

        # Create jobs to create the output sequences
        logger.info("Reading config file from: %s" % self.project.getConfigID())
        configFile = fileStore.readGlobalFile(self.project.getConfigID())
        configNode = ET.parse(configFile).getroot()
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary..
        #Add the preprocessor child job. The output is a job promise value that will be
        #converted into a list of the IDs of the preprocessed sequences in the follow on job.
        preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode))
        self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))])

        #Now build the progressive-down job
        schedule = Schedule()
        schedule.loadProject(self.project, fileStore=fileStore)
        schedule.compute()
        self.options.event = self.project.mcTree.getRootName()
        leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ]
        fileStore.logToMaster("Leaf names = %s" % leafNames)
        self.options.globalLeafEventSet = set(leafNames)

        return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
Esempio n. 7
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event])
        expXml = ET.parse(experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configPath = fileStore.readGlobalFile(experiment.getConfigID())
        configXml = ET.parse(configPath).getroot()

        seqIDMap = dict()
        tree = experiment.getTree()
        seqNames = []
        for node in tree.postOrderTraversal():
            if tree.isLeaf(node):
                name = tree.getName(node)
                seqIDMap[name] = self.project.outputSequenceIDMap[name]
                seqNames.append(name)
        logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames))
            
        experimentFile = fileStore.getLocalTempFile()
        experiment.writeXML(experimentFile)
        self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile)

        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        configFile = fileStore.readGlobalFile(experiment.getConfigID())
        configNode = ET.parse(configFile).getroot()
        workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap)

        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        if self.options.intermediateResultsUrl is not None:
            # Give the URL prefix a special name for this particular
            # subproblem (by suffixing it with the name of the
            # internal node in the guide tree)
            workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event

        # Use the trimming strategy to blast ingroups vs outgroups.
        finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv()
        logger.info("Going to create alignments and define the cactus tree")

        return finalExpWrapper
    def processConfig(self):
        # read in the default right out of cactus
        if self.options.configFile is not None:
            configPath = self.options.configFile
        else:
            dir = cactusRootPath()
            configPath = os.path.join(dir,
                                      "cactus_progressive_config.xml")
        configXml = ET.parse(configPath).getroot()
        self.configWrapper = ConfigWrapper(configXml)
        # here we can go through the options and apply some to the config
        self.configWrapper.setBuildHal(True)
        self.configWrapper.setBuildFasta(True)
        if self.options.outputMaf is not None:
            self.configWrapper.setBuildMaf(True)
            self.configWrapper.setJoinMaf(True)
        # pre-emptively turn down maxParallelSubtree for singleMachine
        # mode if not enough threads are provided to support it.  Probably
        # need to do something for other ?combined? batch systems?
        if self.options.batchSystem == 'singleMachine' and \
               self.options.database == 'kyoto_tycoon':
            if int(self.options.maxThreads) < \
                   self.configWrapper.getMaxParallelSubtrees() * 3:
                self.configWrapper.setMaxParallelSubtrees(
                    max(1, int(self.options.maxThreads) / 3)) 

        # this is a little hack to effectively toggle back to the
        # non-progressive version of cactus (as published in Gen. Res. 2011)
        # from the high-level interface. 
        if self.options.legacy is True:
            self.configWrapper.setSubtreeSize(sys.maxint)
Esempio n. 9
0
    def loadProject(self, mcProject):
        self.inGraph = NX.DiGraph()
        globTree = mcProject.mcTree
        self.maxParallelSubtrees = None
        leafEvents = [globTree.getName(i) for i in globTree.getLeaves()]
        for name, expPath in mcProject.expMap.items():
            exp = ExperimentWrapper(ET.parse(expPath).getroot())
            tree = exp.getTree()
            self.inGraph.add_node(name)
            # Go through the species tree and add the correct
            # dependencies (i.e. to the outgroups and the ingroups,
            # but not to the other nodes that are just there because
            # they are needed to form the correct paths).
            for node in tree.postOrderTraversal():
                nodeName = tree.getName(node)
                if not tree.isLeaf(node) and nodeName not in exp.getOutgroupEvents():
                    # This node is just an internal node added while
                    # creating the induced tree from the species
                    # tree. None of the sequence is used, so skip it.
                    continue

                assert tree.hasParent(node)

                if nodeName not in exp.getOutgroupEvents() and tree.getName(tree.getParent(node)) != name:
                    # This leaf isn't an ingroup or an outgroup, it was
                    # just added to make the species tree
                    # binary. (Hopefully this will be unnecessary in
                    # the future.)
                    continue

                # we don't add edges for leaves (in the global tree)
                # as they are input sequences and do not form dependencies
                # (it would be clever to maybe do the same with existing
                # references when --overwrite is not specified but for now
                # we just do the leaves)
                if nodeName not in leafEvents:
                    self.inGraph.add_edge(name, nodeName)
            configElem = ET.parse(exp.getConfig()).getroot()
            conf = ConfigWrapper(configElem)
            # load max parellel subtrees from the node's config
            if self.maxParallelSubtrees is None:
                self.maxParallelSubtrees = conf.getMaxParallelSubtrees()
            else:
                assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees()
        assert NX.is_directed_acyclic_graph(self.inGraph)
Esempio n. 10
0
    def loadProject(self, mcProject, fileStore = None):
        self.inGraph = NX.DiGraph()
        globTree = mcProject.mcTree
        self.maxParallelSubtrees = None
        leafEvents = [globTree.getName(i) for i in globTree.getLeaves()]

        expMap = None
        if fileStore:
            expMap = dict()
            for name in mcProject.expIDMap:
                expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name])
        else:
            expMap = mcProject.expMap
            
        for name, expPath in expMap.items():
            exp = ExperimentWrapper(ET.parse(expPath).getroot())
            tree = exp.getTree()
            self.inGraph.add_node(name)
            # Go through the species tree and add the correct
            # dependencies (i.e. to the outgroups and the ingroups,
            # but not to the other nodes that are just there because
            # they are needed to form the correct paths).
            for node in tree.postOrderTraversal():
                nodeName = tree.getName(node)

                # we don't add edges for leaves (in the global tree)
                # as they are input sequences and do not form dependencies
                # (it would be clever to maybe do the same with existing
                # references when --overwrite is not specified but for now
                # we just do the leaves)
                if nodeName not in leafEvents and nodeName in exp.getSequenceMap():
                    self.inGraph.add_edge(name, nodeName)
            configFile = fileStore.readGlobalFile(exp.getConfigID())
            configElem = ET.parse(configFile).getroot()
            conf = ConfigWrapper(configElem)
            # load max parellel subtrees from the node's config
            if self.maxParallelSubtrees is None:
                self.maxParallelSubtrees = conf.getMaxParallelSubtrees()
            else:
                assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees()
        assert NX.is_directed_acyclic_graph(self.inGraph)
Esempio n. 11
0
class ProjectWrapper:
    alignmentDirName = 'progressiveAlignment'
    def __init__(self, options):
        self.options = options
        self.seqFile = SeqFile(options.seqFile)
        self.workingDir = options.cactusDir
        self.configWrapper = None
        self.expWrapper = None
        self.processConfig()
        self.processExperiment()

    def processConfig(self):
        # read in the default right out of cactus
        if self.options.configFile is not None:
            configPath = self.options.configFile
        else:
            dir = cactusRootPath()
            configPath = os.path.join(dir,
                                      "cactus_progressive_config.xml")
        log.info("Using config from path %s." % configPath)
        configXml = ET.parse(configPath).getroot()
        self.configWrapper = ConfigWrapper(configXml)
        # here we can go through the options and apply some to the config
        self.configWrapper.setBuildHal(True)
        self.configWrapper.setBuildFasta(True)

    def processExperiment(self):
        expXml = self.seqFile.toXMLElement()
        #create the cactus disk
        cdElem = ET.SubElement(expXml, "cactus_disk")
        database = self.options.database
        assert database == "kyoto_tycoon" or database == "tokyo_cabinet"
        confElem = ET.SubElement(cdElem, "st_kv_database_conf")
        confElem.attrib["type"] = database
        ET.SubElement(confElem, database)
        self.expWrapper = ExperimentWrapper(expXml)
        if not os.path.exists(self.workingDir):
            os.makedirs(self.workingDir)

    def writeXml(self):
        assert os.path.isdir(self.workingDir)
        configPath = absSymPath(
            os.path.join(self.workingDir, "config.xml"))
        expPath = absSymPath(
            os.path.join(self.workingDir, "expTemplate.xml"))
        self.expWrapper.setConfigPath(configPath)
        self.configWrapper.writeXML(configPath)
        self.expWrapper.writeXML(expPath)

        projPath = os.path.join(self.workingDir,
                                ProjectWrapper.alignmentDirName)
        if len(self.seqFile.outgroups) == 0:
            # No outgroups specified, assume the default outgroup set
            outgroups = None
        else:
            outgroups = self.seqFile.outgroups
        runCreateMultiCactusProject(expPath, projPath, fixNames=0,
                                    outgroupNames=outgroups,
                                    root=self.options.root)
Esempio n. 12
0
 def processConfig(self):
     # read in the default right out of cactus
     if self.options.configFile is not None:
         configPath = self.options.configFile
     else:
         dir = cactusRootPath()
         configPath = os.path.join(dir,
                                   "cactus_progressive_config.xml")
     log.info("Using config from path %s." % configPath)
     configXml = ET.parse(configPath).getroot()
     self.configWrapper = ConfigWrapper(configXml)
     # here we can go through the options and apply some to the config
     self.configWrapper.setBuildHal(True)
     self.configWrapper.setBuildFasta(True)
Esempio n. 13
0
    def run(self, fileStore):
        self.configNode = ET.parse(
            fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        fileStore.logToMaster("Project has %i dependencies" %
                              len(self.depProjects))
        for projName in self.depProjects:
            depProject = self.depProjects[projName]
            for expName in depProject.expIDMap:
                expID = depProject.expIDMap[expName]
                experiment = ExperimentWrapper(
                    ET.parse(fileStore.readGlobalFile(expID)).getroot())
                fileStore.logToMaster("Reference ID for experiment %s: %s" %
                                      (expName, experiment.getReferenceID()))
                if experiment.getReferenceID():
                    self.project.expIDMap[expName] = expID
                    self.project.outputSequenceIDMap[
                        expName] = experiment.getReferenceID()

        eventExpWrapper = None
        logger.info("Progressive Next: " + self.event)
        if not self.schedule.isVirtual(self.event):
            eventExpWrapper = self.addChild(
                ProgressiveUp(
                    self.options,
                    self.project,
                    self.event,
                    memory=self.configWrapper.getDefaultMemory())).rv()
        return self.addFollowOn(
            ProgressiveOut(self.options,
                           self.project,
                           self.event,
                           eventExpWrapper,
                           self.schedule,
                           memory=self.configWrapper.getDefaultMemory())).rv()
Esempio n. 14
0
 def setUp(self):
     self.batchSystem = "singleMachine"
     if getBatchSystem() != None:
         self.batchSystem = getBatchSystem()
     unittest.TestCase.setUp(self)
     self.useOutgroup = False
     self.doSelfAlignment = False
     #Load the config file, turn on the checks.
     configWrapper = ConfigWrapper(
         ET.parse(
             os.path.join(cactusRootPath(),
                          "cactus_progressive_config.xml")).getroot())
     configWrapper.turnAllModesOn()
     configWrapper.turnOffHeaderChecks()
     self.tempDir = getTempDirectory(os.getcwd())
     self.configFile = os.path.join(self.tempDir, "tempConfig.xml")
     configWrapper.writeXML(self.configFile)
Esempio n. 15
0
    def run(self, fileStore):
        self.configNode = ET.parse(
            fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()
        logger.info("Progressive Down: " + self.event)

        depProjects = dict()
        deps = self.schedule.deps(self.event)
        fileStore.logToMaster("There are %i dependent projects" % len(deps))
        for child in deps:
            fileStore.logToMaster("Adding dependent project %s" % child)
            depProjects[child] = self.addChild(
                ProgressiveDown(self.options, self.project, child,
                                self.schedule)).rv()

        return self.addFollowOn(
            ProgressiveNext(
                self.options,
                self.project,
                self.event,
                self.schedule,
                depProjects,
                memory=self.configWrapper.getDefaultMemory())).rv()
Esempio n. 16
0
def stageWorkflow(outputSequenceDir, configFile, inputSequences, toil, restart=False, outputSequences = []):
    #Replace any constants
    configNode = ET.parse(configFile).getroot()
    if not outputSequences:
        outputSequences = CactusPreprocessor.getOutputSequenceFiles(inputSequences, outputSequenceDir)
    else:
        assert len(outputSequences) == len(inputSequences)
    if configNode.find("constants") != None:
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals()
    if not restart:
        inputSequenceIDs = [toil.importFile(makeURL(seq)) for seq in inputSequences]
        outputSequenceIDs = toil.start(CactusPreprocessor(inputSequenceIDs, configNode))
    else:
        outputSequenceIDs = toil.restart()
    for seqID, path in zip(outputSequenceIDs, outputSequences):
        toil.exportFile(seqID, makeURL(path))
Esempio n. 17
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        if not self.schedule.isVirtual(self.event):
            tmpExp = fileStore.getLocalTempFile()
            self.eventExpWrapper.writeXML(tmpExp)
            self.project.expIDMap[self.event] = fileStore.writeGlobalFile(tmpExp)
        followOnEvent = self.schedule.followOn(self.event)
        if followOnEvent is not None:
            logger.info("Adding follow-on event %s" % followOnEvent)
            return self.addFollowOn(ProgressiveDown(self.options, self.project, followOnEvent,
                                                    self.schedule, memory=self.configWrapper.getDefaultMemory())).rv()

        return self.project
Esempio n. 18
0
class ProjectWrapper:
    alignmentDirName = 'progressiveAlignment'

    def __init__(self, options, configPath, ignoreSeqPaths=[]):
        self.options = options
        self.seqFile = SeqFile(options.seqFile)
        self.workingDir = options.cactusDir
        self.configWrapper = None
        self.configPath = configPath
        self.expWrapper = None
        self.processConfig()
        self.processExperiment(ignoreSeqPaths)

    def processConfig(self):
        log.info("Using config from path %s." % self.configPath)
        configXml = ET.parse(self.configPath).getroot()
        self.configWrapper = ConfigWrapper(configXml)
        # here we can go through the options and apply some to the config
        self.configWrapper.setBuildHal(True)
        self.configWrapper.setBuildFasta(True)

    def processExperiment(self, ignoreSeqPaths):
        expXml = self.seqFile.toXMLElement(ignoreSeqPaths)
        #create the cactus disk
        cdElem = ET.SubElement(expXml, "cactus_disk")
        database = self.options.database
        assert database == "kyoto_tycoon"
        confElem = ET.SubElement(cdElem, "st_kv_database_conf")
        confElem.attrib["type"] = database
        ET.SubElement(confElem, database)
        self.expWrapper = ExperimentWrapper(expXml)
        self.expWrapper.setConfigPath(self.configPath)
        if not os.path.exists(self.workingDir):
            os.makedirs(self.workingDir)

    def writeXml(self):
        assert os.path.isdir(self.workingDir)
        configPath = absSymPath(os.path.join(self.workingDir, "config.xml"))
        expPath = absSymPath(os.path.join(self.workingDir, "expTemplate.xml"))
        self.expWrapper.setConfigPath(configPath)
        self.configWrapper.writeXML(configPath)
        self.expWrapper.writeXML(expPath)

        projPath = os.path.join(self.workingDir,
                                ProjectWrapper.alignmentDirName)
        if len(self.seqFile.outgroups) == 0:
            # No outgroups specified, assume the default outgroup set
            outgroups = None
        else:
            outgroups = self.seqFile.outgroups
        runCreateMultiCactusProject(expPath,
                                    projPath,
                                    outgroupNames=outgroups,
                                    root=self.options.root)
def runCreateMultiCactusProject(expFile, projectFile,
            outgroupNames=None, root=None, overwrite=False):
    options = CreateMultiCactusProjectOptions(expFile, projectFile,
            outgroupNames=outgroupNames, root=root, overwrite=overwrite)

    expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot())
    configPath = expTemplate.getConfigPath()
    confTemplate = ConfigWrapper(ET.parse(configPath).getroot())
    tree = expTemplate.getTree()
    if options.outgroupNames is not None:
        options.outgroupNames = set(options.outgroupNames)
        projNames = set([tree.getName(x) for x in tree.getLeaves()])
        for outgroupName in options.outgroupNames:
            if outgroupName not in projNames:
                raise RuntimeError("Specified outgroup %s not found in tree" % outgroupName)

    mcProj = createMCProject(tree, expTemplate, confTemplate, options)

    expTemplate.setTree(mcProj.mcTree)

    # Now do the file tree creation
    createFileStructure(mcProj, expTemplate, confTemplate, options)
Esempio n. 20
0
def validateInput(workDir, outputHalFile, options):
    try:
        if workDir.find(' ') >= 0:
            raise RuntimeError(
                "Cactus does not support spaces in pathnames: %s" % workDir)
        if not os.path.isdir(workDir):
            os.makedirs(workDir)
        if not os.path.isdir(workDir) or not os.access(workDir, os.W_OK):
            raise
    except:
        raise RuntimeError("Can't write to workDir: %s" % workDir)
    try:
        open(outputHalFile, "w")
    except:
        raise RuntimeError("Unable to write to hal: %s" % outputHalFile)
    if options.database != "tokyo_cabinet" and\
        options.database != "kyoto_tycoon":
        raise RuntimeError("Invalid database type: %s" % options.database)
    if options.outputMaf is not None:
        try:
            open(options.outputMaf, "w")
        except:
            raise RuntimeError("Unable to write to maf: %s" %
                               options.outputMaf)
    if options.configFile is not None:
        try:
            ConfigWrapper(ET.parse(options.configFile).getroot())
        except:
            raise RuntimeError("Unable to read config: %s" %
                               options.configFile)
    if options.database == 'kyoto_tycoon':
        if options.ktType.lower() != 'memory' and\
           options.ktType.lower() != 'snapshot' and\
           options.ktType.lower() != 'disk':
            raise RuntimeError("Invalid ktserver type specified: %s. Must be "
                               "memory, snapshot or disk" % options.ktType)
Esempio n. 21
0
def main():
    options = get_options()

    with Toil(options) as workflow:
        setupBinaries(options)

        importSingularityImage(options)

        ## Preprocessing:
        if (options.pathOverrides or options.pathOverrideNames):
            if not options.pathOverrides or not options.pathOverrideNames or \
            len(options.pathOverrideNames) != len(options.pathOverrides):
                raise RuntimeError(
                    'same number of values must be passed to --pathOverrides and --pathOverrideNames'
                )

        # apply path overrides.  this was necessary for wdl which doesn't take kindly to
        # text files of local paths (ie seqfile).  one way to fix would be to add support
        # for s3 paths and force wdl to use it.  a better way would be a more fundamental
        # interface shift away from files of paths throughout all of cactus
        if options.pathOverrides:
            seqFile = SeqFile(options.seqFile)
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            tree = MultiCactusTree(seqFile.tree)
            tree.nameUnlabeledInternalNodes(
                prefix=config.getDefaultInternalNodePrefix())
            for name, override in zip(options.pathOverrideNames,
                                      options.pathOverrides):
                seqFile.pathMap[name] = override
            override_seq = os.path.join(options.cactusDir, 'seqFile.override')
            with open(override_seq, 'w') as out_sf:
                out_sf.write(str(seqFile))
            options.seqFile = override_seq

        # Import asms; by default, prepends unique IDs in the technique used in cactus-blast.
        asms = get_asms_from_seqfile(options.seqFile, workflow)

        ## Perform alignments:
        if not workflow.options.restart:
            alignments = workflow.start(
                Job.wrapJobFn(run_cactus_reference_align, asms, options.refID,
                              options.debug_export, options.dipcall_bed_filter,
                              options.dipcall_vcf_filter))

        else:
            alignments = workflow.restart()

        if options.debug_export:
            # first, ensure the debug dir exists.
            if not os.path.isdir(options.debug_export_dir):
                os.mkdir(options.debug_export_dir)

            print(alignments)
            # Then return value is: (all_primary, all_secondary, ref_mappings, primary_mappings, secondary_mappings)
            for asm, mapping_file in alignments[2].items():
                workflow.exportFile(
                    mapping_file, 'file://' +
                    os.path.abspath("mappings_for_" + asm + ".paf"))
            for asm, mapping_file in alignments[3].items():
                workflow.exportFile(
                    mapping_file, 'file://' +
                    os.path.abspath("mappings_for_" + asm + ".cigar"))
            for asm, mapping_file in alignments[4].items():
                workflow.exportFile(
                    mapping_file, 'file://' +
                    os.path.abspath("mappings_for_" + asm + ".cigar.secondry"))

        ## Save alignments:
        if options.dipcall_vcf_filter:  # this is substantially less restrictive than the dipcall_bed_filter.
            dipcall_filtered = workflow.start(
                Job.wrapJobFn(apply_dipcall_vcf_filter, alignments[0]))
            workflow.exportFile(dipcall_filtered, makeURL(options.outputFile))
            workflow.exportFile(
                alignments[1],
                makeURL(options.outputFile + ".unfiltered.secondary"))
        else:
            workflow.exportFile(alignments[0], makeURL(options.outputFile))
            workflow.exportFile(alignments[1],
                                makeURL(options.outputFile + ".secondary"))
Esempio n. 22
0
class ProjectWrapper:
    alignmentDirName = 'progressiveAlignment'
    def __init__(self, options, seqFile, workingDir):
        self.options = options
        self.seqFile = seqFile
        self.workingDir = workingDir
        self.configWrapper = None
        self.expWrapper = None
        self.processConfig()
        self.processExperiment()

    def processConfig(self):
        # read in the default right out of cactus
        if self.options.configFile is not None:
            configPath = self.options.configFile
        else:
            dir = cactusRootPath()
            configPath = os.path.join(dir,
                                      "cactus_progressive_config.xml")
        configXml = ET.parse(configPath).getroot()
        self.configWrapper = ConfigWrapper(configXml)
        # here we can go through the options and apply some to the config
        self.configWrapper.setBuildHal(True)
        self.configWrapper.setBuildFasta(True)
        if self.options.outputMaf is not None:
            self.configWrapper.setBuildMaf(True)
            self.configWrapper.setJoinMaf(True)
        # pre-emptively turn down maxParallelSubtree for singleMachine
        # mode if not enough threads are provided to support it.  Probably
        # need to do something for other ?combined? batch systems?
        if self.options.batchSystem == 'singleMachine' and \
               self.options.database == 'kyoto_tycoon':
            if int(self.options.maxThreads) < \
                   self.configWrapper.getMaxParallelSubtrees() * 3:
                self.configWrapper.setMaxParallelSubtrees(
                    max(1, int(self.options.maxThreads) / 3)) 

        # this is a little hack to effectively toggle back to the
        # non-progressive version of cactus (as published in Gen. Res. 2011)
        # from the high-level interface. 
        if self.options.legacy is True:
            self.configWrapper.setSubtreeSize(sys.maxint)

    def processExperiment(self):
        expXml = self.seqFile.toXMLElement()
        #create the cactus disk
        cdElem = ET.SubElement(expXml, "cactus_disk")
        database = self.options.database
        assert database == "kyoto_tycoon" or database == "tokyo_cabinet"
        confElem = ET.SubElement(cdElem, "st_kv_database_conf")
        confElem.attrib["type"] = database
        dbElem = ET.SubElement(confElem, database)
        self.expWrapper = ExperimentWrapper(expXml)

        if self.options.database == "kyoto_tycoon":
            self.expWrapper.setDbPort(str(self.options.ktPort))
            if self.options.ktHost is not None:
                self.expWrapper.setDbHost(self.options.ktHost)
            if self.options.ktType == 'memory':
                self.expWrapper.setDbInMemory(True)
                self.expWrapper.setDbSnapshot(False)
            elif self.options.ktType == 'snapshot':
                self.expWrapper.setDbInMemory(True)
                self.expWrapper.setDbSnapshot(True)
            else:
                assert self.options.ktType == 'disk'
                self.expWrapper.setDbInMemory(False)
                self.expWrapper.setDbSnapshot(False)
            # sonlib doesn't allow for spaces in attributes in the db conf
            # which renders this options useless
            # if self.options.ktOpts is not None:
            #    self.expWrapper.setDbServerOptions(self.options.ktOpts)
            if self.options.ktCreateTuning is not None:
                self.expWrapper.setDbCreateTuningOptions(
                    self.options.ktCreateTuning)
            if self.options.ktOpenTuning is not None:
                self.expWrapper.setDbReadTuningOptions(
                    self.options.ktOpenTuning)
        
        #set the sequence output directory
        outSeqDir = os.path.join(self.workingDir, "sequenceData")
        if os.path.exists(outSeqDir) and self.options.overwrite:
            system("rm -rf %s" % outSeqDir)
        if not os.path.exists(outSeqDir):
            system("mkdir %s" % outSeqDir)
        self.expWrapper.setOutputSequenceDir(os.path.join(self.workingDir, 
                                                          "sequenceData"))

    def writeXml(self):
        assert os.path.isdir(self.workingDir)
        configPath = os.path.abspath(
            os.path.join(self.workingDir, "config.xml"))
        expPath = os.path.abspath(
            os.path.join(self.workingDir, "expTemplate.xml"))
        self.expWrapper.setConfigPath(configPath)
        self.configWrapper.writeXML(configPath)
        self.expWrapper.writeXML(expPath)

        projPath = os.path.join(self.workingDir,
                                ProjectWrapper.alignmentDirName)
        if os.path.exists(projPath) and self.options.overwrite:
            system("rm -rf %s" % projPath)
        if self.options.outputMaf is True:
            fixNames=1
        else:
            fixNames=0
        if os.path.exists(projPath):
           if not self.isSameAsExisting(expPath, projPath, fixNames):
               raise RuntimeError("Existing project %s not " % projPath+
                                  "compatible with current input.  Please "
                                  "erase the working directory or rerun "
                                  "with the --overwrite option to start "
                                  "from scratch.")
           else:
               logPath = os.path.join(self.workingDir, 'cactus.log')
               logFile = open(logPath, "a")
               logFile.write("\nContinuing existing alignment.  Use "
                             "--overwrite or erase the working directory to "
                             "force restart from scratch.\n")
               logFile.close()
        else:
            cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % (
                expPath, projPath, fixNames)
            if len(self.seqFile.outgroups) > 0: 
                cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups)
            if self.options.rootOutgroupDist:
                cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist
                cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath
            system(cmd)

    # create a project in a dummy directory.  check if the
    # project xml is the same as the current project.
    # we do this to see if we should start fresh or try to
    # work with the existing project when the overwrite flag is off
    def isSameAsExisting(self, expPath, projPath, fixNames):
        if not os.path.exists(projPath):
            return False
        oldPath = os.path.dirname(projPath + "/")
        tempPath = "%s_temp" % oldPath
        if os.path.exists(tempPath):
            system("rm -rf %s" % tempPath)
        cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % (
            expPath, tempPath, fixNames)
        if len(self.seqFile.outgroups) > 0: 
            cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups)
        if self.options.rootOutgroupDist:
            cmd += " --rootOutgroupDist %f" % self.options.rootOutgroupDist
            cmd += " --rootOutgroupPath %s" % self.options.rootOutgroupPath
        system(cmd)
        projFilePathNew = os.path.join(tempPath,'%s_temp_project.xml' %
                                       self.alignmentDirName)
        projFilePathOld = os.path.join(oldPath, '%s_project.xml' %
                                       self.alignmentDirName)
        
        newFile = [line for line in open(projFilePathNew, "r")]
        oldFile = [line for line in open(projFilePathOld, "r")]
        areSame = True
        if len(newFile) != len(oldFile):
            areSame = False
        for newLine, oldLine in zip(newFile, oldFile):
            if newLine.replace(tempPath, oldPath) != oldLine:
                areSame = False
        system("rm -rf %s" % tempPath)
        return areSame
Esempio n. 23
0
def createFileStructure(mcProj, expTemplate, configTemplate, options):
    if not os.path.exists(options.path):
        os.makedirs(options.path)
    mcProj.writeXML(os.path.join(options.path, "%s_project.xml" % options.name))
    seqMap = expTemplate.seqMap
    portOffset = 0
    for name, expPath in mcProj.expMap.items():
        path = os.path.join(options.path, name)
        seqMap[name] = os.path.join(path, name + '.fa')
    for name, expPath in mcProj.expMap.items():
        path = os.path.join(options.path, name)
        children = mcProj.entireTree.getChildNames(name)
        exp = copy.deepcopy(expTemplate)

        # Get outgroups
        outgroups = []
        if configTemplate.getOutgroupStrategy() != 'none' \
        and name in mcProj.outgroup.ogMap:
            for og, ogDist in mcProj.outgroup.ogMap[name]:
                if og in seqMap:
                    ogPath = seqMap[og]
                else:
                    ogPath = os.path.join(options.path, og)
                    ogPath = os.path.join(ogPath, refFileName(og))
                    seqMap[og] = ogPath
                outgroups += [og]

        # Get subtree connecting children + outgroups
        assert len(children) > 0
        subtree = mcProj.entireTree.extractSpanningTree(children + outgroups)
        dbBase = path
        if expTemplate.getDbDir() is not None:
            dbBase = os.path.abspath(expTemplate.getDbDir())
        exp.setDbDir(os.path.join(dbBase, name, "%s_DB" % name))
        if expTemplate.getDbType() == "kyoto_tycoon" and \
            os.path.splitext(name)[1] != ".kch":
            exp.setDbName("%s.kch" % name)
        else:
            exp.setDbName(name)
        if expTemplate.getDbType() == "kyoto_tycoon":
            exp.setDbPort(expTemplate.getDbPort() + portOffset)
            portOffset += 1
            host = expTemplate.getDbHost()
            if host is not None:
                exp.setDbHost(host)
        exp.setReferencePath(os.path.join(path, name + '.fa'))
        if configTemplate.getBuildHal() == True:
            exp.setHALPath(os.path.join(path, "%s_hal.c2h" % name))
        if configTemplate.getBuildFasta() == True:
            exp.setHALFastaPath(os.path.join(path, "%s_hal.fa" % name))
        exp.updateTree(subtree, seqMap, outgroups)
        exp.setConfigPath(os.path.join(path, "%s_config.xml" % name))
        if not os.path.exists(exp.getDbDir()):
            os.makedirs(exp.getDbDir())
        if not os.path.exists(path):
            os.makedirs(path)
        exp.writeXML(expPath)
        config = ConfigWrapper(copy.deepcopy(configTemplate.xmlRoot))
        config.setReferenceName(name)
        config.verifyMinBlockDegree(exp)
        config.writeXML(exp.getConfigPath())
Esempio n. 24
0
    def run(self):
        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        self.options.experimentFile = self.project.expMap[self.event]
        expXml = ET.parse(self.options.experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configXml = ET.parse(experiment.getConfigPath()).getroot()
        configWrapper = ConfigWrapper(configXml)

        # need at least 3 processes for every event when using ktserver:
        # 1 proc to run jobs, 1 proc to run server, 1 proc to run 2ndary server
        if experiment.getDbType() == "kyoto_tycoon":            
            maxParallel = min(len(self.project.expMap),
                             configWrapper.getMaxParallelSubtrees()) 
            if self.options.batchSystem == "singleMachine":
                if int(self.options.maxThreads) < maxParallel * 3:
                    raise RuntimeError("At least %d threads are required (only %d were specified) to handle up to %d events using kyoto tycoon. Either increase the number of threads using the --maxThreads option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, self.options.maxThreads, maxParallel, configWrapper.getMaxParallelSubtrees()))
            else:
                if int(self.options.maxCpus) < maxParallel * 3:
                    raise RuntimeError("At least %d concurrent cpus are required to handle up to %d events using kyoto tycoon. Either increase the number of cpus using the --maxCpus option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, maxParallel, configWrapper.getMaxParallelSubtrees()))
                    
        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        workFlowArgs = CactusWorkflowArguments(self.options)
        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.overwrite = self.options.overwrite
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        
        experiment = ExperimentWrapper(workFlowArgs.experimentNode)

        donePath = os.path.join(os.path.dirname(workFlowArgs.experimentFile), "DONE")
        doneDone = os.path.isfile(donePath)
        refDone = not workFlowArgs.buildReference or os.path.isfile(experiment.getReferencePath())
        halDone = not workFlowArgs.buildHal or (os.path.isfile(experiment.getHALFastaPath()) and
                                                os.path.isfile(experiment.getHALPath()))
                                                               
        if not workFlowArgs.overwrite and doneDone and refDone and halDone:
            self.logToMaster("Skipping %s because it is already done and overwrite is disabled" %
                             self.event)
        else:
            system("rm -f %s" % donePath)
            # delete database 
            # and overwrite specified (or if reference not present)
            dbPath = os.path.join(experiment.getDbDir(), 
                                  experiment.getDbName())
            seqPath = os.path.join(experiment.getDbDir(), "sequences")
            system("rm -f %s* %s %s" % (dbPath, seqPath, 
                                        experiment.getReferencePath()))

            if workFlowArgs.configWrapper.getDoTrimStrategy() and workFlowArgs.outgroupEventNames is not None:
                # Use the trimming strategy to blast ingroups vs outgroups.
                self.addChildTarget(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast"))
            else:
                self.addChildTarget(CactusSetupPhase(cactusWorkflowArguments=workFlowArgs,
                                                     phaseName="setup"))
        logger.info("Going to create alignments and define the cactus tree")

        self.setFollowOnTarget(FinishUp(workFlowArgs, self.project))
Esempio n. 25
0
def runCactusGraphMap(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "refgraph"),
                                            "assemblyName",
                                            default="__MINIGRAPH_SEQUENCES__")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            logger.info("Genomes for graphmap, {}".format(seqFile.pathMap))

            if not options.outputFasta and graph_event not in seqFile.pathMap:
                raise RuntimeError(
                    "{} assembly not found in seqfile so it must be specified with --outputFasta"
                    .format(graph_event))

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            for genome, seq in seqFile.pathMap.items():
                if genome != graph_event:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    seqIDMap[genome] = toil.importFile(seq)

            # run the workflow
            paf_id, gfa_fa_id = toil.start(
                Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap,
                              gfa_id, graph_event))

        #export the paf
        toil.exportFile(paf_id, makeURL(options.outputPAF))
        if gfa_fa_id:
            toil.exportFile(gfa_fa_id, makeURL(options.outputFasta))

        # update the input seqfile (in place!)
        add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta),
                              graph_event)
Esempio n. 26
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("outputHal", type=str, help="Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--database",
                        dest="database",
                        help="Database type: tokyo_cabinet or kyoto_tycoon"
                        " [default: %(default)s]",
                        default="kyoto_tycoon")
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=None)
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument("--latest",
                        dest="latest",
                        action="store_true",
                        help="Use the latest, locally-built docker container "
                        "rather than pulling from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()
    options.cactusDir = getTempDirectory()

    setupBinaries(options)
    setLoggingFromOptions(options)

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None:
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    #Create the progressive cactus project
    projWrapper = ProjectWrapper(options)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    with Toil(options) as toil:
        importSingularityImage()
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            project.readXML(pjPath)
            #import the sequences
            seqIDs = []
            for seq in project.getInputSequencePaths():
                if os.path.isdir(seq):
                    tmpSeq = getTempFile()
                    catFiles([
                        os.path.join(seq, subSeq) for subSeq in os.listdir(seq)
                    ], tmpSeq)
                    seq = tmpSeq
                seq = makeURL(seq)
                seqIDs.append(toil.importFile(seq))
            project.setInputSequenceIDs(seqIDs)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            logger.info("Setting config id to: %s" % cactusConfigID)
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            project.writeXML(pjPath)
            halID = toil.start(
                RunCactusPreprocessorThenProgressiveDown(
                    options, project, memory=configWrapper.getDefaultMemory()))

        toil.exportFile(halID, makeURL(options.outputHal))
Esempio n. 27
0
class ProjectWrapper:
    alignmentDirName = 'progressiveAlignment'

    def __init__(self, options, seqFile, workingDir):
        self.options = options
        self.seqFile = seqFile
        self.workingDir = workingDir
        self.configWrapper = None
        self.expWrapper = None
        self.processConfig()
        self.processExperiment()

    def processConfig(self):
        # read in the default right out of cactus
        if self.options.configFile is not None:
            configPath = self.options.configFile
        else:
            dir = cactusRootPath()
            configPath = os.path.join(dir, "cactus_progressive_config.xml")
        configXml = ET.parse(configPath).getroot()
        self.configWrapper = ConfigWrapper(configXml)
        # here we can go through the options and apply some to the config
        self.configWrapper.setBuildHal(True)
        self.configWrapper.setBuildFasta(True)
        if self.options.outputMaf is not None:
            self.configWrapper.setBuildMaf(True)
            self.configWrapper.setJoinMaf(True)
        # pre-emptively turn down maxParallelSubtree for singleMachine
        # mode if not enough threads are provided to support it.  Probably
        # need to do something for other ?combined? batch systems?
        if self.options.batchSystem == 'singleMachine' and \
               self.options.database == 'kyoto_tycoon':
            if int(self.options.maxThreads) < \
                   self.configWrapper.getMaxParallelSubtrees() * 3:
                self.configWrapper.setMaxParallelSubtrees(
                    max(1,
                        int(self.options.maxThreads) / 3))

        # this is a little hack to effectively toggle back to the
        # non-progressive version of cactus (as published in Gen. Res. 2011)
        # from the high-level interface.
        if self.options.legacy is True:
            self.configWrapper.setSubtreeSize(sys.maxint)

    def processExperiment(self):
        expXml = self.seqFile.toXMLElement()
        #create the cactus disk
        cdElem = ET.SubElement(expXml, "cactus_disk")
        database = self.options.database
        assert database == "kyoto_tycoon" or database == "tokyo_cabinet"
        confElem = ET.SubElement(cdElem, "st_kv_database_conf")
        confElem.attrib["type"] = database
        dbElem = ET.SubElement(confElem, database)
        self.expWrapper = ExperimentWrapper(expXml)

        if self.options.database == "kyoto_tycoon":
            self.expWrapper.setDbPort(str(self.options.ktPort))
            if self.options.ktHost is not None:
                self.expWrapper.setDbHost(self.options.ktHost)
            if self.options.ktType == 'memory':
                self.expWrapper.setDbInMemory(True)
                self.expWrapper.setDbSnapshot(False)
            elif self.options.ktType == 'snapshot':
                self.expWrapper.setDbInMemory(True)
                self.expWrapper.setDbSnapshot(True)
            else:
                assert self.options.ktType == 'disk'
                self.expWrapper.setDbInMemory(False)
                self.expWrapper.setDbSnapshot(False)
            # sonlib doesn't allow for spaces in attributes in the db conf
            # which renders this options useless
            # if self.options.ktOpts is not None:
            #    self.expWrapper.setDbServerOptions(self.options.ktOpts)
            if self.options.ktCreateTuning is not None:
                self.expWrapper.setDbCreateTuningOptions(
                    self.options.ktCreateTuning)
            if self.options.ktOpenTuning is not None:
                self.expWrapper.setDbReadTuningOptions(
                    self.options.ktOpenTuning)

        #set the sequence output directory
        outSeqDir = os.path.join(self.workingDir, "sequenceData")
        if os.path.exists(outSeqDir) and self.options.overwrite:
            system("rm -rf %s" % outSeqDir)
        if not os.path.exists(outSeqDir):
            system("mkdir %s" % outSeqDir)
        self.expWrapper.setOutputSequenceDir(
            os.path.join(self.workingDir, "sequenceData"))

    def writeXml(self):
        assert os.path.isdir(self.workingDir)
        configPath = absSymPath(os.path.join(self.workingDir, "config.xml"))
        expPath = absSymPath(os.path.join(self.workingDir, "expTemplate.xml"))
        self.expWrapper.setConfigPath(configPath)
        self.configWrapper.writeXML(configPath)
        self.expWrapper.writeXML(expPath)

        projPath = os.path.join(self.workingDir,
                                ProjectWrapper.alignmentDirName)
        if os.path.exists(projPath) and self.options.overwrite:
            system("rm -rf %s" % projPath)
        if self.options.outputMaf is True:
            fixNames = 1
        else:
            fixNames = 0
        if os.path.exists(projPath):
            if not self.isSameAsExisting(expPath, projPath, fixNames):
                raise RuntimeError("Existing project %s not " % projPath +
                                   "compatible with current input.  Please "
                                   "erase the working directory or rerun "
                                   "with the --overwrite option to start "
                                   "from scratch.")
            else:
                logPath = os.path.join(self.workingDir, 'cactus.log')
                logFile = open(logPath, "a")
                logFile.write("\nContinuing existing alignment.  Use "
                              "--overwrite or erase the working directory to "
                              "force restart from scratch.\n")
                logFile.close()
        else:
            cmd = "cactus_createMultiCactusProject.py \"%s\" \"%s\" --fixNames=%d" % (
                expPath, projPath, fixNames)
            if len(self.seqFile.outgroups) > 0:
                cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups)
            if self.options.rootOutgroupDists:
                cmd += " --rootOutgroupDists %s" % self.options.rootOutgroupDists
                cmd += " --rootOutgroupPaths %s" % self.options.rootOutgroupPaths
            if self.options.root is not None:
                cmd += " --root %s" % self.options.root
            system(cmd)

    # create a project in a dummy directory.  check if the
    # project xml is the same as the current project.
    # we do this to see if we should start fresh or try to
    # work with the existing project when the overwrite flag is off
    def isSameAsExisting(self, expPath, projPath, fixNames):
        if not os.path.exists(projPath):
            return False
        oldPath = os.path.dirname(projPath + "/")
        tempPath = "%s_temp" % oldPath
        # Fix for relative directories
        if oldPath[0:2] == './':
            oldPath = oldPath[2:]
        if tempPath[0:2] == './':
            tempPath = tempPath[2:]
        if os.path.exists(tempPath):
            system("rm -rf %s" % tempPath)
        cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % (
            expPath, tempPath, fixNames)
        if len(self.seqFile.outgroups) > 0:
            cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups)
        if self.options.rootOutgroupDists:
            cmd += " --rootOutgroupDists %s" % self.options.rootOutgroupDists
            cmd += " --rootOutgroupPaths %s" % self.options.rootOutgroupPaths
        if self.options.root is not None:
            cmd += " --root %s" % self.options.root
        system(cmd)
        projFilePathNew = os.path.join(
            tempPath, '%s_temp_project.xml' % self.alignmentDirName)
        projFilePathOld = os.path.join(
            oldPath, '%s_project.xml' % self.alignmentDirName)

        newFile = [line for line in open(projFilePathNew, "r")]
        oldFile = [line for line in open(projFilePathOld, "r")]
        areSame = True
        if len(newFile) != len(oldFile):
            areSame = False
        for newLine, oldLine in zip(newFile, oldFile):
            if newLine.replace(tempPath, oldPath) != oldLine:
                areSame = False
        system("rm -rf %s" % tempPath)
        return areSame
Esempio n. 28
0
    def run(self, fileStore):
        self.configNode = ET.parse(
            fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        experimentFile = fileStore.readGlobalFile(
            self.project.expIDMap[self.event])
        expXml = ET.parse(experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configPath = fileStore.readGlobalFile(experiment.getConfigID())
        configXml = ET.parse(configPath).getroot()

        seqIDMap = dict()
        tree = experiment.getTree()
        seqNames = []
        for node in tree.postOrderTraversal():
            name = tree.getName(node)
            if tree.isLeaf(node) or (name == experiment.getRootGenome()
                                     and experiment.isRootReconstructed()
                                     == False):
                seqIDMap[name] = self.project.outputSequenceIDMap[name]
                seqNames.append(name)
        logger.info("Sequences in progressive, %s: %s" %
                    (self.event, seqNames))

        experimentFile = fileStore.getLocalTempFile()
        experiment.writeXML(experimentFile)
        self.options.experimentFileID = fileStore.writeGlobalFile(
            experimentFile)

        # take union of command line options and config options for hal and reference
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal",
                                                      bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(
                halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        configFile = fileStore.readGlobalFile(experiment.getConfigID())
        configNode = ET.parse(configFile).getroot()
        workFlowArgs = CactusWorkflowArguments(self.options,
                                               experimentFile=experimentFile,
                                               configNode=configNode,
                                               seqIDMap=seqIDMap)

        # copy over the options so we don't trail them around
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        if self.options.intermediateResultsUrl is not None:
            # Give the URL prefix a special name for this particular
            # subproblem (by suffixing it with the name of the
            # internal node in the guide tree)
            workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event

        # Use the trimming strategy to blast ingroups vs outgroups.
        finalExpWrapper = self.addChild(
            CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs,
                                     phaseName="trimBlast")).rv()
        logger.info("Going to create alignments and define the cactus tree")

        return finalExpWrapper
Esempio n. 29
0
class RunCactusPreprocessorThenProgressiveDown(RoundedJob):
    def __init__(self, options, project, memory=None, cores=None):
        RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True)
        self.options = options
        self.project = project

    def run(self, fileStore):
        self.configNode = ET.parse(
            fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        fileStore.logToMaster("Using the following configuration:\n%s" %
                              ET.tostring(self.configNode, encoding='unicode'))

        # Log the stats for the un-preprocessed assemblies
        for name, sequence in list(self.project.inputSequenceIDMap.items()):
            self.addChildJobFn(logAssemblyStats, "Before preprocessing", name,
                               sequence)

        # Create jobs to create the output sequences
        logger.info("Reading config file from: %s" %
                    self.project.getConfigID())
        configFile = fileStore.readGlobalFile(self.project.getConfigID())
        configNode = ET.parse(configFile).getroot()
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals(
        )  #This is necessary..
        #Add the preprocessor child job. The output is a job promise value that will be
        #converted into a list of the IDs of the preprocessed sequences in the follow on job.
        preprocessorJob = self.addChild(
            CactusPreprocessor(list(self.project.inputSequenceIDMap.values()),
                               configNode))
        rvs = [
            preprocessorJob.rv(i)
            for i in range(len(self.project.inputSequenceIDMap))
        ]
        fileStore.logToMaster('input sequence IDs: %s' %
                              self.project.inputSequenceIDMap)
        for genome, rv in zip(list(self.project.inputSequenceIDMap.keys()),
                              rvs):
            self.project.outputSequenceIDMap[genome] = rv

        #Now build the progressive-down job
        schedule = Schedule()
        schedule.loadProject(self.project, fileStore=fileStore)
        schedule.compute()
        self.options.event = self.project.mcTree.getRootName()
        leafNames = [
            self.project.mcTree.getName(i)
            for i in self.project.mcTree.getLeaves()
        ]
        fileStore.logToMaster("Leaf names = %s" % leafNames)
        self.options.globalLeafEventSet = set(leafNames)

        return self.addFollowOn(
            RunCactusPreprocessorThenProgressiveDown2(
                options=self.options,
                project=self.project,
                event=self.options.event,
                schedule=schedule,
                memory=self.configWrapper.getDefaultMemory())).rv()
Esempio n. 30
0
def runCactusGraphMapSplit(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            wf_output = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # load up the contigs if any
            ref_contigs = set(options.refContigs)
            # todo: use import?
            if options.refContigsFile:
                with open(options.refContigsFile, 'r') as rc_file:
                    for line in rc_file:
                        if len(line.strip()):
                            ref_contigs.add(line.strip().split()[0])

            if options.otherContig:
                assert options.otherContig not in ref_contigs

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "graphmap"),
                                            "assemblyName",
                                            default="_MINIGRAPH_")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the paf
            paf_id = toil.importFile(makeURL(options.graphmapPAF))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            leaves = set([
                seqFile.tree.getName(node)
                for node in seqFile.tree.getLeaves()
            ])

            if graph_event not in leaves:
                raise RuntimeError(
                    "Minigraph name {} not found in seqfile".format(
                        graph_event))
            if options.reference and options.reference not in leaves:
                raise RuntimeError(
                    "Name given with --reference {} not found in seqfile".
                    format(options.reference))

            for genome, seq in seqFile.pathMap.items():
                if genome in leaves:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    logger.info("Importing {}".format(seq))
                    seqIDMap[genome] = (seq, toil.importFile(seq))

            # run the workflow
            wf_output = toil.start(
                Job.wrapJobFn(graphmap_split_workflow, options, config,
                              seqIDMap, gfa_id, options.minigraphGFA, paf_id,
                              options.graphmapPAF, ref_contigs,
                              options.otherContig))

        #export the split data
        export_split_data(toil, wf_output[0], wf_output[1], wf_output[2:],
                          options.outDir, config)
Esempio n. 31
0
def cactusPrepare(options, project):
    """ annotate a SeqFile with ancestral names as well as paths for output sequences."""

    # read the input
    seqFile = SeqFile(options.seqFile)
    configNode = ET.parse(options.configFile).getroot()
    config = ConfigWrapper(configNode)

    if not options.wdl:
        # prepare output sequence directory
        # todo: support remote (ie s3) output directory
        try:
            os.makedirs(options.outDir)
        except:
            pass
        if not os.path.isdir(options.outDir):
            raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir))
        if not os.access(options.outDir, os.W_OK):
            logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir))

    if options.preprocessOnly or options.gpu:
        if options.preprocessOnly:
            # hack the configfile to skip preprocessing and write it to the output dir
            config.removePreprocessors()
        if options.gpu:
            # hack the configfile to toggle on gpu lastz
            cafNode = findRequiredNode(config.xmlRoot, "caf")
            cafNode.attrib["gpuLastz"] = "true"
            # realigning doesn't mix well with lastz so we make sure it's off
            # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271
            cafNode.attrib["realign"] = "0"
        options.configFile = os.path.join(options.outDir, 'config-prepared.xml')
        sys.stderr.write("configuration saved in {}\n".format(options.configFile))
        config.writeXML(options.configFile)
        
    # pass through the config file to the options
    # todo (don't like second hard-code check of .xml path)
    if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl:
        options.cactusOptions += ' --configFile {}'.format(options.configFile)

    # get the ancestor names
    tree = MultiCactusTree(seqFile.tree)
    tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix())

    # make the output
    outSeqFile = SeqFile()
    outSeqFile.tree= tree
    outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap)
    outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups)

    # update paths for preprocessed leaves or inferred ancestors
    for node in outSeqFile.tree.breadthFirstTraversal():
        name = outSeqFile.tree.getName(node)
        leaf = outSeqFile.tree.isLeaf(node)
        if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly):
            out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name)
            outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename))
            if options.wdl:
                # uniquify name in wdl to prevent collisions
                outSeqFile.pathMap[name] += '.pp'

    # write the output
    if options.outSeqFile:
        with open(options.outSeqFile, 'w') as out_sf:
            out_sf.write(str(outSeqFile))

    # write the instructions
    print(get_plan(options, project, seqFile, outSeqFile))
Esempio n. 32
0
def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(options,
                                         options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # import the outgroups
            outgroupIDs = []
            cactus_blast_input = not options.nonBlastInput
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(options.blastOutput) +
                        '.og_fragment_{}'.format(i))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                except:
                    if cactus_blast_input:
                        raise
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not cactus_blast_input
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not cactus_blast_input:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(options.blastOutput))
            try:
                workFlowArgs.secondaryAlignmentsID = toil.importFile(
                    makeURL(options.blastOutput) + '.secondary')
            except:
                workFlowArgs.secondaryAlignmentsID = None
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if cactus_blast_input and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(options.blastOutput) +
                            '.ig_coverage_{}'.format(i)))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs,
                              project, cactus_blast_input))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))
Esempio n. 33
0
def make_align_job(options, toil):
    options.cactusDir = getTempDirectory()

    # apply path overrides.  this was necessary for wdl which doesn't take kindly to
    # text files of local paths (ie seqfile).  one way to fix would be to add support
    # for s3 paths and force wdl to use it.  a better way would be a more fundamental
    # interface shift away from files of paths throughout all of cactus
    if options.pathOverrides:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        tree = MultiCactusTree(seqFile.tree)
        tree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        for name, override in zip(options.pathOverrideNames,
                                  options.pathOverrides):
            seqFile.pathMap[name] = override
        override_seq = os.path.join(options.cactusDir, 'seqFile.override')
        with open(override_seq, 'w') as out_sf:
            out_sf.write(str(seqFile))
        options.seqFile = override_seq

    if not options.root:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        mcTree = MultiCactusTree(seqFile.tree)
        mcTree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        options.root = mcTree.getRootName()

    if options.acyclic:
        seqFile = SeqFile(options.seqFile)
        tree = MultiCactusTree(seqFile.tree)
        leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
        if options.acyclic not in leaves:
            raise RuntimeError(
                "Genome specified with --acyclic, {}, not found in tree leaves"
                .format(options.acyclic))

    #to be consistent with all-in-one cactus, we make sure the project
    #isn't limiting itself to the subtree (todo: parameterize so root can
    #be passed through from prepare to blast/align)
    proj_options = copy.deepcopy(options)
    proj_options.root = None
    #Create the progressive cactus project (as we do in runCactusProgressive)
    projWrapper = ProjectWrapper(proj_options,
                                 proj_options.configFile,
                                 ignoreSeqPaths=options.root)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    # open up the experiment (as we do in ProgressiveUp.run)
    # note that we copy the path into the options here
    experimentFile = project.expMap[options.root]
    expXml = ET.parse(experimentFile).getroot()
    experiment = ExperimentWrapper(expXml)
    configPath = experiment.getConfigPath()
    configXml = ET.parse(configPath).getroot()

    seqIDMap = dict()
    tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root)
    leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
    outgroups = experiment.getOutgroupGenomes()
    genome_set = set(leaves + outgroups)

    # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
    def get_input_path(suffix=''):
        base_path = options.cigarsFile[0]
        for input_path in options.cigarsFile:
            if suffix and input_path.endswith(suffix):
                return input_path
            if os.path.basename(base_path).startswith(
                    os.path.basename(input_path)):
                base_path = input_path
        return base_path + suffix

    # import the outgroups
    outgroupIDs = []
    outgroup_fragment_found = False
    for i, outgroup in enumerate(outgroups):
        try:
            outgroupID = toil.importFile(
                makeURL(get_input_path('.og_fragment_{}'.format(i))))
            outgroupIDs.append(outgroupID)
            experiment.setSequenceID(outgroup, outgroupID)
            outgroup_fragment_found = True
            assert not options.pangenome
        except:
            # we assume that input is not coming from cactus blast, so we'll treat output
            # sequences normally and not go looking for fragments
            outgroupIDs = []
            break

    #import the sequences (that we need to align for the given event, ie leaves and outgroups)
    for genome, seq in list(project.inputSequenceMap.items()):
        if genome in leaves or (not outgroup_fragment_found
                                and genome in outgroups):
            if os.path.isdir(seq):
                tmpSeq = getTempFile()
                catFiles(
                    [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)],
                    tmpSeq)
                seq = tmpSeq
            seq = makeURL(seq)

            logger.info("Importing {}".format(seq))
            experiment.setSequenceID(genome, toil.importFile(seq))

    if not outgroup_fragment_found:
        outgroupIDs = [
            experiment.getSequenceID(outgroup) for outgroup in outgroups
        ]

    # write back the experiment, as CactusWorkflowArguments wants a path
    experiment.writeXML(experimentFile)

    #import cactus config
    if options.configFile:
        cactusConfigID = toil.importFile(makeURL(options.configFile))
    else:
        cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
    project.setConfigID(cactusConfigID)

    project.syncToFileStore(toil)
    configNode = ET.parse(project.getConfigPath()).getroot()
    configWrapper = ConfigWrapper(configNode)
    configWrapper.substituteAllPredefinedConstantsWithLiterals()

    if options.singleCopySpecies:
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format(
                options.singleCopySpecies)

    if options.barMaskFilter:
        findRequiredNode(
            configWrapper.xmlRoot,
            "bar").attrib["partialOrderAlignmentMaskFilter"] = str(
                options.barMaskFilter)

    if options.pangenome:
        # turn off the megablock filter as it ruins non-all-to-all alignments
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["minimumBlockHomologySupport"] = "0"
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
        # turn off mapq filtering
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["runMapQFiltering"] = "0"
        # more iterations here helps quite a bit to reduce underalignment
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["maxRecoverableChainsIterations"] = "50"
        # turn down minimum block degree to get a fat ancestor
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["minimumBlockDegree"] = "1"
        # turn on POA
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["partialOrderAlignment"] = "1"
        # save it
        if not options.batch:
            pg_file = options.outHal + ".pg-conf.xml"
            if pg_file.startswith('s3://'):
                pg_temp_file = getTempFile()
            else:
                pg_temp_file = pg_file
            configWrapper.writeXML(pg_temp_file)
            if pg_file.startswith('s3://'):
                write_s3(pg_temp_file,
                         pg_file,
                         region=get_aws_region(options.jobStore))
            logger.info("pangenome configuration overrides saved in {}".format(
                pg_file))

    workFlowArgs = CactusWorkflowArguments(options,
                                           experimentFile=experimentFile,
                                           configNode=configNode,
                                           seqIDMap=project.inputSequenceIDMap)

    #import the files that cactus-blast made
    workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path()))
    workFlowArgs.secondaryAlignmentsID = None
    if not options.pafInput:
        try:
            workFlowArgs.secondaryAlignmentsID = toil.importFile(
                makeURL(get_input_path('.secondary')))
        except:
            pass
    workFlowArgs.outgroupFragmentIDs = outgroupIDs
    workFlowArgs.ingroupCoverageIDs = []
    if outgroup_fragment_found and len(outgroups) > 0:
        for i in range(len(leaves)):
            workFlowArgs.ingroupCoverageIDs.append(
                toil.importFile(
                    makeURL(get_input_path('.ig_coverage_{}'.format(i)))))

    align_job = Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              checkpointInfo=options.checkpointInfo,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput,
                              pafSecondaries=options.usePafSecondaries,
                              doVG=options.outVG,
                              doGFA=options.outGFA,
                              delay=options.stagger,
                              eventNameAsID=options.eventNameAsID,
                              acyclicEvent=options.acyclic)
    return align_job
Esempio n. 34
0
def stageWorkflow(outputSequenceDir,
                  configFile,
                  inputSequences,
                  toil,
                  restart=False,
                  outputSequences=[],
                  maskAlpha=False,
                  clipAlpha=None,
                  maskPAF=None,
                  inputEventNames=None,
                  brnnCores=None):
    #Replace any constants
    configNode = ET.parse(configFile).getroot()
    if not outputSequences:
        outputSequences = CactusPreprocessor.getOutputSequenceFiles(
            inputSequences, outputSequenceDir)
    else:
        assert len(outputSequences) == len(inputSequences)

    # Make sure we have the dna-brnn model in the filestore if we need it
    loadDnaBrnnModel(toil, ET.parse(configFile).getroot(), maskAlpha=maskAlpha)

    if configNode.find("constants") != None:
        ConfigWrapper(
            configNode).substituteAllPredefinedConstantsWithLiterals()
    if maskAlpha or clipAlpha:
        ConfigWrapper(configNode).setPreprocessorActive(
            "lastzRepeatMask", False)
        ConfigWrapper(configNode).setPreprocessorActive("dna-brnn", True)
        for node in configNode.findall("preprocessor"):
            if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
                if clipAlpha:
                    node.attrib["action"] = "clip"
    if brnnCores is not None:
        for node in configNode.findall("preprocessor"):
            if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn':
                node.attrib["cpu"] = brnnCores

    if not restart:
        inputSequenceIDs = []
        for seq in inputSequences:
            logger.info("Importing {}".format(seq))
            inputSequenceIDs.append(toil.importFile(makeURL(seq)))
        if maskPAF:
            inputPAFID = toil.importFile(makeURL(maskPAF))
        else:
            inputPAFID = None
        unzip_job = Job.wrapJobFn(unzip_then_pp, configNode, inputSequences,
                                  inputSequenceIDs, inputEventNames, maskPAF,
                                  inputPAFID)
        outputSequenceIDs = toil.start(unzip_job)
    else:
        outputSequenceIDs = toil.restart()
    for seqID, path in zip(outputSequenceIDs, outputSequences):
        try:
            iter(seqID)
            # dna-brnn will output a couple of bed files.  we scrape those out here
            toil.exportFile(seqID[0], makeURL(path))
            toil.exportFile(seqID[1], makeURL(path) + '.bed')
            toil.exportFile(seqID[2], makeURL(path) + '.mask.bed')
        except:
            toil.exportFile(seqID, makeURL(path))
Esempio n. 35
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--database", dest="database",
                      help="Database type: tokyo_cabinet or kyoto_tycoon"
                      " [default: %(default)s]",
                      default="kyoto_tycoon")
    parser.add_argument("--configFile", dest="configFile",
                      help="Specify cactus configuration file",
                      default=None)
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                      " must appear in NEWICK tree in <seqfile>) to use as a "
                      "root for the alignment.  Any genomes not below this node "
                      "in the tree may be used as outgroups but will never appear"
                      " in the output.  If no root is specifed then the root"
                      " of the tree is used. ", default=None)   
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None:
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    with Toil(options) as toil:
        importSingularityImage()
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()
            #Create the progressive cactus project 
            projWrapper = ProjectWrapper(options)
            projWrapper.writeXml()

            pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                                  '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)
            #import the sequences
            seqIDs = []
            print "Importing %s sequences" % (len(project.getInputSequencePaths()))
            for seq in project.getInputSequencePaths():
                if os.path.isdir(seq):
                    tmpSeq = getTempFile()
                    catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq)
                    seq = tmpSeq
                seq = makeURL(seq)
                seqIDs.append(toil.importFile(seq))
            project.setInputSequenceIDs(seqIDs)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()


            project.writeXML(pjPath)
            halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory()))

        toil.exportFile(halID, makeURL(options.outputHal))
Esempio n. 36
0
def runCactusBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            logger.info("Experiment {}".format(ET.tostring(expXml)))
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = tree.getChildNames(tree.getRootName())
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)
            logger.info("Genomes in blastonly, {}: {}".format(
                options.root, list(genome_set)))

            print(str(project.inputSequenceMap))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in genome_set:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    project.inputSequenceIDMap[genome] = toil.importFile(seq)
                else:
                    # out-of-scope sequences will only cause trouble later on
                    del project.inputSequenceMap[genome]

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            outWorkFlowArgs = toil.start(
                CactusTrimmingBlastPhase(standAlone=True,
                                         cactusWorkflowArguments=workFlowArgs,
                                         phaseName="trimBlast"))

        # export the alignments
        toil.exportFile(outWorkFlowArgs.alignmentsID,
                        makeURL(options.outputFile))
        # optional secondary alignments
        if outWorkFlowArgs.secondaryAlignmentsID:
            toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID,
                            makeURL(options.outputFile) + '.secondary')
        # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts
        for i, outgroupFragmentID in enumerate(
                outWorkFlowArgs.outgroupFragmentIDs):
            toil.exportFile(
                outgroupFragmentID,
                makeURL(options.outputFile) + '.og_fragment_{}'.format(i))
        # cactus-align can recompute coverage on the fly, but we save them because we have them
        for i, ingroupCoverageID in enumerate(
                outWorkFlowArgs.ingroupCoverageIDs):
            toil.exportFile(
                ingroupCoverageID,
                makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
Esempio n. 37
0
def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
            def get_input_path(suffix=''):
                base_path = options.cigarsFile[0]
                for input_path in options.cigarsFile:
                    if suffix and input_path.endswith(suffix):
                        return input_path
                    if os.path.basename(base_path).startswith(
                            os.path.basename(input_path)):
                        base_path = input_path
                return base_path + suffix

            # import the outgroups
            outgroupIDs = []
            outgroup_fragment_found = False
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(get_input_path('.og_fragment_{}'.format(i))))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                    outgroup_fragment_found = True
                    assert not options.pangenome
                except:
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not outgroup_fragment_found
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not outgroup_fragment_found:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            if options.pangenome:
                # turn off the megablock filter as it ruins non-all-to-all alignments
                configWrapper.disableCafMegablockFilter()
                # the recoverable chains parameter does not seem to play nicely with star-like alignments either
                #configWrapper.disableRecoverableChains()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(get_input_path()))
            workFlowArgs.secondaryAlignmentsID = None
            if not options.pafInput:
                try:
                    workFlowArgs.secondaryAlignmentsID = toil.importFile(
                        makeURL(get_input_path('.secondary')))
                except:
                    pass
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if outgroup_fragment_found and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(get_input_path(
                                '.ig_coverage_{}'.format(i)))))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))
Esempio n. 38
0
def runCactusBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(options,
                                         options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            logger.info("Experiment {}".format(ET.tostring(expXml)))
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = tree.getChildNames(tree.getRootName())
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)
            logger.info("Genomes in blastonly, {}: {}".format(
                options.root, list(genome_set)))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in genome_set:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    project.inputSequenceIDMap[genome] = toil.importFile(seq)
                else:
                    # out-of-scope sequences will only cause trouble later on
                    del project.inputSequenceMap[genome]

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            outWorkFlowArgs = toil.start(
                CactusTrimmingBlastPhase(standAlone=True,
                                         cactusWorkflowArguments=workFlowArgs,
                                         phaseName="trimBlast"))

        # export the alignments
        toil.exportFile(outWorkFlowArgs.alignmentsID,
                        makeURL(options.outputFile))
        # optional secondary alignments
        if outWorkFlowArgs.secondaryAlignmentsID:
            toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID,
                            makeURL(options.outputFile) + '.secondary')
        # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts
        for i, outgroupFragmentID in enumerate(
                outWorkFlowArgs.outgroupFragmentIDs):
            toil.exportFile(
                outgroupFragmentID,
                makeURL(options.outputFile) + '.og_fragment_{}'.format(i))
        # cactus-align can recompute coverage on the fly, but we save them because we have them
        for i, ingroupCoverageID in enumerate(
                outWorkFlowArgs.ingroupCoverageIDs):
            toil.exportFile(
                ingroupCoverageID,
                makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))