Python ExperimentWrapper.getConfigPath Examples

Programming Language: Python

Namespace/Package Name: cactus.shared.experimentWrapper

Method/Function: getConfigPath

Examples at hotexamples.com: 11

Python ExperimentWrapper.getConfigPath - 11 examples found. These are the top rated real world Python examples of cactus.shared.experimentWrapper.ExperimentWrapper.getConfigPath extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ExperimentWrapper(26)

getTree(16)

getConfigPath(9)

getOutgroupGenomes(8)

writeXML(7)

getSequenceID(5)

setSequenceID(4)

createExperimentWrapper(4)

getOutgroupEvents(3)

setConfigPath(3)

buildSequenceMap(3)

getConfigID(3)

getDbType(2)

getReferencePath(2)

getGenomesWithSequence(2)

getDbDir(2)

getRootGenome(2)

getHALPath(2)

setTree(2)

setSequences(1)

setHALPath(1)

updateTree(1)

setDbReadTuningOptions(1)

setDbSnapshot(1)

setHALFastaPath(1)

setOutputSequenceDir(1)

setOutgroupGenomes(1)

getConstraintsFilePath(1)

setReferencePath(1)

setDbInMemory(1)

setRootGenome(1)

setRootReconstructed(1)

setSecondaryDBElem(1)

setDbPort(1)

setConstraintsID(1)

setDbHost(1)

getReferenceID(1)

getHALFastaPath(1)

getHalFastaID(1)

getHalID(1)

getDbPort(1)

getDbName(1)

getOutputSequenceDir(1)

getSecondaryDBElem(1)

setDbDir(1)

getSequence(1)

addOutgroupSequence(1)

getSequenceMap(1)

getDbHost(1)

getConfig(1)

Example #1

Show file

File: cactus_createMultiCactusProject.py Project: zqingyuan/cactus

def runCreateMultiCactusProject(expFile,
                                projectFile,
                                fixNames=False,
                                outgroupNames=None,
                                root=None,
                                overwrite=False):

    options = CreateMultiCactusProjectOptions(expFile,
                                              projectFile,
                                              fixNames=fixNames,
                                              outgroupNames=outgroupNames,
                                              root=root,
                                              overwrite=overwrite)

    expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot())
    configPath = expTemplate.getConfigPath()
    confTemplate = ConfigWrapper(ET.parse(configPath).getroot())
    if options.fixNames:
        cleanEventTree(expTemplate)
    tree = expTemplate.getTree()
    if options.outgroupNames is not None:
        options.outgroupNames = set(options.outgroupNames)
        projNames = set([tree.getName(x) for x in tree.getLeaves()])
        for outgroupName in options.outgroupNames:
            if outgroupName not in projNames:
                raise RuntimeError("Specified outgroup %s not found in tree" %
                                   outgroupName)
    mcProj = createMCProject(tree, expTemplate, confTemplate, options)
    #Replace the sequences with output sequences
    expTemplate.updateTree(mcProj.mcTree, expTemplate.buildSequenceMap())
    #Now do the file tree creation
    createFileStructure(mcProj, expTemplate, confTemplate, options)

Example #2

Show file

 def progressiveFunction(self,
                         experimentFile,
                         toilDir,
                         batchSystem,
                         buildAvgs,
                         buildHal,
                         buildFasta,
                         toilStats,
                         subtreeRoot=None,
                         logLevel=None):
     eW = ExperimentWrapper(ET.parse(experimentFile).getroot())
     seqFile = getTempFile()
     with open(seqFile, 'w') as f:
         tree = eW.getTree()
         newick = NXNewick().writeString(tree)
         f.write('%s\n' % newick)
         for genome in eW.getGenomesWithSequence():
             f.write('%s %s\n' % (genome, eW.getSequenceID(genome)))
     config = eW.getConfigPath()
     runCactusProgressive(seqFile,
                          config,
                          toilDir,
                          batchSystem=batchSystem,
                          buildAvgs=buildAvgs,
                          toilStats=toilStats,
                          logLevel=logLevel)

Example #3

Show file

File: multiCactusProject.py Project: zoumingr/cactus

 def syncToFileStore(self, toil):
     self.expIDMap = dict()
     for name, expPath in list(self.expMap.items()):
         expWrapper = ExperimentWrapper(ET.parse(expPath).getroot())
         expWrapper.setConfigID(toil.importFile("file://" + expWrapper.getConfigPath()))
         expWrapper.writeXML(expPath)
         self.expIDMap[name] = toil.importFile("file://" + expPath)

Example #4

Show file

def main():
    usage = "usage: %prog [options] <experiment> <output project path>"
    description = "Setup a multi-cactus project using an experiment xml as template"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("--fixNames", dest="fixNames",  default = "True", 
                      help="try to make sequence and event names MAF-compliant [default=true]")
    parser.add_option("--outgroupNames", dest="outgroupNames",  default = None, 
                      help="comma-separated names of high quality assemblies to use as outgroups [default=everything]")
    parser.add_option("--root", dest="root", type=str,
                      help="name of alignment root (must be labeled ancestral node in tree in input experiment).  Useful "
                      "for allowing the tree to contain nodes that won't be in the alignment but can still be used for "
                      "outgroups.",
                      default=None)
    parser.add_option("--overwrite", action="store_true", help="Overwrite existing experiment files", default=False)

    options, args = parser.parse_args()
    
    if len(args) != 2:
        parser.print_help()
        raise RuntimeError("Wrong number of arguments")

    options.expFile = args[0]    
    options.path = os.path.abspath(args[1])
    options.name = os.path.basename(options.path)
    options.fixNames = not options.fixNames.lower() == "false"

    if (os.path.isdir(options.path) and not options.overwrite) or os.path.isfile(options.path):
        raise RuntimeError("Output project path %s exists\n" % options.path)
    
    expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot())
    configPath = expTemplate.getConfigPath()
    confTemplate = ConfigWrapper(ET.parse(configPath).getroot())
    if options.fixNames:
        cleanEventTree(expTemplate)
    checkInputSequencePaths(expTemplate)
    tree = expTemplate.getTree()

    # Check that the tree is sensible (root has at least 1 child)
    if len(tree.getChildren(tree.getRootId())) == 0:
        raise RuntimeError("Input species tree has only one node.")

    if options.outgroupNames is not None:
        projNames = set([tree.getName(x) for x in tree.getLeaves()])
        options.outgroupNames = set(options.outgroupNames.split(","))
        for outgroupName in options.outgroupNames:
            if outgroupName not in projNames:
                raise RuntimeError("Specified outgroup %s not found in tree" % outgroupName)
    mcProj = createMCProject(tree, expTemplate, confTemplate, options)
    #Replace the sequences with output sequences
    expTemplate.updateTree(mcProj.mcTree, expTemplate.buildSequenceMap())
    expTemplate.setSequences(CactusPreprocessor.getOutputSequenceFiles(mcProj.inputSequences, expTemplate.getOutputSequenceDir()))

    #Now do the file tree creation
    createFileStructure(mcProj, expTemplate, confTemplate, options)
   # mcProj.check()
    return 0

Example #5

Show file

File: schedule.py Project: yangchou0916/cactus

    def loadProject(self, mcProject, fileStore = None):
        self.inGraph = NX.DiGraph()
        globTree = mcProject.mcTree
        self.maxParallelSubtrees = None
        leafEvents = [globTree.getName(i) for i in globTree.getLeaves()]

        expMap = None
        if fileStore:
            expMap = dict()
            for name in mcProject.expIDMap:
                expMap[name] = fileStore.readGlobalFile(mcProject.expIDMap[name])
        else:
            expMap = mcProject.expMap

        for name, expPath in list(expMap.items()):
            exp = ExperimentWrapper(ET.parse(expPath).getroot())
            tree = exp.getTree()
            self.inGraph.add_node(name)
            # Go through the species tree and add the correct
            # dependencies (i.e. to the outgroups and the ingroups,
            # but not to the other nodes that are just there because
            # they are needed to form the correct paths).
            for node in tree.postOrderTraversal():
                nodeName = tree.getName(node)

                # we don't add edges for leaves (in the global tree)
                # as they are input sequences and do not form dependencies
                # (it would be clever to maybe do the same with existing
                # references when --overwrite is not specified but for now
                # we just do the leaves)
                if nodeName not in leafEvents and tree.isLeaf(node):
                    self.inGraph.add_edge(name, nodeName)
            if fileStore:
                configFile = fileStore.readGlobalFile(exp.getConfigID())
            else:
                # hack from running from cactus-prepare
                configFile = exp.getConfigPath()
            configElem = ET.parse(configFile).getroot()
            conf = ConfigWrapper(configElem)
            # load max parellel subtrees from the node's config
            if self.maxParallelSubtrees is None:
                self.maxParallelSubtrees = conf.getMaxParallelSubtrees()
            else:
                assert self.maxParallelSubtrees == conf.getMaxParallelSubtrees()
        assert NX.is_directed_acyclic_graph(self.inGraph)

Example #6

Show file

File: cactus_align.py Project: xiangyupan/cactus

def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
            def get_input_path(suffix=''):
                base_path = options.cigarsFile[0]
                for input_path in options.cigarsFile:
                    if suffix and input_path.endswith(suffix):
                        return input_path
                    if os.path.basename(base_path).startswith(
                            os.path.basename(input_path)):
                        base_path = input_path
                return base_path + suffix

            # import the outgroups
            outgroupIDs = []
            outgroup_fragment_found = False
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(get_input_path('.og_fragment_{}'.format(i))))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                    outgroup_fragment_found = True
                    assert not options.pangenome
                except:
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not outgroup_fragment_found
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not outgroup_fragment_found:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            if options.pangenome:
                # turn off the megablock filter as it ruins non-all-to-all alignments
                configWrapper.disableCafMegablockFilter()
                # the recoverable chains parameter does not seem to play nicely with star-like alignments either
                #configWrapper.disableRecoverableChains()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(get_input_path()))
            workFlowArgs.secondaryAlignmentsID = None
            if not options.pafInput:
                try:
                    workFlowArgs.secondaryAlignmentsID = toil.importFile(
                        makeURL(get_input_path('.secondary')))
                except:
                    pass
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if outgroup_fragment_found and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(get_input_path(
                                '.ig_coverage_{}'.format(i)))))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))

Example #7

Show file

    def run(self):
        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        self.options.experimentFile = self.project.expMap[self.event]
        expXml = ET.parse(self.options.experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configXml = ET.parse(experiment.getConfigPath()).getroot()
        configWrapper = ConfigWrapper(configXml)

        # need at least 3 processes for every event when using ktserver:
        # 1 proc to run jobs, 1 proc to run server, 1 proc to run 2ndary server
        if experiment.getDbType() == "kyoto_tycoon":            
            maxParallel = min(len(self.project.expMap),
                             configWrapper.getMaxParallelSubtrees()) 
            if self.options.batchSystem == "singleMachine":
                if int(self.options.maxThreads) < maxParallel * 3:
                    raise RuntimeError("At least %d threads are required (only %d were specified) to handle up to %d events using kyoto tycoon. Either increase the number of threads using the --maxThreads option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, self.options.maxThreads, maxParallel, configWrapper.getMaxParallelSubtrees()))
            else:
                if int(self.options.maxCpus) < maxParallel * 3:
                    raise RuntimeError("At least %d concurrent cpus are required to handle up to %d events using kyoto tycoon. Either increase the number of cpus using the --maxCpus option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, maxParallel, configWrapper.getMaxParallelSubtrees()))
                    
        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        workFlowArgs = CactusWorkflowArguments(self.options)
        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.overwrite = self.options.overwrite
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        
        experiment = ExperimentWrapper(workFlowArgs.experimentNode)

        donePath = os.path.join(os.path.dirname(workFlowArgs.experimentFile), "DONE")
        doneDone = os.path.isfile(donePath)
        refDone = not workFlowArgs.buildReference or os.path.isfile(experiment.getReferencePath())
        halDone = not workFlowArgs.buildHal or (os.path.isfile(experiment.getHALFastaPath()) and
                                                os.path.isfile(experiment.getHALPath()))
                                                               
        if not workFlowArgs.overwrite and doneDone and refDone and halDone:
            self.logToMaster("Skipping %s because it is already done and overwrite is disabled" %
                             self.event)
        else:
            system("rm -f %s" % donePath)
            # delete database 
            # and overwrite specified (or if reference not present)
            dbPath = os.path.join(experiment.getDbDir(), 
                                  experiment.getDbName())
            seqPath = os.path.join(experiment.getDbDir(), "sequences")
            system("rm -f %s* %s %s" % (dbPath, seqPath, 
                                        experiment.getReferencePath()))

            if workFlowArgs.configWrapper.getDoTrimStrategy() and workFlowArgs.outgroupEventNames is not None:
                # Use the trimming strategy to blast ingroups vs outgroups.
                self.addChildTarget(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast"))
            else:
                self.addChildTarget(CactusSetupPhase(cactusWorkflowArguments=workFlowArgs,
                                                     phaseName="setup"))
        logger.info("Going to create alignments and define the cactus tree")

        self.setFollowOnTarget(FinishUp(workFlowArgs, self.project))

Example #8

Show file

File: cactus_align.py Project: shanwai1234/cactus

def make_align_job(options, toil):
    options.cactusDir = getTempDirectory()

    # apply path overrides.  this was necessary for wdl which doesn't take kindly to
    # text files of local paths (ie seqfile).  one way to fix would be to add support
    # for s3 paths and force wdl to use it.  a better way would be a more fundamental
    # interface shift away from files of paths throughout all of cactus
    if options.pathOverrides:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        tree = MultiCactusTree(seqFile.tree)
        tree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        for name, override in zip(options.pathOverrideNames,
                                  options.pathOverrides):
            seqFile.pathMap[name] = override
        override_seq = os.path.join(options.cactusDir, 'seqFile.override')
        with open(override_seq, 'w') as out_sf:
            out_sf.write(str(seqFile))
        options.seqFile = override_seq

    if not options.root:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        mcTree = MultiCactusTree(seqFile.tree)
        mcTree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        options.root = mcTree.getRootName()

    if options.acyclic:
        seqFile = SeqFile(options.seqFile)
        tree = MultiCactusTree(seqFile.tree)
        leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
        if options.acyclic not in leaves:
            raise RuntimeError(
                "Genome specified with --acyclic, {}, not found in tree leaves"
                .format(options.acyclic))

    #to be consistent with all-in-one cactus, we make sure the project
    #isn't limiting itself to the subtree (todo: parameterize so root can
    #be passed through from prepare to blast/align)
    proj_options = copy.deepcopy(options)
    proj_options.root = None
    #Create the progressive cactus project (as we do in runCactusProgressive)
    projWrapper = ProjectWrapper(proj_options,
                                 proj_options.configFile,
                                 ignoreSeqPaths=options.root)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    # open up the experiment (as we do in ProgressiveUp.run)
    # note that we copy the path into the options here
    experimentFile = project.expMap[options.root]
    expXml = ET.parse(experimentFile).getroot()
    experiment = ExperimentWrapper(expXml)
    configPath = experiment.getConfigPath()
    configXml = ET.parse(configPath).getroot()

    seqIDMap = dict()
    tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root)
    leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
    outgroups = experiment.getOutgroupGenomes()
    genome_set = set(leaves + outgroups)

    # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
    def get_input_path(suffix=''):
        base_path = options.cigarsFile[0]
        for input_path in options.cigarsFile:
            if suffix and input_path.endswith(suffix):
                return input_path
            if os.path.basename(base_path).startswith(
                    os.path.basename(input_path)):
                base_path = input_path
        return base_path + suffix

    # import the outgroups
    outgroupIDs = []
    outgroup_fragment_found = False
    for i, outgroup in enumerate(outgroups):
        try:
            outgroupID = toil.importFile(
                makeURL(get_input_path('.og_fragment_{}'.format(i))))
            outgroupIDs.append(outgroupID)
            experiment.setSequenceID(outgroup, outgroupID)
            outgroup_fragment_found = True
            assert not options.pangenome
        except:
            # we assume that input is not coming from cactus blast, so we'll treat output
            # sequences normally and not go looking for fragments
            outgroupIDs = []
            break

    #import the sequences (that we need to align for the given event, ie leaves and outgroups)
    for genome, seq in list(project.inputSequenceMap.items()):
        if genome in leaves or (not outgroup_fragment_found
                                and genome in outgroups):
            if os.path.isdir(seq):
                tmpSeq = getTempFile()
                catFiles(
                    [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)],
                    tmpSeq)
                seq = tmpSeq
            seq = makeURL(seq)

            logger.info("Importing {}".format(seq))
            experiment.setSequenceID(genome, toil.importFile(seq))

    if not outgroup_fragment_found:
        outgroupIDs = [
            experiment.getSequenceID(outgroup) for outgroup in outgroups
        ]

    # write back the experiment, as CactusWorkflowArguments wants a path
    experiment.writeXML(experimentFile)

    #import cactus config
    if options.configFile:
        cactusConfigID = toil.importFile(makeURL(options.configFile))
    else:
        cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
    project.setConfigID(cactusConfigID)

    project.syncToFileStore(toil)
    configNode = ET.parse(project.getConfigPath()).getroot()
    configWrapper = ConfigWrapper(configNode)
    configWrapper.substituteAllPredefinedConstantsWithLiterals()

    if options.singleCopySpecies:
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format(
                options.singleCopySpecies)

    if options.barMaskFilter:
        findRequiredNode(
            configWrapper.xmlRoot,
            "bar").attrib["partialOrderAlignmentMaskFilter"] = str(
                options.barMaskFilter)

    if options.pangenome:
        # turn off the megablock filter as it ruins non-all-to-all alignments
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["minimumBlockHomologySupport"] = "0"
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
        # turn off mapq filtering
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["runMapQFiltering"] = "0"
        # more iterations here helps quite a bit to reduce underalignment
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["maxRecoverableChainsIterations"] = "50"
        # turn down minimum block degree to get a fat ancestor
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["minimumBlockDegree"] = "1"
        # turn on POA
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["partialOrderAlignment"] = "1"
        # save it
        if not options.batch:
            pg_file = options.outHal + ".pg-conf.xml"
            if pg_file.startswith('s3://'):
                pg_temp_file = getTempFile()
            else:
                pg_temp_file = pg_file
            configWrapper.writeXML(pg_temp_file)
            if pg_file.startswith('s3://'):
                write_s3(pg_temp_file,
                         pg_file,
                         region=get_aws_region(options.jobStore))
            logger.info("pangenome configuration overrides saved in {}".format(
                pg_file))

    workFlowArgs = CactusWorkflowArguments(options,
                                           experimentFile=experimentFile,
                                           configNode=configNode,
                                           seqIDMap=project.inputSequenceIDMap)

    #import the files that cactus-blast made
    workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path()))
    workFlowArgs.secondaryAlignmentsID = None
    if not options.pafInput:
        try:
            workFlowArgs.secondaryAlignmentsID = toil.importFile(
                makeURL(get_input_path('.secondary')))
        except:
            pass
    workFlowArgs.outgroupFragmentIDs = outgroupIDs
    workFlowArgs.ingroupCoverageIDs = []
    if outgroup_fragment_found and len(outgroups) > 0:
        for i in range(len(leaves)):
            workFlowArgs.ingroupCoverageIDs.append(
                toil.importFile(
                    makeURL(get_input_path('.ig_coverage_{}'.format(i)))))

    align_job = Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              checkpointInfo=options.checkpointInfo,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput,
                              pafSecondaries=options.usePafSecondaries,
                              doVG=options.outVG,
                              doGFA=options.outGFA,
                              delay=options.stagger,
                              eventNameAsID=options.eventNameAsID,
                              acyclicEvent=options.acyclic)
    return align_job

Example #9

Show file

File: cactus_align.py Project: masri2019/cactus

def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(options,
                                         options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # import the outgroups
            outgroupIDs = []
            cactus_blast_input = not options.nonBlastInput
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(options.blastOutput) +
                        '.og_fragment_{}'.format(i))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                except:
                    if cactus_blast_input:
                        raise
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not cactus_blast_input
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not cactus_blast_input:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(options.blastOutput))
            try:
                workFlowArgs.secondaryAlignmentsID = toil.importFile(
                    makeURL(options.blastOutput) + '.secondary')
            except:
                workFlowArgs.secondaryAlignmentsID = None
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if cactus_blast_input and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(options.blastOutput) +
                            '.ig_coverage_{}'.format(i)))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs,
                              project, cactus_blast_input))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))

Example #10

Show file

def runCactusBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            logger.info("Experiment {}".format(ET.tostring(expXml)))
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = tree.getChildNames(tree.getRootName())
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)
            logger.info("Genomes in blastonly, {}: {}".format(
                options.root, list(genome_set)))

            print(str(project.inputSequenceMap))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in genome_set:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    project.inputSequenceIDMap[genome] = toil.importFile(seq)
                else:
                    # out-of-scope sequences will only cause trouble later on
                    del project.inputSequenceMap[genome]

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            outWorkFlowArgs = toil.start(
                CactusTrimmingBlastPhase(standAlone=True,
                                         cactusWorkflowArguments=workFlowArgs,
                                         phaseName="trimBlast"))

        # export the alignments
        toil.exportFile(outWorkFlowArgs.alignmentsID,
                        makeURL(options.outputFile))
        # optional secondary alignments
        if outWorkFlowArgs.secondaryAlignmentsID:
            toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID,
                            makeURL(options.outputFile) + '.secondary')
        # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts
        for i, outgroupFragmentID in enumerate(
                outWorkFlowArgs.outgroupFragmentIDs):
            toil.exportFile(
                outgroupFragmentID,
                makeURL(options.outputFile) + '.og_fragment_{}'.format(i))
        # cactus-align can recompute coverage on the fly, but we save them because we have them
        for i, ingroupCoverageID in enumerate(
                outWorkFlowArgs.ingroupCoverageIDs):
            toil.exportFile(
                ingroupCoverageID,
                makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))

Example #11

Show file

def runCactusBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(options,
                                         options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            logger.info("Experiment {}".format(ET.tostring(expXml)))
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = tree.getChildNames(tree.getRootName())
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)
            logger.info("Genomes in blastonly, {}: {}".format(
                options.root, list(genome_set)))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in genome_set:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    project.inputSequenceIDMap[genome] = toil.importFile(seq)
                else:
                    # out-of-scope sequences will only cause trouble later on
                    del project.inputSequenceMap[genome]

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            outWorkFlowArgs = toil.start(
                CactusTrimmingBlastPhase(standAlone=True,
                                         cactusWorkflowArguments=workFlowArgs,
                                         phaseName="trimBlast"))

        # export the alignments
        toil.exportFile(outWorkFlowArgs.alignmentsID,
                        makeURL(options.outputFile))
        # optional secondary alignments
        if outWorkFlowArgs.secondaryAlignmentsID:
            toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID,
                            makeURL(options.outputFile) + '.secondary')
        # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts
        for i, outgroupFragmentID in enumerate(
                outWorkFlowArgs.outgroupFragmentIDs):
            toil.exportFile(
                outgroupFragmentID,
                makeURL(options.outputFile) + '.og_fragment_{}'.format(i))
        # cactus-align can recompute coverage on the fly, but we save them because we have them
        for i, ingroupCoverageID in enumerate(
                outWorkFlowArgs.ingroupCoverageIDs):
            toil.exportFile(
                ingroupCoverageID,
                makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))