コード例 #1
0
    def progressiveFunction(self, experimentFile, toilDir,
                            batchSystem, buildAvgs,
                            buildReference,
                            buildHal,
                            buildFasta,
                            toilStats,
                            subtreeRoot=None):
        tempDir = getTempDirectory(os.getcwd())
        tempExperimentDir = os.path.join(tempDir, "exp")
        runCreateMultiCactusProject(experimentFile,
                                    tempExperimentDir,
                                    fixNames=False,
                                    root=subtreeRoot)
        logger.info("Put the temporary files in %s" % tempExperimentDir)

        runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"),
                             toilDir,
                             batchSystem=batchSystem,
                             buildAvgs=buildAvgs,
                             toilStats=toilStats)

        # Check that the headers and sequences in the output are the
        # same as the sequences in the input (minus differences in
        # repeat-masking)
        exp = ExperimentWrapper(ET.parse(experimentFile).getroot())
        seqMap = exp.buildSequenceMap()
        # Maps genome name -> headers in fasta
        headers = {}
        for genomeName, inputSequencePath in seqMap.items():
            if os.path.isdir(inputSequencePath):
                # Some "input sequence paths" are actually provided as
                # directories containing multiple FASTAs
                concatenatedPath = getTempFile()
                system("cat %s/* > %s" % (inputSequencePath, concatenatedPath))
                inputSequencePath = concatenatedPath
            headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath)))

        # check headers inside .c2h output
        for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)):
            subExp = ExperimentWrapper(ET.parse(expPath).getroot())
            outgroups = subExp.getOutgroupEvents()
            c2hPath = subExp.getHALPath()
            with open(c2hPath) as f:
                for line in f:
                    fields = line.split('\t')
                    if fields[0] == 's':
                        # Sequence line
                        genome = fields[1][1:-1]
                        header = fields[2][1:-1]
                        if genome in headers and genome not in outgroups:
                            # This genome is an input genome
                            self.assertTrue(header in headers[genome],
                                            'Header %s from output c2h %s not found in input fa %s'
                                            ' for genome %s' % (header, c2hPath, seqMap[genome], genome))


        runToilStatusAndFailIfNotComplete(toilDir)
        system("rm -rf %s" % tempDir)
コード例 #2
0
def updateProject(path):
    mcProj = MultiCactusProject()
    mcProj.readXML(path)
    basePath, name = os.path.split(path)
    
    for name,oldPath in mcProj.expMap.items():
        fileName = os.path.basename(oldPath)
        dirName = os.path.dirname(oldPath).rpartition('/')[2] 
        newPath = os.path.join(basePath, dirName, fileName)
        
        if not os.path.isfile(newPath):
            raise RuntimeError("Experiment file %s not found\n" % newPath)
        
        mcProj.expMap[name] = newPath   
        
        exp = ExperimentWrapper(ET.parse(newPath).getroot())
        
        oldDbDir = exp.getDbDir()
        if oldDbDir is not None:
            dbDirName = oldDbDir[oldDbDir.find(name):]
            newDbDir = os.path.join(basePath, dbDirName)
            exp.setDbDir(newDbDir)
        
        oldRefPath = exp.getReferencePath()
        if oldRefPath is not None:
            refName = oldRefPath[oldRefPath.find(name):]
            newRefPath = os.path.join(basePath, refName)
            exp.setReferencePath(newRefPath)
    
        oldHalPath = exp.getHALPath()
        if oldHalPath is not None:
            halName = oldHalPath[oldHalPath.find(name):]
            newHalPath = os.path.join(basePath, halName)
            exp.setHALPath(newHalPath)

        oldHalFastaPath = exp.getHALFastaPath()
        if oldHalFastaPath is not None:
            halFastaName = oldHalFastaPath[oldHalFastaPath.find(name):]
            newHalFastaPath = os.path.join(basePath, halFastaName)
            exp.setHALFastaPath(newHalFastaPath)

        # seems to have dissappeared from experiment?
        #oldMafPath = exp.getMAFPath()
        #if oldMafPath is not None:
        #    mafName = oldMafPath[oldMafPath.find(name):]
        #    newMafPath = os.path.join(basePath, mafName)
        #    exp.setMAFPath(newMafPath)

        if exp.getDbType() == "kyoto_tycoon":
            oldHostName = exp.getDbHost()
            if oldHostName is not None:
                newHostName = socket.gethostname()
                exp.setDbHost(newHostName)
        
        system("cp %s %s.old" %(newPath, newPath))
        exp.writeXML(newPath)
    
    mcProj.writeXML(path)
コード例 #3
0
ファイル: cactus2hal.py プロジェクト: diekhans/cactus2hal
def main():
    args = initParser()
    myProj = MultiCactusProject()
    myProj.readXML(args['cactus_project'])

    if not args['append']:
        # Overwrite existing hal
        print 'rm -f {0}'.format(args['HAL_file_path'])
        system('rm -f {0}'.format(args['HAL_file_path']))

    # some quick stats
    totalTime = time.time()
    totalAppendTime = 0

    # traverse tree to make sure we are going breadth-first
    tree = myProj.mcTree

    # find subtree if event specified
    event = args['event']
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in myProj.expMap:
            experimentFilePath = myProj.expMap[genomeName]
            experiment = ExperimentWrapper(
                ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            expTreeString = NXNewick().writeString(experiment.getTree())
            assert len(expTreeString) > 1
            assert experiment.getHALPath() is not None
            assert experiment.getHALFastaPath() is not None

            cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(
                experiment.getHALPath(), experiment.getHALFastaPath(),
                expTreeString, args['HAL_file_path'])

            if len(outgroups) > 0:
                cmdline += " --outgroups {0}".format(",".join(outgroups))
            if args["cacheBytes"] is not None:
                cmdline += " --cacheBytes {0}".format(args["cacheBytes"])
            if args["cacheMDC"] is not None:
                cmdline += " --cacheMDC {0}".format(args["cacheMDC"])
            if args["cacheRDC"] is not None:
                cmdline += " --cacheRDC {0}".format(args["cacheRDC"])
            if args["cacheW0"] is not None:
                cmdline += " --cacheW0 {0}".format(args["cacheW0"])
            if args["chunk"] is not None:
                cmdline += " --chunk {0}".format(args["chunk"])
            if args["deflate"] is not None:
                cmdline += " --deflate {0}".format(args["deflate"])
            if args["inMemory"] is True:
                cmdline += " --inMemory"

            print cmdline
            appendTime = time.time()
            system(cmdline)
            appendTime = time.time() - appendTime
            totalAppendTime += appendTime


#            print "time of above command: {0:.2f}".format(appendTime)

    totalTime = time.time() - totalTime
    print "total time: {0:.2f}  total halAppendCactusSubtree time: {1:.2f}".format(
        totalTime, totalAppendTime)
コード例 #4
0
    def run(self):
        logger.info("Progressive Up: " + self.event)

        # open up the experiment
        # note that we copy the path into the options here
        self.options.experimentFile = self.project.expMap[self.event]
        expXml = ET.parse(self.options.experimentFile).getroot()
        experiment = ExperimentWrapper(expXml)
        configXml = ET.parse(experiment.getConfigPath()).getroot()
        configWrapper = ConfigWrapper(configXml)

        # need at least 3 processes for every event when using ktserver:
        # 1 proc to run jobs, 1 proc to run server, 1 proc to run 2ndary server
        if experiment.getDbType() == "kyoto_tycoon":            
            maxParallel = min(len(self.project.expMap),
                             configWrapper.getMaxParallelSubtrees()) 
            if self.options.batchSystem == "singleMachine":
                if int(self.options.maxThreads) < maxParallel * 3:
                    raise RuntimeError("At least %d threads are required (only %d were specified) to handle up to %d events using kyoto tycoon. Either increase the number of threads using the --maxThreads option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, self.options.maxThreads, maxParallel, configWrapper.getMaxParallelSubtrees()))
            else:
                if int(self.options.maxCpus) < maxParallel * 3:
                    raise RuntimeError("At least %d concurrent cpus are required to handle up to %d events using kyoto tycoon. Either increase the number of cpus using the --maxCpus option or decrease the number of parallel jobs (currently %d) by adjusting max_parallel_subtrees in the config file" % (maxParallel * 3, maxParallel, configWrapper.getMaxParallelSubtrees()))
                    
        # take union of command line options and config options for hal and reference
        if self.options.buildReference == False:
            refNode = findRequiredNode(configXml, "reference")
            self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False)
        halNode = findRequiredNode(configXml, "hal")
        if self.options.buildHal == False:
            self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False)
        if self.options.buildFasta == False:
            self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False)

        # get parameters that cactus_workflow stuff wants
        workFlowArgs = CactusWorkflowArguments(self.options)
        # copy over the options so we don't trail them around
        workFlowArgs.buildReference = self.options.buildReference
        workFlowArgs.buildHal = self.options.buildHal
        workFlowArgs.buildFasta = self.options.buildFasta
        workFlowArgs.overwrite = self.options.overwrite
        workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet
        
        experiment = ExperimentWrapper(workFlowArgs.experimentNode)

        donePath = os.path.join(os.path.dirname(workFlowArgs.experimentFile), "DONE")
        doneDone = os.path.isfile(donePath)
        refDone = not workFlowArgs.buildReference or os.path.isfile(experiment.getReferencePath())
        halDone = not workFlowArgs.buildHal or (os.path.isfile(experiment.getHALFastaPath()) and
                                                os.path.isfile(experiment.getHALPath()))
                                                               
        if not workFlowArgs.overwrite and doneDone and refDone and halDone:
            self.logToMaster("Skipping %s because it is already done and overwrite is disabled" %
                             self.event)
        else:
            system("rm -f %s" % donePath)
            # delete database 
            # and overwrite specified (or if reference not present)
            dbPath = os.path.join(experiment.getDbDir(), 
                                  experiment.getDbName())
            seqPath = os.path.join(experiment.getDbDir(), "sequences")
            system("rm -f %s* %s %s" % (dbPath, seqPath, 
                                        experiment.getReferencePath()))

            if workFlowArgs.configWrapper.getDoTrimStrategy() and workFlowArgs.outgroupEventNames is not None:
                # Use the trimming strategy to blast ingroups vs outgroups.
                self.addChildTarget(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast"))
            else:
                self.addChildTarget(CactusSetupPhase(cactusWorkflowArguments=workFlowArgs,
                                                     phaseName="setup"))
        logger.info("Going to create alignments and define the cactus tree")

        self.setFollowOnTarget(FinishUp(workFlowArgs, self.project))
コード例 #5
0
    def progressiveFunction(self,
                            experimentFile,
                            toilDir,
                            batchSystem,
                            buildAvgs,
                            buildReference,
                            buildHal,
                            buildFasta,
                            toilStats,
                            subtreeRoot=None):
        tempDir = getTempDirectory(os.getcwd())
        tempExperimentDir = os.path.join(tempDir, "exp")
        runCreateMultiCactusProject(experimentFile,
                                    tempExperimentDir,
                                    fixNames=False,
                                    root=subtreeRoot)
        logger.info("Put the temporary files in %s" % tempExperimentDir)

        runCactusProgressive(os.path.join(tempExperimentDir,
                                          "exp_project.xml"),
                             toilDir,
                             batchSystem=batchSystem,
                             buildAvgs=buildAvgs,
                             toilStats=toilStats)

        # Check that the headers and sequences in the output are the
        # same as the sequences in the input (minus differences in
        # repeat-masking)
        exp = ExperimentWrapper(ET.parse(experimentFile).getroot())
        seqMap = exp.buildSequenceMap()
        # Maps genome name -> headers in fasta
        headers = {}
        for genomeName, inputSequencePath in seqMap.items():
            if os.path.isdir(inputSequencePath):
                # Some "input sequence paths" are actually provided as
                # directories containing multiple FASTAs
                concatenatedPath = getTempFile()
                system("cat %s/* > %s" % (inputSequencePath, concatenatedPath))
                inputSequencePath = concatenatedPath
            headers[genomeName] = list(
                map(itemgetter(0), fastaRead(inputSequencePath)))

        # check headers inside .c2h output
        for expPath in glob.glob('%s/*/*_experiment.xml' %
                                 (tempExperimentDir)):
            subExp = ExperimentWrapper(ET.parse(expPath).getroot())
            outgroups = subExp.getOutgroupEvents()
            c2hPath = subExp.getHALPath()
            with open(c2hPath) as f:
                for line in f:
                    fields = line.split('\t')
                    if fields[0] == 's':
                        # Sequence line
                        genome = fields[1][1:-1]
                        header = fields[2][1:-1]
                        if genome in headers and genome not in outgroups:
                            # This genome is an input genome
                            self.assertTrue(
                                header in headers[genome],
                                'Header %s from output c2h %s not found in input fa %s'
                                ' for genome %s' %
                                (header, c2hPath, seqMap[genome], genome))

        runToilStatusAndFailIfNotComplete(toilDir)
        system("rm -rf %s" % tempDir)
コード例 #6
0
ファイル: cactus2hal.py プロジェクト: glennhickey/cactus2hal
def main():
    args = initParser()
    myProj = MultiCactusProject()
    myProj.readXML(args['cactus_project'])

    if not args['append']:
        # Overwrite existing hal
        print 'rm -f {0}'.format(args['HAL_file_path'])
        system('rm -f {0}'.format(args['HAL_file_path']))

    # some quick stats
    totalTime = time.time()
    totalAppendTime = 0

    # traverse tree to make sure we are going breadth-first
    tree = myProj.mcTree

    # find subtree if event specified
    event = args['event']
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in myProj.expMap:
            experimentFilePath = myProj.expMap[genomeName]
            print experimentFilePath
            experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True))
            assert len(expTreeString) > 1
            assert experiment.getHALPath() is not None
            assert experiment.getHALFastaPath() is not None

            cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path'])
            
            if len(outgroups) > 0:
                cmdline += " --outgroups {0}".format(",".join(outgroups))
            if args["cacheBytes"] is not None:
                cmdline += " --cacheBytes {0}".format(args["cacheBytes"])
            if args["cacheMDC"] is not None:
                cmdline += " --cacheMDC {0}".format(args["cacheMDC"])
            if args["cacheRDC"] is not None:
                cmdline += " --cacheRDC {0}".format(args["cacheRDC"])
            if args["cacheW0"] is not None:
                cmdline += " --cacheW0 {0}".format(args["cacheW0"])
            if args["chunk"] is not None:
                cmdline += " --chunk {0}".format(args["chunk"])
            if args["deflate"] is not None:
                cmdline += " --deflate {0}".format(args["deflate"])
            if args["inMemory"] is True:
                cmdline += " --inMemory"

            
            print cmdline
            appendTime = time.time()
            system(cmdline)
            appendTime = time.time() - appendTime
            totalAppendTime += appendTime
#            print "time of above command: {0:.2f}".format(appendTime)
 
    totalTime = time.time() - totalTime
    print "total time: {0:.2f}  total halAppendCactusSubtree time: {1:.2f}".format(totalTime, totalAppendTime)