def __growthStats(self, project):
        if project is None:
            return ["", ""]
        results = []
        tree = project.mcTree
        rootName = tree.getRootName()
        rootExpPath = project.expMap[rootName]
        rootExp = ExperimentWrapper(ET.parse(rootExpPath).getroot())
        rootPath = rootExp.getReferencePath()
        rootSize = float(os.path.getsize(rootPath))
        leafNames = [tree.getName(i) for i in tree.getLeaves()]
        leafSizes = []
        for leaf in leafNames:
            leafPath = project.sequencePath(leaf)
            leafSize = float(os.path.getsize(leafPath))
            leafSizes.append(leafSize)
        results.append(rootSize / max(leafSizes))

        ratioSum = 0.0
        ratioCount = 0
        for expName, expPath in project.expMap.items():
            exp = ExperimentWrapper(ET.parse(expPath).getroot())
            rootPath = exp.getReferencePath()
            rootSize = float(os.path.getsize(rootPath))
            for leafName, leafPath in exp.seqMap.items():
                leafSize = float(os.path.getsize(leafPath))
                ratio = rootSize / leafSize
                ratioSum += ratio
                ratioCount += 1
        avgRatio = ratioSum / ratioCount
        results.append(avgRatio)
        return results
    def __pollKtServers(self):
        self.curKtservers = set()
        try:
            mc = MultiCactusProject()
            mc.readXML(self.projectPath)
            for eventName,expPath in mc.expMap.items():
                exp = ExperimentWrapper(ET.parse(expPath).getroot())
                try:
                    if pingKtServer(exp):
                        self.curKtservers.add("%s_%s:%s" % (
                            eventName, exp.getDbHost(), str(exp.getDbPort())))
                except:
                    pass
                try:
                    secElem = exp.getSecondaryDBElem()
                    if secElem is not None and pingKtServer(secElem):
                        self.curKtservers.add("%s_secondary_%s:%s" % (
                            eventName, secElem.getDbHost(),
                            str(secElem.getDbPort())))
                except:
                    pass

        except:
            self.curKtservers = set()
        if len(self.prevKtservers) > 0 and len(self.curKtservers) > 0 and\
               self.curKtservers == self.prevKtservers:
            self.sameKtserversTime += self.pollTime
        else:
            self.prevKtservers = set(self.curKtservers)
            self.sameKtserversTime = 0
    def processExperiment(self):
        expXml = self.seqFile.toXMLElement()
        #create the cactus disk
        cdElem = ET.SubElement(expXml, "cactus_disk")
        database = self.options.database
        assert database == "kyoto_tycoon" or database == "tokyo_cabinet"
        confElem = ET.SubElement(cdElem, "st_kv_database_conf")
        confElem.attrib["type"] = database
        dbElem = ET.SubElement(confElem, database)
        self.expWrapper = ExperimentWrapper(expXml)

        if self.options.database == "kyoto_tycoon":
            self.expWrapper.setDbPort(str(self.options.ktPort))
            if self.options.ktType == 'memory':
                self.expWrapper.setDbInMemory(True)
                self.expWrapper.setDbSnapshot(False)
            elif self.options.ktType == 'snapshot':
                self.expWrapper.setDbInMemory(True)
                self.expWrapper.setDbSnapshot(True)
            else:
                assert self.options.ktType == 'disk'
                self.expWrapper.setDbInMemory(False)
                self.expWrapper.setDbSnapshot(False)
            # sonlib doesn't allow for spaces in attributes in the db conf
            # which renders this options useless
            # if self.options.ktOpts is not None:
            #    self.expWrapper.setDbServerOptions(self.options.ktOpts)
            if self.options.ktCreateTuning is not None:
                self.expWrapper.setDbCreateTuningOptions(
                    self.options.ktCreateTuning)
            if self.options.ktOpenTuning is not None:
                self.expWrapper.setDbReadTuningOptions(
                    self.options.ktOpenTuning)
    def runProgressive(self):
        logger.debug("Going to put the alignment in %s" % self.outputDir)
        if not os.path.isdir(self.outputDir):
            os.mkdir(self.outputDir)

        if not os.path.exists(os.path.join(self.outputDir, "progressiveCactusAlignment")):
            xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml"))
            
            #Set the parameters
            tempLocalDir = os.path.join(self.outputDir, "tempProgressiveCactusAlignment")
            system("rm -rf %s" % tempLocalDir)
            os.mkdir(tempLocalDir)
            
            #Set the config parameters
            self.params.applyToXml(xmlTree)
            config = xmlTree.getroot()
            assert config is not None
            
            #Write the config file
            tempConfigFile = os.path.join(tempLocalDir, "config.xml")
            fileHandle = open(tempConfigFile, 'w')
            assert fileHandle is not None
            tree = ET.ElementTree(config)
            tree.write(fileHandle)
            fileHandle.close()
         
            #Make the experiment file
            tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml")
            
            if self.params.kyotoTycoon == True:
                dbConfElem = ET.Element("st_kv_database_conf", type="kyoto_tycoon")
                ktElem = ET.SubElement(dbConfElem, "kyoto_tycoon", host="localhost", port="1978", database_dir="dummy")
            else:
                dbConfElem = None
            
            cactusWorkflowExperiment = CactusWorkflowExperiment(
                                                 sequences=self.sequences, 
                                                 newickTreeString=self.newickTree, 
                                                 #requiredSpecies=self.requiredSpecies,
                                                 #singleCopySpecies=self.singleCopySpecies,
                                                 databaseName="cactusAlignment",
                                                 outputDir=tempLocalDir,
                                                 configFile=tempConfigFile,
                                                 databaseConf = dbConfElem)
            cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile)
            
            #The jobtree
            tempJobTreeDir = os.path.join(tempLocalDir, "jobTree")
            
            #The place to put the temporary experiment dir
            tempExperimentDir = os.path.join(tempLocalDir, "progressiveCactusAlignment")
            
      
            #The temporary experiment 
            runCactusCreateMultiCactusProject(tempExperimentFile, 
                                              tempExperimentDir)
            logger.info("Setup the cactus progressive experiment")
            
            runCactusProgressive(os.path.join(tempExperimentDir, "progressiveCactusAlignment_project.xml"), 
                                 tempJobTreeDir, 
                                 #batchSystem=batchSystem, 
                                 buildMaf=True,
                                 joinMaf=True,
                                 #buildTrees=buildTrees, buildFaces=buildFaces, buildReference=buildReference,
                                 jobTreeStats=True,
                                 maxThreads=4,
                                 logLevel="DEBUG")
            logger.info("Ran the progressive workflow")
            
            #Check if the jobtree completed sucessively.
            runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
            logger.info("Checked the job tree dir for the progressive run")
            
            #Run the cactus tree stats
            expPath = os.path.join(tempExperimentDir, "Anc0", "Anc0_experiment.xml")
            exp = ExperimentWrapper(ET.parse(expPath).getroot())
            if exp.getDbType() == "kyoto_tycoon":
                ktserver = KtserverLauncher()
                ktserver.spawnServer(exp) 
            treeStatsFile = os.path.join(self.outputDir, "treeStats.xml")
            system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(),
                                                                                        treeStatsFile))
            if exp.getDbType() == "kyoto_tycoon":
                ktserver.killServer(exp)
                
            #Now copy the true assembly back to the output
            system("mv %s %s/experiment.xml" % (tempExperimentFile, self.outputDir))
            system("mv %s %s" % (tempExperimentDir, self.outputDir))
            system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir, self.outputDir))
            system("mv %s %s/config.xml" % (tempConfigFile, self.outputDir))
            
            #But keep a link to the multicactus project in its original path so we can navigate
            # the paths in the xml...
            actualResultsDir = os.path.join(os.path.abspath(self.outputDir), "progressiveCactusAlignment")
            tempResultsDir = os.path.join(self.outputDir, "tempProgressiveCactusAlignment")
            system("ln -s %s %s" % (actualResultsDir, tempResultsDir))
    def runVanilla(self):
        logger.debug("Going to put the alignment in %s" % self.outputDir)
        if not os.path.isdir(self.outputDir):
            os.mkdir(self.outputDir)

        if not os.path.exists(os.path.join(self.outputDir, "cactusAlignmentVanilla")):
            xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml"))
            
            #Set the parameters
            tempLocalDir = os.path.join(self.outputDir, "tempVanillaCactusAlignment")
            system("rm -rf %s" % tempLocalDir)
            os.mkdir(tempLocalDir)
            
            #Set the config parameters
            self.params.applyToXml(xmlTree)
            config = xmlTree.getroot()
            assert config is not None
        
            #Write the config file
            tempConfigFile = os.path.join(tempLocalDir, "config.xml")
            fileHandle = open(tempConfigFile, 'w')
            assert fileHandle is not None
            tree = ET.ElementTree(config)
            tree.write(fileHandle)
            fileHandle.close()
         
            #Make the experiment file
            tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml")
            #Now do standard cactus..
            #Make the experiment file
            tempExperimentFile2 = os.path.join(tempLocalDir, "experiment.xml")

            cactusWorkflowExperiment = CactusWorkflowExperiment(
                                                 sequences=self.sequences, 
                                                 newickTreeString=self.newickTree, 
                                                 #requiredSpecies=self.requiredSpecies,
                                                 #singleCopySpecies=self.singleCopySpecies,
                                                 databaseName="cactusAlignmentVanilla",
                                                 outputDir=tempLocalDir,
                                                 configFile=tempConfigFile)
            tempExperimentDir2 = os.path.join(tempLocalDir, "cactusAlignmentVanilla")
            cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile2)
           
            # apply naming to the event tree to be consistent with progressive
            exp = ExperimentWrapper(ET.parse(tempExperimentFile2).getroot())
            cleanEventTree(exp)
            exp.writeXML(tempExperimentFile2)
            
            #We're done with the progressive, now run the vanilla cactus for comparison
            tempJobTreeDir2 = os.path.join(tempLocalDir, "jobTreeVanilla")
            runCactusWorkflow(tempExperimentFile2, tempJobTreeDir2,
                              jobTreeStats=True,
                              setupAndBuildAlignments=True,
                              buildReference=True,
                              maxThreads=4)
            
            runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir2)
            logger.info("Checked the job tree dir for the vanilla run")
            
            runCactusMAFGenerator(os.path.join(self.outputDir, "cactusVanilla.maf"), getCactusDiskString(tempExperimentDir2))
            
            #Run the cactus tree stats
            treeStatsFile = os.path.join(self.outputDir, "treeStats.xml")
            system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(),
                                                                                        treeStatsFile))
            
            system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir2, self.outputDir))
            system("mv %s %s" % (tempExperimentDir2, self.outputDir))
            system("mv %s %s/experiment.xml" % (tempExperimentFile2, self.outputDir))
class ProjectWrapper:
    alignmentDirName = 'progressiveAlignment'
    def __init__(self, options, seqFile, workingDir):
        self.options = options
        self.seqFile = seqFile
        self.workingDir = workingDir
        self.configWrapper = None
        self.expWrapper = None
        self.processConfig()
        self.processExperiment()

    def processConfig(self):
        # read in the default right out of cactus
        if self.options.configFile is not None:
            configPath = self.options.configFile
        else:
            dir = os.path.join(cactusRootPath(), "progressive")
            configPath = os.path.join(dir,
                                      "cactus_progressive_workflow_config.xml")
        configXml = ET.parse(configPath).getroot()
        self.configWrapper = ConfigWrapper(configXml)
        # here we can go through the options and apply some to the config
        self.configWrapper.setBuildHal(True)
        self.configWrapper.setBuildFasta(True)
        if self.options.outputMaf is not None:
            self.configWrapper.setBuildMaf(True)
            self.configWrapper.setJoinMaf(True)
        # pre-emptively turn down maxParallelSubtree for singleMachine
        # mode if not enough threads are provided to support it.  Probably
        # need to do something for other ?combined? batch systems?
        if self.options.batchSystem == 'singleMachine' and \
               self.options.database == 'kyoto_tycoon':
            if int(self.options.maxThreads) < \
                   self.configWrapper.getMaxParallelSubtrees() * 3:
                self.configWrapper.setMaxParallelSubtrees(
                    max(1, int(self.options.maxThreads) / 3)) 

        # this is a little hack to effectively toggle back to the
        # non-progressive version of cactus (as published in Gen. Res. 2011)
        # from the high-level interface. 
        if self.options.legacy is True:
            self.configWrapper.setSubtreeSize(sys.maxint)

    def processExperiment(self):
        expXml = self.seqFile.toXMLElement()
        #create the cactus disk
        cdElem = ET.SubElement(expXml, "cactus_disk")
        database = self.options.database
        assert database == "kyoto_tycoon" or database == "tokyo_cabinet"
        confElem = ET.SubElement(cdElem, "st_kv_database_conf")
        confElem.attrib["type"] = database
        dbElem = ET.SubElement(confElem, database)
        self.expWrapper = ExperimentWrapper(expXml)

        if self.options.database == "kyoto_tycoon":
            self.expWrapper.setDbPort(str(self.options.ktPort))
            if self.options.ktType == 'memory':
                self.expWrapper.setDbInMemory(True)
                self.expWrapper.setDbSnapshot(False)
            elif self.options.ktType == 'snapshot':
                self.expWrapper.setDbInMemory(True)
                self.expWrapper.setDbSnapshot(True)
            else:
                assert self.options.ktType == 'disk'
                self.expWrapper.setDbInMemory(False)
                self.expWrapper.setDbSnapshot(False)
            # sonlib doesn't allow for spaces in attributes in the db conf
            # which renders this options useless
            # if self.options.ktOpts is not None:
            #    self.expWrapper.setDbServerOptions(self.options.ktOpts)
            if self.options.ktCreateTuning is not None:
                self.expWrapper.setDbCreateTuningOptions(
                    self.options.ktCreateTuning)
            if self.options.ktOpenTuning is not None:
                self.expWrapper.setDbReadTuningOptions(
                    self.options.ktOpenTuning)

    def writeXml(self):
        assert os.path.isdir(self.workingDir)
        configPath = os.path.abspath(
            os.path.join(self.workingDir, "config.xml"))
        expPath = os.path.abspath(
            os.path.join(self.workingDir, "expTemplate.xml"))
        self.expWrapper.setConfigPath(configPath)
        self.configWrapper.writeXML(configPath)
        self.expWrapper.writeXML(expPath)

        projPath = os.path.join(self.workingDir,
                                ProjectWrapper.alignmentDirName)
        if os.path.exists(projPath) and self.options.overwrite:
            system("rm -rf %s" % projPath)
        if self.options.outputMaf is True:
            fixNames=1
        else:
            fixNames=0
        if os.path.exists(projPath):
           if not self.isSameAsExisting(expPath, projPath, fixNames):
               raise RuntimeError("Existing project %s not " % projPath+
                                  "compatible with current input.  Please "
                                  "erase the working directory or rerun "
                                  "with the --overwrite option to start "
                                  "from scratch.")
           else:
               logPath = os.path.join(self.workingDir, 'cactus.log')
               logFile = open(logPath, "a")
               logFile.write("\nContinuing existing alignment.  Use "
                             "--overwrite or erase the working directory to "
                             "force restart from scratch.\n")
               logFile.close()
        else:
            cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % (
                expPath, projPath, fixNames)
            if len(self.seqFile.outgroups) > 0: 
                cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups)
            system(cmd)

    # create a project in a dummy directory.  check if the
    # project xml is the same as the current project.
    # we do this to see if we should start fresh or try to
    # work with the existing project when the overwrite flag is off
    def isSameAsExisting(self, expPath, projPath, fixNames):
        if not os.path.exists(projPath):
            return False
        oldPath = os.path.dirname(projPath + "/")
        tempPath = "%s_temp" % oldPath
        if os.path.exists(tempPath):
            system("rm -rf %s" % tempPath)
        cmd = "cactus_createMultiCactusProject.py %s %s --fixNames=%d" % (
            expPath, tempPath, fixNames)
        if len(self.seqFile.outgroups) > 0: 
            cmd += " --outgroupNames " + ",".join(self.seqFile.outgroups)
        system(cmd)
        projFilePathNew = os.path.join(tempPath,'%s_temp_project.xml' %
                                       self.alignmentDirName)
        projFilePathOld = os.path.join(oldPath, '%s_project.xml' %
                                       self.alignmentDirName)
        
        newFile = [line for line in open(projFilePathNew, "r")]
        oldFile = [line for line in open(projFilePathOld, "r")]
        areSame = True
        if len(newFile) != len(oldFile):
            areSame = False
        for newLine, oldLine in zip(newFile, oldFile):
            if newLine.replace(tempPath, oldPath) != oldLine:
                areSame = False
        system("rm -rf %s" % tempPath)
        return areSame
Esempio n. 7
0
def main():
    args = initParser()
    myProj = MultiCactusProject()
    myProj.readXML(args['cactus_project'])

    # for now we do not support appending at the script level
    print 'rm -f {0}'.format(args['HAL_file_path'])
    system('rm -f {0}'.format(args['HAL_file_path']))

    # some quick stats
    totalTime = time.time()
    totalAppendTime = 0

    # traverse tree to make sure we are going breadth-first
    tree = myProj.mcTree

    # find subtree if event specified
    event = args['event']
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in myProj.expMap:
            experimentFilePath = myProj.expMap[genomeName]
            experiment = ExperimentWrapper(
                ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            expTreeString = NXNewick().writeString(experiment.getTree())
            assert len(expTreeString) > 1
            assert experiment.getHALPath() is not None
            assert experiment.getHALFastaPath() is not None

            cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(
                experiment.getHALPath(), experiment.getHALFastaPath(),
                expTreeString, args['HAL_file_path'])

            if len(outgroups) > 0:
                cmdline += " --outgroups {0}".format(",".join(outgroups))
            if args["cacheBytes"] is not None:
                cmdline += " --cacheBytes {0}".format(args["cacheBytes"])
            if args["cacheMDC"] is not None:
                cmdline += " --cacheMDC {0}".format(args["cacheMDC"])
            if args["cacheRDC"] is not None:
                cmdline += " --cacheRDC {0}".format(args["cacheRDC"])
            if args["cacheW0"] is not None:
                cmdline += " --cacheW0 {0}".format(args["cacheW0"])
            if args["chunk"] is not None:
                cmdline += " --chunk {0}".format(args["chunk"])
            if args["deflate"] is not None:
                cmdline += " --deflate {0}".format(args["deflate"])

            print cmdline
            appendTime = time.time()
            system(cmdline)
            appendTime = time.time() - appendTime
            totalAppendTime += appendTime


#            print "time of above command: {0:.2f}".format(appendTime)

    totalTime = time.time() - totalTime
    print "total time: {0:.2f}  total halAppendCactusSubtree time: {1:.2f}".format(
        totalTime, totalAppendTime)
Esempio n. 8
0
def main():
    args = initParser()
    myProj = MultiCactusProject()
    myProj.readXML(args['cactus_project'])

    # for now we do not support appending at the script level
    print 'rm -f {0}'.format(args['HAL_file_path'])
    system('rm -f {0}'.format(args['HAL_file_path']))

    # some quick stats
    totalTime = time.time()
    totalAppendTime = 0

    # traverse tree to make sure we are going breadth-first
    tree = myProj.mcTree

    # find subtree if event specified
    event = args['event']
    rootNode = None
    if event is not None:
        assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event])
        rootNode = tree.nameToId[event]

    for node in tree.breadthFirstTraversal(rootNode):
        genomeName = tree.getName(node)
        if genomeName in myProj.expMap:
            experimentFilePath = myProj.expMap[genomeName]
            experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot())

            outgroups = experiment.getOutgroupEvents()
            expTreeString = NXNewick().writeString(experiment.getTree())
            assert len(expTreeString) > 1
            assert experiment.getHALPath() is not None
            assert experiment.getHALFastaPath() is not None

            cmdline = "time halAppendCactusSubtree \'{0}\' \'{1}\' \'{2}\' \'{3}\'".format(experiment.getHALPath(), experiment.getHALFastaPath(), expTreeString, args['HAL_file_path'])
            
            if len(outgroups) > 0:
                cmdline += " --outgroups {0}".format(",".join(outgroups))
            if args["cacheBytes"] is not None:
                cmdline += " --cacheBytes {0}".format(args["cacheBytes"])
            if args["cacheMDC"] is not None:
                cmdline += " --cacheMDC {0}".format(args["cacheMDC"])
            if args["cacheRDC"] is not None:
                cmdline += " --cacheRDC {0}".format(args["cacheRDC"])
            if args["cacheW0"] is not None:
                cmdline += " --cacheW0 {0}".format(args["cacheW0"])
            if args["chunk"] is not None:
                cmdline += " --chunk {0}".format(args["chunk"])
            if args["deflate"] is not None:
                cmdline += " --deflate {0}".format(args["deflate"])
            
            print cmdline
            appendTime = time.time()
            system(cmdline)
            appendTime = time.time() - appendTime
            totalAppendTime += appendTime
#            print "time of above command: {0:.2f}".format(appendTime)
 
    totalTime = time.time() - totalTime
    print "total time: {0:.2f}  total halAppendCactusSubtree time: {1:.2f}".format(totalTime, totalAppendTime)