Exemple #1
0
 def run(self):
     cactusAlignmentName = "cactusAlignment"
     cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName)
     if not os.path.exists(cactusAlignment):
         #Prepare the assembly
         #First copy it.
         if self.assemblyFile[-3:] == '.gz':
             tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(),
                                            suffix=".gz")
             system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
             system("gunzip %s" % tempAssemblyFile)
             tempAssemblyFile = tempAssemblyFile[:-3]
             assert os.path.exists(tempAssemblyFile)
         else:
             tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(),
                                            suffix="")
             system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
         #Make the supporting temporary files
         tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir())
         tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree")
         #Make the experiment file
         cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper(
             sequences=self.haplotypeSequences + [tempAssemblyFile],
             newickTreeString=self.newickTree,
             outputDir=self.getLocalTempDir(),
             configFile=self.configFile)
         cactusWorkflowExperiment.setDbName(cactusAlignmentName)
         cactusWorkflowExperiment.setDbDir(
             os.path.join(self.getLocalTempDir(),
                          cactusWorkflowExperiment.getDbName())
         )  #This needs to be set to ensure the thing gets put in the right directory
         cactusWorkflowExperiment.writeXML(tempExperimentFile)
         #Now run cactus workflow
         runCactusWorkflow(experimentFile=tempExperimentFile,
                           jobTreeDir=tempJobTreeDir,
                           buildAvgs=False,
                           buildReference=True,
                           batchSystem="single_machine",
                           maxThreads=1,
                           jobTreeStats=True)
         logger.info("Ran the workflow")
         #Check if the jobtree completed sucessively.
         runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
         logger.info("Checked the job tree dir")
         #Compute the stats
         cactusAlignmentDir = os.path.join(self.getLocalTempDir(),
                                           cactusAlignmentName)
         tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),
                                             "jobTreeStats.xml")
         system("jobTreeStats --jobTree %s --outputFile %s" %
                (tempJobTreeDir, tempJobTreeStatsFile))
         #Now copy the true assembly back to the output
         system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir))
         #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir))
         #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir))
         #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir))
         assert os.path.exists(cactusAlignment)
         #We're done!
     self.addChildTarget(
         MakeStats1(self.outputDir, cactusAlignment, self.options))
 def run(self):
     cactusAlignmentName = "cactusAlignment"
     cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName)
     if not os.path.exists(cactusAlignment):
         #Prepare the assembly
         #First copy it.
         if self.assemblyFile[-3:] == '.gz':
            tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz")
            system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
            system("gunzip %s" % tempAssemblyFile)
            tempAssemblyFile = tempAssemblyFile[:-3]
            assert os.path.exists(tempAssemblyFile)
         else:
             tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="")
             system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
         #Make the supporting temporary files
         tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir())
         tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree")
         #Make the experiment file
         cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper(
                                              sequences=self.haplotypeSequences + [ tempAssemblyFile ], 
                                              newickTreeString=self.newickTree, 
                                              outputDir=self.getLocalTempDir(),
                                              configFile=self.configFile)
         cactusWorkflowExperiment.setDbName(cactusAlignmentName)
         cactusWorkflowExperiment.setDbDir(os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName())) #This needs to be set to ensure the thing gets put in the right directory
         cactusWorkflowExperiment.writeXML(tempExperimentFile)
         #Now run cactus workflow
         runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, 
                           buildAvgs=False, buildReference=True,
                           batchSystem="single_machine", maxThreads=1, jobTreeStats=True)
         logger.info("Ran the workflow")
         #Check if the jobtree completed sucessively.
         runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
         logger.info("Checked the job tree dir")
         #Compute the stats
         cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName)
         tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),"jobTreeStats.xml")
         system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile))
         #Now copy the true assembly back to the output
         system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir))
         #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir))
         #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir))
         #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir))
         assert os.path.exists(cactusAlignment)
         #We're done!
     self.addChildTarget(MakeStats1(self.outputDir, cactusAlignment, self.options))
    def runVanilla(self):
        logger.debug("Going to put the alignment in %s" % self.outputDir)
        if not os.path.isdir(self.outputDir):
            os.mkdir(self.outputDir)

        if not os.path.exists(os.path.join(self.outputDir, "cactusAlignmentVanilla")):
            xmlTree = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml"))
            
            #Set the parameters
            tempLocalDir = os.path.join(self.outputDir, "tempVanillaCactusAlignment")
            system("rm -rf %s" % tempLocalDir)
            os.mkdir(tempLocalDir)
            
            #Set the config parameters
            self.params.applyToXml(xmlTree)
            config = xmlTree.getroot()
            assert config is not None
        
            #Write the config file
            tempConfigFile = os.path.join(tempLocalDir, "config.xml")
            fileHandle = open(tempConfigFile, 'w')
            assert fileHandle is not None
            tree = ET.ElementTree(config)
            tree.write(fileHandle)
            fileHandle.close()
         
            #Make the experiment file
            tempExperimentFile = os.path.join(tempLocalDir, "experiment.xml")
            #Now do standard cactus..
            #Make the experiment file
            tempExperimentFile2 = os.path.join(tempLocalDir, "experiment.xml")

            cactusWorkflowExperiment = CactusWorkflowExperiment(
                                                 sequences=self.sequences, 
                                                 newickTreeString=self.newickTree, 
                                                 #requiredSpecies=self.requiredSpecies,
                                                 #singleCopySpecies=self.singleCopySpecies,
                                                 databaseName="cactusAlignmentVanilla",
                                                 outputDir=tempLocalDir,
                                                 configFile=tempConfigFile)
            tempExperimentDir2 = os.path.join(tempLocalDir, "cactusAlignmentVanilla")
            cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile2)
           
            # apply naming to the event tree to be consistent with progressive
            exp = ExperimentWrapper(ET.parse(tempExperimentFile2).getroot())
            cleanEventTree(exp)
            exp.writeXML(tempExperimentFile2)
            
            #We're done with the progressive, now run the vanilla cactus for comparison
            tempJobTreeDir2 = os.path.join(tempLocalDir, "jobTreeVanilla")
            runCactusWorkflow(tempExperimentFile2, tempJobTreeDir2,
                              jobTreeStats=True,
                              setupAndBuildAlignments=True,
                              buildReference=True,
                              maxThreads=4)
            
            runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir2)
            logger.info("Checked the job tree dir for the vanilla run")
            
            runCactusMAFGenerator(os.path.join(self.outputDir, "cactusVanilla.maf"), getCactusDiskString(tempExperimentDir2))
            
            #Run the cactus tree stats
            treeStatsFile = os.path.join(self.outputDir, "treeStats.xml")
            system("cactus_treeStats --cactusDisk \'%s\' --flowerName 0 --outputFile %s" %(exp.getDiskDatabaseString(),
                                                                                        treeStatsFile))
            
            system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir2, self.outputDir))
            system("mv %s %s" % (tempExperimentDir2, self.outputDir))
            system("mv %s %s/experiment.xml" % (tempExperimentFile2, self.outputDir))
Exemple #4
0
    def run(self):
        if not os.path.isdir(self.outputDir):
            os.mkdir(self.outputDir)
        cactusAlignmentName = "cactusAlignment"
        outputFile = os.path.join(self.outputDir, cactusAlignmentName)
        if not os.path.exists(outputFile):
            config = ET.parse(
                os.path.join(getRootPathString(), "lib",
                             "cactus_workflow_config.xml")).getroot()

            #Set the reference algorithm
            config.find("reference").attrib[
                "matching_algorithm"] = self.referenceAlgorithm

            #Do the minimum block degree configuration
            iterations = config.find("alignment").find("iterations")
            blastIteration = iterations.findall("iteration")[0]
            baseIteration = iterations.findall("iteration")[1]

            minimumBlastBlockDegree = self.minimumBlockDegree
            if minimumBlastBlockDegree <= 1:
                minimumBlastBlockDegree = 2
            blastIteration.find("core").attrib["minimumBlockDegree"] = str(
                minimumBlastBlockDegree)
            baseIteration.attrib["minimumBlockDegree"] = str(
                self.minimumBlockDegree)
            baseIteration.attrib["prune_out_stub_alignments"] = str(
                int(self.pruneOutStubAlignments))
            baseIteration.attrib["gap_gamma"] = str(float(self.gapGamma))

            #Set the blast string
            blastIteration.find(
                "blast").attrib["blastString"] = blastIteration.find(
                    "blast").attrib["blastString"].replace(
                        "PARAMETERS", self.blastAlignmentString)
            blastIteration.find(
                "blast").attrib["selfBlastString"] = blastIteration.find(
                    "blast").attrib["selfBlastString"].replace(
                        "PARAMETERS", self.blastAlignmentString)

            #Get rid of the base level, if needed
            if not self.baseLevel:
                iterations.remove(baseIteration)

            #Set the number of chains to allow in a level, during promotion
            config.find("normal").attrib["max_number_of_chains"] = str(
                self.maxNumberOfChains)

            #Set the number of chains to order per round of the matching algorithm
            config.find("reference").attrib["permutations"] = str(
                self.permutations)

            #Set the chain weight function
            if bool(self.useSimulatedAnnealing):
                config.find("reference").attrib["useSimulatedAnnealing"] = "1"

            config.find("reference").attrib["theta"] = str(self.theta)

            #Write the config file
            tempConfigFile = os.path.join(self.getLocalTempDir(), "config.xml")
            fileHandle = open(tempConfigFile, 'w')
            tree = ET.ElementTree(config)
            tree.write(fileHandle)
            fileHandle.close()

            #Make the supporting temporary files
            tempExperimentFile = os.path.join(self.getLocalTempDir(),
                                              "experiment.xml")
            tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree")
            #Make the experiment file
            cactusWorkflowExperiment = CactusWorkflowExperiment(
                sequences=self.sequences.split(),
                newickTreeString=self.options.newickTree,
                requiredSpecies=[(1, self.requiredSpecies.split())],
                singleCopySpecies=self.singleCopySpecies,
                outgroupEvent=self.options.outgroupEvent,
                databaseName=cactusAlignmentName,
                outputDir=self.getLocalTempDir(),
                configFile=tempConfigFile)
            cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile)
            #Now run cactus workflow
            runCactusWorkflow(experimentFile=tempExperimentFile,
                              jobTreeDir=tempJobTreeDir,
                              setupAndBuildAlignments=True,
                              buildTrees=False,
                              buildFaces=False,
                              buildReference=True,
                              batchSystem="single_machine",
                              maxThreads=1,
                              jobTreeStats=True)
            logger.info("Ran the workflow")
            #Check if the jobtree completed sucessively.
            runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
            logger.info("Checked the job tree dir")
            #Now copy the true assembly back to the output
            system("mv %s %s/experiment.xml" %
                   (tempExperimentFile, self.outputDir))
            system("mv %s %s/config.xml" % (tempConfigFile, self.outputDir))
            #Copy across the final alignment
            localCactusDisk = os.path.join(self.getLocalTempDir(),
                                           cactusAlignmentName)
            #Move the final db
            system("mv %s %s" % (localCactusDisk, outputFile))
            #Compute the stats
            system(
                "jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" %
                (tempJobTreeDir, self.outputDir))
            #We're done!
        self.addChildTarget(MakeStats(outputFile, self.outputDir,
                                      self.options))
 def run(self):
     if not os.path.isdir(self.outputDir):
         os.mkdir(self.outputDir)
     cactusAlignmentName = "cactusAlignment"
     outputFile = os.path.join(self.outputDir, cactusAlignmentName)
     if not os.path.exists(outputFile):
         config = ET.parse(os.path.join(getRootPathString(), "lib", "cactus_workflow_config.xml")).getroot()
         
         #Set the reference algorithm
         config.find("reference").attrib["matching_algorithm"] = self.referenceAlgorithm
         
         #Do the minimum block degree configuration
         iterations = config.find("alignment").find("iterations")
         blastIteration = iterations.findall("iteration")[0]
         baseIteration = iterations.findall("iteration")[1]
         
         minimumBlastBlockDegree = self.minimumBlockDegree
         if minimumBlastBlockDegree <= 1:
             minimumBlastBlockDegree = 2
         blastIteration.find("core").attrib["minimumBlockDegree"] = str(minimumBlastBlockDegree)
         baseIteration.attrib["minimumBlockDegree"] = str(self.minimumBlockDegree)
         baseIteration.attrib["prune_out_stub_alignments"] = str(int(self.pruneOutStubAlignments))
         baseIteration.attrib["gap_gamma"] = str(float(self.gapGamma))
         
         #Set the blast string
         blastIteration.find("blast").attrib["blastString"] = blastIteration.find("blast").attrib["blastString"].replace("PARAMETERS", self.blastAlignmentString)
         blastIteration.find("blast").attrib["selfBlastString"] = blastIteration.find("blast").attrib["selfBlastString"].replace("PARAMETERS", self.blastAlignmentString)
         
         #Get rid of the base level, if needed
         if not self.baseLevel:
             iterations.remove(baseIteration)
         
         #Set the number of chains to allow in a level, during promotion
         config.find("normal").attrib["max_number_of_chains"] = str(self.maxNumberOfChains)
         
         #Set the number of chains to order per round of the matching algorithm
         config.find("reference").attrib["permutations"]  = str(self.permutations)
         
         #Set the chain weight function
         if bool(self.useSimulatedAnnealing):
             config.find("reference").attrib["useSimulatedAnnealing"]="1"
             
         config.find("reference").attrib["theta"] = str(self.theta)
         
         #Write the config file
         tempConfigFile = os.path.join(self.getLocalTempDir(), "config.xml")
         fileHandle = open(tempConfigFile, 'w')
         tree = ET.ElementTree(config)
         tree.write(fileHandle)
         fileHandle.close()
         
         #Make the supporting temporary files
         tempExperimentFile = os.path.join(self.getLocalTempDir(), "experiment.xml")
         tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree")
         #Make the experiment file
         cactusWorkflowExperiment = CactusWorkflowExperiment(
                                              sequences=self.sequences.split(), 
                                              newickTreeString=self.options.newickTree, 
                                              requiredSpecies=[ (1, self.requiredSpecies.split() ) ],
                                              singleCopySpecies=self.singleCopySpecies,
                                              outgroupEvent = self.options.outgroupEvent,
                                              databaseName=cactusAlignmentName,
                                              outputDir=self.getLocalTempDir(),
                                              configFile=tempConfigFile)
         cactusWorkflowExperiment.writeExperimentFile(tempExperimentFile)
         #Now run cactus workflow
         runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, 
                           setupAndBuildAlignments=True,
                           buildTrees=False, buildFaces=False, buildReference=True,
                           batchSystem="single_machine", maxThreads=1, jobTreeStats=True)
         logger.info("Ran the workflow")
         #Check if the jobtree completed sucessively.
         runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
         logger.info("Checked the job tree dir")
         #Now copy the true assembly back to the output
         system("mv %s %s/experiment.xml" % (tempExperimentFile, self.outputDir))
         system("mv %s %s/config.xml" % (tempConfigFile, self.outputDir))
         #Copy across the final alignment
         localCactusDisk = os.path.join(self.getLocalTempDir(), cactusAlignmentName)
         #Move the final db
         system("mv %s %s" % (localCactusDisk, outputFile))
         #Compute the stats
         system("jobTreeStats --jobTree %s --outputFile %s/jobTreeStats.xml" % (tempJobTreeDir, self.outputDir))
         #We're done!
     self.addChildTarget(MakeStats(outputFile, self.outputDir, self.options))
Exemple #6
0
 def testCactusWorkflow_Blanchette(self): 
     """Runs the workflow on blanchette's simulated (colinear) regions.
     """
     if "SON_TRACE_DATASETS" not in os.environ:
         return
     for test in xrange(self.testNo):
         tempFiles = []
         tempDir = getTempDirectory(os.getcwd())
         
         trueAlignment = os.path.join(TestStatus.getPathToDataSets(), "blanchettesSimulation", "00.job", "true.mfa")
         
         #Load the true alignment.
         columnAlignment = [ i for i in  fastaAlignmentRead(trueAlignment) ]
         fastaHeaders = [ i for i in fastaReadHeaders(trueAlignment) ]
         sequenceNumber = 9
         
         #The tree
         newickTreeString = "((((HUMAN:0.006969, CHIMP:0.009727):0.025291, BABOON:0.044568):0.11,(RAT:0.072818, MOUSE:0.081244):0.260342):0.023260,((DOG:0.07, CAT:0.07):0.087381,(PIG:0.06, COW:0.06):0.104728):0.04);"
         
         #Get random dir
         testDir = getTempDirectory(tempDir)
         
         #random alignment
         alignmentLength = 5000
         randomStart = random.choice(xrange(len(columnAlignment)-alignmentLength))
         subAlignment = columnAlignment[randomStart:randomStart+alignmentLength]
         logger.info("Got a sub alignment, it is %i columns long" % len(subAlignment))
         
         #Get sequences
         sequences = [ (fastaHeaders[seqNo], "".join([ column[seqNo] for column in subAlignment if column[seqNo] != '-' ])) for seqNo in xrange(sequenceNumber) ]
         logger.info("Got the sequences")
         
         #Write sequences into temp files
         tempFastaFiles = []
         for seqNo in xrange(sequenceNumber):
             header, sequence = sequences[seqNo]
             logger.info("Making temp file for header: %s, seq: %s" % (header, sequence))
             tempFastaFile = os.path.join(testDir, "%i.fa" % seqNo)
             tempFastaFiles.append(tempFastaFile)
             fileHandle = open(tempFastaFile, "w")
             fastaWrite(fileHandle, header, sequence)
             fileHandle.close()
         logger.info("Got the temp sequence files")
         
         experiment = getCactusWorkflowExperimentForTest(tempFastaFiles, newickTreeString, testDir)
         experimentFile = os.path.join(testDir, "experiment.xml")
         experiment.writeXML(experimentFile)
         cactusDiskDatabaseString = experiment.getDiskDatabaseString()
         
         jobTree = os.path.join(testDir, "jobTree")
         
         runCactusWorkflow(experimentFile, jobTree)
         logger.info("Ran the the workflow")
         
         #Check the output alignment
         runJobTreeStatusAndFailIfNotComplete(jobTree)
         logger.info("Checked the job tree dir")
         
         #Output the 'TRUE' alignment file
         if os.system("mfaToMaf --help > /dev/null 2>&1") == 0 and\
            os.system("cactus_MAFGenerator --help > /dev/null 2>&1") == 0 and\
            os.system("mafComparator --help > /dev/null 2>&1") == 0 and\
            os.system("cactus_treeStats --help > /dev/null 2>&1") == 0:
             trueMFAFile = os.path.join(testDir, "true.mfa")
             fastaAlignmentWrite(subAlignment, fastaHeaders, len(fastaHeaders), trueMFAFile)
             trueMAFFile = os.path.join(testDir, "true.maf")
             system("mfaToMaf --mfaFile %s --outputFile %s --logLevel %s" % (trueMFAFile, trueMAFFile, getLogLevelString()))
             system("cat %s" % trueMAFFile)
             
             #Now get mafs for the region.
             mAFFile = os.path.join(testDir, "flower.maf")
             system("cactus_MAFGenerator --flowerName 0 --cactusDisk '%s' --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, mAFFile, getLogLevelString()))
             logger.info("Got the MAFs from the flower disk")
             system("cat %s" % mAFFile)
             
             statsFile = os.path.join(testDir, "stats.xml")
             system("cactus_treeStats --cactusDisk '%s' --flowerName 0 --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, statsFile, getLogLevelString()))
             system("cat %s" % statsFile)
             logger.info("Got the cactus tree stats")
             
             #Now compare the mafs to the output.
             resultsFile = os.path.join(testDir, "results.xml")
             system("mafComparator --mafFile1 %s --mafFile2 %s --outputFile %s --logLevel %s" % (trueMAFFile, mAFFile, resultsFile, getLogLevelString()))
             logger.info("Ran the maf comparator")
             
             system("cat %s" % resultsFile)
             
             #Cleanup
             experiment.cleanupDb()
             system("rm -rf %s" % testDir)
             logger.info("Successfully ran test for the problem")
             
         for tempFile in tempFiles:
             os.remove(tempFile)
         system("rm -rf %s" % tempDir)