Exemple #1
0
 def run(self):
     cactusAlignmentName = "cactusAlignment"
     cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName)
     if not os.path.exists(cactusAlignment):
         #Prepare the assembly
         #First copy it.
         if self.assemblyFile[-3:] == '.gz':
             tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(),
                                            suffix=".gz")
             system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
             system("gunzip %s" % tempAssemblyFile)
             tempAssemblyFile = tempAssemblyFile[:-3]
             assert os.path.exists(tempAssemblyFile)
         else:
             tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(),
                                            suffix="")
             system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
         #Make the supporting temporary files
         tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir())
         tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree")
         #Make the experiment file
         cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper(
             sequences=self.haplotypeSequences + [tempAssemblyFile],
             newickTreeString=self.newickTree,
             outputDir=self.getLocalTempDir(),
             configFile=self.configFile)
         cactusWorkflowExperiment.setDbName(cactusAlignmentName)
         cactusWorkflowExperiment.setDbDir(
             os.path.join(self.getLocalTempDir(),
                          cactusWorkflowExperiment.getDbName())
         )  #This needs to be set to ensure the thing gets put in the right directory
         cactusWorkflowExperiment.writeXML(tempExperimentFile)
         #Now run cactus workflow
         runCactusWorkflow(experimentFile=tempExperimentFile,
                           jobTreeDir=tempJobTreeDir,
                           buildAvgs=False,
                           buildReference=True,
                           batchSystem="single_machine",
                           maxThreads=1,
                           jobTreeStats=True)
         logger.info("Ran the workflow")
         #Check if the jobtree completed sucessively.
         runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
         logger.info("Checked the job tree dir")
         #Compute the stats
         cactusAlignmentDir = os.path.join(self.getLocalTempDir(),
                                           cactusAlignmentName)
         tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),
                                             "jobTreeStats.xml")
         system("jobTreeStats --jobTree %s --outputFile %s" %
                (tempJobTreeDir, tempJobTreeStatsFile))
         #Now copy the true assembly back to the output
         system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir))
         #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir))
         #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir))
         #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir))
         assert os.path.exists(cactusAlignment)
         #We're done!
     self.addChildTarget(
         MakeStats1(self.outputDir, cactusAlignment, self.options))
Exemple #2
0
    def testCactusRealign(self):
        """Runs cactus realign using the default parameters and checks that the realigned output cigars align
        the same subsequences.
        """
        for seqFile1, seqFile2 in seqFilePairGenerator():
            lastzOutput = getTempFile(rootDir=self.tempDir)
            runLastz(seqFile1,
                     seqFile2,
                     alignmentsFile=lastzOutput,
                     lastzArguments=self.defaultLastzArguments,
                     work_dir=self.tempDir)
            realignOutput = getTempFile(rootDir=self.tempDir)
            runCactusRealign(seqFile1,
                             seqFile2,
                             inputAlignmentsFile=lastzOutput,
                             outputAlignmentsFile=realignOutput,
                             realignArguments=self.defaultRealignArguments,
                             work_dir=self.tempDir)

            for realignLine, lastzLine in zip(
                [i for i in open(lastzOutput, 'r') if i != ''],
                [i for i in open(realignOutput, 'r') if i != '']):
                realignCigar = cigarReadFromString(realignLine)
                lastzCigar = cigarReadFromString(lastzLine)
                self.assertTrue(realignCigar.sameCoordinates(lastzCigar))
Exemple #3
0
    def testCactusRealignRescoreByIdentityAndProb(self):
        """Runs cactus realign using the default parameters and checks that the realigned output cigars align 
        the same subsequences.
        """
        for seqFile1, seqFile2 in seqFilePairGenerator():
            lastzOutput = getTempFile(rootDir=self.tempDir)
            runLastz(seqFile1,
                     seqFile2,
                     alignmentsFile=lastzOutput,
                     lastzArguments=self.defaultLastzArguments,
                     work_dir=self.tempDir)

            realignByIdentityOutput = getTempFile(rootDir=self.tempDir)
            runCactusRealign(seqFile1,
                             seqFile2,
                             inputAlignmentsFile=lastzOutput,
                             outputAlignmentsFile=realignByIdentityOutput,
                             realignArguments=self.defaultRealignArguments +
                             " --rescoreByIdentity",
                             work_dir=self.tempDir)

            realignByPosteriorProbOutput = getTempFile(rootDir=self.tempDir)
            runCactusRealign(seqFile1,
                             seqFile2,
                             inputAlignmentsFile=lastzOutput,
                             outputAlignmentsFile=realignByPosteriorProbOutput,
                             realignArguments=self.defaultRealignArguments +
                             " --rescoreByPosteriorProb",
                             work_dir=self.tempDir)

            realignByIdentityIgnoringGapsOutput = getTempFile(
                rootDir=self.tempDir)
            runCactusRealign(
                seqFile1,
                seqFile2,
                inputAlignmentsFile=lastzOutput,
                outputAlignmentsFile=realignByIdentityIgnoringGapsOutput,
                realignArguments=self.defaultRealignArguments +
                " --rescoreByIdentityIgnoringGaps",
                work_dir=self.tempDir)
            for realignLineByIdentity, realignLineByPosteriorProb, realignLineByIdentityIgnoringGaps, lastzLine in \
                                          zip([ i for i in open(realignByIdentityOutput, 'r') if i != '' ], \
                                              [ i for i in open(realignByPosteriorProbOutput, 'r') if i != '' ], \
                                              [ i for i in open(realignByIdentityIgnoringGapsOutput, 'r') if i != '' ], \
                                              [ i for i in open(lastzOutput, 'r') if i != '' ]):
                realignCigarByIdentity = cigarReadFromString(
                    realignLineByIdentity)
                realignCigarByPosteriorProb = cigarReadFromString(
                    realignLineByPosteriorProb)
                realignCigarByIdentityIgnoringGaps = cigarReadFromString(
                    realignLineByIdentityIgnoringGaps)
                lastzCigar = cigarReadFromString(lastzLine)
                #Check scores are as expected
                self.assertTrue(realignCigarByIdentity.score >= 0)
                self.assertTrue(realignCigarByIdentity.score <= 100.0)
                self.assertTrue(realignCigarByPosteriorProb.score >= 0)
                self.assertTrue(realignCigarByPosteriorProb.score <= 100.0)
                self.assertTrue(realignCigarByIdentityIgnoringGaps.score >= 0)
                self.assertTrue(
                    realignCigarByIdentityIgnoringGaps.score <= 100.0)
def down(target, inputFile, fileStart, fileEnd, N, outputFile):
    """Input is a file and a range into that file to sort and an output location in which
    to write the sorted file.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on job is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    if random.random() > 0.5:
        raise RuntimeError(
        )  #This error is a test error, it does not mean the tests have failed.
    length = fileEnd - fileStart
    target.logToMaster(
        "Am running a down target with length: %i from input file: %s" %
        (length, inputFile))
    assert length >= 0
    if length > N:
        midPoint = getMidPoint(inputFile, fileStart, fileEnd)
        assert midPoint >= fileStart
        assert midPoint + 1 < fileEnd
        #We will subdivide the file
        tempFile1 = getTempFile(rootDir=target.getGlobalTempDir())
        tempFile2 = getTempFile(rootDir=target.getGlobalTempDir())
        target.addChildTargetFn(
            down, (inputFile, fileStart, midPoint + 1, N, tempFile1))
        target.addChildTargetFn(down,
                                (inputFile, midPoint + 1, fileEnd, N,
                                 tempFile2))  #Add one to avoid the newline
        target.setFollowOnTargetFn(up, (tempFile1, tempFile2, outputFile))
    else:
        #We can sort this bit of the file
        copySubRangeOfFile(inputFile, fileStart, fileEnd, outputFile)
        sort(outputFile)
Exemple #5
0
 def run(self):
     self.logToMaster("Blasting ingroups vs outgroups to file %s" % (self.finalResultsFile))
     try:
         os.makedirs(self.outgroupFragmentsDir)
     except os.error:
         # Directory already exists
         pass
     if self.ingroupCoverageDir is not None:
         try:
             os.makedirs(self.ingroupCoverageDir)
         except os.error:
             # Directory already exists
             pass
     
     ingroupResultsFile = getTempFile("ingroupResults",
                                      rootDir=self.getGlobalTempDir())
     self.addChildTarget(BlastSequencesAllAgainstAll(self.ingroupSequenceFiles,
                                                     ingroupResultsFile,
                                                     self.blastOptions))
     outgroupResultsFile = getTempFile("outgroupResults",
                                       rootDir=self.getGlobalTempDir())
     self.setFollowOnTarget(BlastFirstOutgroup(self.ingroupSequenceFiles,
                                               self.ingroupSequenceFiles,
                                               self.outgroupSequenceFiles,
                                               self.outgroupFragmentsDir,
                                               ingroupResultsFile,
                                               outgroupResultsFile,
                                               self.finalResultsFile,
                                               self.blastOptions, 1,
                                               self.ingroupCoverageDir))
Exemple #6
0
    def testFastaReadWriteC(self):
        """Tests consistency with C version of this function.
        """
        tempFile = getTempFile()
        self.tempFiles.append(tempFile)
        tempFile2 = getTempFile()
        self.tempFiles.append(tempFile2)
        for test in range(0, self.testNo):
            fastaNumber = random.choice(range(10))
            l = [getRandomSequence() for i in range(fastaNumber)]
            fileHandle = open(tempFile, 'w')
            for name, seq in l:
                fastaWrite(fileHandle, name, seq)
            fileHandle.close()

            command = "sonLib_fastaCTest %s %s" % (tempFile, tempFile2)

            print(command)

            system(command)

            fileHandle = open(tempFile2, 'r')
            l.reverse()
            outFh = io.StringIO()
            for i in fastaRead(fileHandle):
                name, seq = i
                assert i == l.pop()
                fastaWrite(outFh, name, seq)
            outFh.close()
            fileHandle.close()
Exemple #7
0
 def testCPecanRealignSplitSequences(self):
     """Runs cPecanRealign, splitting indels longer than 100bp, and check
     that the coverage from the results is the same as the coverage from
     realigning with no arguments.."""
     for seqFile1, seqFile2 in seqFilePairGenerator():
         # Drop the lastz command since it's not needed. But this
         # is still convenient to use the same parameters as all
         # the other tests
         realignCommand, _ = getCommands(seqFile1, seqFile2)
         splitRealignCommand = realignCommand + " --splitIndelsLongerThanThis 100"
         realignOutput = getTempFile()
         splitRealignOutput = getTempFile()
         realignCommand += " > %s" % realignOutput
         splitRealignCommand += " > %s" % splitRealignOutput
         system(realignCommand)
         system(splitRealignCommand)
         # Check coverage on seqFile1
         
         #The following will fail until we refactor.
         
         splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, splitRealignOutput))
         realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, realignOutput))
         self.assertTrue(splitRealignCoverage == realignCoverage)
         # Check coverage on seqFile2
         splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, splitRealignOutput))
         realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, realignOutput))
         self.assertTrue(splitRealignCoverage == realignCoverage)
         os.remove(realignOutput)
         os.remove(splitRealignOutput)
Exemple #8
0
 def testKeepingCoverageOnIngroups(self):
     """Tests whether the --ingroupCoverageDir option works as
     advertised."""
     encodeRegion = "ENm001"
     ingroups = ["human", "cow"]
     outgroups = ["macaque", "rabbit", "dog"]
     regionPath = os.path.join(self.encodePath, encodeRegion)
     ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
     outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
     # Run blast in "ingroup vs outgroups" mode, requesting to keep
     # the bed files that show outgroup coverage on the ingroup.
     system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/outgroupJobTree --outgroupFragmentsDir %s/outgroupFragments --ingroupCoverageDir %s/ingroupCoverages" % (",".join(ingroupPaths), ",".join(outgroupPaths), self.tempOutputFile, self.tempDir, self.tempDir, self.tempDir))
     for i, ingroupPath in enumerate(ingroupPaths):
         # Get the coverage from the outgroups independently and
         # check that it's the same as the file in
         # ingroupCoverageDir
         otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0]
         # To filter out alignments from the other ingroup and
         # self-alignments we need to create a fasta with all the
         # outgroup fragments in it.
         outgroupsCombined = getTempFile(rootDir=self.tempDir)
         for outgroupPath in outgroupPaths:
             system("cat %s/outgroupFragments/%s >> %s" % (self.tempDir, os.path.basename(outgroupPath), outgroupsCombined))
         independentCoverageFile = getTempFile(rootDir=self.tempDir)
         system("cactus_coverage --from %s %s %s > %s" % (outgroupsCombined, ingroupPath, self.tempOutputFile, independentCoverageFile))
         # find the coverage file cactus_blast kept (should be
         # named according to the basename of the ingroup path
         # file)
         keptCoverageFile = os.path.join("%s/ingroupCoverages" % self.tempDir, os.path.basename(ingroupPath) + ".bed")
         print independentCoverageFile
         self.assertTrue(os.path.isfile(keptCoverageFile))
         self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
    def testCactusRealignDummy(self):
        """Runs cactus realign using the "rescoreOriginalAlignment" mode
        and checks the output is equivalent to what you'd get by just running lastz.
        """
        for seqFile1, seqFile2 in seqFilePairGenerator():

            lastzOutput = getTempFile(rootDir=self.tempDir)
            runLastz(seqFile1,
                     seqFile2,
                     alignmentsFile=lastzOutput,
                     lastzArguments=self.defaultLastzArguments)
            realignOutput = getTempFile(rootDir=self.tempDir)
            runCactusRealign(seqFile1,
                             seqFile2,
                             inputAlignmentsFile=lastzOutput,
                             outputAlignmentsFile=realignOutput,
                             realignArguments=self.defaultRealignArguments +
                             " --rescoreOriginalAlignment")

            for realignLine, lastzLine in zip(
                [i for i in open(lastzOutput, 'r') if i != ''],
                [i for i in open(realignOutput, 'r') if i != '']):
                realignCigar = cigarReadFromString(realignLine)
                lastzCigar = cigarReadFromString(lastzLine)
                self.assertTrue(realignCigar != None)
                self.assertTrue(realignCigar == lastzCigar)
Exemple #10
0
    def testKeepingCoverageOnIngroups(self):
        """Tests whether the --ingroupCoverageDir option works as
        advertised."""
        encodeRegion = "ENm001"
        ingroups = ["human", "cow"]
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run blast in "ingroup vs outgroups" mode, requesting to keep
        # the bed files that show outgroup coverage on the ingroup.
        toilDir = os.path.join(self.tempDir, "tmp_toil")
        outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups]
        ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups]
        runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir)
        for i, ingroupPath in enumerate(ingroupPaths):
            # Get the coverage from the outgroups independently and
            # check that it's the same as the file in
            # ingroupCoverageDir
            otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0]
            # To filter out alignments from the other ingroup and
            # self-alignments we need to create a fasta with all the
            # outgroup fragments in it.
            outgroupsCombined = getTempFile(rootDir=self.tempDir)
            for outgroupFragmentPath in outgroupFragmentPaths:
                system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined))
            independentCoverageFile = getTempFile(rootDir=self.tempDir)
            calculateCoverage(fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile)

            # find the coverage file cactus_blast kept (should be
            # named according to the basename of the ingroup path
            # file)
            keptCoverageFile = ingroupCoveragePaths[i]
            self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
def down(target, inputFile, fileStart, fileEnd, N, outputFile):
    """Input is a file and a range into that file to sort and an output location in which
    to write the sorted file.
    If the range is larger than a threshold N the range is divided recursively and
    a follow on job is then created which merges back the results else
    the file is sorted and placed in the output.
    """
    if random.random() > 0.5:
        raise RuntimeError() #This error is a test error, it does not mean the tests have failed.
    length = fileEnd - fileStart
    target.logToMaster("Am running a down target with length: %i from input file: %s" % (length, inputFile))
    assert length >= 0
    if length > N:
        midPoint = getMidPoint(inputFile, fileStart, fileEnd)
        assert midPoint >= fileStart
        assert midPoint+1 < fileEnd
        #We will subdivide the file
        tempFile1 = getTempFile(rootDir=target.getGlobalTempDir())
        tempFile2 = getTempFile(rootDir=target.getGlobalTempDir())
        target.addChildTargetFn(down, (inputFile, fileStart, midPoint+1, N, tempFile1))
        target.addChildTargetFn(down, (inputFile, midPoint+1, fileEnd, N, tempFile2)) #Add one to avoid the newline
        target.setFollowOnTargetFn(up, (tempFile1, tempFile2, outputFile))                
    else:
        #We can sort this bit of the file
        copySubRangeOfFile(inputFile, fileStart, fileEnd, outputFile)
        sort(outputFile)
Exemple #12
0
    def testKeepingCoverageOnIngroups(self):
        """Tests whether the --ingroupCoverageDir option works as
        advertised."""
        encodeRegion = "ENm001"
        ingroups = ["human", "cow"]
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run blast in "ingroup vs outgroups" mode, requesting to keep
        # the bed files that show outgroup coverage on the ingroup.
        toilDir = os.path.join(self.tempDir, "tmp_toil")
        outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups]
        ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups]
        runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir)
        for i, ingroupPath in enumerate(ingroupPaths):
            # Get the coverage from the outgroups independently and
            # check that it's the same as the file in
            # ingroupCoverageDir
            otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0]
            # To filter out alignments from the other ingroup and
            # self-alignments we need to create a fasta with all the
            # outgroup fragments in it.
            outgroupsCombined = getTempFile(rootDir=self.tempDir)
            for outgroupFragmentPath in outgroupFragmentPaths:
                system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined))
            independentCoverageFile = getTempFile(rootDir=self.tempDir)
            coverageWorkDir = getTempDirectory(rootDir=self.tempDir)
            calculateCoverage(work_dir=coverageWorkDir, fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile)

            # find the coverage file cactus_blast kept (should be
            # named according to the basename of the ingroup path
            # file)
            keptCoverageFile = ingroupCoveragePaths[i]
            self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
    def testCactusRealignSplitSequences(self):
        """Runs cactus realign, splitting indels longer than 100bp, and check
        that the coverage from the results is the same as the coverage from
        realigning with no arguments.."""
        for seqFile1, seqFile2 in seqFilePairGenerator():
            lastzOutput = getTempFile(rootDir=self.tempDir)
            runLastz(seqFile1, seqFile2, alignmentsFile=lastzOutput,
                     lastzArguments=self.defaultLastzArguments)
            
            realignOutput = getTempFile(rootDir=self.tempDir)
            runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=lastzOutput,
                             outputAlignmentsFile=realignOutput,
                             realignArguments=self.defaultRealignArguments)
            
            splitRealignOutput = getTempFile(rootDir=self.tempDir)
            runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=lastzOutput,
                             outputAlignmentsFile=splitRealignOutput,
                             realignArguments=self.defaultRealignArguments + " --splitIndelsLongerThanThis 100")

            # Check coverage on seqFile1
            splitRealignCoverage = runCactusCoverage(seqFile1, splitRealignOutput)
            realignCoverage = runCactusCoverage(seqFile1, realignOutput)
            self.assertTrue(splitRealignCoverage == realignCoverage)
            # Check coverage on seqFile2
            splitRealignCoverage = runCactusCoverage(seqFile2, splitRealignOutput)
            realignCoverage = runCactusCoverage(seqFile2, realignOutput)
            self.assertTrue(splitRealignCoverage == realignCoverage)
            os.remove(realignOutput)
            os.remove(splitRealignOutput)
    def testMatchGraph(self):
        """ Tests matchGraph.py program using randGraph.py input
        """

        for test in range(self.testNo):
            tempInputFile = getTempFile()
            tempOutputFile = getTempFile()

            self.tempFiles.append(tempInputFile)
            self.tempFiles.append(tempOutputFile)

            # Create sample/test input graph file
            system("blossom_randGraph.py > %s" % tempInputFile)

            # Run matchGraph.py
            system("matchGraph.py -e %s -w %s" % (tempInputFile, tempOutputFile))

            # Now check if output is valid
            f = open(tempOutputFile, 'r')
            lineIdx = 0
            for line in f:
                line = line.rstrip()
                if lineIdx == 0:
                    (vertexNum, edgeNum) = line.split()
                    vertexNum = int(vertexNum)
                    edgeNum = int(edgeNum)
                    vertexArray = [0] * vertexNum

                    # Number of vertices must be even
                    self.assertEqual(vertexNum % 2, 0)

                    # Number of edges is half the number of vertices
                    self.assertEqual(vertexNum/2, edgeNum)
                else:
                    (vertexI, vertexJ,) = line.split()
                    vertexI = int(vertexI)
                    vertexJ = int(vertexJ)

                    vertexArray[vertexI] += 1
                    vertexArray[vertexJ] += 1

                    # Vertex indices must be 0<= i,j < V
                    self.assertTrue(vertexI in range(vertexNum))
                    self.assertTrue(vertexJ in range(vertexNum))
                lineIdx += 1

            # Must have the correct number of edges
            self.assertEqual(edgeNum, lineIdx-1)

            badCount = 0
            for i in vertexArray:
                if i != 1:
                    badCount += 1
            # Each vertex must be only in one edge
            self.assertEqual(badCount, 0)

            logger.info("Ran the test(s) of the matchGraph program okay")
    def testBlossom(self):
        """ Tests blossom5 program using randGraph.py input
        """

        for test in xrange(self.testNo):
            tempInputFile = getTempFile()
            tempOutputFile = getTempFile()

            self.tempFiles.append(tempInputFile)
            self.tempFiles.append(tempOutputFile)

            # Create sample/test input graph file
            system("blossom_randGraph.py > %s" % tempInputFile)

            # Run blossom5
            system("blossom5 -e %s -w %s >& /dev/null" % (tempInputFile, tempOutputFile))

            # Now check if output is valid
            f = open(tempOutputFile, 'r')
            lineIdx = 0
            for line in f:
                line = line.rstrip()
                if lineIdx == 0:
                    (vertexNum, edgeNum) = line.split()
                    vertexNum = int(vertexNum)
                    edgeNum = int(edgeNum)
                    vertexArray = [0] * vertexNum

                    # Number of vertices must be even
                    self.assertEqual(vertexNum % 2, 0)

                    # Number of edges is half the number of vertices
                    self.assertEqual(vertexNum/2, edgeNum)
                else:
                    (vertexI, vertexJ,) = line.split()
                    vertexI = int(vertexI)
                    vertexJ = int(vertexJ)

                    vertexArray[vertexI] += 1
                    vertexArray[vertexJ] += 1

                    # Vertex indices must be 0<= i,j < V
                    self.assertTrue(vertexI in xrange(vertexNum))
                    self.assertTrue(vertexJ in xrange(vertexNum))
                lineIdx += 1

            # Must have the correct number of edges
            self.assertEqual(edgeNum, lineIdx-1)

            badCount = 0
            for i in vertexArray:
                if i != 1:
                    badCount += 1
            # Each vertex must be only in one edge
            self.assertEqual(badCount, 0)

            logger.info("Ran the test(s) of the blossom program okay")
Exemple #16
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ]
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        # subselect 4 random ordered outgroups
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))]
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
            outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
            results = []
            for numOutgroups in xrange(1,5):
                # Align w/ increasing numbers of outgroups
                subResults = getTempFile()
                subOutgroupPaths = outgroupPaths[:numOutgroups]
                print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
                tmpToil = os.path.join(self.tempDir, "outgroupToil")
                runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil)
                results.append(subResults)

            # Print diagnostics about coverage
            for i, subResults in enumerate(results):
                for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                    ingroupCoverage = getTempFile(rootDir=self.tempDir)
                    coverageWorkDir = getTempDirectory(rootDir=self.tempDir)
                    calculateCoverage(work_dir=coverageWorkDir, sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage)
                    coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage)
                    print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

            resultsSets = map(lambda x : loadResults(x), results)
            for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
                # Make sure the results from (n+1) outgroups are
                # (very nearly) a superset of the results from n outgroups
                print "Using %d addl outgroup(s):" % (i + 1)
                comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
                print comparator
                self.assertTrue(comparator.sensitivity >= 0.99)

            # Ensure that the new alignments don't cover more than
            # x% of already existing alignments to human
            for i in xrange(1, len(resultsSets)):
                prevResults = resultsSets[i-1][0]
                curResults = resultsSets[i][0]
                prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
                newAlignments = curResults.difference(prevResults)
                newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
                print "addl outgroup %d:" % i
                print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
            for subResult in results:
                os.remove(subResult)
Exemple #17
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegion = "ENm001"
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        MAX_NUM_OUTGROUPS = 3
        # subselect a random set of outgroups in the same order
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), MAX_NUM_OUTGROUPS))]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        results = []
        for numOutgroups in xrange(1, len(outgroups) + 1):
            # Align w/ increasing numbers of outgroups
            subResults = getTempFile()
            subOutgroupPaths = outgroupPaths[:numOutgroups]
            print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
            tmpToil = os.path.join(self.tempDir, "outgroupToil")
            runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil)
            results.append(subResults)

        # Print diagnostics about coverage
        for i, subResults in enumerate(results):
            for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                ingroupCoverage = getTempFile(rootDir=self.tempDir)
                calculateCoverage(sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage)
                coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage)
                print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

        resultsSets = map(lambda x : loadResults(x), results)
        for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
            # Make sure the results from (n+1) outgroups are
            # (very nearly) a superset of the results from n outgroups
            print "Using %d addl outgroup(s):" % (i + 1)
            comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
            print comparator
            self.assertTrue(comparator.sensitivity >= 0.99)

        # Ensure that the new alignments don't cover more than
        # x% of already existing alignments to human
        for i in xrange(1, len(resultsSets)):
            prevResults = resultsSets[i-1][0]
            curResults = resultsSets[i][0]
            prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
            newAlignments = curResults.difference(prevResults)
            newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
            print "addl outgroup %d:" % i
            print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
        for subResult in results:
            os.remove(subResult)
Exemple #18
0
    def liftover(self, bedLine):
        """Lift a bedLine over to the target genome, parse the PSL output, and
        return a map from target sequence -> [(query block, [target
        block(s)])]

        Blocks are (start, end, strand) where start < end

        """
        tempSrc = getTempFile("ContiguousRegions.tempSrc.bed",
                                    rootDir=self.tempRoot)
        tempDest = getTempFile("ContiguousRegions.tempDest.psl",
                                     rootDir=self.tempRoot)
        open(tempSrc, 'w').write("%s\n" % bedLine)
        cmd = "halLiftover --outPSL %s %s %s %s %s" % (self.alignment,
                                                       self.srcGenome,
                                                       tempSrc,
                                                       self.destGenome,
                                                       tempDest)
        bioio.system(cmd)
        pslLines = open(tempDest).read().split("\n")
        os.remove(tempSrc)
        os.remove(tempDest)
        pslLines = map(lambda x: x.split(), pslLines)
        # Get target blocks for every query block. All adjacencies
        # within a block are by definition preserved. Adjacencies
        # between target blocks (and query blocks with the commandline
        # option) are what determine if the structure is preserved.
        # dict is to keep blocks separated by target sequence & strand
        blocks = defaultdict(list)
        for pslLine in pslLines:
            if pslLine == []:
                continue
            qStrand = pslLine[8][0]
            assert(qStrand == '+')
            if len(pslLine[8]) != 1:
                assert(len(pslLine[8]) == 2)
                tStrand = pslLine[8][1]
            else:
                tStrand = '+'
            tName = pslLine[13]
            tSize = int(pslLine[14])
            blockSizes = [int(i) for i in pslLine[18].split(",") if i != '']
            qStarts = [int(i) for i in pslLine[19].split(",") if i != '']
            tStarts = [int(i) for i in pslLine[20].split(",") if i != '']
            assert(len(blockSizes) == len(qStarts) and
                   len(qStarts) == len(tStarts))
            for blockLen, qStart, tStart in zip(blockSizes, qStarts, tStarts):
                qBlock = (qStart, qStart + blockLen, qStrand)
                tBlock = (tStart, tStart + blockLen, tStrand) if tStrand == '+' else (tSize - tStart - blockLen, tSize - tStart, tStrand)
                blocks[tName].append((qBlock, tBlock))

        # Sort & merge query blocks in cases of duplication
        return self.mergeBlocks(blocks)
Exemple #19
0
    def liftover(self, bedLine):
        """Lift a bedLine over to the target genome, parse the PSL output, and
        return a map from target sequence -> [(query block, [target
        block(s)])]

        Blocks are (start, end, strand) where start < end

        """
        tempSrc = getTempFile("ContiguousRegions.tempSrc.bed",
                                    rootDir=self.tempRoot)
        tempDest = getTempFile("ContiguousRegions.tempDest.psl",
                                     rootDir=self.tempRoot)
        open(tempSrc, 'w').write("%s\n" % bedLine)
        cmd = "halLiftover --outPSL %s %s %s %s %s" % (self.alignment,
                                                       self.srcGenome,
                                                       tempSrc,
                                                       self.destGenome,
                                                       tempDest)
        bioio.system(cmd)
        pslLines = open(tempDest).read().split("\n")
        os.remove(tempSrc)
        os.remove(tempDest)
        pslLines = [x.split() for x in pslLines]
        # Get target blocks for every query block. All adjacencies
        # within a block are by definition preserved. Adjacencies
        # between target blocks (and query blocks with the commandline
        # option) are what determine if the structure is preserved.
        # dict is to keep blocks separated by target sequence & strand
        blocks = defaultdict(list)
        for pslLine in pslLines:
            if pslLine == []:
                continue
            qStrand = pslLine[8][0]
            assert(qStrand == '+')
            if len(pslLine[8]) != 1:
                assert(len(pslLine[8]) == 2)
                tStrand = pslLine[8][1]
            else:
                tStrand = '+'
            tName = pslLine[13]
            tSize = int(pslLine[14])
            blockSizes = [int(i) for i in pslLine[18].split(",") if i != '']
            qStarts = [int(i) for i in pslLine[19].split(",") if i != '']
            tStarts = [int(i) for i in pslLine[20].split(",") if i != '']
            assert(len(blockSizes) == len(qStarts) and
                   len(qStarts) == len(tStarts))
            for blockLen, qStart, tStart in zip(blockSizes, qStarts, tStarts):
                qBlock = (qStart, qStart + blockLen, qStrand)
                tBlock = (tStart, tStart + blockLen, tStrand) if tStrand == '+' else (tSize - tStart - blockLen, tSize - tStart, tStrand)
                blocks[tName].append((qBlock, tBlock))

        # Sort & merge query blocks in cases of duplication
        return self.mergeBlocks(blocks)
Exemple #20
0
 def testRepeatBed(self):
     tempFile = getTempFile(rootDir=os.getcwd())
     tempFile2 = getTempFile(rootDir=os.getcwd())
     fileHandle = open(tempFile, 'w')
     fileHandle.write(">hello boo\nacTGACCCCgtcgAAcAAccc\n>foo\nAaaAAAAAAA")
     fileHandle.close()
     system("getRepeatBed %s %s" % (tempFile, tempFile2))
     fileHandle = open(tempFile2, 'r')
     fn = lambda (i, j, k) : (i, int(j), int(k))
     j = [ fn(i.split()) for i in fileHandle.readlines() ]
     print j
     assert j == [ ("hello", 0, 2), ("hello", 9, 13), ("hello", 15, 16), ("hello", 18, 21), ("foo", 1, 3) ]
     os.remove(tempFile)
     os.remove(tempFile2)
Exemple #21
0
 def testCopySubRangeOfFile(self):
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile = getTempFile(rootDir=tempDir)
         outputFile = getTempFile(rootDir=tempDir)
         makeFileToSort(tempFile)
         fileSize = os.path.getsize(tempFile)
         assert fileSize > 0
         fileStart = random.choice(xrange(0, fileSize))
         fileEnd = random.choice(xrange(fileStart, fileSize))
         copySubRangeOfFile(tempFile, fileStart, fileEnd, outputFile)
         l = open(outputFile, 'r').read()
         l2 = open(tempFile, 'r').read()[fileStart:fileEnd]
         checkEqual(l, l2)
         system("rm -rf %s" % tempDir)
 def testInvariants(self):
     (seqs, _) = getCactusInputs_encode(random.uniform(0, 2))
     # Chimp encode input has duplicate header names.
     seqs = [i for i in seqs if 'chimp' not in i]
     seqs = random.sample(seqs, 2)
     cigarPath = getTempFile()
     cactus_call(parameters=[
         "cPecanLastz", "--format=cigar",
         "%s[multiple]" % seqs[0],
         "%s[multiple]" % seqs[1]
     ],
                 outfile=cigarPath)
     bed = cactus_call(parameters=["cactus_coverage", seqs[1], cigarPath],
                       check_output=True)
     prevChrom = None
     prevStart = None
     prevEnd = None
     # Check that everything is sorted and there are no overlaps
     for line in bed.split("\n"):
         line.strip()
         if line == "":
             continue
         fields = line.split()
         chrom = fields[0]
         start = int(fields[1])
         end = int(fields[2])
         self.assertTrue(end - start >= 1)
         if chrom == prevChrom:
             self.assertTrue(start > prevStart)
             self.assertTrue(start >= prevEnd)
     os.remove(cigarPath)
    def run(self):
        # If the files are in a sub-dir then rip them out.
        if os.path.isdir(self.inputSequenceFileOrDirectory):
            tempFile = getTempFile(rootDir=self.getGlobalTempDir())
            catFiles(
                [
                    os.path.join(self.inputSequenceFileOrDirectory, f)
                    for f in os.listdir(self.inputSequenceFileOrDirectory)
                ],
                tempFile,
            )
            inputSequenceFile = tempFile
        else:
            inputSequenceFile = self.inputSequenceFileOrDirectory

        assert inputSequenceFile != self.outputSequenceFile

        prepXmlElems = self.configNode.findall("preprocessor")

        analysisString = runCactusAnalyseAssembly(inputSequenceFile)
        self.logToMaster(
            "Before running any preprocessing on the assembly: %s got following stats (assembly may be listed as temp file if input sequences from a directory): %s"
            % (self.inputSequenceFileOrDirectory, analysisString)
        )

        if len(prepXmlElems) == 0:  # Just cp the file to the output file
            system("cp %s %s" % (inputSequenceFile, self.outputSequenceFile))
        else:
            logger.info("Adding child batch_preprocessor target")
            self.addChildTarget(BatchPreprocessor(prepXmlElems, inputSequenceFile, self.outputSequenceFile, 0))
def setup(target, inputFile, N):
    """Sets up the sort.
    """
    tempOutputFile = getTempFile(rootDir=target.getGlobalTempDir())
    target.addChildTargetFn(
        down, (inputFile, 0, os.path.getsize(inputFile), N, tempOutputFile))
    target.setFollowOnFn(cleanup, (tempOutputFile, inputFile))
Exemple #25
0
 def progressiveFunction(self,
                         experimentFile,
                         toilDir,
                         batchSystem,
                         buildAvgs,
                         buildHal,
                         buildFasta,
                         toilStats,
                         subtreeRoot=None,
                         logLevel=None):
     eW = ExperimentWrapper(ET.parse(experimentFile).getroot())
     seqFile = getTempFile()
     with open(seqFile, 'w') as f:
         tree = eW.getTree()
         newick = NXNewick().writeString(tree)
         f.write('%s\n' % newick)
         for genome in eW.getGenomesWithSequence():
             f.write('%s %s\n' % (genome, eW.getSequenceID(genome)))
     config = eW.getConfigPath()
     runCactusProgressive(seqFile,
                          config,
                          toilDir,
                          batchSystem=batchSystem,
                          buildAvgs=buildAvgs,
                          toilStats=toilStats,
                          logLevel=logLevel)
Exemple #26
0
 def getFastaDict(self):
     temp = getTempFile(rootDir=self.getGlobalTempDir())
     system("hal2fasta %s %s > %s" % (self.halPath, self.genome, temp))
     ret = {}
     for header, seq in fastaRead(temp):
         ret[header] = seq
     return ret
Exemple #27
0
def getRandomConfigFile():
    tempConfigFile = getTempFile(rootDir="./", suffix=".xml")
    config = ET.parse(os.path.join(cactusRootPath(), "cactus_config.xml")).getroot()
    cafNode = config.find("caf")
    assert len(config.findall("caf")) == 1
    
    annealingRounds = 1 + int(random.random() * 10)
    cafNode.attrib["annealingRounds"] = " ".join([ str(1 + int(random.random() * 10)) for i in xrange(annealingRounds) ])
    deannealingRounds = list(set([ 1 + int(random.random() * 10) for i in xrange(int(random.random() * 10)) ]))
    deannealingRounds.sort()
    cafNode.attrib["deannealingRounds"] = " ".join([ str(i) for i in deannealingRounds ])
    cafNode.attrib["trim"] = " ".join([ str(1 + int(random.random() * 5)) for i in xrange(annealingRounds) ])
    
    cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() * annealingRounds)
    
    cafNode.attrib["minimumTreeCoverage"] = str(random.random())
    cafNode.attrib["blockTrim"] = str(int(random.random() * 5))
    cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str(random.choice([0, 1]))
    cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5]))
    
    checkNode = config.find("check")
    checkNode.attrib["runCheck"] = "1"
    
    checkNode = config.find("normal")
    checkNode.attrib["iterations"] = "2"
    
    #Now print the file..
    fileHandle = open(tempConfigFile, 'w')
    ET.ElementTree(config).write(fileHandle)
    fileHandle.close()
    if getLogLevelString() == "DEBUG":
        system("cat %s" % tempConfigFile)
    return tempConfigFile
Exemple #28
0
 def testInvariants(self):
     (seqs, _) = getCactusInputs_encode(random.uniform(0, 2))
     # Chimp encode input has duplicate header names.
     seqs = [i for i in seqs if 'chimp' not in i]
     seqs = random.sample(seqs, 2)
     cigarPath = getTempFile()
     system("cPecanLastz --format=cigar %s[multiple] %s[multiple] > %s" % \
            (seqs[0], seqs[1], cigarPath))
     bed = popenCatch("cactus_coverage %s %s" % (seqs[1], cigarPath))
     prevChrom = None
     prevStart = None
     prevEnd = None
     # Check that everything is sorted and there are no overlaps
     for line in bed.split("\n"):
         line.strip()
         if line == "":
             continue
         fields = line.split()
         chrom = fields[0]
         start = int(fields[1])
         end = int(fields[2])
         self.assertTrue(end - start >= 1)
         if chrom == prevChrom:
             self.assertTrue(start > prevStart)
             self.assertTrue(start >= prevEnd)
     os.remove(cigarPath)
Exemple #29
0
    def wrap(self):
        # Pretty much ripped from the toil worker.py setup.
        tempPath = getTempFile()
        oldStdout = os.dup(1)
        oldStderr = os.dup(2)

        #Open the file to send stdout/stderr to.
        logFh = os.open(tempPath, os.O_RDWR | os.O_CREAT | os.O_APPEND)

        #Replace standard output with a descriptor for the log file
        os.dup2(logFh, 1)

        #Replace standard error with a descriptor for the log file
        os.dup2(logFh, 2)
        try:
            fn(self)
        except:
            oldStdoutFile = os.fdopen(oldStdout, 'w')
            logFile = os.fdopen(os.dup(logFh))
            logFile.seek(0)
            oldStdoutFile.write(logFile.read())
            raise
        finally:
            # Close the descriptor we used to open the file
            os.close(logFh)
            # Reset stdout and stderr
            os.dup2(oldStdout, 1)
            os.dup2(oldStderr, 2)
            os.remove(tempPath)
Exemple #30
0
    def wrap(self):
        # Pretty much ripped from the toil worker.py setup.
        tempPath = getTempFile()
        oldStdout = os.dup(1)
        oldStderr = os.dup(2)

        #Open the file to send stdout/stderr to.
        logFh = os.open(tempPath, os.O_RDWR | os.O_CREAT | os.O_APPEND)

        #Replace standard output with a descriptor for the log file
        os.dup2(logFh, 1)

        #Replace standard error with a descriptor for the log file
        os.dup2(logFh, 2)
        try:
            fn(self)
        except:
            oldStdoutFile = os.fdopen(oldStdout, 'w')
            logFile = os.fdopen(os.dup(logFh))
            logFile.seek(0)
            oldStdoutFile.write(logFile.read())
            raise
        finally:
            # Close the descriptor we used to open the file
            os.close(logFh)
            # Reset stdout and stderr
            os.dup2(oldStdout, 1)
            os.dup2(oldStderr, 2)
            os.remove(tempPath)
    def run(self):
        speciesTree = popenCatch("halStats --tree %s" % (self.opts.halFile)).strip()
        chromSizes = getChromSizes(self.opts.halFile, self.opts.refGenome)

        positions = []
        # For ensuring that a column isn't counted multiple times from
        # different reference positions.
        positionSet = set(positions)
        for i in xrange(self.opts.numSamples):
            # Have to sample the columns here since otherwise it can
            # be difficult to independently seed several RNGs
            pos = samplePosition(chromSizes)
            if pos not in positionSet:
                positions.append(pos)
                positionSet.add(pos)

        outputs = []
        for sliceStart in xrange(0, self.opts.numSamples,
                                 self.opts.samplesPerJob):
            slice = positions[sliceStart:sliceStart + self.opts.samplesPerJob]
            outputFile = getTempFile(rootDir=self.getGlobalTempDir())
            outputs.append(outputFile)
            self.addChildTarget(ScoreColumns(self.opts, slice,
                                             outputFile, speciesTree, positionSet))
        self.setFollowOnTarget(Summarize(self.opts, outputs, self.opts.outputFile, self.opts.writeMismatchesToFile))
Exemple #32
0
 def getFastaDict(self):
     temp = getTempFile(rootDir=self.getGlobalTempDir())
     system("hal2fasta %s %s > %s" % (self.halPath, self.genome, temp))
     ret = {}
     for header, seq in fastaRead(temp):
         ret[header] = seq
     return ret
Exemple #33
0
def scriptTree_SortTest(testNo, batchSystem, lines=10000, maxLineLength=10, N=10000):
    """Tests scriptTree/jobTree by sorting a file in parallel.
    """
    for test in xrange(testNo):
        tempDir = getTempDirectory(os.getcwd())
        tempFile = getTempFile(rootDir=tempDir)
        jobTreeDir = os.path.join(tempDir, "testJobTree")
        makeFileToSort(tempFile, lines=lines, maxLineLength=maxLineLength)
        #First make our own sorted version
        fileHandle = open(tempFile, 'r')
        l = fileHandle.readlines()
        l.sort()
        fileHandle.close()
        #Sort the file
        while True:
            command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %i --batchSystem %s --jobTime 1.0 --maxCpus 20 --retryCount 2" % (jobTreeDir, tempFile, N, batchSystem) #, retryCount)
            system(command)
            try:
                system("jobTreeStatus --jobTree %s --failIfNotComplete" % jobTreeDir)
                break
            except:
                print "The jobtree failed and will be restarted"
                #raise RuntimeError()
                continue
                
        #Now check the file is properly sorted..
        #Now get the sorted file
        fileHandle = open(tempFile, 'r')
        l2 = fileHandle.readlines()
        fileHandle.close()
        checkEqual(l, l2)
        system("rm -rf %s" % tempDir)
Exemple #34
0
 def testMerge(self):
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile1 = getTempFile(rootDir=tempDir)
         tempFile2 = getTempFile(rootDir=tempDir)
         tempFile3 = getTempFile(rootDir=tempDir)
         makeFileToSort(tempFile1)
         makeFileToSort(tempFile2)
         sort(tempFile1)
         sort(tempFile2)
         merge(tempFile1, tempFile2, tempFile3)
         lines1 = loadFile(tempFile1) + loadFile(tempFile2)
         lines1.sort()
         lines2 = loadFile(tempFile3)
         checkEqual(lines1, lines2)
         system("rm -rf %s" % tempDir)
def parasolRestart():
    """Function starts the parasol hub and node.
    """
    parasolStop()
    while True:
        machineList = os.path.join(workflowRootPath(), "jobTree", "machineList")
        #pathEnvVar = os.environ["PATH"]
        os.system("paraNode start -hub=localhost") 
        #-umask=002 -userPath=%s -sysPath=%s" % (pathEnvVar, pathEnvVar))
        os.system("paraHub %s subnet=127.0.0 &" % (machineList,))
        tempFile = getTempFile()
        dead = True
        try:
            popen("parasol status", tempFile)
            fileHandle = open(tempFile, 'r')
            line = fileHandle.readline()
            while line != '':
                if "Nodes dead" in line:
                    print line
                    if int(line.split()[-1]) == 0:
                        dead = False
                line = fileHandle.readline()
            fileHandle.close()
        except RuntimeError:
            pass
        os.remove(tempFile)
        if not dead:
            break
        else:
            logger.info("Tried to restart the parasol process, but failed, will try again")
            parasolStop()
            time.sleep(5)
    logger.info("Restarted the parasol process")
def killMasterAndParasol():
    """Method to destroy master process
    """
    tempFile = getTempFile()
    popen("ps -a", tempFile)
    fileHandle = open(tempFile, 'r')
    line = fileHandle.readline()
    #Example parasol state lines:
    #67401 ttys002    0:00.06 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i
    #67403 ttys002    0:00.65 /Users/benedictpaten/kent/src/parasol/bin/paraHub -log=/tmp/hub.2009-07-08.log machineList subnet=127.0.0
    #68573 ttys002    0:00.00 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i
    while line != '':
        tokens = line.split()
        if 'paraNode' in line or 'paraHub' in line:
            if random.random() > 0.5:
                i = os.system("kill %i" % int(tokens[0]))
                logger.info("Tried to kill parasol process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i))
                break
        elif 'jobTreeMaster.py' in line:
            logger.info("Have job tree master line")
            if random.random() > 0.5:
                i = os.system("kill %i" % int(tokens[0]))
                logger.info("Tried to kill master process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i))
                break
        line = fileHandle.readline()
    fileHandle.close()
    os.remove(tempFile)
    parasolRestart()
 def run(self):
     cactusAlignmentName = "cactusAlignment"
     cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName)
     if not os.path.exists(cactusAlignment):
         #Prepare the assembly
         #First copy it.
         if self.assemblyFile[-3:] == '.gz':
            tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz")
            system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
            system("gunzip %s" % tempAssemblyFile)
            tempAssemblyFile = tempAssemblyFile[:-3]
            assert os.path.exists(tempAssemblyFile)
         else:
             tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="")
             system("cp %s %s" % (self.assemblyFile, tempAssemblyFile))
         #Make the supporting temporary files
         tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir())
         tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree")
         #Make the experiment file
         cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper(
                                              sequences=self.haplotypeSequences + [ tempAssemblyFile ], 
                                              newickTreeString=self.newickTree, 
                                              outputDir=self.getLocalTempDir(),
                                              configFile=self.configFile)
         cactusWorkflowExperiment.setDbName(cactusAlignmentName)
         cactusWorkflowExperiment.setDbDir(os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName())) #This needs to be set to ensure the thing gets put in the right directory
         cactusWorkflowExperiment.writeXML(tempExperimentFile)
         #Now run cactus workflow
         runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, 
                           buildAvgs=False, buildReference=True,
                           batchSystem="single_machine", maxThreads=1, jobTreeStats=True)
         logger.info("Ran the workflow")
         #Check if the jobtree completed sucessively.
         runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir)
         logger.info("Checked the job tree dir")
         #Compute the stats
         cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName)
         tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),"jobTreeStats.xml")
         system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile))
         #Now copy the true assembly back to the output
         system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir))
         #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir))
         #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir))
         #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir))
         assert os.path.exists(cactusAlignment)
         #We're done!
     self.addChildTarget(MakeStats1(self.outputDir, cactusAlignment, self.options))
def main():
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")
    
    parser.add_option("--job", dest="jobFile", 
                      help="Job file containing command to run",
                      default="None")
    
    parser.add_option("--treePointer", dest="treePointerFile", 
                      help="File containing pointer to the tree data",
                      default="None")
    
    options, args = parseBasicOptions(parser)
    
    logger.info("Parsed the input arguments")
    
    job = ET.parse(options.jobFile).getroot() 
    setLogLevel(job.attrib["log_level"])
    
    logger.info("Parsed the job XML")
    
    treePointer = ET.parse(options.treePointerFile).getroot() 
    
    logger.info("Parsed the tree pointer XML")
    
    tree = ET.parse(treePointer.attrib["file"]).getroot()
    
    logger.info("Parsed the tree XML")
    
    for child in tree.find("children").findall("child"):
        #Make the chuld tree pointer
        childTreePointerFile = makeTreePointer(child.attrib["file"], getTempFile(rootDir=job.attrib["global_temp_dir"]))
        #Make the child command
        unbornChild = ET.SubElement(job.find("children"), "child")
        command = "jobTreeTest_CommandFirst.py --treePointer %s --job JOB_FILE" % \
        (childTreePointerFile,)
        unbornChild.attrib["command"] = command
        if random.random() > 0.2:
            unbornChild.attrib["time"] = str(random.random() * 10)
        #Make the child tree pointer
        ET.SubElement(treePointer.find("children"), "child", { "file":childTreePointerFile })
    
    job.attrib["command"] = "jobTreeTest_CommandSecond.py --treePointer %s --job JOB_FILE" % \
    (options.treePointerFile,)
    logger.info("Made new command")

    fileHandle = open(options.jobFile, 'w')
    ET.ElementTree(job).write(fileHandle)
    fileHandle.close()
    
    logger.info("Updated the job file")
    
    print >>sys.stderr, "Checking that we can report to std err" #These lines should end up in the logs
    print "Checking that we can report to std out"

    if random.random() > 0.9:
        logger.info("Going to fail the job")
        sys.exit(1)
    logger.info("Going to pass the job done okay")
    sys.exit(0)
Exemple #39
0
def getCactusInputs_randomWithConstraints(regionNumber=0, tempDir=None):
    sequenceDirs, newickTreeString = getCactusInputs_random(regionNumber=regionNumber, tempDir=tempDir)
    constraints = getTempFile(rootDir=tempDir)
    fileHandle = open(constraints, 'w')
    for pairwiseAlignment in makeRandomConstraints(getFastasFromSequence(sequenceDirs)):
        cigarWrite(fileHandle, pairwiseAlignment, withProbs=False)
    fileHandle.close()
    return sequenceDirs, newickTreeString, constraints
Exemple #40
0
def getConfigFile(matchingAlgorithm="greedy"):
    tempConfigFile = getTempFile(rootDir="./", suffix=".xml")
    config = ET.parse(
        os.path.join(cactusRootPath(),
                     "cactus_progressive_config.xml")).getroot()
    config.find("reference").attrib["matching_algorithm"] = matchingAlgorithm
    ET.ElementTree(config).write(tempConfigFile)
    return os.path.abspath(tempConfigFile)
Exemple #41
0
def getCactusInputs_randomWithConstraints(regionNumber=0, tempDir=None):
    sequenceDirs, newickTreeString = getCactusInputs_random(regionNumber=regionNumber, tempDir=tempDir)
    constraints = getTempFile(rootDir=tempDir)
    fileHandle = open(constraints, 'w')
    for pairwiseAlignment in makeRandomConstraints(getFastasFromSequence(sequenceDirs)):
        cigarWrite(fileHandle, pairwiseAlignment, withProbs=False)
    fileHandle.close()
    return sequenceDirs, newickTreeString, constraints
def sortCigarByContigAndPos(cigarPath, contigNum):
    contigNameKey = 2 if contigNum == 1 else 6
    startPosKey = 3 if contigNum == 1 else 7
    tempFile = getTempFile()
    system("sort -k %d,%d -k %d,%dn %s > %s" %
           (contigNameKey, contigNameKey, startPosKey, startPosKey, cigarPath,
            tempFile))
    return tempFile
    def progressiveFunction(self, experimentFile, toilDir,
                            batchSystem, buildAvgs,
                            buildReference,
                            buildHal,
                            buildFasta,
                            toilStats,
                            subtreeRoot=None):
        tempDir = getTempDirectory(os.getcwd())
        tempExperimentDir = os.path.join(tempDir, "exp")
        runCreateMultiCactusProject(experimentFile,
                                    tempExperimentDir,
                                    fixNames=False,
                                    root=subtreeRoot)
        logger.info("Put the temporary files in %s" % tempExperimentDir)

        runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"),
                             toilDir,
                             batchSystem=batchSystem,
                             buildAvgs=buildAvgs,
                             toilStats=toilStats)

        # Check that the headers and sequences in the output are the
        # same as the sequences in the input (minus differences in
        # repeat-masking)
        exp = ExperimentWrapper(ET.parse(experimentFile).getroot())
        seqMap = exp.buildSequenceMap()
        # Maps genome name -> headers in fasta
        headers = {}
        for genomeName, inputSequencePath in seqMap.items():
            if os.path.isdir(inputSequencePath):
                # Some "input sequence paths" are actually provided as
                # directories containing multiple FASTAs
                concatenatedPath = getTempFile()
                system("cat %s/* > %s" % (inputSequencePath, concatenatedPath))
                inputSequencePath = concatenatedPath
            headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath)))

        # check headers inside .c2h output
        for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)):
            subExp = ExperimentWrapper(ET.parse(expPath).getroot())
            outgroups = subExp.getOutgroupEvents()
            c2hPath = subExp.getHALPath()
            with open(c2hPath) as f:
                for line in f:
                    fields = line.split('\t')
                    if fields[0] == 's':
                        # Sequence line
                        genome = fields[1][1:-1]
                        header = fields[2][1:-1]
                        if genome in headers and genome not in outgroups:
                            # This genome is an input genome
                            self.assertTrue(header in headers[genome],
                                            'Header %s from output c2h %s not found in input fa %s'
                                            ' for genome %s' % (header, c2hPath, seqMap[genome], genome))


        runToilStatusAndFailIfNotComplete(toilDir)
        system("rm -rf %s" % tempDir)
 def setUp(self):
     unittest.TestCase.setUp(self)
     # simple test data -- not an actual alignment, but to test if
     # coverage is correct. no overlap on B, but overlap on A.
     self.simpleFastaPathA = getTempFile()
     open(self.simpleFastaPathA, 'w').write(
         dedent('''\
     >id=0|simpleSeqA1 otherTokens thatDon'tMatter
     ACTAGAGTAGGAGAGAGAGGGGGG
     CATGCATGCATGCATGCATGCATG
     >id=1|simpleSeqA2 otherTokens thatDon'tMatter
     AAAAAAAAAAAAAAAACTCGTGAG
     CATGCATGCATGCATGCATGCATG'''))
     self.simpleFastaPathB = getTempFile()
     open(self.simpleFastaPathB, 'w').write(
         dedent('''\
     >id=2|simpleSeqB1 otherTokens
     CATGCATGCATGCATGCATGCATG
     CATGCATGCATGCATGCATGCATG'''))
     self.simpleFastaPathC = getTempFile()
     open(self.simpleFastaPathC, 'w').write(
         dedent('''\
     >id=3|simpleSeqC1 otherTokens thatDon'tMatter
     CATGCATGCATGCATGCATGCATG
     CATGCATGCATGCATGCATGCATG'''))
     self.simpleFastaPathD = getTempFile()
     open(self.simpleFastaPathD, 'w').write(
         dedent('''\
     >id=4|simpleSeqD otherTokens thatDon'tMatter
     CATGCATGCATGCATGCATGCATG
     CATGCATGCATGCATGCATGCATG'''))
     self.simpleCigarPath = getTempFile()
     open(self.simpleCigarPath, 'w').write(
         dedent('''\
     cigar: id=2|simpleSeqB1 0 9 + id=0|simpleSeqA1 10 0 - 0 M 8 D 1 M 1
     cigar: id=2|simpleSeqB1 9 18 + id=0|simpleSeqA1 2 6 + 0 M 3 I 5 M 1
     cigar: id=2|simpleSeqB1 18 28 + id=1|simpleSeqA2 0 10 + 0 M 1 I 2 M 2 D 2 M 5
     cigar: id=2|simpleSeqB1 28 30 + id=1|simpleSeqA2 6 8 + 0 M 2
     cigar: id=2|simpleSeqB1 30 32 + id=1|simpleSeqA2 7 9 + 0 M 2
     cigar: id=12|simpleSeqZ1 0 1 + id=0|simpleSeqA1 6 7 + 0 M 1
     cigar: id=3|simpleSeqC1 0 5 + id=4|simpleSeqD 0 5 + 0 M 5
     cigar: id=4|simpleSeqD 5 10 + id=3|simpleSeqC1 5 10 + 0 M 5
     cigar: id=3|simpleSeqC1 10 15 + id=3|simpleSeqC1 15 20 + 0 M 5
     cigar: id=303|simpleSeqNonExistent 0 10 + id=3|simpleSeqC1 0 10 + 0 M 10
     '''))
 def testCactusRealign(self):
     """Runs cactus realign using the default parameters and checks that the realigned output cigars align
     the same subsequences.
     """
     for seqFile1, seqFile2 in seqFilePairGenerator():
         lastzOutput = getTempFile(rootDir=self.tempDir)
         runLastz(seqFile1, seqFile2, alignmentsFile=lastzOutput,
                  lastzArguments=self.defaultLastzArguments)
         realignOutput = getTempFile(rootDir=self.tempDir)
         runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile = lastzOutput,
                          outputAlignmentsFile = realignOutput,
                          realignArguments=self.defaultRealignArguments)
         
         for realignLine, lastzLine in zip([ i for i in open(lastzOutput, 'r') if i != '' ], 
                                           [ i for i in open(realignOutput, 'r') if i != '' ]):
             realignCigar = cigarReadFromString(realignLine)
             lastzCigar = cigarReadFromString(lastzLine)
             self.assertTrue(realignCigar.sameCoordinates(lastzCigar))
 def run(self):
     length = self.fileEnd - self.fileStart
     self.logToMaster("Am running a down target with length: %i from input file: %s" % (length, self.inputFile))
     assert length >= 0
     if length > self.N:
         midPoint = getMidPoint(self.inputFile, self.fileStart, self.fileEnd)
         assert midPoint >= self.fileStart
         assert midPoint+1 < self.fileEnd
         #We will subdivide the file
         tempFile1 = getTempFile(rootDir=self.getGlobalTempDir())
         tempFile2 = getTempFile(rootDir=self.getGlobalTempDir())
         self.addChildTarget(Down(self.inputFile, self.fileStart, midPoint+1, self.N, tempFile1))
         self.addChildTarget(Down(self.inputFile, midPoint+1, self.fileEnd, self.N, tempFile2)) #Add one to avoid the newline
         self.setFollowOnTarget(Up(tempFile1, tempFile2, self.outputFile))                
     else:
         #We can sort this bit of the file
         copySubRangeOfFile(self.inputFile, self.fileStart, self.fileEnd, self.outputFile)
         sort(self.outputFile)
Exemple #47
0
 def run(self):
     outputsPerGenome = {}
     for genome, bedFile in self.bedFileDict.items():
         outputsPerGenome[genome] = []
         numLines = int(popenCatch("wc -l %s | cut -d' ' -f 1" % bedFile))
         linesPerJob = int(math.ceil(float(numLines) / self.jobsPerGenome))
         if linesPerJob == 0:
             linesPerJob = 1
         for start in xrange(0, numLines, linesPerJob):
             end = start + linesPerJob
             if end > numLines:
                 end = numLines
             bedForJob = getTempFile(rootDir=self.getGlobalTempDir())
             system("head -n %d %s | tail -n %d > %s" % (start + linesPerJob, bedFile, end - start, bedForJob))
             output = getTempFile(rootDir=self.getGlobalTempDir())
             self.addChildTarget(RunAncestorsML(self.halFile, genome, bedForJob, self.phyloPModel, output))
             outputsPerGenome[genome].append(output)
     self.setFollowOnTarget(WriteNucleotides(outputsPerGenome, self.halFile))
def writeSequenceData(target, genome, hal, hubDir):
    """Write the .2bit and chrom.sizes for a genome."""
    if not os.path.isdir(os.path.join(hubDir, genome)):
        os.makedirs(os.path.join(hubDir, genome))
    fasta = getTempFile()
    system("hal2fasta %s %s > %s" % (hal, genome, fasta))
    system("faToTwoBit %s %s" % (fasta, os.path.join(hubDir, genome, genome + '.2bit')))
    system("twoBitInfo %s %s" % (os.path.join(hubDir, genome, genome + '.2bit'), os.path.join(hubDir, genome, 'chrom.sizes')))
    os.remove(fasta)
Exemple #49
0
def runReferenceMedianProblemTest(medianHistory, greedyIterations, theta):
    """Runs the reference problem for a given median history
    """
    #Make adjacencies
    stubNumber = 2
    nodeNumber = len(
        medianHistory.getMedianGenome().getElements()) * 2 + stubNumber
    weights = {}
    for genome in medianHistory.getLeafGenomes():
        for node1, node2, distance in genome.getTransitiveAdjacencies():
            if (node1, node2) in weights:
                weights[(node1, node2)] += weightFn(distance, theta)
            else:
                weights[(node1, node2)] = weightFn(distance, theta)

    def translateLeftSideOfElementToNode(element):
        assert element != 0
        if element < 0:
            return abs(element) * 2
        return element * 2 + 1

    def translateLeftNodeToElement(node):
        assert node >= stubNumber
        assert node < nodeNumber
        element = node / 2
        if (node % 2) == 0:
            element *= -1
        return element

    #Now print out the
    input = "%i\t%i\t%i\t%i\t%s" % (
        greedyIterations, nodeNumber, stubNumber, len(weights.keys()),
        "\t".join([
            "%i\t%i\t%f" %
            (translateLeftSideOfElementToNode(-node1),
             translateLeftSideOfElementToNode(node2), weights[(node1, node2)])
            for (node1, node2) in weights.keys()
        ]))
    tempPath = getTempFile()
    with open(tempPath, 'w') as tempFile:
        tempFile.write(input)
    #Command
    command = os.path.join(
        os.path.split(
            os.path.abspath(
                matchingAndOrdering.tests.simulatedGenome.__file__))[0],
        "testBin", "referenceMedianProblemTest2")
    output = popenCatch(command + " < %s" % tempPath)
    os.remove(tempPath)
    medianChromosome = Chromosome()
    for adjacency in output.split():
        medianChromosome.append(translateLeftNodeToElement(int(adjacency)))
    medianGenome = Genome(chromosomeNumber=0, elementNumber=0)
    medianGenome.addChromosome(medianChromosome)
    assert medianGenome.getElements() == medianHistory.getMedianGenome(
    ).getElements()
    return medianGenome
Exemple #50
0
 def testJobTreeStats_SortSimple(self):
     """Tests the jobTreeStats utility using the scriptTree_sort example.
     """
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile = getTempFile(rootDir=tempDir)
         outputFile = getTempFile(rootDir=tempDir)
         jobTreeDir = os.path.join(tempDir, "jobTree")
         lines=100000
         maxLineLength=10
         N=1000
         makeFileToSort(tempFile, lines, maxLineLength)
         #Sort the file
         command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %s --stats --jobTime 0.5" % (jobTreeDir, tempFile, N)
         system(command)
         #Now get the stats
         system("jobTreeStats --jobTree %s --outputFile %s" % (jobTreeDir, outputFile))
         #Cleanup
         system("rm -rf %s" % tempDir)
Exemple #51
0
 def makeRunnable(self, tempDir):
     pickleFile = getTempFile(".pickle", tempDir)
     fileHandle = open(pickleFile, 'w')
     cPickle.dump(self, fileHandle, cPickle.HIGHEST_PROTOCOL)
     fileHandle.close()
     i = set()
     for importString in self.target.importStrings:
         i.add(importString)
     classNames = " ".join(i)
     return "scriptTree %s %s" % (pickleFile, classNames)
Exemple #52
0
    def testRandom(self):
        """Makes random sequences and tests that Ortheus can align them and produce a valid output.
        """
        outputFile = getTempFile()
        self.tempFiles.append(outputFile)

        MAX_SEQS = 20

        for i in xrange(MAX_SEQS):
            self.tempFiles.append(getTempFile())

        for test in xrange(0, self.testNo):
            print "test no : %i " % test
            #seqNo
            binaryTree = randomTree()
            middleSeq = getRandomSequence(250)[1]
            seqs = []
            getTreeSeqs(binaryTree, middleSeq, seqs)

            if len(seqs) <= MAX_SEQS and len(seqs) > 2:
                seqFiles = []
                for i in xrange(0, len(seqs)):
                    seqFiles.append(self.tempFiles[1 + i])
                    fileHandle = open(seqFiles[i], 'w')
                    fastaWrite(fileHandle, "%i" % i, seqs[i])
                    fileHandle.close()
                print "Have seq files ", seqFiles

                treeString = printBinaryTree(binaryTree, True)
                print "For tree ", treeString

                #align seqs and check no failure
                command = "ortheus_core -a %s -b '%s' -d %s -e" % (
                    " ".join(seqFiles), treeString, outputFile)
                print "command to call", command
                system(command)

                #check alignment is complete
                alignment = [i[:] for i in fastaAlignmentRead(outputFile)]
                #print "alignment", alignment
                checkAlignment(alignment, seqs)

                print "test no is finished : %i " % test
Exemple #53
0
 def progressiveFunction(self, experimentFile, toilDir,
                         batchSystem, buildAvgs,
                         buildHal,
                         buildFasta,
                         toilStats,
                         subtreeRoot=None):
     eW = ExperimentWrapper(ET.parse(experimentFile).getroot())
     seqFile = getTempFile()
     with open(seqFile, 'w') as f:
         tree = eW.getTree()
Exemple #54
0
 def runScript(self, binaryName, outputFile, specialOptions):
     if not os.path.exists(outputFile):
         tempOutputFile = getTempFile(rootDir=self.getLocalTempDir())
         os.remove(tempOutputFile)
         system(
             "%s --cactusDisk '%s' --outputFile %s --minimumNsForScaffoldGap %s --sampleNumber %s %s"
             % (os.path.join(getRootPathString(), "bin", binaryName),
                getCactusDiskString(self.alignment), tempOutputFile,
                self.options.minimumNsForScaffoldGap,
                self.options.sampleNumber, specialOptions))
         system("mv %s %s" % (tempOutputFile, outputFile))
Exemple #55
0
 def testCactusCallPipes(self):
     inputFile = getTempFile(rootDir=self.tempDir)
     with open(inputFile, 'w') as f:
         f.write('foobar\n')
     # using 'cat' here rather than infile is intentional; it tests
     # whether the directory is mounted into containers correctly.
     output = cactus_call(parameters=[['cat', inputFile],
                                      ['sed', 's/foo/baz/g'],
                                      ['awk', '{ print "quux" $0 }']],
                          check_output=True)
     self.assertEqual(output, 'quuxbazbar\n')