Ejemplo n.º 1
0
 def test_hello_world(self):
     system([sys.executable,
             '-m', helloWorld.__name__,
             'file:./toilTest',
             '--batchSystem', 'mesos',
             '--mesosMaster', 'localhost:5050',
             '--logLevel', getLogLevelString()])
Ejemplo n.º 2
0
 def testAWSProvisionerUtils(self):
     clusterName = 'cluster-utils-test' + str(uuid.uuid4())
     try:
         system([
             self.toilMain, 'launch-cluster', '--nodeType=t2.micro',
             '--keyPairName=jenkins@jenkins-master', clusterName,
             '--provisioner=aws'
         ])
     finally:
         system([
             self.toilMain, 'destroy-cluster', '--provisioner=aws',
             clusterName
         ])
     try:
         # launch preemptable master with same name
         system([
             self.toilMain, 'launch-cluster', '--nodeType=m3.medium:0.2',
             '--keyPairName=jenkins@jenkins-master', clusterName,
             '--provisioner=aws', '--logLevel=DEBUG'
         ])
         system([
             self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName
         ])
     finally:
         system([
             self.toilMain, 'destroy-cluster', '--provisioner=aws',
             clusterName
         ])
Ejemplo n.º 3
0
    def testUtilsStatsSort(self):
        """
        Tests the stats commands on a complete run of the stats test.
        """
        # Get the sort command to run
        toilCommandString = ("{self.sort} "
                             "{self.toilDir} "
                             "--logLevel=DEBUG "
                             "--fileToSort={self.tempFile} "
                             "--N {self.N} --stats "
                             "--retryCount 99".format(**locals()))

        # Run the script for the first time
        system(toilCommandString)
        self.assertTrue(os.path.exists(self.toilDir))

        # Check we can run 'toil stats'
        rootPath = os.path.join(toilPackageDirPath(), "utils")
        toilStatsString = ("{self.toilMain} stats "
                           "{self.toilDir} --pretty".format(**locals()))
        system(toilStatsString)

        # Check the file is properly sorted
        with open(self.tempFile, 'r') as fileHandle:
            l2 = fileHandle.readlines()
            self.assertEquals(self.correctSort, l2)
Ejemplo n.º 4
0
    def testUtilsStatsSort(self):
        """
        Tests the stats commands on a complete run of the stats test.
        """
        # Get the sort command to run
        toilCommand = [
            sys.executable,
            "-m",
            toil.test.sort.sort.__name__,
            self.toilDir,
            "--logLevel=DEBUG",
            "--fileToSort",
            self.tempFile,
            "--N",
            str(self.N),
            "--stats",
            "--retryCount=99",
            "--badWorker=0.5",
            "--badWorkerFailInterval=0.01",
        ]

        # Run the script for the first time
        system(toilCommand)
        self.assertTrue(os.path.exists(self.toilDir))

        # Check we can run 'toil stats'
        system(self.statsCommand)

        # Check the file is properly sorted
        with open(self.tempFile, "r") as fileHandle:
            l2 = fileHandle.readlines()
            self.assertEquals(self.correctSort, l2)
Ejemplo n.º 5
0
    def testUtilsStatsSort(self):
        """
        Tests the stats commands on a complete run of the stats test.
        """
        # Get the sort command to run
        toilCommand = [sys.executable,
                       '-m', toil.test.sort.sort.__name__,
                       self.toilDir,
                       '--logLevel=DEBUG',
                       '--fileToSort', self.tempFile,
                       '--outputFile', self.outputFile,
                       '--N', str(self.N),
                       '--stats',
                       '--retryCount=99',
                       '--badWorker=0.5',
                       '--badWorkerFailInterval=0.01']

        # Run the script for the first time
        system(toilCommand)
        self.assertTrue(os.path.exists(self.toilDir))

        # Check we can run 'toil stats'
        system(self.statsCommand)

        # Check the file is properly sorted
        with open(self.outputFile, 'r') as fileHandle:
            l2 = fileHandle.readlines()
            self.assertEquals(self.correctSort, l2)

        # Delete output file
        os.remove(self.outputFile)
Ejemplo n.º 6
0
    def testUtilsStatsSort(self):
        """
        Tests the stats commands on a complete run of the stats test.
        """
        # Get the sort command to run
        toilCommand = [
            sys.executable, '-m', toil.test.sort.sort.__name__, self.toilDir,
            '--logLevel=DEBUG', '--fileToSort', self.tempFile, '--outputFile',
            self.outputFile, '--N',
            str(self.N), '--stats', '--retryCount=99', '--badWorker=0.5',
            '--badWorkerFailInterval=0.01'
        ]

        # Run the script for the first time
        system(toilCommand)
        self.assertTrue(os.path.exists(self.toilDir))

        # Check we can run 'toil stats'
        system(self.statsCommand)

        # Check the file is properly sorted
        with open(self.outputFile, 'r') as fileHandle:
            l2 = fileHandle.readlines()
            self.assertEqual(self.correctSort, l2)

        # Delete output file
        os.remove(self.outputFile)
Ejemplo n.º 7
0
 def test_hello_world(self):
     system(
         [
             sys.executable,
             "-m",
             helloWorld.__name__,
             "./toilTest",
             "--batchSystem=mesos",
             "--logLevel",
             getLogLevelString(),
         ]
     )
Ejemplo n.º 8
0
    def testAWSProvisionerUtils(self):
        """
        Runs a number of the cluster utilities in sequence.

        Launches a cluster with custom tags.
        Verifies the tags exist.
        ssh's into the cluster.
        Does some weird string comparisons.
        Makes certain that TOIL_WORKDIR is set as expected in the ssh'ed cluster.
        Rsyncs a file and verifies it exists on the leader.
        Destroys the cluster.

        :return:
        """
        # TODO: Run these for the other clouds.
        clusterName = f'cluster-utils-test{uuid.uuid4()}'
        keyName = os.getenv('TOIL_AWS_KEYNAME').strip() or 'id_rsa'

        try:
            from toil.provisioners.aws.awsProvisioner import AWSProvisioner
            aws_provisioner = AWSProvisioner.__module__
            logger.debug(f"Found AWSProvisioner: {aws_provisioner}.")

            # launch master with an assortment of custom tags
            system([
                self.toilMain, 'launch-cluster', '-t', 'key1=value1', '-t',
                'key2=value2', '--tag', 'key3=value3',
                '--leaderNodeType=m3.medium', '--keyPairName=' + keyName,
                clusterName, '--provisioner=aws', '--zone=us-west-2a',
                '--logLevel=DEBUG'
            ])

            from toil.provisioners import cluster_factory
            cluster = toil.provisioners.cluster_factory(
                provisioner='aws', clusterName=clusterName)
            leader = cluster.getLeader()

            # check that the leader carries the appropriate tags
            tags = {
                'key1': 'value1',
                'key2': 'value2',
                'key3': 'value3',
                'Name': clusterName,
                'Owner': keyName
            }
            for key in tags:
                self.assertEqual(tags[key], leader.tags.get(key))
        finally:
            system([
                self.toilMain, 'destroy-cluster', '--provisioner=aws',
                clusterName
            ])
Ejemplo n.º 9
0
    def testAWSProvisionerUtils(self):
        clusterName = 'cluster-utils-test' + str(uuid.uuid4())
        try:
            system([
                self.toilMain, 'launch-cluster', '--nodeType=t2.micro',
                '--keyPairName=jenkins@jenkins-master', clusterName,
                '--provisioner=aws'
            ])
        finally:
            system([
                self.toilMain, 'destroy-cluster', '--provisioner=aws',
                clusterName
            ])
        try:
            from toil.provisioners.aws.awsProvisioner import AWSProvisioner
            # launch preemptable master with same name
            system([
                self.toilMain, 'launch-cluster', '--nodeType=m3.medium:0.2',
                '--keyPairName=jenkins@jenkins-master', clusterName,
                '--provisioner=aws', '--logLevel=DEBUG'
            ])
            system([
                self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName
            ])

            testStrings = [
                "'foo'", '"foo"', '  foo', '$PATH', '"', "'", '\\', '| cat',
                '&& cat', '; cat'
            ]
            for test in testStrings:
                logger.info('Testing SSH with special string: %s', test)
                compareTo = "import sys; assert sys.argv[1]==%r" % test
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=['python', '-', test],
                                         input=compareTo)

            try:
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=['nonsenseShouldFail'])
            except RuntimeError:
                pass
            else:
                self.fail(
                    'The remote command failed silently where it should have '
                    'raised an error')

        finally:
            system([
                self.toilMain, 'destroy-cluster', '--provisioner=aws',
                clusterName
            ])
Ejemplo n.º 10
0
def parasolIsInstalled():
    """
    Returns True if parasol is installed, else False.
    """
    try:
        return system("parasol status") == 0
    except CalledProcessError:
        return False
Ejemplo n.º 11
0
def gridEngineIsInstalled():
    """
    Returns True if grid-engine is installed, else False.
    """
    try:
        return system("qstat -help") == 0
    except CalledProcessError:
        return False
Ejemplo n.º 12
0
def parasolIsInstalled():
    """
    Returns True if parasol is installed, else False.
    """
    try:
        return system("parasol status") == 0
    except CalledProcessError:
        return False
Ejemplo n.º 13
0
def gridEngineIsInstalled():
    """
    Returns True if grid-engine is installed, else False.
    """
    try:
        return system("qstat -help") == 0
    except CalledProcessError:
        return False
Ejemplo n.º 14
0
    def testAWSProvisionerUtils(self):
        clusterName = 'cluster-utils-test' + str(uuid.uuid4())
        try:
            system([self.toilMain, 'launch-cluster', '--nodeType=t2.micro', '--keyPairName=jenkins@jenkins-master',
                    clusterName, '--provisioner=aws'])
        finally:
            system([self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName])
        try:
            from toil.provisioners.aws.awsProvisioner import AWSProvisioner
            # launch preemptable master with same name
            system([self.toilMain, 'launch-cluster', '--nodeType=m3.medium:0.2', '--keyPairName=jenkins@jenkins-master',
                    clusterName, '--provisioner=aws', '--logLevel=DEBUG'])
            system([self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName])

            testStrings = ["'foo'",
                           '"foo"',
                           '  foo',
                           '$PATH',
                           '"',
                           "'",
                           '\\',
                           '| cat',
                           '&& cat',
                           '; cat'
                           ]
            for test in testStrings:
                logger.info('Testing SSH with special string: %s', test)
                compareTo = "import sys; assert sys.argv[1]==%r" % test
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=['python', '-', test],
                                         input=compareTo)

            try:
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=['nonsenseShouldFail'])
            except RuntimeError:
                pass
            else:
                self.fail('The remote command failed silently where it should have '
                          'raised an error')

        finally:
            system([self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName])
Ejemplo n.º 15
0
 def testToilStats_SortSimple(self):
     """
     Tests the toilStats utility using the scriptTree_sort example.
     """
     for test in xrange(self.testNo):
         tempDir = getTempDirectory(os.getcwd())
         tempFile = getTempFile(rootDir=tempDir)
         outputFile = getTempFile(rootDir=tempDir)
         toilDir = os.path.join(tempDir, "testToil")
         lines = 10000
         maxLineLength = 10
         N = 1000
         makeFileToSort(tempFile, lines, maxLineLength)
         # Sort the file
         rootPath = os.path.join(toilPackageDirPath(), "test", "sort")
         system("{rootPath}/sort.py "
                "--toil {toilDir} "
                "--logLevel=DEBUG "
                "--fileToSort={tempFile} "
                "--N {N} --stats "
                "--jobTime 0.5 "
                "--retryCount 99".format(**locals()))
         # Now get the stats
         toilStats = self.getScriptPath('toilStats')
         system("{toilStats} "
                "--toil {toilDir} "
                "--outputFile {outputFile}".format(**locals()))
         # Cleanup
         system("rm -rf %s" % tempDir)
Ejemplo n.º 16
0
    def testAWSProvisionerUtils(self):
        """
        Runs a number of the cluster utilities in sequence.

        Launches a cluster with custom tags.
        Verifies the tags exist.
        ssh's into the cluster.
        Does some weird string comparisons.
        Makes certain that TOIL_WORKDIR is set as expected in the ssh'ed cluster.
        Rsyncs a file and verifies it exists on the leader.
        Destroys the cluster.

        :return:
        """
        # TODO: Run these for the other clouds.
        clusterName = 'cluster-utils-test' + str(uuid.uuid4())
        keyName = os.getenv('TOIL_AWS_KEYNAME')

        try:
            from toil.provisioners.aws.awsProvisioner import AWSProvisioner

            # launch master with an assortment of custom tags
            system([
                self.toilMain, 'launch-cluster', '-t', 'key1=value1', '-t',
                'key2=value2', '--tag', 'key3=value3',
                '--leaderNodeType=m3.medium', '--keyPairName=' + keyName,
                clusterName, '--provisioner=aws', '--zone=us-west-2a',
                '--logLevel=DEBUG'
            ])

            cluster = clusterFactory(provisioner='aws',
                                     clusterName=clusterName)
            leader = cluster.getLeader()

            # check that the leader carries the appropriate tags
            tags = {
                'key1': 'value1',
                'key2': 'value2',
                'key3': 'value3',
                'Name': clusterName,
                'Owner': keyName
            }
            for key in tags:
                self.assertEqual(tags[key], leader.tags.get(key))

            # Test strict host key checking
            # Doesn't work when run locally.
            if keyName == 'jenkins@jenkins-master':
                try:
                    leader.sshAppliance(strict=True)
                except RuntimeError:
                    pass
                else:
                    self.fail(
                        "Host key verification passed where it should have failed"
                    )

            # Add the host key to known_hosts so that the rest of the tests can
            # pass without choking on the verification prompt.
            leader.sshAppliance('bash',
                                strict=True,
                                sshOptions=['-oStrictHostKeyChecking=no'])

            system([
                self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName
            ])

            testStrings = [
                "'foo'", '"foo"', '  foo', '$PATH', '"', "'", '\\', '| cat',
                '&& cat', '; cat'
            ]
            for test in testStrings:
                logger.debug('Testing SSH with special string: %s', test)
                compareTo = "import sys; assert sys.argv[1]==%r" % test
                leader.sshAppliance('python', '-', test, input=compareTo)

            try:
                leader.sshAppliance('nonsenseShouldFail')
            except RuntimeError:
                pass
            else:
                self.fail(
                    'The remote command failed silently where it should have raised an error'
                )

            leader.sshAppliance(
                'python', '-c',
                "import os; assert os.environ['TOIL_WORKDIR']=='/var/lib/toil'"
            )

            # `toil rsync-cluster`
            # Testing special characters - string.punctuation
            fname = '!"#$%&\'()*+,-.;<=>:\ ?@[\\\\]^_`{|}~'
            testData = os.urandom(3 * (10**6))
            with tempfile.NamedTemporaryFile(suffix=fname) as tmpFile:
                relpath = os.path.basename(tmpFile.name)
                tmpFile.write(testData)
                tmpFile.flush()
                # Upload file to leader
                leader.coreRsync(args=[tmpFile.name, ":"])
                # Ensure file exists
                leader.sshAppliance("test", "-e", relpath)
            tmpDir = tempfile.mkdtemp()
            # Download the file again and make sure it's the same file
            # `--protect-args` needed because remote bash chokes on special characters
            leader.coreRsync(args=["--protect-args", ":" + relpath, tmpDir])
            with open(os.path.join(tmpDir, relpath), "r") as f:
                self.assertEqual(
                    f.read(), testData,
                    "Downloaded file does not match original file")
        finally:
            system([
                self.toilMain, 'destroy-cluster', '--provisioner=aws',
                clusterName
            ])
            try:
                shutil.rmtree(tmpDir)
            except NameError:
                pass
Ejemplo n.º 17
0
def decompressFastaFile(fileName, tempFileName):
    """Copies the file from the central dir to a temporary file, returning the temp file name.
    """
    system("bunzip2 --stdout %s > %s" % (fileName, tempFileName))
    return tempFileName
Ejemplo n.º 18
0
    def testUtilsSort(self):
        """
        Tests the status and stats commands of the toil command line utility using the
        sort example with the --restart flag.
        """
        # Get the sort command to run
        toilCommand = [
            sys.executable, '-m', toil.test.sort.sort.__name__, self.toilDir,
            '--logLevel=DEBUG', '--fileToSort', self.tempFile, '--N',
            str(self.N), '--stats', '--retryCount=2', '--badWorker=0.5',
            '--badWorkerFailInterval=0.05'
        ]
        # Try restarting it to check that a JobStoreException is thrown
        self.assertRaises(CalledProcessError, system,
                          toilCommand + ['--restart'])
        # Check that trying to run it in restart mode does not create the jobStore
        self.assertFalse(os.path.exists(self.toilDir))

        # Status command
        # Run the script for the first time
        try:
            system(toilCommand)
            finished = True
        except CalledProcessError:  # This happens when the script fails due to having unfinished jobs
            self.assertRaises(CalledProcessError, system, self.statusCommand)
            finished = False
        self.assertTrue(os.path.exists(self.toilDir))

        # Try running it without restart and check an exception is thrown
        self.assertRaises(CalledProcessError, system, toilCommand)

        # Now restart it until done
        totalTrys = 1
        while not finished:
            try:
                system(toilCommand + ['--restart'])
                finished = True
            except CalledProcessError:  # This happens when the script fails due to having unfinished jobs
                self.assertRaises(CalledProcessError, system,
                                  self.statusCommand)
                if totalTrys > 16:
                    self.fail()  # Exceeded a reasonable number of restarts
                totalTrys += 1

                # Check the toil status command does not issue an exception
        system(self.statusCommand)

        # Check if we try to launch after its finished that we get a JobException
        self.assertRaises(CalledProcessError, system,
                          toilCommand + ['--restart'])

        # Check we can run 'toil stats'
        system(self.statsCommand)

        # Check the file is properly sorted
        with open(self.tempFile, 'r') as fileHandle:
            l2 = fileHandle.readlines()
            self.assertEquals(self.correctSort, l2)

        # Check we can run 'toil clean'
        system(self.cleanCommand)
Ejemplo n.º 19
0
def main():
    logging.basicConfig()

    ##########################################
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import makePublicDir
    from toil.lib.bioio import system
    from toil.common import loadJobStore
    from toil.job import Job
    
    ########################################## 
    #Input args
    ##########################################
    
    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = loadJobStore(jobStoreString)
    config = jobStore.config
    
    ##########################################
    #Create the worker killer, if requested
    ##########################################

    if config.badWorker > 0 and random.random() < config.badWorker:
        def badWorker():
            #This will randomly kill the worker process at a random time 
            time.sleep(config.badWorkerFailInterval * random.random())
            os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT)
            #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine)
        t = Thread(target=badWorker)
        t.daemon = True
        t.start()

    ##########################################
    #Load the environment for the jobWrapper
    ##########################################
    
    #First load the environment for the jobWrapper.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.logLevel)

    tempRootDir = config.workDir
    if tempRootDir is not None and not os.path.exists(tempRootDir):
        raise RuntimeError("The temporary directory specified by workDir: %s does not exist" % tempRootDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in. If tempRootDir is None, tempdir looks at environment variables to determine
    # where to put the tempDir.
    localWorkerTempDir = tempfile.mkdtemp(dir=tempRootDir)
    os.chmod(localWorkerTempDir, 0755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    messages = []
    blockFn = lambda : True
    cleanCacheFn = lambda x : True
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))
    
        ##########################################
        #Load the jobWrapper
        ##########################################
        
        jobWrapper = jobStore.load(jobStoreID)
        logger.debug("Parsed jobWrapper")
        
        ##########################################
        #Cleanup from any earlier invocation of the jobWrapper
        ##########################################
        
        if jobWrapper.command == None:
            while len(jobWrapper.stack) > 0:
                jobs = jobWrapper.stack[-1]
                #If the jobs still exist they have not been run, so break
                if jobStore.exists(jobs[0][0]):
                    break
                #However, if they are gone then we can remove them from the stack.
                #This is the only way to flush successors that have previously been run
                #, as jobs are, as far as possible, read only in the leader.
                jobWrapper.stack.pop()
                
        #This cleans the old log file which may 
        #have been left if the jobWrapper is being retried after a jobWrapper failure.
        oldLogFile = jobWrapper.logJobStoreFileID
        jobWrapper.logJobStoreFileID = None
        jobStore.update(jobWrapper) #Update first, before deleting the file
        if oldLogFile != None:
            jobStore.delete(oldLogFile)
            
        #Make a temporary file directory for the jobWrapper
        localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir"))
    
        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.stats:
            startTime = time.time()
            startClock = getTotalCpuTime()

        startTime = time.time() 
        while True:
            ##########################################
            #Run the jobWrapper, if there is one
            ##########################################
            
            if jobWrapper.command != None:
                if jobWrapper.command.startswith( "_toil " ):
                    #Load the job
                    job = Job._loadJob(jobWrapper.command, jobStore)
                    
                    #Cleanup the cache from the previous job
                    cleanCacheFn(job.effectiveRequirements(jobStore.config).cache)
                    
                    #Create a fileStore object for the job
                    fileStore = Job.FileStore(jobStore, jobWrapper, localTempDir, 
                                              blockFn)
                    #Get the next block function and list that will contain any messages
                    blockFn = fileStore._blockFn
                    messages = fileStore.loggingMessages

                    job._execute(jobWrapper=jobWrapper,
                                           stats=statsDict if config.stats else None,
                                           localTempDir=localTempDir,
                                           jobStore=jobStore,
                                           fileStore=fileStore)

                    #Set the clean cache function
                    cleanCacheFn = fileStore._cleanLocalTempDir
                    
                else: #Is another command (running outside of jobs may be deprecated)
                    #Cleanup the cache from the previous job
                    cleanCacheFn(0)
                    
                    system(jobWrapper.command)
                    #Set a dummy clean cache fn
                    cleanCacheFn = lambda x : None
            else:
                #The command may be none, in which case
                #the jobWrapper is either a shell ready to be deleted or has 
                #been scheduled after a failure to cleanup
                break
            
            if Job.FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobWrapper within the worker
            ##########################################
            
            #No more jobs to run so quit
            if len(jobWrapper.stack) == 0:
                break
            
            #Get the next set of jobs to run
            jobs = jobWrapper.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the jobWrapper to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCores, successorsDisk, successorPredecessorID = jobs[0]
            if successorMemory > jobWrapper.memory:
                logger.debug("We need more memory for the next jobWrapper, so finishing")
                break
            if successorCores > jobWrapper.cores:
                logger.debug("We need more cores for the next jobWrapper, so finishing")
                break
            if successorsDisk > jobWrapper.disk:
                logger.debug("We need more disk for the next jobWrapper, so finishing")
                break
            if successorPredecessorID != None: 
                logger.debug("The jobWrapper has multiple predecessors, we must return to the leader.")
                break
          
            ##########################################
            #We have a single successor jobWrapper.
            #We load the successor jobWrapper and transplant its command and stack
            #into the current jobWrapper so that it can be run
            #as if it were a command that were part of the current jobWrapper.
            #We can then delete the successor jobWrapper in the jobStore, as it is
            #wholly incorporated into the current jobWrapper.
            ##########################################
            
            #Clone the jobWrapper and its stack
            jobWrapper = copy.deepcopy(jobWrapper)
            
            #Remove the successor jobWrapper
            jobWrapper.stack.pop()
            
            #Load the successor jobWrapper
            successorJob = jobStore.load(successorJobStoreID)
            #These should all match up
            assert successorJob.memory == successorMemory
            assert successorJob.cores == successorCores
            assert successorJob.predecessorsFinished == set()
            assert successorJob.predecessorNumber == 1
            assert successorJob.command != None
            assert successorJobStoreID == successorJob.jobStoreID
            
            #Transplant the command and stack to the current jobWrapper
            jobWrapper.command = successorJob.command
            jobWrapper.stack += successorJob.stack
            assert jobWrapper.memory >= successorJob.memory
            assert jobWrapper.cores >= successorJob.cores
            
            #Build a fileStore to update the job
            fileStore = Job.FileStore(jobStore, jobWrapper, localTempDir, blockFn)
            
            #Update blockFn
            blockFn = fileStore._blockFn
            
            #Add successorJob to those to be deleted
            fileStore.jobsToDelete.add(successorJob.jobStoreID)
            
            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()            
            
            #Clone the jobWrapper and its stack again, so that updates to it do 
            #not interfere with this update
            jobWrapper = copy.deepcopy(jobWrapper)
            
            logger.debug("Starting the next jobWrapper")
        
        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)
            statsDict.workers.log = messages
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed jobWrapper on host %s", socket.gethostname())
        Job.FileStore._terminateEvent.set()
    
    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ########################################## 
       
    blockFn() 
    
    ##########################################
    #All the asynchronous worker/update threads must be finished now, 
    #so safe to test if they completed okay
    ########################################## 
    
    if Job.FileStore._terminateEvent.isSet():
        jobWrapper = jobStore.load(jobStoreID)
        jobWrapper.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if workerFailed:
        truncateFile(tempWorkerLogPath)
        jobWrapper.logJobStoreFileID = jobStore.writeFile( tempWorkerLogPath, jobWrapper.jobStoreID )
        os.remove(tempWorkerLogPath)
        jobStore.update(jobWrapper)
    elif debugging: # write log messages
        truncateFile(tempWorkerLogPath)
        with open(tempWorkerLogPath, 'r') as logFile:
            logMessages = logFile.read().splitlines()
        statsDict.logs = [Expando(jobStoreID=jobStoreID,text=logMessage) for logMessage in logMessages]

    if (debugging or config.stats or messages) and not workerFailed: # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict))

    #Remove the temp dir
    shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobWrapper.command == None and len(jobWrapper.stack) == 0:
        #We can now safely get rid of the jobWrapper
        jobStore.delete(jobWrapper.jobStoreID)
Ejemplo n.º 20
0
 def test_hello_world(self):
     system([sys.executable,
             '-m', helloWorld.__name__,
             './toilTest',
             '--batchSystem=mesos',
             '--logLevel', getLogLevelString()])
Ejemplo n.º 21
0
 def tearDown(self):
     ToilTest.tearDown(self)
     system("rm -rf %s" % self.tempDir)
Ejemplo n.º 22
0
 def tearDown(self):
     self.contextManager.__exit__(None, None, None)
     system("rm -rf %s" % self.testToil)
     super(JobTest, self).tearDown()
Ejemplo n.º 23
0
def compressFastaFile(fileName):
    """Compress a fasta file.
    """
    system("bzip2 --keep --fast %s" % fileName)
    return fileName + ".bz2"
Ejemplo n.º 24
0
    def testAWSProvisionerUtils(self):
        clusterName = 'cluster-utils-test' + str(uuid.uuid4())
        keyName = os.getenv('TOIL_AWS_KEYNAME')

        try:
            # --provisioner flag should default to aws, so we're not explicitly
            # specifying that here
            system([self.toilMain, 'launch-cluster', '--nodeType=t2.micro',
                    '--keyPairName=' + keyName, clusterName])
        finally:
            system([self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName])
        try:
            from toil.provisioners.aws.awsProvisioner import AWSProvisioner

            userTags = {'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}
            tags = {'Name': clusterName, 'Owner': keyName}
            tags.update(userTags)

            # launch preemptable master with same name
            system([self.toilMain, 'launch-cluster', '-t', 'key1=value1', '-t', 'key2=value2', '--tag', 'key3=value3',
                    '--nodeType=m3.medium:0.2', '--keyPairName=' + keyName, clusterName,
                    '--provisioner=aws', '--logLevel=DEBUG'])

            # test leader tags
            leaderTags = AWSProvisioner._getLeader(clusterName).tags
            self.assertEqual(tags, leaderTags)

            # Test strict host key checking
            # Doesn't work when run locally.
            if(keyName == 'jenkins@jenkins-master'):
                try:
                    AWSProvisioner.sshLeader(clusterName=clusterName, strict=True)
                except RuntimeError:
                    pass
                else:
                    self.fail("Host key verification passed where it should have failed")

            # Add the host key to known_hosts so that the rest of the tests can
            # pass without choking on the verification prompt.
            AWSProvisioner.sshLeader(clusterName=clusterName, strict=True, sshOptions=['-oStrictHostKeyChecking=no'])

            system([self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName])

            testStrings = ["'foo'",
                           '"foo"',
                           '  foo',
                           '$PATH',
                           '"',
                           "'",
                           '\\',
                           '| cat',
                           '&& cat',
                           '; cat'
                           ]
            for test in testStrings:
                logger.info('Testing SSH with special string: %s', test)
                compareTo = "import sys; assert sys.argv[1]==%r" % test
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=['python', '-', test],
                                         input=compareTo)

            try:
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=['nonsenseShouldFail'])
            except RuntimeError:
                pass
            else:
                self.fail('The remote command failed silently where it should have '
                          'raised an error')

            AWSProvisioner.sshLeader(clusterName=clusterName,
                                     args=['python', '-c', "import os; assert os.environ['TOIL_WORKDIR']=='/var/lib/toil'"])

            # `toil rsync-cluster`
            # Testing special characters - string.punctuation
            fname = '!"#$%&\'()*+,-.;<=>:\ ?@[\\\\]^_`{|}~'
            testData = os.urandom(3 * (10**6))
            with tempfile.NamedTemporaryFile(suffix=fname) as tmpFile:
                relpath = os.path.basename(tmpFile.name)
                tmpFile.write(testData)
                tmpFile.flush()
                # Upload file to leader
                AWSProvisioner.rsyncLeader(clusterName=clusterName, args=[tmpFile.name, ":"])
                # Ensure file exists
                AWSProvisioner.sshLeader(clusterName=clusterName, args=["test", "-e", relpath])
            tmpDir = tempfile.mkdtemp()
            # Download the file again and make sure it's the same file
            # `--protect-args` needed because remote bash chokes on special characters
            AWSProvisioner.rsyncLeader(clusterName=clusterName, args=["--protect-args", ":" + relpath, tmpDir])
            with open(os.path.join(tmpDir, relpath), "r") as f:
                self.assertEqual(f.read(), testData, "Downloaded file does not match original file")
        finally:
            system([self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName])
            try:
                shutil.rmtree(tmpDir)
            except NameError:
                pass
Ejemplo n.º 25
0
 def tearDown(self):
     self.contextManager.__exit__(None, None, None)
     system("rm -rf %s" % self.testToil)
     super( JobTest, self ).tearDown( )
Ejemplo n.º 26
0
    def testUtilsSort(self):
        """
        Tests the restart, status and stats commands of the toil command line utility using the
        sort example.
        """
        # Get the sort command to run
        toilCommandString = ("{self.sort} "
                             "{self.toilDir} "
                             "--logLevel=DEBUG "
                             "--fileToSort={self.tempFile} "
                             "--N {self.N} --stats "
                             "--retryCount 2".format(**locals()))

        # Try restarting it to check that a JobStoreException is thrown
        self.assertRaises(CalledProcessError, system, toilCommandString + " --restart")
        # Check that trying to run it in restart mode does not create the jobStore
        self.assertFalse(os.path.exists(self.toilDir))

        # Status command
        toilStatusCommandString = ("{self.toilMain} status "
                                   "{self.toilDir} "
                                   "--failIfNotComplete".format(**locals()))

        # Run the script for the first time
        try:
            system(toilCommandString)
            finished = True
        except CalledProcessError:  # This happens when the script fails due to having unfinished jobs
            self.assertRaises(CalledProcessError, system, toilStatusCommandString)
            finished = False
        self.assertTrue(os.path.exists(self.toilDir))

        # Try running it without restart and check an exception is thrown
        self.assertRaises(CalledProcessError, system, toilCommandString)

        # Now restart it until done
        while not finished:
            try:
                system(toilCommandString + " --restart")
                finished = True
            except CalledProcessError:  # This happens when the script fails due to having unfinished jobs
                self.assertRaises(CalledProcessError, system, toilStatusCommandString)

        # Check the toil status command does not issue an exception
        system(toilStatusCommandString)

        # Check if we try to launch after its finished that we get a JobException
        self.assertRaises(CalledProcessError, system, toilCommandString + " --restart")

        # Check we can run 'toil stats'
        toilStatsString = ("{self.toilMain} stats "
                           "{self.toilDir} "
                           "--pretty".format(**locals()))
        system(toilStatsString)

        # Check the file is properly sorted
        with open(self.tempFile, 'r') as fileHandle:
            l2 = fileHandle.readlines()
            self.assertEquals(self.correctSort, l2)

        # Check we can run 'toil clean'
        toilCleanString = ("{self.toilMain} clean "
                           "{self.toilDir}".format(**locals()))
        system(toilCleanString)
Ejemplo n.º 27
0
    def testAWSProvisionerUtils(self):
        clusterName = 'cluster-utils-test' + str(uuid.uuid4())
        keyName = os.getenv('TOIL_AWS_KEYNAME')

        try:
            # --provisioner flag should default to aws, so we're not explicitly
            # specifying that here
            system([
                self.toilMain, 'launch-cluster', '--leaderNodeType=t2.micro',
                '--keyPairName=' + keyName, clusterName
            ])
        finally:
            system([
                self.toilMain, 'destroy-cluster', '--provisioner=aws',
                clusterName
            ])
        try:
            from toil.provisioners.aws.awsProvisioner import AWSProvisioner

            userTags = {'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}
            tags = {'Name': clusterName, 'Owner': keyName}
            tags.update(userTags)

            # launch preemptable master with same name
            system([
                self.toilMain, 'launch-cluster', '-t', 'key1=value1', '-t',
                'key2=value2', '--tag', 'key3=value3',
                '--leaderNodeType=m3.medium:0.2', '--keyPairName=' + keyName,
                clusterName, '--provisioner=aws', '--logLevel=DEBUG'
            ])

            # test leader tags
            leaderTags = AWSProvisioner._getLeader(clusterName).tags
            self.assertEqual(tags, leaderTags)

            # Test strict host key checking
            # Doesn't work when run locally.
            if (keyName == 'jenkins@jenkins-master'):
                try:
                    AWSProvisioner.sshLeader(clusterName=clusterName,
                                             strict=True)
                except RuntimeError:
                    pass
                else:
                    self.fail(
                        "Host key verification passed where it should have failed"
                    )

            # Add the host key to known_hosts so that the rest of the tests can
            # pass without choking on the verification prompt.
            AWSProvisioner.sshLeader(clusterName=clusterName,
                                     strict=True,
                                     sshOptions=['-oStrictHostKeyChecking=no'])

            system([
                self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName
            ])

            testStrings = [
                "'foo'", '"foo"', '  foo', '$PATH', '"', "'", '\\', '| cat',
                '&& cat', '; cat'
            ]
            for test in testStrings:
                logger.info('Testing SSH with special string: %s', test)
                compareTo = "import sys; assert sys.argv[1]==%r" % test
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=['python', '-', test],
                                         input=compareTo)

            try:
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=['nonsenseShouldFail'])
            except RuntimeError:
                pass
            else:
                self.fail(
                    'The remote command failed silently where it should have '
                    'raised an error')

            AWSProvisioner.sshLeader(
                clusterName=clusterName,
                args=[
                    'python', '-c',
                    "import os; assert os.environ['TOIL_WORKDIR']=='/var/lib/toil'"
                ])

            # `toil rsync-cluster`
            # Testing special characters - string.punctuation
            fname = '!"#$%&\'()*+,-.;<=>:\ ?@[\\\\]^_`{|}~'
            testData = os.urandom(3 * (10**6))
            with tempfile.NamedTemporaryFile(suffix=fname) as tmpFile:
                relpath = os.path.basename(tmpFile.name)
                tmpFile.write(testData)
                tmpFile.flush()
                # Upload file to leader
                AWSProvisioner.rsyncLeader(clusterName=clusterName,
                                           args=[tmpFile.name, ":"])
                # Ensure file exists
                AWSProvisioner.sshLeader(clusterName=clusterName,
                                         args=["test", "-e", relpath])
            tmpDir = tempfile.mkdtemp()
            # Download the file again and make sure it's the same file
            # `--protect-args` needed because remote bash chokes on special characters
            AWSProvisioner.rsyncLeader(
                clusterName=clusterName,
                args=["--protect-args", ":" + relpath, tmpDir])
            with open(os.path.join(tmpDir, relpath), "r") as f:
                self.assertEqual(
                    f.read(), testData,
                    "Downloaded file does not match original file")
        finally:
            system([
                self.toilMain, 'destroy-cluster', '--provisioner=aws',
                clusterName
            ])
            try:
                shutil.rmtree(tmpDir)
            except NameError:
                pass
Ejemplo n.º 28
0
    def testUtilsSort(self):
        """
        Tests the status and stats commands of the toil command line utility using the
        sort example with the --restart flag.
        """
        # Get the sort command to run
        toilCommand = [
            sys.executable,
            "-m",
            toil.test.sort.sort.__name__,
            self.toilDir,
            "--logLevel=DEBUG",
            "--fileToSort",
            self.tempFile,
            "--N",
            str(self.N),
            "--stats",
            "--retryCount=2",
            "--badWorker=0.5",
            "--badWorkerFailInterval=0.05",
        ]
        # Try restarting it to check that a JobStoreException is thrown
        self.assertRaises(CalledProcessError, system, toilCommand + ["--restart"])
        # Check that trying to run it in restart mode does not create the jobStore
        self.assertFalse(os.path.exists(self.toilDir))

        # Status command
        # Run the script for the first time
        try:
            system(toilCommand)
            finished = True
        except CalledProcessError:  # This happens when the script fails due to having unfinished jobs
            self.assertRaises(CalledProcessError, system, self.statusCommand)
            finished = False
        self.assertTrue(os.path.exists(self.toilDir))

        # Try running it without restart and check an exception is thrown
        self.assertRaises(CalledProcessError, system, toilCommand)

        # Now restart it until done
        totalTrys = 1
        while not finished:
            try:
                system(toilCommand + ["--restart"])
                finished = True
            except CalledProcessError:  # This happens when the script fails due to having unfinished jobs
                self.assertRaises(CalledProcessError, system, self.statusCommand)
                if totalTrys > 16:
                    self.fail()  # Exceeded a reasonable number of restarts
                totalTrys += 1

                # Check the toil status command does not issue an exception
        system(self.statusCommand)

        # Check if we try to launch after its finished that we get a JobException
        self.assertRaises(CalledProcessError, system, toilCommand + ["--restart"])

        # Check we can run 'toil stats'
        system(self.statsCommand)

        # Check the file is properly sorted
        with open(self.tempFile, "r") as fileHandle:
            l2 = fileHandle.readlines()
            self.assertEquals(self.correctSort, l2)

        # Check we can run 'toil clean'
        system(self.cleanCommand)
Ejemplo n.º 29
0
def main():
    ########################################## 
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        # FIXME: prepending to sys.path should fix #103
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import getTempDirectory
    from toil.lib.bioio import makeSubDir
    from toil.lib.bioio import system
    from toil.common import loadJobStore
    
    ########################################## 
    #Input args
    ##########################################
    
    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = loadJobStore(jobStoreString)
    config = jobStore.config

    ##########################################
    #Load the environment for the batchjob
    ##########################################
    
    #First load the environment for the batchjob.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.attrib["log_level"])

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localWorkerTempDir = getTempDirectory()
    
    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?    
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)
    
    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)
    
    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))
    
        ##########################################
        #Load the batchjob
        ##########################################
        
        batchjob = jobStore.load(jobStoreID)
        logger.debug("Parsed batchjob")
        
        ##########################################
        #Cleanup from any earlier invocation of the batchjob
        ##########################################
        
        if batchjob.command == None:
            while len(batchjob.stack) > 0:
                jobs = batchjob.stack[-1]
                #If the jobs still exist they have not been run, so break
                if jobStore.exists(jobs[0][0]):
                    break
                #However, if they are gone then we can remove them from the stack.
                #This is the only way to flush successors that have previously been run
                #, as jobs are, as far as possible, read only in the leader.
                batchjob.stack.pop()
                
                
        #This cleans the old log file which may 
        #have been left if the batchjob is being retried after a batchjob failure.
        if batchjob.logJobStoreFileID != None:
            batchjob.clearLogFile(jobStore)
    
        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.attrib.has_key("stats"):
            startTime = time.time()
            startClock = getTotalCpuTime()
            stats = ET.Element("worker")
        else:
            stats = None

        startTime = time.time() 
        while True:
            ##########################################
            #Run the batchjob, if there is one
            ##########################################
            
            if batchjob.command != None:
                if batchjob.command[:11] == "scriptTree ":
                    #Make a temporary file directory for the job
                    localTempDir = makeSubDir(os.path.join(localWorkerTempDir, "localTempDir"))
                    
                    #Is a job command
                    messages = loadJob(batchjob.command, jobStore)._execute(batchjob=batchjob,
                                    stats=stats, localTempDir=localTempDir, 
                                    jobStore=jobStore)
                    
                    #Remove the temporary file directory
                    shutil.rmtree(localTempDir)
    
                else: #Is another command (running outside of jobs may be deprecated)
                    system(batchjob.command)
                    messages = []
            else:
                #The command may be none, in which case
                #the batchjob is just a shell ready to be deleted
                assert len(batchjob.stack) == 0
                messages = []
                break
            
            ##########################################
            #Establish if we can run another batchjob within the worker
            ##########################################
            
            #Exceeded the amount of time the worker is allowed to run for so quit
            if time.time() - startTime > float(config.attrib["job_time"]):
                logger.debug("We are breaking because the maximum time the batchjob should run for has been exceeded")
                break

            #No more jobs to run so quit
            if len(batchjob.stack) == 0:
                break
            
            #Get the next set of jobs to run
            jobs = batchjob.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the batchjob to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCpu, successorsDisk, successorPredecessorID = jobs[0]
            if successorMemory > batchjob.memory:
                logger.debug("We need more memory for the next batchjob, so finishing")
                break
            if successorCpu > batchjob.cpu:
                logger.debug("We need more cpus for the next batchjob, so finishing")
                break
            if successorsDisk > batchjob.disk:
                logger.debug("We need more disk for the next batchjob, so finishing")
                break
            if successorPredecessorID != None: 
                logger.debug("The batchjob has multiple predecessors, we must return to the leader.")
                break
          
            ##########################################
            #We have a single successor batchjob.
            #We load the successor batchjob and transplant its command and stack
            #into the current batchjob so that it can be run
            #as if it were a command that were part of the current batchjob.
            #We can then delete the successor batchjob in the jobStore, as it is
            #wholly incorporated into the current batchjob.
            ##########################################
            
            #Remove the successor batchjob
            batchjob.stack.pop()
            
            #Load the successor batchjob
            successorJob = jobStore.load(successorJobStoreID)
            #These should all match up
            assert successorJob.memory == successorMemory
            assert successorJob.cpu == successorCpu
            assert successorJob.predecessorsFinished == set()
            assert successorJob.predecessorNumber == 1
            assert successorJob.command != None
            assert successorJobStoreID == successorJob.jobStoreID
            
            #Transplant the command and stack to the current batchjob
            batchjob.command = successorJob.command
            batchjob.stack += successorJob.stack
            assert batchjob.memory >= successorJob.memory
            assert batchjob.cpu >= successorJob.cpu
            
            #Checkpoint the batchjob and delete the successorJob
            batchjob.jobsToDelete = [ successorJob.jobStoreID ]
            jobStore.update(batchjob)
            jobStore.delete(successorJob.jobStoreID)
            
            logger.debug("Starting the next batchjob")
        
        ##########################################
        #Finish up the stats
        ##########################################

        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            m = ET.SubElement(stats, "messages")
            for message in messages:
                ET.SubElement(m, "message").text = message
            jobStore.writeStatsAndLogging(ET.tostring(stats))
        elif len(messages) > 0: #No stats, but still need to report log messages
            l = ET.Element("worker")
            m = ET.SubElement(l, "messages")
            for message in messages:
                ET.SubElement(m, "message").text = message
            jobStore.writeStatsAndLogging(ET.tostring(l))
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed batchjob on host %s", socket.gethostname())
        batchjob = jobStore.load(jobStoreID)
        batchjob.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if workerFailed:
        truncateFile(tempWorkerLogPath)
        batchjob.setLogFile(tempWorkerLogPath, jobStore)
        os.remove(tempWorkerLogPath)
        jobStore.update(batchjob)

    #Remove the temp dir
    shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and batchjob.command == None and len(batchjob.stack) == 0:
        #We can now safely get rid of the batchjob
        jobStore.delete(batchjob.jobStoreID)
Ejemplo n.º 30
0
def decompressFastaFile(fileName, tempFileName):
    """Copies the file from the central dir to a temporary file, returning the temp file name.
    """
    system("bunzip2 --stdout %s > %s" % (fileName, tempFileName))
    return tempFileName
Ejemplo n.º 31
0
 def test_hello_world(self):
     system([sys.executable,
             '-m', helloWorld.__name__,
             './toilTest',
             '--batchSystem=mesos',
             '--logLevel', getLogLevelString()])
Ejemplo n.º 32
0
 def tearDown(self):
     ToilTest.tearDown(self)
     system("rm -rf %s" % self.tempDir)
Ejemplo n.º 33
0
def main():
    ##########################################
    #Import necessary modules
    ##########################################

    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        # FIXME: prepending to sys.path should fix #103
        sys.path.append(sourcePath)

    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import getTempDirectory
    from toil.lib.bioio import makeSubDir
    from toil.lib.bioio import system
    from toil.common import loadJobStore

    ##########################################
    #Input args
    ##########################################

    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]

    ##########################################
    #Load the jobStore/config file
    ##########################################

    jobStore = loadJobStore(jobStoreString)
    config = jobStore.config

    ##########################################
    #Load the environment for the batchjob
    ##########################################

    #First load the environment for the batchjob.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.attrib["log_level"])

    ##########################################
    #Setup the temporary directories.
    ##########################################

    #Dir to put all the temp files in.
    localWorkerTempDir = getTempDirectory()

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>

    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")

    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)

    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)

    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...

    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers):  #Remove old handlers
        logger.removeHandler(handler)

    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()

        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))

        ##########################################
        #Load the batchjob
        ##########################################

        batchjob = jobStore.load(jobStoreID)
        logger.debug("Parsed batchjob")

        ##########################################
        #Cleanup from any earlier invocation of the batchjob
        ##########################################

        if batchjob.command == None:
            while len(batchjob.stack) > 0:
                jobs = batchjob.stack[-1]
                #If the jobs still exist they have not been run, so break
                if jobStore.exists(jobs[0][0]):
                    break
                #However, if they are gone then we can remove them from the stack.
                #This is the only way to flush successors that have previously been run
                #, as jobs are, as far as possible, read only in the leader.
                batchjob.stack.pop()

        #This cleans the old log file which may
        #have been left if the batchjob is being retried after a batchjob failure.
        if batchjob.logJobStoreFileID != None:
            batchjob.clearLogFile(jobStore)

        ##########################################
        #Setup the stats, if requested
        ##########################################

        if config.attrib.has_key("stats"):
            startTime = time.time()
            startClock = getTotalCpuTime()
            stats = ET.Element("worker")
        else:
            stats = None

        startTime = time.time()
        while True:
            ##########################################
            #Run the batchjob, if there is one
            ##########################################

            if batchjob.command != None:
                if batchjob.command[:11] == "scriptTree ":
                    #Make a temporary file directory for the job
                    localTempDir = makeSubDir(
                        os.path.join(localWorkerTempDir, "localTempDir"))

                    #Is a job command
                    messages = loadJob(batchjob.command, jobStore)._execute(
                        batchjob=batchjob,
                        stats=stats,
                        localTempDir=localTempDir,
                        jobStore=jobStore)

                    #Remove the temporary file directory
                    shutil.rmtree(localTempDir)

                else:  #Is another command (running outside of jobs may be deprecated)
                    system(batchjob.command)
                    messages = []
            else:
                #The command may be none, in which case
                #the batchjob is just a shell ready to be deleted
                assert len(batchjob.stack) == 0
                messages = []
                break

            ##########################################
            #Establish if we can run another batchjob within the worker
            ##########################################

            #Exceeded the amount of time the worker is allowed to run for so quit
            if time.time() - startTime > float(config.attrib["job_time"]):
                logger.debug(
                    "We are breaking because the maximum time the batchjob should run for has been exceeded"
                )
                break

            #No more jobs to run so quit
            if len(batchjob.stack) == 0:
                break

            #Get the next set of jobs to run
            jobs = batchjob.stack[-1]
            assert len(jobs) > 0

            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug(
                    "No more jobs can run in series by this worker,"
                    " it's got %i children",
                    len(jobs) - 1)
                break

            #We check the requirements of the batchjob to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCpu, successorsDisk, successorPredecessorID = jobs[
                0]
            if successorMemory > batchjob.memory:
                logger.debug(
                    "We need more memory for the next batchjob, so finishing")
                break
            if successorCpu > batchjob.cpu:
                logger.debug(
                    "We need more cpus for the next batchjob, so finishing")
                break
            if successorsDisk > batchjob.disk:
                logger.debug(
                    "We need more disk for the next batchjob, so finishing")
                break
            if successorPredecessorID != None:
                logger.debug(
                    "The batchjob has multiple predecessors, we must return to the leader."
                )
                break

            ##########################################
            #We have a single successor batchjob.
            #We load the successor batchjob and transplant its command and stack
            #into the current batchjob so that it can be run
            #as if it were a command that were part of the current batchjob.
            #We can then delete the successor batchjob in the jobStore, as it is
            #wholly incorporated into the current batchjob.
            ##########################################

            #Remove the successor batchjob
            batchjob.stack.pop()

            #Load the successor batchjob
            successorJob = jobStore.load(successorJobStoreID)
            #These should all match up
            assert successorJob.memory == successorMemory
            assert successorJob.cpu == successorCpu
            assert successorJob.predecessorsFinished == set()
            assert successorJob.predecessorNumber == 1
            assert successorJob.command != None
            assert successorJobStoreID == successorJob.jobStoreID

            #Transplant the command and stack to the current batchjob
            batchjob.command = successorJob.command
            batchjob.stack += successorJob.stack
            assert batchjob.memory >= successorJob.memory
            assert batchjob.cpu >= successorJob.cpu

            #Checkpoint the batchjob and delete the successorJob
            batchjob.jobsToDelete = [successorJob.jobStoreID]
            jobStore.update(batchjob)
            jobStore.delete(successorJob.jobStoreID)

            logger.debug("Starting the next batchjob")

        ##########################################
        #Finish up the stats
        ##########################################

        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            m = ET.SubElement(stats, "messages")
            for message in messages:
                ET.SubElement(m, "message").text = message
            jobStore.writeStatsAndLogging(ET.tostring(stats))
        elif len(messages
                 ) > 0:  #No stats, but still need to report log messages
            l = ET.Element("worker")
            m = ET.SubElement(l, "messages")
            for message in messages:
                ET.SubElement(m, "message").text = message
            jobStore.writeStatsAndLogging(ET.tostring(l))

        logger.info(
            "Finished running the chain of jobs on this node, we ran for a total of %f seconds",
            time.time() - startTime)

    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except:  #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error(
            "Exiting the worker because of a failed batchjob on host %s",
            socket.gethostname())
        batchjob = jobStore.load(jobStoreID)
        batchjob.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################

    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)

    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)

    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)

    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr

    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)

    #Now our file handles are in exactly the state they were in before.

    #Copy back the log file to the global dir, if needed
    if workerFailed:
        truncateFile(tempWorkerLogPath)
        batchjob.setLogFile(tempWorkerLogPath, jobStore)
        os.remove(tempWorkerLogPath)
        jobStore.update(batchjob)

    #Remove the temp dir
    shutil.rmtree(localWorkerTempDir)

    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and batchjob.command == None and len(
            batchjob.stack) == 0:
        #We can now safely get rid of the batchjob
        jobStore.delete(batchjob.jobStoreID)
Ejemplo n.º 34
0
def main():
    logging.basicConfig()

    ##########################################
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import makePublicDir
    from toil.lib.bioio import system
    from toil.common import loadJobStore
    from toil.job import Job
    
    ########################################## 
    #Input args
    ##########################################
    
    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = loadJobStore(jobStoreString)
    config = jobStore.config
    
    ##########################################
    #Create the worker killer, if requested
    ##########################################

    if config.badWorker > 0 and random.random() < config.badWorker:
        def badWorker():
            #This will randomly kill the worker process at a random time 
            time.sleep(config.badWorkerFailInterval * random.random())
            os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT)
            #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine)
        t = Thread(target=badWorker)
        t.daemon = True
        t.start()

    ##########################################
    #Load the environment for the jobWrapper
    ##########################################
    
    #First load the environment for the jobWrapper.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.logLevel)

    tempRootDir = config.workDir
    if tempRootDir is not None and not os.path.exists(tempRootDir):
        raise RuntimeError("The temporary directory specified by workDir: %s does not exist" % tempRootDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in. If tempRootDir is None, tempdir looks at environment variables to determine
    # where to put the tempDir.
    localWorkerTempDir = tempfile.mkdtemp(dir=tempRootDir)
    os.chmod(localWorkerTempDir, 0755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    messages = []
    blockFn = lambda : True
    cleanCacheFn = lambda x : True
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))
    
        ##########################################
        #Load the jobWrapper
        ##########################################
        
        jobWrapper = jobStore.load(jobStoreID)
        logger.debug("Parsed jobWrapper")
        
        ##########################################
        #Cleanup from any earlier invocation of the jobWrapper
        ##########################################
        
        if jobWrapper.command == None:
            while len(jobWrapper.stack) > 0:
                jobs = jobWrapper.stack[-1]
                #If the jobs still exist they have not been run, so break
                if jobStore.exists(jobs[0][0]):
                    break
                #However, if they are gone then we can remove them from the stack.
                #This is the only way to flush successors that have previously been run
                #, as jobs are, as far as possible, read only in the leader.
                jobWrapper.stack.pop()
                
        #This cleans the old log file which may 
        #have been left if the jobWrapper is being retried after a jobWrapper failure.
        oldLogFile = jobWrapper.logJobStoreFileID
        jobWrapper.logJobStoreFileID = None
        jobStore.update(jobWrapper) #Update first, before deleting the file
        if oldLogFile != None:
            jobStore.delete(oldLogFile)
            
        #Make a temporary file directory for the jobWrapper
        localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir"))
    
        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.stats:
            startTime = time.time()
            startClock = getTotalCpuTime()

        startTime = time.time() 
        while True:
            ##########################################
            #Run the jobWrapper, if there is one
            ##########################################
            
            if jobWrapper.command != None:
                if jobWrapper.command.startswith( "_toil " ):
                    #Load the job
                    job = Job._loadJob(jobWrapper.command, jobStore)
                    
                    #Cleanup the cache from the previous job
                    cleanCacheFn(job.effectiveRequirements(jobStore.config).cache)
                    
                    #Create a fileStore object for the job
                    fileStore = Job.FileStore(jobStore, jobWrapper, localTempDir, 
                                              blockFn)
                    #Get the next block function and list that will contain any messages
                    blockFn = fileStore._blockFn
                    messages = fileStore.loggingMessages

                    job._execute(jobWrapper=jobWrapper,
                                           stats=statsDict if config.stats else None,
                                           localTempDir=localTempDir,
                                           jobStore=jobStore,
                                           fileStore=fileStore)

                    #Set the clean cache function
                    cleanCacheFn = fileStore._cleanLocalTempDir
                    
                else: #Is another command (running outside of jobs may be deprecated)
                    #Cleanup the cache from the previous job
                    cleanCacheFn(0)
                    
                    system(jobWrapper.command)
                    #Set a dummy clean cache fn
                    cleanCacheFn = lambda x : None
            else:
                #The command may be none, in which case
                #the jobWrapper is either a shell ready to be deleted or has 
                #been scheduled after a failure to cleanup
                break
            
            if Job.FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobWrapper within the worker
            ##########################################
            
            #No more jobs to run so quit
            if len(jobWrapper.stack) == 0:
                break
            
            #Get the next set of jobs to run
            jobs = jobWrapper.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the jobWrapper to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCores, successorsDisk, successorPredecessorID = jobs[0]
            if successorMemory > jobWrapper.memory:
                logger.debug("We need more memory for the next jobWrapper, so finishing")
                break
            if successorCores > jobWrapper.cores:
                logger.debug("We need more cores for the next jobWrapper, so finishing")
                break
            if successorsDisk > jobWrapper.disk:
                logger.debug("We need more disk for the next jobWrapper, so finishing")
                break
            if successorPredecessorID != None: 
                logger.debug("The jobWrapper has multiple predecessors, we must return to the leader.")
                break
          
            ##########################################
            #We have a single successor jobWrapper.
            #We load the successor jobWrapper and transplant its command and stack
            #into the current jobWrapper so that it can be run
            #as if it were a command that were part of the current jobWrapper.
            #We can then delete the successor jobWrapper in the jobStore, as it is
            #wholly incorporated into the current jobWrapper.
            ##########################################
            
            #Clone the jobWrapper and its stack
            jobWrapper = copy.deepcopy(jobWrapper)
            
            #Remove the successor jobWrapper
            jobWrapper.stack.pop()
            
            #Load the successor jobWrapper
            successorJob = jobStore.load(successorJobStoreID)
            #These should all match up
            assert successorJob.memory == successorMemory
            assert successorJob.cores == successorCores
            assert successorJob.predecessorsFinished == set()
            assert successorJob.predecessorNumber == 1
            assert successorJob.command != None
            assert successorJobStoreID == successorJob.jobStoreID
            
            #Transplant the command and stack to the current jobWrapper
            jobWrapper.command = successorJob.command
            jobWrapper.stack += successorJob.stack
            assert jobWrapper.memory >= successorJob.memory
            assert jobWrapper.cores >= successorJob.cores
            
            #Build a fileStore to update the job
            fileStore = Job.FileStore(jobStore, jobWrapper, localTempDir, blockFn)
            
            #Update blockFn
            blockFn = fileStore._blockFn
            
            #Add successorJob to those to be deleted
            fileStore.jobsToDelete.add(successorJob.jobStoreID)
            
            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()            
            
            #Clone the jobWrapper and its stack again, so that updates to it do 
            #not interfere with this update
            jobWrapper = copy.deepcopy(jobWrapper)
            
            logger.debug("Starting the next jobWrapper")
        
        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)
            statsDict.workers.log = messages
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed jobWrapper on host %s", socket.gethostname())
        Job.FileStore._terminateEvent.set()
    
    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ########################################## 
       
    blockFn() 
    
    ##########################################
    #All the asynchronous worker/update threads must be finished now, 
    #so safe to test if they completed okay
    ########################################## 
    
    if Job.FileStore._terminateEvent.isSet():
        jobWrapper = jobStore.load(jobStoreID)
        jobWrapper.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if workerFailed:
        truncateFile(tempWorkerLogPath)
        jobWrapper.logJobStoreFileID = jobStore.writeFile( tempWorkerLogPath, jobWrapper.jobStoreID )
        os.remove(tempWorkerLogPath)
        jobStore.update(jobWrapper)
    elif debugging: # write log messages
        truncateFile(tempWorkerLogPath)
        with open(tempWorkerLogPath, 'r') as logFile:
            logMessages = logFile.read().splitlines()
        statsDict.logs = [Expando(jobStoreID=jobStoreID,text=logMessage) for logMessage in logMessages]

    if (debugging or config.stats or messages) and not workerFailed: # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict))

    #Remove the temp dir
    shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobWrapper.command == None and len(jobWrapper.stack) == 0:
        #We can now safely get rid of the jobWrapper
        jobStore.delete(jobWrapper.jobStoreID)
Ejemplo n.º 35
0
def compressFastaFile(fileName):
    """Compress a fasta file.
    """
    system("bzip2 --keep --fast %s" % fileName)
    return fileName + ".bz2"