Beispiel #1
0
def main():
    logging.basicConfig()

    ##########################################
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import makePublicDir
    from toil.lib.bioio import system
    from toil.job import Job
    
    ########################################## 
    #Input args
    ##########################################
    
    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = Toil.loadOrCreateJobStore(jobStoreString)
    config = jobStore.config
    
    ##########################################
    #Create the worker killer, if requested
    ##########################################

    if config.badWorker > 0 and random.random() < config.badWorker:
        def badWorker():
            #This will randomly kill the worker process at a random time 
            time.sleep(config.badWorkerFailInterval * random.random())
            os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT)
            #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine)
        t = Thread(target=badWorker)
        # Ideally this would be a daemon thread but that causes an intermittent (but benign)
        # exception similar to the one described here:
        # http://stackoverflow.com/questions/20596918/python-exception-in-thread-thread-1-most-likely-raised-during-interpreter-shutd
        # Our exception is:
        #    Exception in thread Thread-1 (most likely raised during interpreter shutdown):
        #    <type 'exceptions.AttributeError'>: 'NoneType' object has no attribute 'kill'
        # This attribute error is caused by the call os.kill() and apparently unavoidable with a
        # daemon
        t.start()

    ##########################################
    #Load the environment for the jobWrapper
    ##########################################
    
    #First load the environment for the jobWrapper.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.logLevel)

    toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    # Dir to put all this worker's temp files in.
    localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir)
    os.chmod(localWorkerTempDir, 0755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    statsDict.workers.logsToMaster = []
    blockFn = lambda : True
    cleanCacheFn = lambda x : True
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))

        # Setup the caching variable now in case of an exception during loading of jobwrapper, etc
        # Flag to identify if the run is cached or not.
        FileStore = Job.FileStore if config.disableSharedCache else Job.CachedFileStore

        ##########################################
        #Load the jobWrapper
        ##########################################
        
        jobWrapper = jobStore.load(jobStoreID)
        logger.debug("Parsed jobWrapper")
        
        ##########################################
        #Cleanup from any earlier invocation of the jobWrapper
        ##########################################
        
        if jobWrapper.command == None:
            # Cleanup jobs already finished
            f = lambda jobs : filter(lambda x : len(x) > 0, map(lambda x :
                                    filter(lambda y : jobStore.exists(y[0]), x), jobs))
            jobWrapper.stack = f(jobWrapper.stack)
            jobWrapper.services = f(jobWrapper.services)
            logger.debug("Cleaned up any references to completed successor jobs")

        #This cleans the old log file which may 
        #have been left if the jobWrapper is being retried after a jobWrapper failure.
        oldLogFile = jobWrapper.logJobStoreFileID
        if oldLogFile != None:
            jobWrapper.logJobStoreFileID = None
            jobStore.update(jobWrapper) #Update first, before deleting any files
            jobStore.deleteFile(oldLogFile)

        ##########################################
        # If a checkpoint exists, restart from the checkpoint
        ##########################################

        # The job is a checkpoint, and is being restarted after previously completing
        if jobWrapper.checkpoint != None:
            logger.debug("Job is a checkpoint")
            if len(jobWrapper.stack) > 0 or len(jobWrapper.services) > 0 or jobWrapper.command != None:
                if jobWrapper.command != None:
                    assert jobWrapper.command == jobWrapper.checkpoint
                    logger.debug("Checkpoint job already has command set to run")
                else:
                    jobWrapper.command = jobWrapper.checkpoint

                # Reduce the retry count
                assert jobWrapper.remainingRetryCount >= 0
                jobWrapper.remainingRetryCount = max(0, jobWrapper.remainingRetryCount - 1)

                jobStore.update(jobWrapper) # Update immediately to ensure that checkpoint
                # is made before deleting any remaining successors

                if len(jobWrapper.stack) > 0 or len(jobWrapper.services) > 0:
                    # If the subtree of successors is not complete restart everything
                    logger.debug("Checkpoint job has unfinished successor jobs, deleting the jobs on the stack: %s, services: %s " %
                                 (jobWrapper.stack, jobWrapper.services))

                    # Delete everything on the stack, as these represent successors to clean
                    # up as we restart the queue
                    def recursiveDelete(jobWrapper2):
                        # Recursive walk the stack to delete all remaining jobs
                        for jobs in jobWrapper2.stack + jobWrapper2.services:
                            for jobTuple in jobs:
                                if jobStore.exists(jobTuple[0]):
                                    recursiveDelete(jobStore.load(jobTuple[0]))
                                else:
                                    logger.debug("Job %s has already been deleted", jobTuple[0])
                        if jobWrapper2 != jobWrapper:
                            logger.debug("Checkpoint is deleting old successor job: %s", jobWrapper2.jobStoreID)
                            jobStore.delete(jobWrapper2.jobStoreID)
                    recursiveDelete(jobWrapper)

                    jobWrapper.stack = [ [], [] ] # Initialise the job to mimic the state of a job
                    # that has been previously serialised but which as yet has no successors

                    jobWrapper.services = [] # Empty the services

                    # Update the jobStore to avoid doing this twice on failure and make this clean.
                    jobStore.update(jobWrapper)

            # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
            # because of the job being a checkpoint
            else:
                logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
                #Delete any remnant files
                map(jobStore.deleteFile, filter(jobStore.fileExists, jobWrapper.checkpointFilesToDelete))

        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.stats:
            startTime = time.time()
            startClock = getTotalCpuTime()

        #Make a temporary file directory for the jobWrapper
        #localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir"))

        startTime = time.time()
        while True:
            ##########################################
            #Run the jobWrapper, if there is one
            ##########################################
            
            if jobWrapper.command is not None:
                assert jobWrapper.command.startswith( "_toil " )
                logger.debug("Got a command to run: %s" % jobWrapper.command)
                #Load the job
                job = Job._loadJob(jobWrapper.command, jobStore)
                # If it is a checkpoint job, save the command
                if job.checkpoint:
                    jobWrapper.checkpoint = jobWrapper.command

                # Need to fix all this for non shared cache runs
                if config.disableSharedCache:
                    #Cleanup the cache from the previous job
                    cleanCacheFn(job.effectiveRequirements(jobStore.config).cache)

                # Create a fileStore object for the job
                fileStore = FileStore(jobStore, jobWrapper, localWorkerTempDir, blockFn)
                with job._executor(jobWrapper=jobWrapper,
                                   stats=statsDict if config.stats else None,
                                   fileStore=fileStore):
                    with fileStore.open(job):
                        # Get the next block function and list that will contain any messages
                        blockFn = fileStore._blockFn

                        job._runner(jobWrapper=jobWrapper, jobStore=jobStore, fileStore=fileStore)

                # Accumulate messages from this job & any subsequent chained jobs
                statsDict.workers.logsToMaster += fileStore.loggingMessages
                if config.disableSharedCache:
                    #Set the clean cache function
                    cleanCacheFn = fileStore._cleanLocalTempDir

            else:
                #The command may be none, in which case
                #the jobWrapper is either a shell ready to be deleted or has 
                #been scheduled after a failure to cleanup
                break
            
            if FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobWrapper within the worker
            ##########################################
            
            #If no more jobs to run or services not finished, quit
            if len(jobWrapper.stack) == 0 or len(jobWrapper.services) > 0 or jobWrapper.checkpoint != None:
                logger.debug("Stopping running chain of jobs: length of stack: %s, services: %s, checkpoint: %s",
                             len(jobWrapper.stack), len(jobWrapper.services), jobWrapper.checkpoint != None)
                break
            
            #Get the next set of jobs to run
            jobs = jobWrapper.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the jobWrapper to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCores, successorsDisk, successorsPreemptable, successorPredecessorID = jobs[0]
            if successorMemory > jobWrapper.memory:
                logger.debug("We need more memory for the next jobWrapper, so finishing")
                break
            if successorCores > jobWrapper.cores:
                logger.debug("We need more cores for the next jobWrapper, so finishing")
                break
            if successorsDisk > jobWrapper.disk:
                logger.debug("We need more disk for the next jobWrapper, so finishing")
                break
            if successorPredecessorID != None: 
                logger.debug("The jobWrapper has multiple predecessors, we must return to the leader.")
                break

            # Load the successor jobWrapper
            successorJobWrapper = jobStore.load(successorJobStoreID)

            # Somewhat ugly, but check if job is a checkpoint job and quit if
            # so
            if successorJobWrapper.command.startswith( "_toil " ):
                #Load the job
                successorJob = Job._loadJob(successorJobWrapper.command, jobStore)

                # Check it is not a checkpoint
                if successorJob.checkpoint:
                    logger.debug("Next job is checkpoint, so finishing")
                    break

            ##########################################
            #We have a single successor job that is not a checkpoint job.
            #We transplant the successor jobWrappers command and stack
            #into the current jobWrapper so that it can be run
            #as if it were a command that were part of the current jobWrapper.
            #We can then delete the successor jobWrapper in the jobStore, as it is
            #wholly incorporated into the current jobWrapper.
            ##########################################
            
            #Clone the jobWrapper and its stack
            jobWrapper = copy.deepcopy(jobWrapper)
            
            #Remove the successor jobWrapper
            jobWrapper.stack.pop()

            #These should all match up
            assert successorJobWrapper.memory == successorMemory
            assert successorJobWrapper.cores == successorCores
            assert successorJobWrapper.predecessorsFinished == set()
            assert successorJobWrapper.predecessorNumber == 1
            assert successorJobWrapper.command != None
            assert successorJobStoreID == successorJobWrapper.jobStoreID
            
            #Transplant the command and stack to the current jobWrapper
            jobWrapper.command = successorJobWrapper.command
            jobWrapper.stack += successorJobWrapper.stack
            assert jobWrapper.memory >= successorJobWrapper.memory
            assert jobWrapper.cores >= successorJobWrapper.cores
            
            #Build a fileStore to update the job
            fileStore = Job.FileStore(jobStore, jobWrapper, localWorkerTempDir, blockFn)
            
            #Update blockFn
            blockFn = fileStore._blockFn
            
            #Add successorJobWrapper to those to be deleted
            fileStore.jobsToDelete.add(successorJobWrapper.jobStoreID)
            
            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()            
            
            #Clone the jobWrapper and its stack again, so that updates to it do 
            #not interfere with this update
            jobWrapper = copy.deepcopy(jobWrapper)
            
            logger.debug("Starting the next jobWrapper")
        
        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)

        # log the worker log path here so that if the file is truncated the path can still be found
        logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir)
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed jobWrapper on host %s", socket.gethostname())
        FileStore._terminateEvent.set()
    
    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ########################################## 
       
    blockFn() 
    
    ##########################################
    #All the asynchronous worker/update threads must be finished now, 
    #so safe to test if they completed okay
    ########################################## 
    
    if FileStore._terminateEvent.isSet():
        jobWrapper = jobStore.load(jobStoreID)
        jobWrapper.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if workerFailed:
        jobWrapper.logJobStoreFileID = jobStore.getEmptyFileStoreID(jobWrapper.jobStoreID)
        with jobStore.updateFileStream(jobWrapper.logJobStoreFileID) as w:
            with open(tempWorkerLogPath, "r") as f:
                if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit:
                    f.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
                w.write(f.read())
        jobStore.update(jobWrapper)

    elif debugging:  # write log messages
        with open(tempWorkerLogPath, 'r') as logFile:
            if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit:
                logFile.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
            logMessages = logFile.read().splitlines()
        statsDict.logs = [Expando(jobStoreID=jobStoreID, text=logMessage) for logMessage in logMessages]

    if (debugging or config.stats or statsDict.workers.logsToMaster) and not workerFailed:  # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict))

    #Remove the temp dir
    cleanUp = config.cleanWorkDir
    if cleanUp == 'always' or (cleanUp == 'onSuccess' and not workerFailed) or (cleanUp == 'onError' and workerFailed):
        shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobWrapper.command == None and len(jobWrapper.stack) == 0 and len(jobWrapper.services) == 0:
        # We can now safely get rid of the jobWrapper
        jobStore.delete(jobWrapper.jobStoreID)
Beispiel #2
0
def main():
    logging.basicConfig()

    ##########################################
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import makePublicDir
    from toil.lib.bioio import system
    from toil.common import loadJobStore
    from toil.job import Job
    
    ########################################## 
    #Input args
    ##########################################
    
    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = loadJobStore(jobStoreString)
    config = jobStore.config
    
    ##########################################
    #Create the worker killer, if requested
    ##########################################

    if config.badWorker > 0 and random.random() < config.badWorker:
        def badWorker():
            #This will randomly kill the worker process at a random time 
            time.sleep(config.badWorkerFailInterval * random.random())
            os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT)
            #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine)
        t = Thread(target=badWorker)
        t.daemon = True
        t.start()

    ##########################################
    #Load the environment for the jobWrapper
    ##########################################
    
    #First load the environment for the jobWrapper.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.logLevel)

    tempRootDir = config.workDir
    if tempRootDir is not None and not os.path.exists(tempRootDir):
        raise RuntimeError("The temporary directory specified by workDir: %s does not exist" % tempRootDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in. If tempRootDir is None, tempdir looks at environment variables to determine
    # where to put the tempDir.
    localWorkerTempDir = tempfile.mkdtemp(dir=tempRootDir)
    os.chmod(localWorkerTempDir, 0755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    messages = []
    blockFn = lambda : True
    cleanCacheFn = lambda x : True
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))
    
        ##########################################
        #Load the jobWrapper
        ##########################################
        
        jobWrapper = jobStore.load(jobStoreID)
        logger.debug("Parsed jobWrapper")
        
        ##########################################
        #Cleanup from any earlier invocation of the jobWrapper
        ##########################################
        
        if jobWrapper.command == None:
            while len(jobWrapper.stack) > 0:
                jobs = jobWrapper.stack[-1]
                #If the jobs still exist they have not been run, so break
                if jobStore.exists(jobs[0][0]):
                    break
                #However, if they are gone then we can remove them from the stack.
                #This is the only way to flush successors that have previously been run
                #, as jobs are, as far as possible, read only in the leader.
                jobWrapper.stack.pop()
                
        #This cleans the old log file which may 
        #have been left if the jobWrapper is being retried after a jobWrapper failure.
        oldLogFile = jobWrapper.logJobStoreFileID
        jobWrapper.logJobStoreFileID = None
        jobStore.update(jobWrapper) #Update first, before deleting the file
        if oldLogFile != None:
            jobStore.delete(oldLogFile)
            
        #Make a temporary file directory for the jobWrapper
        localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir"))
    
        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.stats:
            startTime = time.time()
            startClock = getTotalCpuTime()

        startTime = time.time() 
        while True:
            ##########################################
            #Run the jobWrapper, if there is one
            ##########################################
            
            if jobWrapper.command != None:
                if jobWrapper.command.startswith( "_toil " ):
                    #Load the job
                    job = Job._loadJob(jobWrapper.command, jobStore)
                    
                    #Cleanup the cache from the previous job
                    cleanCacheFn(job.effectiveRequirements(jobStore.config).cache)
                    
                    #Create a fileStore object for the job
                    fileStore = Job.FileStore(jobStore, jobWrapper, localTempDir, 
                                              blockFn)
                    #Get the next block function and list that will contain any messages
                    blockFn = fileStore._blockFn
                    messages = fileStore.loggingMessages

                    job._execute(jobWrapper=jobWrapper,
                                           stats=statsDict if config.stats else None,
                                           localTempDir=localTempDir,
                                           jobStore=jobStore,
                                           fileStore=fileStore)

                    #Set the clean cache function
                    cleanCacheFn = fileStore._cleanLocalTempDir
                    
                else: #Is another command (running outside of jobs may be deprecated)
                    #Cleanup the cache from the previous job
                    cleanCacheFn(0)
                    
                    system(jobWrapper.command)
                    #Set a dummy clean cache fn
                    cleanCacheFn = lambda x : None
            else:
                #The command may be none, in which case
                #the jobWrapper is either a shell ready to be deleted or has 
                #been scheduled after a failure to cleanup
                break
            
            if Job.FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobWrapper within the worker
            ##########################################
            
            #No more jobs to run so quit
            if len(jobWrapper.stack) == 0:
                break
            
            #Get the next set of jobs to run
            jobs = jobWrapper.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the jobWrapper to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCores, successorsDisk, successorPredecessorID = jobs[0]
            if successorMemory > jobWrapper.memory:
                logger.debug("We need more memory for the next jobWrapper, so finishing")
                break
            if successorCores > jobWrapper.cores:
                logger.debug("We need more cores for the next jobWrapper, so finishing")
                break
            if successorsDisk > jobWrapper.disk:
                logger.debug("We need more disk for the next jobWrapper, so finishing")
                break
            if successorPredecessorID != None: 
                logger.debug("The jobWrapper has multiple predecessors, we must return to the leader.")
                break
          
            ##########################################
            #We have a single successor jobWrapper.
            #We load the successor jobWrapper and transplant its command and stack
            #into the current jobWrapper so that it can be run
            #as if it were a command that were part of the current jobWrapper.
            #We can then delete the successor jobWrapper in the jobStore, as it is
            #wholly incorporated into the current jobWrapper.
            ##########################################
            
            #Clone the jobWrapper and its stack
            jobWrapper = copy.deepcopy(jobWrapper)
            
            #Remove the successor jobWrapper
            jobWrapper.stack.pop()
            
            #Load the successor jobWrapper
            successorJob = jobStore.load(successorJobStoreID)
            #These should all match up
            assert successorJob.memory == successorMemory
            assert successorJob.cores == successorCores
            assert successorJob.predecessorsFinished == set()
            assert successorJob.predecessorNumber == 1
            assert successorJob.command != None
            assert successorJobStoreID == successorJob.jobStoreID
            
            #Transplant the command and stack to the current jobWrapper
            jobWrapper.command = successorJob.command
            jobWrapper.stack += successorJob.stack
            assert jobWrapper.memory >= successorJob.memory
            assert jobWrapper.cores >= successorJob.cores
            
            #Build a fileStore to update the job
            fileStore = Job.FileStore(jobStore, jobWrapper, localTempDir, blockFn)
            
            #Update blockFn
            blockFn = fileStore._blockFn
            
            #Add successorJob to those to be deleted
            fileStore.jobsToDelete.add(successorJob.jobStoreID)
            
            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()            
            
            #Clone the jobWrapper and its stack again, so that updates to it do 
            #not interfere with this update
            jobWrapper = copy.deepcopy(jobWrapper)
            
            logger.debug("Starting the next jobWrapper")
        
        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)
            statsDict.workers.log = messages
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed jobWrapper on host %s", socket.gethostname())
        Job.FileStore._terminateEvent.set()
    
    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ########################################## 
       
    blockFn() 
    
    ##########################################
    #All the asynchronous worker/update threads must be finished now, 
    #so safe to test if they completed okay
    ########################################## 
    
    if Job.FileStore._terminateEvent.isSet():
        jobWrapper = jobStore.load(jobStoreID)
        jobWrapper.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if workerFailed:
        truncateFile(tempWorkerLogPath)
        jobWrapper.logJobStoreFileID = jobStore.writeFile( tempWorkerLogPath, jobWrapper.jobStoreID )
        os.remove(tempWorkerLogPath)
        jobStore.update(jobWrapper)
    elif debugging: # write log messages
        truncateFile(tempWorkerLogPath)
        with open(tempWorkerLogPath, 'r') as logFile:
            logMessages = logFile.read().splitlines()
        statsDict.logs = [Expando(jobStoreID=jobStoreID,text=logMessage) for logMessage in logMessages]

    if (debugging or config.stats or messages) and not workerFailed: # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict))

    #Remove the temp dir
    shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobWrapper.command == None and len(jobWrapper.stack) == 0:
        #We can now safely get rid of the jobWrapper
        jobStore.delete(jobWrapper.jobStoreID)
Beispiel #3
0
def executor():
    """
    Main function of the _toil_kubernetes_executor entrypoint.

    Runs inside the Toil container.

    Responsible for setting up the user script and running the command for the
    job (which may in turn invoke the Toil worker entrypoint).

    """

    configureRootLogger()
    setLogLevel("DEBUG")
    logger.debug("Starting executor")

    # If we don't manage to run the child, what should our exit code be?
    exit_code = EXIT_STATUS_UNAVAILABLE_VALUE

    if len(sys.argv) != 2:
        logger.error('Executor requires exactly one base64-encoded argument')
        sys.exit(exit_code)

    # Take in a base64-encoded pickled dict as our first argument and decode it
    try:
        # Make sure to encode the text arguments to bytes before base 64 decoding
        job = pickle.loads(base64.b64decode(sys.argv[1].encode('utf-8')))
    except:
        exc_info = sys.exc_info()
        logger.error('Exception while unpickling task: ', exc_info=exc_info)
        sys.exit(exit_code)

    if 'environment' in job:
        # Adopt the job environment into the executor.
        # This lets us use things like TOIL_WORKDIR when figuring out how to talk to other executors.
        logger.debug('Adopting environment: %s',
                     str(job['environment'].keys()))
        for var, value in job['environment'].items():
            os.environ[var] = value

    # Set JTRES_ROOT and other global state needed for resource
    # downloading/deployment to work.
    # TODO: Every worker downloads resources independently.
    # We should have a way to share a resource directory.
    logger.debug('Preparing system for resource download')
    Resource.prepareSystem()
    try:
        if 'userScript' in job:
            job['userScript'].register()

        # We need to tell other workers in this workflow not to do cleanup now that
        # we are here, or else wait for them to finish. So get the cleanup info
        # that knows where the work dir is.
        cleanupInfo = job['workerCleanupInfo']

        # Join a Last Process Standing arena, so we know which process should be
        # responsible for cleanup.
        # We need to use the real workDir, not just the override from cleanupInfo.
        # This needs to happen after the environment is applied.
        arena = LastProcessStandingArena(
            Toil.getToilWorkDir(cleanupInfo.workDir),
            cleanupInfo.workflowID + '-kube-executor')
        arena.enter()
        try:

            # Start the child process
            logger.debug("Invoking command: '%s'", job['command'])
            child = subprocess.Popen(job['command'],
                                     preexec_fn=lambda: os.setpgrp(),
                                     shell=True)

            # Reproduce child's exit code
            exit_code = child.wait()

        finally:
            for _ in arena.leave():
                # We are the last concurrent executor to finish.
                # Do batch system cleanup.
                logger.debug('Cleaning up worker')
                BatchSystemSupport.workerCleanup(cleanupInfo)
    finally:
        logger.debug('Cleaning up resources')
        # TODO: Change resource system to use a shared resource directory for everyone.
        # Then move this into the last-process-standing cleanup
        Resource.cleanSystem()
        logger.debug('Shutting down')
        sys.exit(exit_code)
Beispiel #4
0
def workerScript(jobStore, config, jobName, jobStoreID, redirectOutputToLogFile=True):
    """
    Worker process script, runs a job. 
    
    :param str jobName: The "job name" (a user friendly name) of the job to be run
    :param str jobStoreLocator: Specifies the job store to use
    :param str jobStoreID: The job store ID of the job to be run
    :param bool redirectOutputToLogFile: Redirect standard out and standard error to a log file
    """
    logging.basicConfig()
    setLogLevel(config.logLevel)

    ##########################################
    #Create the worker killer, if requested
    ##########################################

    logFileByteReportLimit = config.maxLogFileSize

    if config.badWorker > 0 and random.random() < config.badWorker:
        # We need to kill the process we are currently in, to simulate worker
        # failure. We don't want to just send SIGKILL, because we can't tell
        # that from a legitimate OOM on our CI runner. We're going to send
        # SIGUSR1 so our terminations are distinctive, and then SIGKILL if that
        # didn't stick. We definitely don't want to do this from *within* the
        # process we are trying to kill, so we fork off. TODO: We can still
        # leave the killing code running after the main Toil flow is done, but
        # since it's now in a process instead of a thread, the main Python
        # process won't wait around for its timeout to expire. I think this is
        # better than the old thread-based way where all of Toil would wait
        # around to be killed.
        
        killTarget = os.getpid()
        sleepTime = config.badWorkerFailInterval * random.random()
        if os.fork() == 0:
            # We are the child
            # Let the parent run some amount of time
            time.sleep(sleepTime)
            # Kill it gently
            os.kill(killTarget, signal.SIGUSR1)
            # Wait for that to stick
            time.sleep(0.01)
            try:
                # Kill it harder. Hope the PID hasn't already been reused.
                # If we succeeded the first time, this will OSError
                os.kill(killTarget, signal.SIGKILL)
            except OSError:
                pass
            # Exit without doing any of Toil's cleanup
            os._exit()
            
        # We don't need to reap the child. Either it kills us, or we finish
        # before it does. Either way, init will have to clean it up for us.

    ##########################################
    #Load the environment for the jobGraph
    ##########################################
    
    #First load the environment for the jobGraph.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = safeUnpickleFromStream(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    toilWorkflowDir = Toil.getWorkflowDir(config.workflowID, config.workDir)
    
    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    # Dir to put all this worker's temp files in.
    localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir)
    os.chmod(localWorkerTempDir, 0o755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")

    if redirectOutputToLogFile:
        # Announce that we are redirecting logging, and where it will now go.
        # This is important if we are trying to manually trace a faulty worker invocation.
        logger.info("Redirecting logging to %s", tempWorkerLogPath)
        sys.stdout.flush()
        sys.stderr.flush()
        
        # Save the original stdout and stderr (by opening new file descriptors
        # to the same files)
        origStdOut = os.dup(1)
        origStdErr = os.dup(2)

        # Open the file to send stdout/stderr to.
        logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

        # Replace standard output with a descriptor for the log file
        os.dup2(logFh, 1)

        # Replace standard error with a descriptor for the log file
        os.dup2(logFh, 2)

        # Since we only opened the file once, all the descriptors duped from
        # the original will share offset information, and won't clobber each
        # others' writes. See <http://stackoverflow.com/a/5284108/402891>. This
        # shouldn't matter, since O_APPEND seeks to the end of the file before
        # every write, but maybe there's something odd going on...

        # Close the descriptor we used to open the file
        os.close(logFh)

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    statsDict.workers.logsToMaster = []
    blockFn = lambda : True
    listOfJobs = [jobName]
    job = None
    try:

        #Put a message at the top of the log, just to make sure it's working.
        logger.info("---TOIL WORKER OUTPUT LOG---")
        sys.stdout.flush()
        
        logProcessContext(config)

        ##########################################
        #Load the jobGraph
        ##########################################
        
        jobGraph = jobStore.load(jobStoreID)
        listOfJobs[0] = str(jobGraph)
        logger.debug("Parsed job wrapper")
        
        ##########################################
        #Cleanup from any earlier invocation of the jobGraph
        ##########################################
        
        if jobGraph.command == None:
            logger.debug("Wrapper has no user job to run.")
            # Cleanup jobs already finished
            f = lambda jobs : [z for z in [[y for y in x if jobStore.exists(y.jobStoreID)] for x in jobs] if len(z) > 0]
            jobGraph.stack = f(jobGraph.stack)
            jobGraph.services = f(jobGraph.services)
            logger.debug("Cleaned up any references to completed successor jobs")

        #This cleans the old log file which may 
        #have been left if the job is being retried after a job failure.
        oldLogFile = jobGraph.logJobStoreFileID
        if oldLogFile != None:
            jobGraph.logJobStoreFileID = None
            jobStore.update(jobGraph) #Update first, before deleting any files
            jobStore.deleteFile(oldLogFile)

        ##########################################
        # If a checkpoint exists, restart from the checkpoint
        ##########################################

        # The job is a checkpoint, and is being restarted after previously completing
        if jobGraph.checkpoint != None:
            logger.debug("Job is a checkpoint")
            # If the checkpoint still has extant jobs in its
            # (flattened) stack and services, its subtree didn't
            # complete properly. We handle the restart of the
            # checkpoint here, removing its previous subtree.
            if len([i for l in jobGraph.stack for i in l]) > 0 or len(jobGraph.services) > 0:
                logger.debug("Checkpoint has failed.")
                # Reduce the retry count
                assert jobGraph.remainingRetryCount >= 0
                jobGraph.remainingRetryCount = max(0, jobGraph.remainingRetryCount - 1)
                jobGraph.restartCheckpoint(jobStore)
            # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
            # because of the job being a checkpoint
            else:
                logger.debug("The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete.")
                #Delete any remnant files
                list(map(jobStore.deleteFile, list(filter(jobStore.fileExists, jobGraph.checkpointFilesToDelete))))

        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.stats:
            startClock = getTotalCpuTime()

        startTime = time.time()
        while True:
            ##########################################
            #Run the jobGraph, if there is one
            ##########################################
            
            if jobGraph.command is not None:
                assert jobGraph.command.startswith("_toil ")
                logger.debug("Got a command to run: %s" % jobGraph.command)
                #Load the job
                job = Job._loadJob(jobGraph.command, jobStore)
                # If it is a checkpoint job, save the command
                if job.checkpoint:
                    jobGraph.checkpoint = jobGraph.command

                # Create a fileStore object for the job
                fileStore = FileStore.createFileStore(jobStore, jobGraph, localWorkerTempDir, blockFn,
                                                      caching=not config.disableCaching)
                with job._executor(jobGraph=jobGraph,
                                   stats=statsDict if config.stats else None,
                                   fileStore=fileStore):
                    with fileStore.open(job):
                        # Get the next block function and list that will contain any messages
                        blockFn = fileStore._blockFn

                        job._runner(jobGraph=jobGraph, jobStore=jobStore, fileStore=fileStore)

                # Accumulate messages from this job & any subsequent chained jobs
                statsDict.workers.logsToMaster += fileStore.loggingMessages

            else:
                #The command may be none, in which case
                #the jobGraph is either a shell ready to be deleted or has
                #been scheduled after a failure to cleanup
                logger.debug("No user job to run, so finishing")
                break
            
            if FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobGraph within the worker
            ##########################################
            successorJobGraph = nextChainableJobGraph(jobGraph, jobStore)
            if successorJobGraph is None or config.disableChaining:
                # Can't chain any more jobs.
                break

            ##########################################
            #We have a single successor job that is not a checkpoint job.
            #We transplant the successor jobGraph command and stack
            #into the current jobGraph object so that it can be run
            #as if it were a command that were part of the current jobGraph.
            #We can then delete the successor jobGraph in the jobStore, as it is
            #wholly incorporated into the current jobGraph.
            ##########################################

            # add the successor to the list of jobs run
            listOfJobs.append(str(successorJobGraph))

            #Clone the jobGraph and its stack
            jobGraph = copy.deepcopy(jobGraph)
            
            #Remove the successor jobGraph
            jobGraph.stack.pop()

            #Transplant the command and stack to the current jobGraph
            jobGraph.command = successorJobGraph.command
            jobGraph.stack += successorJobGraph.stack
            # include some attributes for better identification of chained jobs in
            # logging output
            jobGraph.unitName = successorJobGraph.unitName
            jobGraph.jobName = successorJobGraph.jobName
            assert jobGraph.memory >= successorJobGraph.memory
            assert jobGraph.cores >= successorJobGraph.cores
            
            #Build a fileStore to update the job
            fileStore = FileStore.createFileStore(jobStore, jobGraph, localWorkerTempDir, blockFn,
                                                  caching=not config.disableCaching)

            #Update blockFn
            blockFn = fileStore._blockFn

            #Add successorJobGraph to those to be deleted
            fileStore.jobsToDelete.add(successorJobGraph.jobStoreID)
            
            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()            
            
            #Clone the jobGraph and its stack again, so that updates to it do
            #not interfere with this update
            jobGraph = copy.deepcopy(jobGraph)
            
            logger.debug("Starting the next job")
        
        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)

        # log the worker log path here so that if the file is truncated the path can still be found
        if redirectOutputToLogFile:
            logger.info("Worker log can be found at %s. Set --cleanWorkDir to retain this log", localWorkerTempDir)
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed job on host %s", socket.gethostname())
        FileStore._terminateEvent.set()
    
    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ########################################## 
       
    blockFn() 
    
    ##########################################
    #All the asynchronous worker/update threads must be finished now, 
    #so safe to test if they completed okay
    ########################################## 
    
    if FileStore._terminateEvent.isSet():
        jobGraph = jobStore.load(jobStoreID)
        jobGraph.setupJobAfterFailure(config)
        workerFailed = True
        if job and jobGraph.remainingRetryCount == 0:
            job._succeeded = False

    ##########################################
    #Cleanup
    ##########################################

    # Close the worker logging
    # Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    if redirectOutputToLogFile:
        # Flush at the OS level
        os.fsync(1)
        os.fsync(2)

        # Close redirected stdout and replace with the original standard output.
        os.dup2(origStdOut, 1)

        # Close redirected stderr and replace with the original standard error.
        os.dup2(origStdErr, 2)

        # sys.stdout and sys.stderr don't need to be modified at all. We don't
        # need to call redirectLoggerStreamHandlers since they still log to
        # sys.stderr

        # Close our extra handles to the original standard output and standard
        # error streams, so we don't leak file handles.
        os.close(origStdOut)
        os.close(origStdErr)

    # Now our file handles are in exactly the state they were in before.

    #Copy back the log file to the global dir, if needed
    if workerFailed and redirectOutputToLogFile:
        jobGraph.logJobStoreFileID = jobStore.getEmptyFileStoreID(jobGraph.jobStoreID)
        jobGraph.chainedJobs = listOfJobs
        with jobStore.updateFileStream(jobGraph.logJobStoreFileID) as w:
            with open(tempWorkerLogPath, "r") as f:
                if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit !=0:
                    if logFileByteReportLimit > 0:
                        f.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
                    elif logFileByteReportLimit < 0:
                        f.seek(logFileByteReportLimit, 0)  # seek to first tooBig bytes of file
                w.write(f.read().encode('utf-8')) # TODO load file using a buffer
        jobStore.update(jobGraph)

    elif debugging and redirectOutputToLogFile:  # write log messages
        with open(tempWorkerLogPath, 'r') as logFile:
            if os.path.getsize(tempWorkerLogPath) > logFileByteReportLimit != 0:
                if logFileByteReportLimit > 0:
                    logFile.seek(-logFileByteReportLimit, 2)  # seek to last tooBig bytes of file
                elif logFileByteReportLimit < 0:
                    logFile.seek(logFileByteReportLimit, 0)  # seek to first tooBig bytes of file
            logMessages = logFile.read().splitlines()
        statsDict.logs.names = listOfJobs
        statsDict.logs.messages = logMessages

    if (debugging or config.stats or statsDict.workers.logsToMaster) and not workerFailed:  # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict, ensure_ascii=True))

    #Remove the temp dir
    cleanUp = config.cleanWorkDir
    if cleanUp == 'always' or (cleanUp == 'onSuccess' and not workerFailed) or (cleanUp == 'onError' and workerFailed):
        shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobGraph.command == None and len(jobGraph.stack) == 0 and len(jobGraph.services) == 0:
        # We can now safely get rid of the jobGraph
        jobStore.delete(jobGraph.jobStoreID)
Beispiel #5
0
def main():
    ##########################################
    #Import necessary modules
    ##########################################

    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        # FIXME: prepending to sys.path should fix #103
        sys.path.append(sourcePath)

    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import getTempDirectory
    from toil.lib.bioio import makeSubDir
    from toil.lib.bioio import system
    from toil.common import loadJobStore

    ##########################################
    #Input args
    ##########################################

    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]

    ##########################################
    #Load the jobStore/config file
    ##########################################

    jobStore = loadJobStore(jobStoreString)
    config = jobStore.config

    ##########################################
    #Load the environment for the batchjob
    ##########################################

    #First load the environment for the batchjob.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.attrib["log_level"])

    ##########################################
    #Setup the temporary directories.
    ##########################################

    #Dir to put all the temp files in.
    localWorkerTempDir = getTempDirectory()

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>

    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")

    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)

    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)

    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...

    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers):  #Remove old handlers
        logger.removeHandler(handler)

    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()

        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))

        ##########################################
        #Load the batchjob
        ##########################################

        batchjob = jobStore.load(jobStoreID)
        logger.debug("Parsed batchjob")

        ##########################################
        #Cleanup from any earlier invocation of the batchjob
        ##########################################

        if batchjob.command == None:
            while len(batchjob.stack) > 0:
                jobs = batchjob.stack[-1]
                #If the jobs still exist they have not been run, so break
                if jobStore.exists(jobs[0][0]):
                    break
                #However, if they are gone then we can remove them from the stack.
                #This is the only way to flush successors that have previously been run
                #, as jobs are, as far as possible, read only in the leader.
                batchjob.stack.pop()

        #This cleans the old log file which may
        #have been left if the batchjob is being retried after a batchjob failure.
        if batchjob.logJobStoreFileID != None:
            batchjob.clearLogFile(jobStore)

        ##########################################
        #Setup the stats, if requested
        ##########################################

        if config.attrib.has_key("stats"):
            startTime = time.time()
            startClock = getTotalCpuTime()
            stats = ET.Element("worker")
        else:
            stats = None

        startTime = time.time()
        while True:
            ##########################################
            #Run the batchjob, if there is one
            ##########################################

            if batchjob.command != None:
                if batchjob.command[:11] == "scriptTree ":
                    #Make a temporary file directory for the job
                    localTempDir = makeSubDir(
                        os.path.join(localWorkerTempDir, "localTempDir"))

                    #Is a job command
                    messages = loadJob(batchjob.command, jobStore)._execute(
                        batchjob=batchjob,
                        stats=stats,
                        localTempDir=localTempDir,
                        jobStore=jobStore)

                    #Remove the temporary file directory
                    shutil.rmtree(localTempDir)

                else:  #Is another command (running outside of jobs may be deprecated)
                    system(batchjob.command)
                    messages = []
            else:
                #The command may be none, in which case
                #the batchjob is just a shell ready to be deleted
                assert len(batchjob.stack) == 0
                messages = []
                break

            ##########################################
            #Establish if we can run another batchjob within the worker
            ##########################################

            #Exceeded the amount of time the worker is allowed to run for so quit
            if time.time() - startTime > float(config.attrib["job_time"]):
                logger.debug(
                    "We are breaking because the maximum time the batchjob should run for has been exceeded"
                )
                break

            #No more jobs to run so quit
            if len(batchjob.stack) == 0:
                break

            #Get the next set of jobs to run
            jobs = batchjob.stack[-1]
            assert len(jobs) > 0

            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug(
                    "No more jobs can run in series by this worker,"
                    " it's got %i children",
                    len(jobs) - 1)
                break

            #We check the requirements of the batchjob to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCpu, successorsDisk, successorPredecessorID = jobs[
                0]
            if successorMemory > batchjob.memory:
                logger.debug(
                    "We need more memory for the next batchjob, so finishing")
                break
            if successorCpu > batchjob.cpu:
                logger.debug(
                    "We need more cpus for the next batchjob, so finishing")
                break
            if successorsDisk > batchjob.disk:
                logger.debug(
                    "We need more disk for the next batchjob, so finishing")
                break
            if successorPredecessorID != None:
                logger.debug(
                    "The batchjob has multiple predecessors, we must return to the leader."
                )
                break

            ##########################################
            #We have a single successor batchjob.
            #We load the successor batchjob and transplant its command and stack
            #into the current batchjob so that it can be run
            #as if it were a command that were part of the current batchjob.
            #We can then delete the successor batchjob in the jobStore, as it is
            #wholly incorporated into the current batchjob.
            ##########################################

            #Remove the successor batchjob
            batchjob.stack.pop()

            #Load the successor batchjob
            successorJob = jobStore.load(successorJobStoreID)
            #These should all match up
            assert successorJob.memory == successorMemory
            assert successorJob.cpu == successorCpu
            assert successorJob.predecessorsFinished == set()
            assert successorJob.predecessorNumber == 1
            assert successorJob.command != None
            assert successorJobStoreID == successorJob.jobStoreID

            #Transplant the command and stack to the current batchjob
            batchjob.command = successorJob.command
            batchjob.stack += successorJob.stack
            assert batchjob.memory >= successorJob.memory
            assert batchjob.cpu >= successorJob.cpu

            #Checkpoint the batchjob and delete the successorJob
            batchjob.jobsToDelete = [successorJob.jobStoreID]
            jobStore.update(batchjob)
            jobStore.delete(successorJob.jobStoreID)

            logger.debug("Starting the next batchjob")

        ##########################################
        #Finish up the stats
        ##########################################

        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            m = ET.SubElement(stats, "messages")
            for message in messages:
                ET.SubElement(m, "message").text = message
            jobStore.writeStatsAndLogging(ET.tostring(stats))
        elif len(messages
                 ) > 0:  #No stats, but still need to report log messages
            l = ET.Element("worker")
            m = ET.SubElement(l, "messages")
            for message in messages:
                ET.SubElement(m, "message").text = message
            jobStore.writeStatsAndLogging(ET.tostring(l))

        logger.info(
            "Finished running the chain of jobs on this node, we ran for a total of %f seconds",
            time.time() - startTime)

    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except:  #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error(
            "Exiting the worker because of a failed batchjob on host %s",
            socket.gethostname())
        batchjob = jobStore.load(jobStoreID)
        batchjob.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################

    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)

    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)

    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)

    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr

    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)

    #Now our file handles are in exactly the state they were in before.

    #Copy back the log file to the global dir, if needed
    if workerFailed:
        truncateFile(tempWorkerLogPath)
        batchjob.setLogFile(tempWorkerLogPath, jobStore)
        os.remove(tempWorkerLogPath)
        jobStore.update(batchjob)

    #Remove the temp dir
    shutil.rmtree(localWorkerTempDir)

    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and batchjob.command == None and len(
            batchjob.stack) == 0:
        #We can now safely get rid of the batchjob
        jobStore.delete(batchjob.jobStoreID)
Beispiel #6
0
def main():
    ########################################## 
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        # FIXME: prepending to sys.path should fix #103
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import getTempDirectory
    from toil.lib.bioio import makeSubDir
    from toil.lib.bioio import system
    from toil.common import loadJobStore
    
    ########################################## 
    #Input args
    ##########################################
    
    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = loadJobStore(jobStoreString)
    config = jobStore.config

    ##########################################
    #Load the environment for the batchjob
    ##########################################
    
    #First load the environment for the batchjob.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.attrib["log_level"])

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in.
    localWorkerTempDir = getTempDirectory()
    
    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?    
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)
    
    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)
    
    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))
    
        ##########################################
        #Load the batchjob
        ##########################################
        
        batchjob = jobStore.load(jobStoreID)
        logger.debug("Parsed batchjob")
        
        ##########################################
        #Cleanup from any earlier invocation of the batchjob
        ##########################################
        
        if batchjob.command == None:
            while len(batchjob.stack) > 0:
                jobs = batchjob.stack[-1]
                #If the jobs still exist they have not been run, so break
                if jobStore.exists(jobs[0][0]):
                    break
                #However, if they are gone then we can remove them from the stack.
                #This is the only way to flush successors that have previously been run
                #, as jobs are, as far as possible, read only in the leader.
                batchjob.stack.pop()
                
                
        #This cleans the old log file which may 
        #have been left if the batchjob is being retried after a batchjob failure.
        if batchjob.logJobStoreFileID != None:
            batchjob.clearLogFile(jobStore)
    
        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.attrib.has_key("stats"):
            startTime = time.time()
            startClock = getTotalCpuTime()
            stats = ET.Element("worker")
        else:
            stats = None

        startTime = time.time() 
        while True:
            ##########################################
            #Run the batchjob, if there is one
            ##########################################
            
            if batchjob.command != None:
                if batchjob.command[:11] == "scriptTree ":
                    #Make a temporary file directory for the job
                    localTempDir = makeSubDir(os.path.join(localWorkerTempDir, "localTempDir"))
                    
                    #Is a job command
                    messages = loadJob(batchjob.command, jobStore)._execute(batchjob=batchjob,
                                    stats=stats, localTempDir=localTempDir, 
                                    jobStore=jobStore)
                    
                    #Remove the temporary file directory
                    shutil.rmtree(localTempDir)
    
                else: #Is another command (running outside of jobs may be deprecated)
                    system(batchjob.command)
                    messages = []
            else:
                #The command may be none, in which case
                #the batchjob is just a shell ready to be deleted
                assert len(batchjob.stack) == 0
                messages = []
                break
            
            ##########################################
            #Establish if we can run another batchjob within the worker
            ##########################################
            
            #Exceeded the amount of time the worker is allowed to run for so quit
            if time.time() - startTime > float(config.attrib["job_time"]):
                logger.debug("We are breaking because the maximum time the batchjob should run for has been exceeded")
                break

            #No more jobs to run so quit
            if len(batchjob.stack) == 0:
                break
            
            #Get the next set of jobs to run
            jobs = batchjob.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the batchjob to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCpu, successorsDisk, successorPredecessorID = jobs[0]
            if successorMemory > batchjob.memory:
                logger.debug("We need more memory for the next batchjob, so finishing")
                break
            if successorCpu > batchjob.cpu:
                logger.debug("We need more cpus for the next batchjob, so finishing")
                break
            if successorsDisk > batchjob.disk:
                logger.debug("We need more disk for the next batchjob, so finishing")
                break
            if successorPredecessorID != None: 
                logger.debug("The batchjob has multiple predecessors, we must return to the leader.")
                break
          
            ##########################################
            #We have a single successor batchjob.
            #We load the successor batchjob and transplant its command and stack
            #into the current batchjob so that it can be run
            #as if it were a command that were part of the current batchjob.
            #We can then delete the successor batchjob in the jobStore, as it is
            #wholly incorporated into the current batchjob.
            ##########################################
            
            #Remove the successor batchjob
            batchjob.stack.pop()
            
            #Load the successor batchjob
            successorJob = jobStore.load(successorJobStoreID)
            #These should all match up
            assert successorJob.memory == successorMemory
            assert successorJob.cpu == successorCpu
            assert successorJob.predecessorsFinished == set()
            assert successorJob.predecessorNumber == 1
            assert successorJob.command != None
            assert successorJobStoreID == successorJob.jobStoreID
            
            #Transplant the command and stack to the current batchjob
            batchjob.command = successorJob.command
            batchjob.stack += successorJob.stack
            assert batchjob.memory >= successorJob.memory
            assert batchjob.cpu >= successorJob.cpu
            
            #Checkpoint the batchjob and delete the successorJob
            batchjob.jobsToDelete = [ successorJob.jobStoreID ]
            jobStore.update(batchjob)
            jobStore.delete(successorJob.jobStoreID)
            
            logger.debug("Starting the next batchjob")
        
        ##########################################
        #Finish up the stats
        ##########################################

        if stats != None:
            totalCpuTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            stats.attrib["time"] = str(time.time() - startTime)
            stats.attrib["clock"] = str(totalCpuTime - startClock)
            stats.attrib["memory"] = str(totalMemoryUsage)
            m = ET.SubElement(stats, "messages")
            for message in messages:
                ET.SubElement(m, "message").text = message
            jobStore.writeStatsAndLogging(ET.tostring(stats))
        elif len(messages) > 0: #No stats, but still need to report log messages
            l = ET.Element("worker")
            m = ET.SubElement(l, "messages")
            for message in messages:
                ET.SubElement(m, "message").text = message
            jobStore.writeStatsAndLogging(ET.tostring(l))
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed batchjob on host %s", socket.gethostname())
        batchjob = jobStore.load(jobStoreID)
        batchjob.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if workerFailed:
        truncateFile(tempWorkerLogPath)
        batchjob.setLogFile(tempWorkerLogPath, jobStore)
        os.remove(tempWorkerLogPath)
        jobStore.update(batchjob)

    #Remove the temp dir
    shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and batchjob.command == None and len(batchjob.stack) == 0:
        #We can now safely get rid of the batchjob
        jobStore.delete(batchjob.jobStoreID)
Beispiel #7
0
def workerScript(jobStore,
                 config,
                 jobName,
                 jobStoreID,
                 redirectOutputToLogFile=True):
    """
    Worker process script, runs a job. 
    
    :param str jobName: The "job name" (a user friendly name) of the job to be run
    :param str jobStoreLocator: Specifies the job store to use
    :param str jobStoreID: The job store ID of the job to be run
    
    :return int: 1 if a job failed, or 0 if all jobs succeeded
    """

    configureRootLogger()
    setLogLevel(config.logLevel)

    ##########################################
    #Create the worker killer, if requested
    ##########################################

    logFileByteReportLimit = config.maxLogFileSize

    if config.badWorker > 0 and random.random() < config.badWorker:
        # We need to kill the process we are currently in, to simulate worker
        # failure. We don't want to just send SIGKILL, because we can't tell
        # that from a legitimate OOM on our CI runner. We're going to send
        # SIGUSR1 so our terminations are distinctive, and then SIGKILL if that
        # didn't stick. We definitely don't want to do this from *within* the
        # process we are trying to kill, so we fork off. TODO: We can still
        # leave the killing code running after the main Toil flow is done, but
        # since it's now in a process instead of a thread, the main Python
        # process won't wait around for its timeout to expire. I think this is
        # better than the old thread-based way where all of Toil would wait
        # around to be killed.

        killTarget = os.getpid()
        sleepTime = config.badWorkerFailInterval * random.random()
        if os.fork() == 0:
            # We are the child
            # Let the parent run some amount of time
            time.sleep(sleepTime)
            # Kill it gently
            os.kill(killTarget, signal.SIGUSR1)
            # Wait for that to stick
            time.sleep(0.01)
            try:
                # Kill it harder. Hope the PID hasn't already been reused.
                # If we succeeded the first time, this will OSError
                os.kill(killTarget, signal.SIGKILL)
            except OSError:
                pass
            # Exit without doing any of Toil's cleanup
            os._exit(0)

        # We don't need to reap the child. Either it kills us, or we finish
        # before it does. Either way, init will have to clean it up for us.

    ##########################################
    #Load the environment for the job
    ##########################################

    #First load the environment for the job.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = safeUnpickleFromStream(fileHandle)
    env_reject = {
        "TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE", "HOME", "LOGNAME", "USER",
        "DISPLAY", "JAVA_HOME"
    }
    for i in environment:
        if i == "PATH":
            # Handle path specially. Sometimes e.g. leader may not include
            # /bin, but the Toil appliance needs it.
            if i in os.environ and os.environ[i] != '':
                # Use the provided PATH and then the local system's PATH
                os.environ[i] = environment[i] + ':' + os.environ[i]
            else:
                # Use the provided PATH only
                os.environ[i] = environment[i]
        elif i not in env_reject:
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    toilWorkflowDir = Toil.getLocalWorkflowDir(config.workflowID,
                                               config.workDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################

    # Dir to put all this worker's temp files in.
    localWorkerTempDir = tempfile.mkdtemp(dir=toilWorkflowDir)
    os.chmod(localWorkerTempDir, 0o755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>

    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    # Do we even want to redirect output? Let the config make us not do it.
    redirectOutputToLogFile = redirectOutputToLogFile and not config.disableWorkerOutputCapture

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")

    if redirectOutputToLogFile:
        # Announce that we are redirecting logging, and where it will now go.
        # This is important if we are trying to manually trace a faulty worker invocation.
        logger.info("Redirecting logging to %s", tempWorkerLogPath)
        sys.stdout.flush()
        sys.stderr.flush()

        # Save the original stdout and stderr (by opening new file descriptors
        # to the same files)
        origStdOut = os.dup(1)
        origStdErr = os.dup(2)

        # Open the file to send stdout/stderr to.
        logFh = os.open(tempWorkerLogPath,
                        os.O_WRONLY | os.O_CREAT | os.O_APPEND)

        # Replace standard output with a descriptor for the log file
        os.dup2(logFh, 1)

        # Replace standard error with a descriptor for the log file
        os.dup2(logFh, 2)

        # Since we only opened the file once, all the descriptors duped from
        # the original will share offset information, and won't clobber each
        # others' writes. See <http://stackoverflow.com/a/5284108/402891>. This
        # shouldn't matter, since O_APPEND seeks to the end of the file before
        # every write, but maybe there's something odd going on...

        # Close the descriptor we used to open the file
        os.close(logFh)

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    jobAttemptFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    statsDict.workers.logsToMaster = []
    blockFn = lambda: True
    listOfJobs = [jobName]
    job = None
    try:

        #Put a message at the top of the log, just to make sure it's working.
        logger.info("---TOIL WORKER OUTPUT LOG---")
        sys.stdout.flush()

        logProcessContext(config)

        ##########################################
        #Connect to the deferred function system
        ##########################################
        deferredFunctionManager = DeferredFunctionManager(toilWorkflowDir)

        ##########################################
        #Load the JobDescription
        ##########################################

        jobDesc = jobStore.load(jobStoreID)
        listOfJobs[0] = str(jobDesc)
        logger.debug("Parsed job description")

        ##########################################
        #Cleanup from any earlier invocation of the job
        ##########################################

        if jobDesc.command == None:
            logger.debug("Job description has no body to run.")
            # Cleanup jobs already finished
            predicate = lambda jID: jobStore.exists(jID)
            jobDesc.filterSuccessors(predicate)
            jobDesc.filterServiceHosts(predicate)
            logger.debug(
                "Cleaned up any references to completed successor jobs")

        # This cleans the old log file which may
        # have been left if the job is being retried after a job failure.
        oldLogFile = jobDesc.logJobStoreFileID
        if oldLogFile != None:
            jobDesc.logJobStoreFileID = None
            jobStore.update(jobDesc)  #Update first, before deleting any files
            jobStore.deleteFile(oldLogFile)

        ##########################################
        # If a checkpoint exists, restart from the checkpoint
        ##########################################

        if isinstance(
                jobDesc,
                CheckpointJobDescription) and jobDesc.checkpoint is not None:
            # The job is a checkpoint, and is being restarted after previously completing
            logger.debug("Job is a checkpoint")
            # If the checkpoint still has extant successors or services, its
            # subtree didn't complete properly. We handle the restart of the
            # checkpoint here, removing its previous subtree.
            if next(jobDesc.successorsAndServiceHosts(), None) is not None:
                logger.debug("Checkpoint has failed; restoring")
                # Reduce the try count
                assert jobDesc.remainingTryCount >= 0
                jobDesc.remainingTryCount = max(0,
                                                jobDesc.remainingTryCount - 1)
                jobDesc.restartCheckpoint(jobStore)
            # Otherwise, the job and successors are done, and we can cleanup stuff we couldn't clean
            # because of the job being a checkpoint
            else:
                logger.debug(
                    "The checkpoint jobs seems to have completed okay, removing any checkpoint files to delete."
                )
                #Delete any remnant files
                list(
                    map(
                        jobStore.deleteFile,
                        list(
                            filter(jobStore.fileExists,
                                   jobDesc.checkpointFilesToDelete))))

        ##########################################
        #Setup the stats, if requested
        ##########################################

        if config.stats:
            startClock = getTotalCpuTime()

        startTime = time.time()
        while True:
            ##########################################
            #Run the job body, if there is one
            ##########################################

            logger.info("Working on job %s", jobDesc)

            if jobDesc.command is not None:
                assert jobDesc.command.startswith("_toil ")
                logger.debug("Got a command to run: %s" % jobDesc.command)
                # Load the job. It will use the same JobDescription we have been using.
                job = Job.loadJob(jobStore, jobDesc)
                if isinstance(jobDesc, CheckpointJobDescription):
                    # If it is a checkpoint job, save the command
                    jobDesc.checkpoint = jobDesc.command

                logger.info("Loaded body %s from description %s", job, jobDesc)

                # Create a fileStore object for the job
                fileStore = AbstractFileStore.createFileStore(
                    jobStore,
                    jobDesc,
                    localWorkerTempDir,
                    blockFn,
                    caching=not config.disableCaching)
                with job._executor(stats=statsDict if config.stats else None,
                                   fileStore=fileStore):
                    with deferredFunctionManager.open() as defer:
                        with fileStore.open(job):
                            # Get the next block function to wait on committing this job
                            blockFn = fileStore.waitForCommit

                            # Run the job, save new successors, and set up
                            # locally (but don't commit) successor
                            # relationships and job completion.
                            # Pass everything as name=value because Cactus
                            # likes to override _runner when it shouldn't and
                            # it needs some hope of finding the arguments it
                            # wants across multiple Toil versions. We also
                            # still pass a jobGraph argument to placate old
                            # versions of Cactus.
                            job._runner(jobGraph=None,
                                        jobStore=jobStore,
                                        fileStore=fileStore,
                                        defer=defer)

                # Accumulate messages from this job & any subsequent chained jobs
                statsDict.workers.logsToMaster += fileStore.loggingMessages

                logger.info("Completed body for %s", jobDesc)

            else:
                #The command may be none, in which case
                #the JobDescription is either a shell ready to be deleted or has
                #been scheduled after a failure to cleanup
                logger.debug("No user job to run, so finishing")
                break

            if AbstractFileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another job within the worker
            ##########################################
            successor = nextChainable(jobDesc, jobStore, config)
            if successor is None or config.disableChaining:
                # Can't chain any more jobs. We are going to stop.

                logger.info("Not chaining from job %s", jobDesc)

                # TODO: Somehow the commit happens even if we don't start it here.

                break

            logger.info("Chaining from %s to %s", jobDesc, successor)

            ##########################################
            # We have a single successor job that is not a checkpoint job. We
            # reassign the ID of the current JobDescription to the successor.
            # We can then delete the successor JobDescription (under its old
            # ID) in the jobStore, as it is wholly incorporated into the
            # current one.
            ##########################################

            # Make sure nothing has gone wrong and we can really chain
            assert jobDesc.memory >= successor.memory
            assert jobDesc.cores >= successor.cores

            # Save the successor's original ID, so we can clean it (and its
            # body) up after we finish executing it.
            successorID = successor.jobStoreID

            # add the successor to the list of jobs run
            listOfJobs.append(str(successor))

            # Now we need to become that successor, under the original ID.
            successor.replace(jobDesc)
            jobDesc = successor

            # Problem: successor's job body is a file that will be cleaned up
            # when we delete the successor job by ID. We can't just move it. So
            # we need to roll up the deletion of the successor job by ID with
            # the deletion of the job ID we're currently working on.
            jobDesc.jobsToDelete.append(successorID)

            # Clone the now-current JobDescription (which used to be the successor).
            # TODO: Why??? Can we not?
            jobDesc = copy.deepcopy(jobDesc)

            # Build a fileStore to update the job and commit the replacement.
            # TODO: can we have a commit operation without an entire FileStore???
            fileStore = AbstractFileStore.createFileStore(
                jobStore,
                jobDesc,
                localWorkerTempDir,
                blockFn,
                caching=not config.disableCaching)

            # Update blockFn to wait for that commit operation.
            blockFn = fileStore.waitForCommit

            # This will update the job once the previous job is done updating
            fileStore.startCommit(jobState=True)

            # Clone the current job description again, so that further updates
            # to it (such as new successors being added when it runs) occur
            # after the commit process we just kicked off, and aren't committed
            # early or partially.
            jobDesc = copy.deepcopy(jobDesc)

            logger.debug("Starting the next job")

        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)

        # log the worker log path here so that if the file is truncated the path can still be found
        if redirectOutputToLogFile:
            logger.info(
                "Worker log can be found at %s. Set --cleanWorkDir to retain this log",
                localWorkerTempDir)

        logger.info(
            "Finished running the chain of jobs on this node, we ran for a total of %f seconds",
            time.time() - startTime)

    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except:  #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed job on host %s",
                     socket.gethostname())
        AbstractFileStore._terminateEvent.set()

    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ##########################################

    blockFn()

    ##########################################
    #All the asynchronous worker/update threads must be finished now,
    #so safe to test if they completed okay
    ##########################################

    if AbstractFileStore._terminateEvent.isSet():
        # Something has gone wrong.

        # Clobber any garbage state we have for this job from failing with
        # whatever good state is still stored in the JobStore
        jobDesc = jobStore.load(jobStoreID)
        # Remember that we failed
        jobAttemptFailed = True

    ##########################################
    #Cleanup
    ##########################################

    # Close the worker logging
    # Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    if redirectOutputToLogFile:
        # Flush at the OS level
        os.fsync(1)
        os.fsync(2)

        # Close redirected stdout and replace with the original standard output.
        os.dup2(origStdOut, 1)

        # Close redirected stderr and replace with the original standard error.
        os.dup2(origStdErr, 2)

        # sys.stdout and sys.stderr don't need to be modified at all. We don't
        # need to call redirectLoggerStreamHandlers since they still log to
        # sys.stderr

        # Close our extra handles to the original standard output and standard
        # error streams, so we don't leak file handles.
        os.close(origStdOut)
        os.close(origStdErr)

    # Now our file handles are in exactly the state they were in before.

    # Copy back the log file to the global dir, if needed.
    # Note that we work with bytes instead of characters so we can seek
    # relative to the end (since Python won't decode Unicode backward, or even
    # interpret seek offsets in characters for us). TODO: We may get invalid or
    # just different Unicode by breaking up a character at the boundary!
    if jobAttemptFailed and redirectOutputToLogFile:
        jobDesc.logJobStoreFileID = jobStore.getEmptyFileStoreID(
            jobDesc.jobStoreID, cleanup=True)
        jobDesc.chainedJobs = listOfJobs
        with jobStore.updateFileStream(jobDesc.logJobStoreFileID) as w:
            with open(tempWorkerLogPath, 'rb') as f:
                if os.path.getsize(
                        tempWorkerLogPath) > logFileByteReportLimit != 0:
                    if logFileByteReportLimit > 0:
                        f.seek(-logFileByteReportLimit,
                               2)  # seek to last tooBig bytes of file
                    elif logFileByteReportLimit < 0:
                        f.seek(logFileByteReportLimit,
                               0)  # seek to first tooBig bytes of file
                # Dump the possibly-invalid-Unicode bytes into the log file
                w.write(f.read())  # TODO load file using a buffer
        # Commit log file reference back to JobStore
        jobStore.update(jobDesc)

    elif ((debugging or (config.writeLogsFromAllJobs
                         and not jobName.startswith(CWL_INTERNAL_JOBS)))
          and redirectOutputToLogFile):  # write log messages
        with open(tempWorkerLogPath, 'rb') as logFile:
            if os.path.getsize(
                    tempWorkerLogPath) > logFileByteReportLimit != 0:
                if logFileByteReportLimit > 0:
                    logFile.seek(-logFileByteReportLimit,
                                 2)  # seek to last tooBig bytes of file
                elif logFileByteReportLimit < 0:
                    logFile.seek(logFileByteReportLimit,
                                 0)  # seek to first tooBig bytes of file
            # Make sure lines are Unicode so they can be JSON serialized as part of the dict.
            # We may have damaged the Unicode text by cutting it at an arbitrary byte so we drop bad characters.
            logMessages = [
                line.decode('utf-8', 'skip')
                for line in logFile.read().splitlines()
            ]
        statsDict.logs.names = listOfJobs
        statsDict.logs.messages = logMessages

    if (debugging or config.stats or statsDict.workers.logsToMaster
        ) and not jobAttemptFailed:  # We have stats/logging to report back
        jobStore.writeStatsAndLogging(
            json.dumps(statsDict, ensure_ascii=True).encode())

    #Remove the temp dir
    cleanUp = config.cleanWorkDir
    if cleanUp == 'always' or (cleanUp == 'onSuccess' and
                               not jobAttemptFailed) or (cleanUp == 'onError'
                                                         and jobAttemptFailed):
        shutil.rmtree(localWorkerTempDir)

    #This must happen after the log file is done with, else there is no place to put the log
    if (not jobAttemptFailed) and jobDesc.command == None and next(
            jobDesc.successorsAndServiceHosts(), None) is None:
        # We can now safely get rid of the JobDescription, and all jobs it chained up
        for otherID in jobDesc.jobsToDelete:
            jobStore.delete(otherID)
        jobStore.delete(jobDesc.jobStoreID)

    if jobAttemptFailed:
        return 1
    else:
        return 0
Beispiel #8
0
def main():
    logging.basicConfig()

    ##########################################
    #Import necessary modules 
    ##########################################
    
    # This is assuming that worker.py is at a path ending in "/toil/worker.py".
    sourcePath = os.path.dirname(os.path.dirname(__file__))
    if sourcePath not in sys.path:
        sys.path.append(sourcePath)
    
    #Now we can import all the necessary functions
    from toil.lib.bioio import setLogLevel
    from toil.lib.bioio import getTotalCpuTime
    from toil.lib.bioio import getTotalCpuTimeAndMemoryUsage
    from toil.lib.bioio import makePublicDir
    from toil.lib.bioio import system
    from toil.common import loadJobStore
    from toil.job import Job
    
    ########################################## 
    #Input args
    ##########################################
    
    jobStoreString = sys.argv[1]
    jobStoreID = sys.argv[2]
    
    ##########################################
    #Load the jobStore/config file
    ##########################################
    
    jobStore = loadJobStore(jobStoreString)
    config = jobStore.config
    
    ##########################################
    #Create the worker killer, if requested
    ##########################################

    if config.badWorker > 0 and random.random() < config.badWorker:
        def badWorker():
            #This will randomly kill the worker process at a random time 
            time.sleep(config.badWorkerFailInterval * random.random())
            os.kill(os.getpid(), signal.SIGKILL) #signal.SIGINT)
            #TODO: FIX OCCASIONAL DEADLOCK WITH SIGINT (tested on single machine)
        t = Thread(target=badWorker)
        t.daemon = True
        t.start()

    ##########################################
    #Load the environment for the jobWrapper
    ##########################################
    
    #First load the environment for the jobWrapper.
    with jobStore.readSharedFileStream("environment.pickle") as fileHandle:
        environment = cPickle.load(fileHandle)
    for i in environment:
        if i not in ("TMPDIR", "TMP", "HOSTNAME", "HOSTTYPE"):
            os.environ[i] = environment[i]
    # sys.path is used by __import__ to find modules
    if "PYTHONPATH" in environment:
        for e in environment["PYTHONPATH"].split(':'):
            if e != '':
                sys.path.append(e)

    setLogLevel(config.logLevel)

    tempRootDir = config.workDir
    if tempRootDir is not None and not os.path.exists(tempRootDir):
        raise RuntimeError("The temporary directory specified by workDir: %s does not exist" % tempRootDir)

    ##########################################
    #Setup the temporary directories.
    ##########################################
        
    #Dir to put all the temp files in. If tempRootDir is None, tempdir looks at environment variables to determine
    # where to put the tempDir.
    localWorkerTempDir = tempfile.mkdtemp(dir=tempRootDir)
    os.chmod(localWorkerTempDir, 0755)

    ##########################################
    #Setup the logging
    ##########################################

    #This is mildly tricky because we don't just want to
    #redirect stdout and stderr for this Python process; we want to redirect it
    #for this process and all children. Consequently, we can't just replace
    #sys.stdout and sys.stderr; we need to mess with the underlying OS-level
    #file descriptors. See <http://stackoverflow.com/a/11632982/402891>
    
    #When we start, standard input is file descriptor 0, standard output is
    #file descriptor 1, and standard error is file descriptor 2.

    #What file do we want to point FDs 1 and 2 to?
    tempWorkerLogPath = os.path.join(localWorkerTempDir, "worker_log.txt")
    
    #Save the original stdout and stderr (by opening new file descriptors to the
    #same files)
    origStdOut = os.dup(1)
    origStdErr = os.dup(2)

    #Open the file to send stdout/stderr to.
    logFh = os.open(tempWorkerLogPath, os.O_WRONLY | os.O_CREAT | os.O_APPEND)

    #Replace standard output with a descriptor for the log file
    os.dup2(logFh, 1)
    
    #Replace standard error with a descriptor for the log file
    os.dup2(logFh, 2)
    
    #Since we only opened the file once, all the descriptors duped from the
    #original will share offset information, and won't clobber each others'
    #writes. See <http://stackoverflow.com/a/5284108/402891>. This shouldn't
    #matter, since O_APPEND seeks to the end of the file before every write, but
    #maybe there's something odd going on...
    
    #Close the descriptor we used to open the file
    os.close(logFh)

    for handler in list(logger.handlers): #Remove old handlers
        logger.removeHandler(handler)
    
    #Add the new handler. The sys.stderr stream has been redirected by swapping
    #the file descriptor out from under it.
    logger.addHandler(logging.StreamHandler(sys.stderr))

    debugging = logging.getLogger().isEnabledFor(logging.DEBUG)
    ##########################################
    #Worker log file trapped from here on in
    ##########################################

    workerFailed = False
    statsDict = MagicExpando()
    statsDict.jobs = []
    messages = []
    blockFn = lambda : True
    cleanCacheFn = lambda x : True
    try:

        #Put a message at the top of the log, just to make sure it's working.
        print "---TOIL WORKER OUTPUT LOG---"
        sys.stdout.flush()
        
        #Log the number of open file descriptors so we can tell if we're leaking
        #them.
        logger.debug("Next available file descriptor: {}".format(
            nextOpenDescriptor()))
    
        ##########################################
        #Load the jobWrapper
        ##########################################
        
        jobWrapper = jobStore.load(jobStoreID)
        logger.debug("Parsed jobWrapper")
        
        ##########################################
        #Cleanup from any earlier invocation of the jobWrapper
        ##########################################
        
        if jobWrapper.command == None:
            while len(jobWrapper.stack) > 0:
                jobs = jobWrapper.stack[-1]
                #If the jobs still exist they have not been run, so break
                if jobStore.exists(jobs[0][0]):
                    break
                #However, if they are gone then we can remove them from the stack.
                #This is the only way to flush successors that have previously been run
                #, as jobs are, as far as possible, read only in the leader.
                jobWrapper.stack.pop()
                
        #This cleans the old log file which may 
        #have been left if the jobWrapper is being retried after a jobWrapper failure.
        oldLogFile = jobWrapper.logJobStoreFileID
        jobWrapper.logJobStoreFileID = None
        jobStore.update(jobWrapper) #Update first, before deleting the file
        if oldLogFile != None:
            jobStore.delete(oldLogFile)
            
        #Make a temporary file directory for the jobWrapper
        localTempDir = makePublicDir(os.path.join(localWorkerTempDir, "localTempDir"))
    
        ##########################################
        #Setup the stats, if requested
        ##########################################
        
        if config.stats:
            startTime = time.time()
            startClock = getTotalCpuTime()

        startTime = time.time() 
        while True:
            ##########################################
            #Run the jobWrapper, if there is one
            ##########################################
            
            if jobWrapper.command != None:
                if jobWrapper.command.startswith( "_toil " ):
                    #Load the job
                    job = Job._loadJob(jobWrapper.command, jobStore)
                    
                    #Cleanup the cache from the previous job
                    cleanCacheFn(job.effectiveRequirements(jobStore.config).cache)
                    
                    #Create a fileStore object for the job
                    fileStore = Job.FileStore(jobStore, jobWrapper, localTempDir, 
                                              blockFn)
                    #Get the next block function and list that will contain any messages
                    blockFn = fileStore._blockFn
                    messages = fileStore.loggingMessages

                    job._execute(jobWrapper=jobWrapper,
                                           stats=statsDict if config.stats else None,
                                           localTempDir=localTempDir,
                                           jobStore=jobStore,
                                           fileStore=fileStore)

                    #Set the clean cache function
                    cleanCacheFn = fileStore._cleanLocalTempDir
                    
                else: #Is another command (running outside of jobs may be deprecated)
                    #Cleanup the cache from the previous job
                    cleanCacheFn(0)
                    
                    system(jobWrapper.command)
                    #Set a dummy clean cache fn
                    cleanCacheFn = lambda x : None
            else:
                #The command may be none, in which case
                #the jobWrapper is either a shell ready to be deleted or has 
                #been scheduled after a failure to cleanup
                break
            
            if Job.FileStore._terminateEvent.isSet():
                raise RuntimeError("The termination flag is set")

            ##########################################
            #Establish if we can run another jobWrapper within the worker
            ##########################################
            
            #No more jobs to run so quit
            if len(jobWrapper.stack) == 0:
                break
            
            #Get the next set of jobs to run
            jobs = jobWrapper.stack[-1]
            assert len(jobs) > 0
            
            #If there are 2 or more jobs to run in parallel we quit
            if len(jobs) >= 2:
                logger.debug("No more jobs can run in series by this worker,"
                            " it's got %i children", len(jobs)-1)
                break
            
            #We check the requirements of the jobWrapper to see if we can run it
            #within the current worker
            successorJobStoreID, successorMemory, successorCores, successorsDisk, successorPredecessorID = jobs[0]
            if successorMemory > jobWrapper.memory:
                logger.debug("We need more memory for the next jobWrapper, so finishing")
                break
            if successorCores > jobWrapper.cores:
                logger.debug("We need more cores for the next jobWrapper, so finishing")
                break
            if successorsDisk > jobWrapper.disk:
                logger.debug("We need more disk for the next jobWrapper, so finishing")
                break
            if successorPredecessorID != None: 
                logger.debug("The jobWrapper has multiple predecessors, we must return to the leader.")
                break
          
            ##########################################
            #We have a single successor jobWrapper.
            #We load the successor jobWrapper and transplant its command and stack
            #into the current jobWrapper so that it can be run
            #as if it were a command that were part of the current jobWrapper.
            #We can then delete the successor jobWrapper in the jobStore, as it is
            #wholly incorporated into the current jobWrapper.
            ##########################################
            
            #Clone the jobWrapper and its stack
            jobWrapper = copy.deepcopy(jobWrapper)
            
            #Remove the successor jobWrapper
            jobWrapper.stack.pop()
            
            #Load the successor jobWrapper
            successorJob = jobStore.load(successorJobStoreID)
            #These should all match up
            assert successorJob.memory == successorMemory
            assert successorJob.cores == successorCores
            assert successorJob.predecessorsFinished == set()
            assert successorJob.predecessorNumber == 1
            assert successorJob.command != None
            assert successorJobStoreID == successorJob.jobStoreID
            
            #Transplant the command and stack to the current jobWrapper
            jobWrapper.command = successorJob.command
            jobWrapper.stack += successorJob.stack
            assert jobWrapper.memory >= successorJob.memory
            assert jobWrapper.cores >= successorJob.cores
            
            #Build a fileStore to update the job
            fileStore = Job.FileStore(jobStore, jobWrapper, localTempDir, blockFn)
            
            #Update blockFn
            blockFn = fileStore._blockFn
            
            #Add successorJob to those to be deleted
            fileStore.jobsToDelete.add(successorJob.jobStoreID)
            
            #This will update the job once the previous job is done
            fileStore._updateJobWhenDone()            
            
            #Clone the jobWrapper and its stack again, so that updates to it do 
            #not interfere with this update
            jobWrapper = copy.deepcopy(jobWrapper)
            
            logger.debug("Starting the next jobWrapper")
        
        ##########################################
        #Finish up the stats
        ##########################################
        if config.stats:
            totalCPUTime, totalMemoryUsage = getTotalCpuTimeAndMemoryUsage()
            statsDict.workers.time = str(time.time() - startTime)
            statsDict.workers.clock = str(totalCPUTime - startClock)
            statsDict.workers.memory = str(totalMemoryUsage)
            statsDict.workers.log = messages
        
        logger.info("Finished running the chain of jobs on this node, we ran for a total of %f seconds", time.time() - startTime)
    
    ##########################################
    #Trapping where worker goes wrong
    ##########################################
    except: #Case that something goes wrong in worker
        traceback.print_exc()
        logger.error("Exiting the worker because of a failed jobWrapper on host %s", socket.gethostname())
        Job.FileStore._terminateEvent.set()
    
    ##########################################
    #Wait for the asynchronous chain of writes/updates to finish
    ########################################## 
       
    blockFn() 
    
    ##########################################
    #All the asynchronous worker/update threads must be finished now, 
    #so safe to test if they completed okay
    ########################################## 
    
    if Job.FileStore._terminateEvent.isSet():
        jobWrapper = jobStore.load(jobStoreID)
        jobWrapper.setupJobAfterFailure(config)
        workerFailed = True

    ##########################################
    #Cleanup
    ##########################################
    
    #Close the worker logging
    #Flush at the Python level
    sys.stdout.flush()
    sys.stderr.flush()
    #Flush at the OS level
    os.fsync(1)
    os.fsync(2)
    
    #Close redirected stdout and replace with the original standard output.
    os.dup2(origStdOut, 1)
    
    #Close redirected stderr and replace with the original standard error.
    os.dup2(origStdOut, 2)
    
    #sys.stdout and sys.stderr don't need to be modified at all. We don't need
    #to call redirectLoggerStreamHandlers since they still log to sys.stderr
    
    #Close our extra handles to the original standard output and standard error
    #streams, so we don't leak file handles.
    os.close(origStdOut)
    os.close(origStdErr)
    
    #Now our file handles are in exactly the state they were in before.
    
    #Copy back the log file to the global dir, if needed
    if workerFailed:
        truncateFile(tempWorkerLogPath)
        jobWrapper.logJobStoreFileID = jobStore.writeFile( tempWorkerLogPath, jobWrapper.jobStoreID )
        os.remove(tempWorkerLogPath)
        jobStore.update(jobWrapper)
    elif debugging: # write log messages
        truncateFile(tempWorkerLogPath)
        with open(tempWorkerLogPath, 'r') as logFile:
            logMessages = logFile.read().splitlines()
        statsDict.logs = [Expando(jobStoreID=jobStoreID,text=logMessage) for logMessage in logMessages]

    if (debugging or config.stats or messages) and not workerFailed: # We have stats/logging to report back
        jobStore.writeStatsAndLogging(json.dumps(statsDict))

    #Remove the temp dir
    shutil.rmtree(localWorkerTempDir)
    
    #This must happen after the log file is done with, else there is no place to put the log
    if (not workerFailed) and jobWrapper.command == None and len(jobWrapper.stack) == 0:
        #We can now safely get rid of the jobWrapper
        jobStore.delete(jobWrapper.jobStoreID)
Beispiel #9
0
def main():
    configureRootLogger()
    setLogLevel("DEBUG")

    if not os.environ.get("MESOS_AGENT_ENDPOINT"):
        # Some Mesos setups in our tests somehow lack this variable. Provide a
        # fake one to maybe convince the executor driver to work.
        os.environ["MESOS_AGENT_ENDPOINT"] = os.environ.get("MESOS_SLAVE_ENDPOINT", "127.0.0.1:5051")
        log.warning("Had to fake MESOS_AGENT_ENDPOINT as %s" % os.environ["MESOS_AGENT_ENDPOINT"])

    # must be set manually to enable toggling of the mesos log level for debugging jenkins
    # may be useful: https://github.com/DataBiosphere/toil/pull/2338#discussion_r223854931
    if False:
        try:
            urlopen("http://%s/logging/toggle?level=1&duration=15mins" % os.environ["MESOS_AGENT_ENDPOINT"]).read()
            log.debug("Toggled agent log level")
        except Exception as e:
            log.debug("Failed to toggle agent log level")
        
    # Parse the agent state
    agent_state = json.loads(urlopen("http://%s/state" % os.environ["MESOS_AGENT_ENDPOINT"]).read())
    if 'completed_frameworks' in agent_state:
        # Drop the completed frameworks which grow over time
        del agent_state['completed_frameworks']
    log.debug("Agent state: %s", str(agent_state))
    
    log.debug("Virtual memory info in executor: %s" % repr(psutil.virtual_memory()))
    
    if os.path.exists('/sys/fs/cgroup/memory'):
        # Mesos can limit memory with a cgroup, so we should report on that.
        for (dirpath, dirnames, filenames) in os.walk('/sys/fs/cgroup/memory', followlinks=True):
            for filename in filenames:
                if 'limit_in_bytes' not in filename:
                    continue
                log.debug('cgroup memory info from %s:' % os.path.join(dirpath, filename))
                try:
                    for line in open(os.path.join(dirpath, filename)):
                        log.debug(line.rstrip())
                except Exception as e:
                    log.debug("Failed to read file")
                    
    # Mesos can also impose rlimit limits, including on things that really
    # ought to not be limited, like virtual address space size.
    log.debug('DATA rlimit: %s', str(resource.getrlimit(resource.RLIMIT_DATA)))
    log.debug('STACK rlimit: %s', str(resource.getrlimit(resource.RLIMIT_STACK)))
    log.debug('RSS rlimit: %s', str(resource.getrlimit(resource.RLIMIT_RSS)))
    log.debug('AS rlimit: %s', str(resource.getrlimit(resource.RLIMIT_AS)))
    
                    
    executor = MesosExecutor()
    log.debug('Made executor')
    driver = MesosExecutorDriver(executor, use_addict=True)
    
    old_on_event = driver.on_event
    
    def patched_on_event(event):
        """
        Intercept and log all pymesos events.
        """
        log.debug("Event: %s", repr(event))
        old_on_event(event)
        
    driver.on_event = patched_on_event
    
    log.debug('Made driver')
    driver.start()
    log.debug('Started driver')
    driver_result = driver.join()
    log.debug('Joined driver')
    
    # Tolerate a None in addition to the code the docs suggest we should receive from join()
    exit_value = 0 if (driver_result is None or driver_result == 'DRIVER_STOPPED') else 1
    assert len(executor.runningTasks) == 0
    sys.exit(exit_value)