def checkOnJobs(self): """Check and update status of all running jobs. Respects statePollingWait and will return cached results if not within time period to talk with the scheduler. """ if (self._checkOnJobsTimestamp and (datetime.now() - self._checkOnJobsTimestamp).total_seconds() < self.boss.config.statePollingWait): return self._checkOnJobsCache activity = False for jobID in list(self.runningJobs): batchJobID = self.getBatchSystemID(jobID) status = self.boss.with_retries(self.getJobExitCode, batchJobID) if status is not None and isinstance(status, int): activity = True self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=status, exitReason=None, wallTime=None)) self.forgetJob(jobID) elif status is not None and isinstance(status, BatchJobExitReason): activity = True self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=1, exitReason=status, wallTime=None)) self.forgetJob(jobID) self._checkOnJobsCache = activity self._checkOnJobsTimestamp = datetime.now() return activity
def jobEnded(_exitStatus, wallTime=None, exitReason=None): """ Notify external observers of the job ending. """ self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=_exitStatus, wallTime=wallTime, exitReason=exitReason)) agentIP = None try: agentIP = self.runningJobMap[jobID].agentIP except KeyError: log.warning("Job %i returned exit code %i but isn't tracked as running.", jobID, _exitStatus) else: # Mark the job as no longer running. We MUST do this BEFORE # saying we killed the job, or it will be possible for another # thread to kill a job and then see it as running. del self.runningJobMap[jobID] try: self.hostToJobIDs[agentIP].remove(jobID) except KeyError: log.warning("Job %i returned exit code %i from unknown host.", jobID, _exitStatus) try: self.killJobIds.remove(jobID) except KeyError: pass else: # We were asked to kill this job, so say that we have done so. # We do this LAST, after all status updates for the job have # been handled, to ensure a consistent view of the scheduler # state from other threads. self.killedJobIds.add(jobID)
def _runDebugJob(self, jobCommand, jobID, environment): """ Run the jobCommand right now, in the current thread. May only be called in debug-worker mode. Assumes resources are available. """ assert self.debugWorker # TODO: It is not possible to kill running jobs in forkless mode, # because they are run immediately in the main thread. info = Info(time.time(), None, None, killIntended=False) self.runningJobs[jobID] = info if jobCommand.startswith("_toil_worker "): # We can actually run in this thread jobName, jobStoreLocator, jobStoreID = jobCommand.split()[1:] # Parse command jobStore = Toil.resumeJobStore(jobStoreLocator) toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID, redirectOutputToLogFile=not self.debugWorker) # Call the worker else: # Run synchronously. If starting or running the command fails, let the exception stop us. subprocess.check_call(jobCommand, shell=True, env=dict(os.environ, **environment)) self.runningJobs.pop(jobID) if not info.killIntended: self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=0, wallTime=time.time() - info.time, exitReason=None))
def _handle_job_status(self, job_id: int, status: Union[int, None], activity: bool) -> bool: """ Helper method for checkOnJobs to handle job statuses """ if status is not None: self.updatedJobsQueue.put( UpdatedBatchJobInfo(jobID=job_id, exitStatus=status, exitReason=None, wallTime=None)) self.forgetJob(job_id) return True if status is not None and isinstance(status, BatchJobExitReason): self.updatedJobsQueue.put( UpdatedBatchJobInfo(jobID=job_id, exitStatus=1, exitReason=status, wallTime=None)) self.forgetJob(job_id) return True return activity
def _handleChild(self, pid): """ Handle a child process PID that has finished. The PID must be for a child job we started. Not thread safe to run at the same time as we are making more children. Remove the child from our bookkeeping structures and free its resources. """ # Look up the child popen = self.children[pid] jobID = self.childToJob[pid] info = self.runningJobs[jobID] # Unpack the job resources (coreFractions, jobMemory, jobDisk) = info.resources # Clean up our records of the job. self.runningJobs.pop(jobID) self.childToJob.pop(pid) self.children.pop(pid) # See how the child did, and reap it. statusCode = popen.wait() if statusCode != 0 and not info.killIntended: log.error("Got exit code %i (indicating failure) " "from job %s.", statusCode, self.jobs[jobID]) if not info.killIntended: # Report if the job failed and we didn't kill it. # If we killed it then it shouldn't show up in the queue. self.outputQueue.put( UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None)) # Free up the job's resources. self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self.disk.release(jobDisk) log.debug('Child %d for job %s succeeded', pid, jobID)
def checkOnJobs(self): """ Check and update status of all running jobs. Respects statePollingWait and will return cached results if not within time period to talk with the scheduler. """ if ( self._checkOnJobsTimestamp and (datetime.now() - self._checkOnJobsTimestamp).total_seconds() < self.boss.config.statePollingWait ): return self._checkOnJobsCache activity = False not_finished = self.boss.with_retries(self._getNotFinishedIDs) # Added for jobID in list(self.runningJobs): batchJobID = self.getBatchSystemID(jobID) if int(batchJobID) in not_finished: logger.debug("bjobs detected unfinished job %s", batchJobID) else: status = self.boss.with_retries( self._customGetJobExitCode, batchJobID, jobID ) if status is not None: activity = True self.updatedJobsQueue.put( UpdatedBatchJobInfo( jobID=jobID, exitStatus=status, exitReason=None, wallTime=None, ) ) self.forgetJob(jobID) self._checkOnJobsCache = activity self._checkOnJobsTimestamp = datetime.now() return activity
def updatedJobWorker(self): """ We use the parasol results to update the status of jobs, adding them to the list of updated jobs. Results have the following structure.. (thanks Mark D!) int status; /* Job status - wait() return format. 0 is good. */ char *host; /* Machine job ran on. */ char *jobId; /* Job queuing system job ID */ char *exe; /* Job executable file (no path) */ int usrTicks; /* 'User' CPU time in ticks. */ int sysTicks; /* 'System' CPU time in ticks. */ unsigned submitTime; /* Job submission time in seconds since 1/1/1970 */ unsigned startTime; /* Job start time in seconds since 1/1/1970 */ unsigned endTime; /* Job end time in seconds since 1/1/1970 */ char *user; /* User who ran job */ char *errFile; /* Location of stderr file on host */ Plus you finally have the command name. """ resultsFiles = set() resultsFileHandles = [] try: while self.running: # Look for any new results files that have been created, and open them newResultsFiles = set(os.listdir( self.parasolResultsDir)).difference(resultsFiles) for newFile in newResultsFiles: newFilePath = os.path.join(self.parasolResultsDir, newFile) resultsFileHandles.append(open(newFilePath, 'r')) resultsFiles.add(newFile) for fileHandle in resultsFileHandles: while self.running: line = fileHandle.readline() if not line: break assert line[-1] == '\n' (status, host, jobId, exe, usrTicks, sysTicks, submitTime, startTime, endTime, user, errFile, command) = line[:-1].split(None, 11) status = int(status) jobId = int(jobId) if os.WIFEXITED(status): status = os.WEXITSTATUS(status) else: status = -status self.cpuUsageQueue.put(jobId) startTime = int(startTime) endTime = int(endTime) if endTime == startTime: # Both, start and end time is an integer so to get sub-second # accuracy we use the ticks reported by Parasol as an approximation. # This isn't documented but what Parasol calls "ticks" is actually a # hundredth of a second. Parasol does the unit conversion early on # after a job finished. Search paraNode.c for ticksToHundreths. We # also cheat a little by always reporting at least one hundredth of a # second. usrTicks = int(usrTicks) sysTicks = int(sysTicks) wallTime = float(max(1, usrTicks + sysTicks)) * 0.01 else: wallTime = float(endTime - startTime) self.updatedJobsQueue.put( UpdatedBatchJobInfo(jobID=jobId, exitStatus=status, wallTime=wallTime, exitReason=None)) time.sleep(1) except: logger.warning( "Error occurred while parsing parasol results files.") raise finally: for fileHandle in resultsFileHandles: fileHandle.close()
def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: # Remember when we started, for respecting the timeout entry = datetime.datetime.now() while ((datetime.datetime.now() - entry).total_seconds() < maxWait or not maxWait): result = self.getUpdatedLocalJob(0) if result: return result try: # Collect together the list of AWS and batch system IDs for tasks we # are acknowledging and don't care about anymore. acknowledged = [] for job_detail in self._describe_jobs_in_batches(): if job_detail.get('status') in ['SUCCEEDED', 'FAILED']: # This job is done! aws_id = job_detail['jobId'] bs_id = self.aws_id_to_bs_id[aws_id] # Acknowledge it acknowledged.append((aws_id, bs_id)) if aws_id in self.killed_job_aws_ids: # Killed jobs aren't allowed to appear as updated. logger.debug('Job %s was killed so skipping it', bs_id) continue # Otherwise, it stopped running and it wasn't our fault. # Record runtime runtime = self._get_runtime(job_detail) # Determine if it succeeded exit_reason = STATE_TO_EXIT_REASON[ job_detail['status']] # Get its exit code exit_code = self._get_exit_code(job_detail) if job_detail[ 'status'] == 'FAILED' and 'statusReason' in job_detail: # AWS knows why the job failed, so log the error logger.error('Job %s failed because: %s', bs_id, job_detail['statusReason']) # Compose a result return UpdatedBatchJobInfo(jobID=bs_id, exitStatus=exit_code, wallTime=runtime, exitReason=exit_reason) finally: # Drop all the records for tasks we acknowledged for (aws_id, bs_id) in acknowledged: del self.aws_id_to_bs_id[aws_id] del self.bs_id_to_aws_id[bs_id] if aws_id in self.killed_job_aws_ids: # We don't need to remember that we killed this job anymore. self.killed_job_aws_ids.remove(aws_id) if maxWait: # Wait a bit and poll again time.sleep(min(maxWait / 2, 1.0)) else: # Only poll once break # If we get here we got nothing return None
def _handleChild(self, pid: int) -> None: """ Handle a child process PID that has finished. The PID must be for a child job we started. Not thread safe to run at the same time as we are making more children. Remove the child from our bookkeeping structures and free its resources. """ # Look up the child popen = self.children[pid] jobID = self.childToJob[pid] info = self.runningJobs[jobID] # Unpack the job resources (coreFractions, jobMemory, jobDisk) = info.resources # Clean up our records of the job. self.runningJobs.pop(jobID) self.childToJob.pop(pid) self.children.pop(pid) if popen.returncode is None or not callable(getattr( os, 'waitid', None)): # It isn't reaped yet, or we have to reap all children to see if thay're done. # Before we reap it (if possible), kill its PID as a PGID to make sure # it isn't leaving children behind. # TODO: This is a PGID re-use risk on Mac because the process is # reaped already and the PGID may have been reused. try: os.killpg(pid, signal.SIGKILL) except ProcessLookupError: # It is dead already pass except PermissionError: # It isn't ours actually. Ours is dead. pass # See how the child did, and reap it. statusCode = popen.wait() if statusCode != 0 and not info.killIntended: log.error("Got exit code %i (indicating failure) " "from job %s.", statusCode, self.jobs[jobID]) if not info.killIntended: # Report if the job failed and we didn't kill it. # If we killed it then it shouldn't show up in the queue. self.outputQueue.put( UpdatedBatchJobInfo(jobID=jobID, exitStatus=statusCode, wallTime=time.time() - info.time, exitReason=None)) # Make absolutely sure all processes in the group have received their # kill signals and been cleaned up. # TODO: this opens a PGID reuse risk; we reaped the process and its # PGID may have been re-used. But it probably hasn't been and we # definitely want to make sure all its children died before saying the # job is done. Some might not be dead yet if we don't do this. # TODO: can we safely do this before reaping? Or would we sit forever # signaling a dead but unreaped process? try: while True: # Send a kill to the group again, to see if anything in it # is still alive. Our first kill might not have been # delivered yet. os.killpg(pid, signal.SIGKILL) # If that worked it is still alive, so wait for the kernel # to stop fooling around and kill it. log.warning( 'Sent redundant job completion kill to surviving process group %s known to batch system %s', pid, id(self)) time.sleep(0.1) except ProcessLookupError: # It is dead already pass except PermissionError: # It isn't ours actually. Ours is dead. pass # Free up the job's resources. self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self.disk.release(jobDisk) log.debug('Child %d for job %s succeeded', pid, jobID)
def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk, environment): """ Start a child process for the given job. Allocate its required resources and save it and save it in our bookkeeping structures. If the job is started, returns its PID. If the job fails to start, reports it as failed and returns False. If the job cannot get the resources it needs to start, returns None. """ # We fill this in if we manage to actually start the child. popen = None # This is when we started working on the job. startTime = time.time() # See if we can fit the job in our resource pools right now. if self.coreFractions.acquireNow(coreFractions): # We got some cores if self.memory.acquireNow(jobMemory): # We got some memory if self.disk.acquireNow(jobDisk): # We got the final resource, disk. # Actually run the job. # When it finishes we will release what it was using. # So it is important to not lose track of the child process. try: # Launch the job. # Make sure it is in its own session (and thus its own # process group) so that, if the user signals the # workflow, Toil will be responsible for killing the # job. This also makes sure that we can signal the job # and all its children together. We assume that the # process group ID will equal the PID of the process we # are starting. popen = subprocess.Popen(jobCommand, shell=True, env=dict( os.environ, **environment), start_new_session=True) except Exception: # If the job can't start, make sure we release resources now self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self.disk.release(jobDisk) log.error('Could not start job %s: %s', jobID, traceback.format_exc()) # Report as failed. self.outputQueue.put( UpdatedBatchJobInfo( jobID=jobID, exitStatus=EXIT_STATUS_UNAVAILABLE_VALUE, wallTime=0, exitReason=None)) # Free resources self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self.disk.release(jobDisk) # Complain it broke. return False else: # If the job did start, record it self.children[popen.pid] = popen # Make sure we can look it up by PID later self.childToJob[popen.pid] = jobID # Record that the job is running, and the resources it is using info = Info(startTime, popen, (coreFractions, jobMemory, jobDisk), killIntended=False) self.runningJobs[jobID] = info log.debug('Launched job %s as child %d', jobID, popen.pid) # Report success starting the job # Note that if a PID were somehow 0 it would look like False assert popen.pid != 0 return popen.pid else: # We can't get disk, so free cores and memory self.coreFractions.release(coreFractions) self.memory.release(jobMemory) self._setSchedulingStatusMessage( 'Not enough disk to run job %s' % jobID) else: # Free cores, since we can't get memory self.coreFractions.release(coreFractions) self._setSchedulingStatusMessage( 'Not enough memory to run job %s' % jobID) else: self._setSchedulingStatusMessage('Not enough cores to run job %s' % jobID) # If we get here, we didn't succeed or fail starting the job. # We didn't manage to get the resources. # Report that. return None
def _getUpdatedBatchJobImmediately(self): """ Return None if no updated (completed or failed) batch job is currently available, and jobID, exitCode, runtime if such a job can be found. """ # See if a local batch job has updated and is available immediately local_tuple = self.getUpdatedLocalJob(0) if local_tuple: # If so, use it return local_tuple # Otherwise we didn't get a local job. # Go looking for other jobs # Everybody else does this with a queue and some other thread that # is responsible for populating it. # But we can just ask kubernetes now. # Find a job that is done, failed, or stuck jobObject = None # Put 'done', 'failed', or 'stuck' here chosenFor = '' for j in self._ourJobObjects(onlySucceeded=True, limit=1): # Look for succeeded jobs because that's the only filter Kubernetes has jobObject = j chosenFor = 'done' if jobObject is None: for j in self._ourJobObjects(): # If there aren't any succeeded jobs, scan all jobs # See how many times each failed failCount = getattr(j.status, 'failed', 0) if failCount is None: # Make sure it is an int failCount = 0 if failCount > 0: # Take the first failed one you find jobObject = j chosenFor = 'failed' break if jobObject is None: # If no jobs are failed, look for jobs with pods that are stuck for various reasons. for j in self._ourJobObjects(): pod = self._getPodForJob(j) if pod is None: # Skip jobs with no pod continue # Containers can get stuck in Waiting with reason ImagePullBackOff # Get the statuses of the pod's containers containerStatuses = pod.status.container_statuses if containerStatuses is None or len(containerStatuses) == 0: # Pod exists but has no container statuses # This happens when the pod is just "Scheduled" # ("PodScheduled" status event) and isn't actually starting # to run yet. # Can't be stuck in ImagePullBackOff continue waitingInfo = getattr( getattr(pod.status.container_statuses[0], 'state', None), 'waiting', None) if waitingInfo is not None and waitingInfo.reason == 'ImagePullBackOff': # Assume it will never finish, even if the registry comes back or whatever. # We can get into this state when we send in a non-existent image. # See https://github.com/kubernetes/kubernetes/issues/58384 jobObject = j chosenFor = 'stuck' logger.warning( 'Failing stuck job; did you try to run a non-existent Docker image?' ' Check TOIL_APPLIANCE_SELF.') break # Pods can also get stuck nearly but not quite out of memory, # if their memory limits are high and they try to exhaust them. if self._isPodStuckOOM(pod): # We found a job that probably should be OOM! Report it as stuck. # Polling function takes care of the logging. jobObject = j chosenFor = 'stuck' break if jobObject is None: # Say we couldn't find anything return None # Otherwise we got something. # Work out what the job's ID was (whatever came after our name prefix) jobID = int(jobObject.metadata.name[len(self.jobPrefix):]) # Work out when the job was submitted. If the pod fails before actually # running, this is the basis for our runtime. jobSubmitTime = getattr(jobObject.status, 'start_time', None) if jobSubmitTime is None: # If somehow this is unset, say it was just now. jobSubmitTime = utc_now() # Grab the pod pod = self._getPodForJob(jobObject) if pod is not None: if chosenFor == 'done' or chosenFor == 'failed': # The job actually finished or failed # Get the statuses of the pod's containers containerStatuses = pod.status.container_statuses # Get when the pod started (reached the Kubelet) as a datetime startTime = getattr(pod.status, 'start_time', None) if startTime is None: # If the pod never made it to the kubelet to get a # start_time, say it was when the job was submitted. startTime = jobSubmitTime if containerStatuses is None or len(containerStatuses) == 0: # No statuses available. # This happens when a pod is "Scheduled". But how could a # 'done' or 'failed' pod be merely "Scheduled"? # Complain so we can find out. logger.warning( 'Exit code and runtime unavailable; pod has no container statuses' ) logger.warning('Pod: %s', str(pod)) exitCode = EXIT_STATUS_UNAVAILABLE_VALUE # Say it stopped now and started when it was scheduled/submitted. # We still need a strictly positive runtime. runtime = slow_down((utc_now() - startTime).totalSeconds()) else: # Get the termination info from the pod's main (only) container terminatedInfo = getattr( getattr(containerStatuses[0], 'state', None), 'terminated', None) if terminatedInfo is None: logger.warning( 'Exit code and runtime unavailable; pod stopped without container terminating' ) logger.warning('Pod: %s', str(pod)) exitCode = EXIT_STATUS_UNAVAILABLE_VALUE # Say it stopped now and started when it was scheduled/submitted. # We still need a strictly positive runtime. runtime = slow_down( (utc_now() - startTime).totalSeconds()) else: # Extract the exit code exitCode = terminatedInfo.exit_code # Compute how long the job actually ran for (subtract # datetimes). We need to look at the pod's start time # because the job's start time is just when the job is # created. And we need to look at the pod's end time # because the job only gets a completion time if # successful. runtime = slow_down( (terminatedInfo.finished_at - pod.status.start_time).total_seconds()) if chosenFor == 'failed': # Warn the user with the failed pod's log # TODO: cut this down somehow? logger.warning('Log from failed pod: %s', self._getLogForPod(pod)) else: # The job has gotten stuck assert chosenFor == 'stuck' # Synthesize an exit code exitCode = EXIT_STATUS_UNAVAILABLE_VALUE # Say it ran from when the job was submitted to when the pod got stuck runtime = slow_down((utc_now() - jobSubmitTime).totalSeconds()) else: # The pod went away from under the job. logging.warning('Exit code and runtime unavailable; pod vanished') exitCode = EXIT_STATUS_UNAVAILABLE_VALUE # Say it ran from when the job was submitted to when the pod vanished runtime = slow_down((utc_now() - jobSubmitTime).totalSeconds()) try: # Delete the job and all dependents (pods) self._api('batch').delete_namespaced_job( jobObject.metadata.name, self.namespace, propagation_policy='Foreground') # That just kicks off the deletion process. Foreground doesn't # actually block. See # https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/#foreground-cascading-deletion # We have to either wait until the deletion is done and we can't # see the job anymore, or ban the job from being "updated" again if # we see it. If we don't block on deletion, we can't use limit=1 # on our query for succeeded jobs. So we poll for the job's # non-existence. self._waitForJobDeath(jobObject.metadata.name) except kubernetes.client.rest.ApiException: # TODO: check to see if this is a 404 on the thing we tried to delete # If so, it is gone already and we don't need to delete it again. pass # Return the one finished job we found return UpdatedBatchJobInfo(jobID=jobID, exitStatus=exitCode, wallTime=runtime, exitReason=None)
def getUpdatedBatchJob(self, maxWait): entry = datetime.datetime.now() result = self._getUpdatedBatchJobImmediately() if result is not None or maxWait == 0: # We got something on the first try, or we only get one try return result # Otherwise we need to maybe wait. if self.enableWatching: # Try watching for something to happen and use that. w = kubernetes.watch.Watch() if self.enableWatching: for j in self._ourJobObjects(): for event in w.stream( self._api('core').list_namespaced_pod, self.namespace, timeout_seconds=maxWait): pod = event['object'] if pod.metadata.name.startswith(self.jobPrefix): if pod.status.phase == 'Failed' or pod.status.phase == 'Succeeded': containerStatuses = pod.status.container_statuses logger.debug("FINISHED") if containerStatuses is None or len( containerStatuses) == 0: logger.debug( "No job container statuses for job %s" % (pod.metadata.owner_references[0].name) ) return UpdatedBatchJobInfo( jobID=int( pod.metadata.owner_references[0]. name[len(self.jobPrefix):]), exitStatus= EXIT_STATUS_UNAVAILABLE_VALUE, wallTime=0, exitReason=None) # Get termination onformation from the pod termination = pod.status.container_statuses[ 0].state.terminated logger.info("REASON: %s Exit Code: %s", termination.reason, termination.exit_code) if termination.exit_code != 0: # The pod failed. Dump information about it. logger.debug('Failed pod information: %s', str(pod)) logger.warning('Log from failed pod: %s', self._getLogForPod(pod)) jobID = int(pod.metadata.owner_references[0]. name[len(self.jobPrefix):]) terminated = pod.status.container_statuses[ 0].state.terminated runtime = slow_down( (terminated.finished_at - terminated.started_at).total_seconds()) result = UpdatedBatchJobInfo( jobID=jobID, exitStatus=terminated.exit_code, wallTime=runtime, exitReason=None) self._api('batch').delete_namespaced_job( pod.metadata.owner_references[0].name, self.namespace, propagation_policy='Foreground') self._waitForJobDeath( pod.metadata.owner_references[0].name) return result else: continue else: # Try polling instead while result is None and (datetime.datetime.now() - entry).total_seconds() < maxWait: # We still have nothing and we haven't hit the timeout. # Poll result = self._getUpdatedBatchJobImmediately() if result is None: # Still nothing. Wait a second, or some fraction of our max wait time. time.sleep(min(maxWait / 2, 1.0)) # When we get here, either we found something or we ran out of time return result
def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]: # Remember when we started, for respecting the timeout entry = datetime.datetime.now() # This is the updated job we have found, if any result = None while result is None and ( (datetime.datetime.now() - entry).total_seconds() < maxWait or not maxWait): result = self.getUpdatedLocalJob(0) if result: return result # Collect together the list of TES and batch system IDs for tasks we # are acknowledging and don't care about anymore. acknowledged = [] for tes_id, bs_id in self.tes_id_to_bs_id.items(): # Immediately poll all the jobs we issued. # TODO: There's no way to acknowledge a finished job, so there's no # faster way to find the newly finished jobs than polling task = self.tes.get_task(tes_id, view="MINIMAL") if task.state in [ "COMPLETE", "CANCELED", "EXECUTOR_ERROR", "SYSTEM_ERROR" ]: # This task is done! logger.debug("Found stopped task: %s", task) # Acknowledge it acknowledged.append((tes_id, bs_id)) if task.state == "CANCELED": # Killed jobs aren't allowed to appear as updated. continue # Otherwise, it stopped running and it wasn't our fault. # Fetch the task's full info, including logs. task = self.tes.get_task(tes_id, view="FULL") # Record runtime runtime = self._get_runtime(task) # Determine if it succeeded exit_reason = STATE_TO_EXIT_REASON[task.state] # Get its exit code exit_code = self._get_exit_code(task) if task.state == "EXECUTOR_ERROR": # The task failed, so report executor logs. logger.warning('Log from failed executor: %s', self.__get_log_text(task)) # Compose a result result = UpdatedBatchJobInfo(jobID=bs_id, exitStatus=exit_code, wallTime=runtime, exitReason=exit_reason) # No more iteration needed, we found a result. break # After the iteration, drop all the records for tasks we acknowledged for (tes_id, bs_id) in acknowledged: del self.tes_id_to_bs_id[tes_id] del self.bs_id_to_tes_id[bs_id] if not maxWait: # Don't wait at all break elif result is None: # Wait a bit and poll again time.sleep(min(maxWait / 2, 1.0)) # When we get here we have all the result we can get return result