Example #1
0
        def checkOnJobs(self):
            """Check and update status of all running jobs.

            Respects statePollingWait and will return cached results if not within
            time period to talk with the scheduler.
            """
            if (self._checkOnJobsTimestamp and
                 (datetime.now() - self._checkOnJobsTimestamp).total_seconds() < self.boss.config.statePollingWait):
                return self._checkOnJobsCache

            activity = False
            for jobID in list(self.runningJobs):
                batchJobID = self.getBatchSystemID(jobID)
                status = self.boss.with_retries(self.getJobExitCode, batchJobID)
                if status is not None and isinstance(status, int):
                    activity = True
                    self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=status, exitReason=None, wallTime=None))
                    self.forgetJob(jobID)
                elif status is not None and isinstance(status, BatchJobExitReason):
                    activity = True
                    self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=1, exitReason=status, wallTime=None))
                    self.forgetJob(jobID)
            self._checkOnJobsCache = activity
            self._checkOnJobsTimestamp = datetime.now()
            return activity
Example #2
0
        def jobEnded(_exitStatus, wallTime=None, exitReason=None):
            """
            Notify external observers of the job ending.
            """
            self.updatedJobsQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=_exitStatus, wallTime=wallTime, exitReason=exitReason))
            agentIP = None
            try:
                agentIP = self.runningJobMap[jobID].agentIP
            except KeyError:
                log.warning("Job %i returned exit code %i but isn't tracked as running.",
                            jobID, _exitStatus)
            else:
                # Mark the job as no longer running. We MUST do this BEFORE
                # saying we killed the job, or it will be possible for another
                # thread to kill a job and then see it as running.
                del self.runningJobMap[jobID]

            try:
                self.hostToJobIDs[agentIP].remove(jobID)
            except KeyError:
                log.warning("Job %i returned exit code %i from unknown host.",
                            jobID, _exitStatus)

            try:
                self.killJobIds.remove(jobID)
            except KeyError:
                pass
            else:
                # We were asked to kill this job, so say that we have done so.
                # We do this LAST, after all status updates for the job have
                # been handled, to ensure a consistent view of the scheduler
                # state from other threads.
                self.killedJobIds.add(jobID)
Example #3
0
    def _runDebugJob(self, jobCommand, jobID, environment):
        """
        Run the jobCommand right now, in the current thread.
        May only be called in debug-worker mode.
        Assumes resources are available.
        """
        assert self.debugWorker
        # TODO: It is not possible to kill running jobs in forkless mode,
        # because they are run immediately in the main thread.
        info = Info(time.time(), None, None, killIntended=False)
        self.runningJobs[jobID] = info

        if jobCommand.startswith("_toil_worker "):
            # We can actually run in this thread
            jobName, jobStoreLocator, jobStoreID = jobCommand.split()[1:] # Parse command
            jobStore = Toil.resumeJobStore(jobStoreLocator)
            toil_worker.workerScript(jobStore, jobStore.config, jobName, jobStoreID,
                                     redirectOutputToLogFile=not self.debugWorker) # Call the worker
        else:
            # Run synchronously. If starting or running the command fails, let the exception stop us.
            subprocess.check_call(jobCommand,
                                  shell=True,
                                  env=dict(os.environ, **environment))

        self.runningJobs.pop(jobID)
        if not info.killIntended:
            self.outputQueue.put(UpdatedBatchJobInfo(jobID=jobID, exitStatus=0, wallTime=time.time() - info.time, exitReason=None))
Example #4
0
 def _handle_job_status(self, job_id: int, status: Union[int, None],
                        activity: bool) -> bool:
     """
     Helper method for checkOnJobs to handle job statuses
     """
     if status is not None:
         self.updatedJobsQueue.put(
             UpdatedBatchJobInfo(jobID=job_id,
                                 exitStatus=status,
                                 exitReason=None,
                                 wallTime=None))
         self.forgetJob(job_id)
         return True
     if status is not None and isinstance(status, BatchJobExitReason):
         self.updatedJobsQueue.put(
             UpdatedBatchJobInfo(jobID=job_id,
                                 exitStatus=1,
                                 exitReason=status,
                                 wallTime=None))
         self.forgetJob(job_id)
         return True
     return activity
Example #5
0
    def _handleChild(self, pid):
        """
        Handle a child process PID that has finished.
        The PID must be for a child job we started.
        Not thread safe to run at the same time as we are making more children.

        Remove the child from our bookkeeping structures and free its resources.
        """

        # Look up the child
        popen = self.children[pid]
        jobID = self.childToJob[pid]
        info = self.runningJobs[jobID]

        # Unpack the job resources
        (coreFractions, jobMemory, jobDisk) = info.resources

        # Clean up our records of the job.
        self.runningJobs.pop(jobID)
        self.childToJob.pop(pid)
        self.children.pop(pid)

        # See how the child did, and reap it.
        statusCode = popen.wait()
        if statusCode != 0 and not info.killIntended:
            log.error("Got exit code %i (indicating failure) "
                      "from job %s.", statusCode, self.jobs[jobID])
        if not info.killIntended:
            # Report if the job failed and we didn't kill it.
            # If we killed it then it shouldn't show up in the queue.
            self.outputQueue.put(
                UpdatedBatchJobInfo(jobID=jobID,
                                    exitStatus=statusCode,
                                    wallTime=time.time() - info.time,
                                    exitReason=None))

        # Free up the job's resources.
        self.coreFractions.release(coreFractions)
        self.memory.release(jobMemory)
        self.disk.release(jobDisk)

        log.debug('Child %d for job %s succeeded', pid, jobID)
Example #6
0
        def checkOnJobs(self):
            """
            Check and update status of all running jobs.

            Respects statePollingWait and will return cached results if not within
            time period to talk with the scheduler.
            """
            if (
                self._checkOnJobsTimestamp
                and (datetime.now() - self._checkOnJobsTimestamp).total_seconds()
                < self.boss.config.statePollingWait
            ):
                return self._checkOnJobsCache

            activity = False
            not_finished = self.boss.with_retries(self._getNotFinishedIDs)  # Added

            for jobID in list(self.runningJobs):
                batchJobID = self.getBatchSystemID(jobID)

                if int(batchJobID) in not_finished:
                    logger.debug("bjobs detected unfinished job %s", batchJobID)
                else:
                    status = self.boss.with_retries(
                        self._customGetJobExitCode, batchJobID, jobID
                    )
                    if status is not None:
                        activity = True
                        self.updatedJobsQueue.put(
                            UpdatedBatchJobInfo(
                                jobID=jobID,
                                exitStatus=status,
                                exitReason=None,
                                wallTime=None,
                            )
                        )
                        self.forgetJob(jobID)

            self._checkOnJobsCache = activity
            self._checkOnJobsTimestamp = datetime.now()
            return activity
Example #7
0
    def updatedJobWorker(self):
        """
        We use the parasol results to update the status of jobs, adding them
        to the list of updated jobs.

        Results have the following structure.. (thanks Mark D!)

        int status;    /* Job status - wait() return format. 0 is good. */
        char *host;    /* Machine job ran on. */
        char *jobId;    /* Job queuing system job ID */
        char *exe;    /* Job executable file (no path) */
        int usrTicks;    /* 'User' CPU time in ticks. */
        int sysTicks;    /* 'System' CPU time in ticks. */
        unsigned submitTime;    /* Job submission time in seconds since 1/1/1970 */
        unsigned startTime;    /* Job start time in seconds since 1/1/1970 */
        unsigned endTime;    /* Job end time in seconds since 1/1/1970 */
        char *user;    /* User who ran job */
        char *errFile;    /* Location of stderr file on host */

        Plus you finally have the command name.
        """
        resultsFiles = set()
        resultsFileHandles = []
        try:
            while self.running:
                # Look for any new results files that have been created, and open them
                newResultsFiles = set(os.listdir(
                    self.parasolResultsDir)).difference(resultsFiles)
                for newFile in newResultsFiles:
                    newFilePath = os.path.join(self.parasolResultsDir, newFile)
                    resultsFileHandles.append(open(newFilePath, 'r'))
                    resultsFiles.add(newFile)
                for fileHandle in resultsFileHandles:
                    while self.running:
                        line = fileHandle.readline()
                        if not line:
                            break
                        assert line[-1] == '\n'
                        (status, host, jobId, exe, usrTicks, sysTicks,
                         submitTime, startTime, endTime, user, errFile,
                         command) = line[:-1].split(None, 11)
                        status = int(status)
                        jobId = int(jobId)
                        if os.WIFEXITED(status):
                            status = os.WEXITSTATUS(status)
                        else:
                            status = -status
                        self.cpuUsageQueue.put(jobId)
                        startTime = int(startTime)
                        endTime = int(endTime)
                        if endTime == startTime:
                            # Both, start and end time is an integer so to get sub-second
                            # accuracy we use the ticks reported by Parasol as an approximation.
                            # This isn't documented but what Parasol calls "ticks" is actually a
                            # hundredth of a second. Parasol does the unit conversion early on
                            # after a job finished. Search paraNode.c for ticksToHundreths. We
                            # also cheat a little by always reporting at least one hundredth of a
                            # second.
                            usrTicks = int(usrTicks)
                            sysTicks = int(sysTicks)
                            wallTime = float(max(1,
                                                 usrTicks + sysTicks)) * 0.01
                        else:
                            wallTime = float(endTime - startTime)
                        self.updatedJobsQueue.put(
                            UpdatedBatchJobInfo(jobID=jobId,
                                                exitStatus=status,
                                                wallTime=wallTime,
                                                exitReason=None))
                time.sleep(1)
        except:
            logger.warning(
                "Error occurred while parsing parasol results files.")
            raise
        finally:
            for fileHandle in resultsFileHandles:
                fileHandle.close()
Example #8
0
    def getUpdatedBatchJob(self,
                           maxWait: int) -> Optional[UpdatedBatchJobInfo]:
        # Remember when we started, for respecting the timeout
        entry = datetime.datetime.now()
        while ((datetime.datetime.now() - entry).total_seconds() < maxWait
               or not maxWait):
            result = self.getUpdatedLocalJob(0)
            if result:
                return result

            try:
                # Collect together the list of AWS and batch system IDs for tasks we
                # are acknowledging and don't care about anymore.
                acknowledged = []

                for job_detail in self._describe_jobs_in_batches():
                    if job_detail.get('status') in ['SUCCEEDED', 'FAILED']:
                        # This job is done!
                        aws_id = job_detail['jobId']
                        bs_id = self.aws_id_to_bs_id[aws_id]

                        # Acknowledge it
                        acknowledged.append((aws_id, bs_id))

                        if aws_id in self.killed_job_aws_ids:
                            # Killed jobs aren't allowed to appear as updated.
                            logger.debug('Job %s was killed so skipping it',
                                         bs_id)
                            continue

                        # Otherwise, it stopped running and it wasn't our fault.

                        # Record runtime
                        runtime = self._get_runtime(job_detail)

                        # Determine if it succeeded
                        exit_reason = STATE_TO_EXIT_REASON[
                            job_detail['status']]

                        # Get its exit code
                        exit_code = self._get_exit_code(job_detail)

                        if job_detail[
                                'status'] == 'FAILED' and 'statusReason' in job_detail:
                            # AWS knows why the job failed, so log the error
                            logger.error('Job %s failed because: %s', bs_id,
                                         job_detail['statusReason'])

                        # Compose a result
                        return UpdatedBatchJobInfo(jobID=bs_id,
                                                   exitStatus=exit_code,
                                                   wallTime=runtime,
                                                   exitReason=exit_reason)

            finally:
                # Drop all the records for tasks we acknowledged
                for (aws_id, bs_id) in acknowledged:
                    del self.aws_id_to_bs_id[aws_id]
                    del self.bs_id_to_aws_id[bs_id]
                    if aws_id in self.killed_job_aws_ids:
                        # We don't need to remember that we killed this job anymore.
                        self.killed_job_aws_ids.remove(aws_id)

            if maxWait:
                # Wait a bit and poll again
                time.sleep(min(maxWait / 2, 1.0))
            else:
                # Only poll once
                break
        # If we get here we got nothing
        return None
Example #9
0
    def _handleChild(self, pid: int) -> None:
        """
        Handle a child process PID that has finished.
        The PID must be for a child job we started.
        Not thread safe to run at the same time as we are making more children.

        Remove the child from our bookkeeping structures and free its resources.
        """

        # Look up the child
        popen = self.children[pid]
        jobID = self.childToJob[pid]
        info = self.runningJobs[jobID]

        # Unpack the job resources
        (coreFractions, jobMemory, jobDisk) = info.resources

        # Clean up our records of the job.
        self.runningJobs.pop(jobID)
        self.childToJob.pop(pid)
        self.children.pop(pid)

        if popen.returncode is None or not callable(getattr(
                os, 'waitid', None)):
            # It isn't reaped yet, or we have to reap all children to see if thay're done.
            # Before we reap it (if possible), kill its PID as a PGID to make sure
            # it isn't leaving children behind.
            # TODO: This is a PGID re-use risk on Mac because the process is
            # reaped already and the PGID may have been reused.
            try:
                os.killpg(pid, signal.SIGKILL)
            except ProcessLookupError:
                # It is dead already
                pass
            except PermissionError:
                # It isn't ours actually. Ours is dead.
                pass

        # See how the child did, and reap it.
        statusCode = popen.wait()
        if statusCode != 0 and not info.killIntended:
            log.error("Got exit code %i (indicating failure) "
                      "from job %s.", statusCode, self.jobs[jobID])
        if not info.killIntended:
            # Report if the job failed and we didn't kill it.
            # If we killed it then it shouldn't show up in the queue.
            self.outputQueue.put(
                UpdatedBatchJobInfo(jobID=jobID,
                                    exitStatus=statusCode,
                                    wallTime=time.time() - info.time,
                                    exitReason=None))

        # Make absolutely sure all processes in the group have received their
        # kill signals and been cleaned up.
        # TODO: this opens a PGID reuse risk; we reaped the process and its
        # PGID may have been re-used. But it probably hasn't been and we
        # definitely want to make sure all its children died before saying the
        # job is done. Some might not be dead yet if we don't do this.
        # TODO: can we safely do this before reaping? Or would we sit forever
        # signaling a dead but unreaped process?
        try:
            while True:
                # Send a kill to the group again, to see if anything in it
                # is still alive. Our first kill might not have been
                # delivered yet.
                os.killpg(pid, signal.SIGKILL)
                # If that worked it is still alive, so wait for the kernel
                # to stop fooling around and kill it.
                log.warning(
                    'Sent redundant job completion kill to surviving process group %s known to batch system %s',
                    pid, id(self))
                time.sleep(0.1)
        except ProcessLookupError:
            # It is dead already
            pass
        except PermissionError:
            # It isn't ours actually. Ours is dead.
            pass

        # Free up the job's resources.
        self.coreFractions.release(coreFractions)
        self.memory.release(jobMemory)
        self.disk.release(jobDisk)

        log.debug('Child %d for job %s succeeded', pid, jobID)
Example #10
0
    def _startChild(self, jobCommand, jobID, coreFractions, jobMemory, jobDisk,
                    environment):
        """
        Start a child process for the given job.

        Allocate its required resources and save it and save it in our bookkeeping structures.

        If the job is started, returns its PID.
        If the job fails to start, reports it as failed and returns False.
        If the job cannot get the resources it needs to start, returns None.
        """

        # We fill this in if we manage to actually start the child.
        popen = None

        # This is when we started working on the job.
        startTime = time.time()

        # See if we can fit the job in our resource pools right now.
        if self.coreFractions.acquireNow(coreFractions):
            # We got some cores
            if self.memory.acquireNow(jobMemory):
                # We got some memory
                if self.disk.acquireNow(jobDisk):
                    # We got the final resource, disk.
                    # Actually run the job.
                    # When it finishes we will release what it was using.
                    # So it is important to not lose track of the child process.

                    try:
                        # Launch the job.
                        # Make sure it is in its own session (and thus its own
                        # process group) so that, if the user signals the
                        # workflow, Toil will be responsible for killing the
                        # job. This also makes sure that we can signal the job
                        # and all its children together. We assume that the
                        # process group ID will equal the PID of the process we
                        # are starting.
                        popen = subprocess.Popen(jobCommand,
                                                 shell=True,
                                                 env=dict(
                                                     os.environ,
                                                     **environment),
                                                 start_new_session=True)
                    except Exception:
                        # If the job can't start, make sure we release resources now
                        self.coreFractions.release(coreFractions)
                        self.memory.release(jobMemory)
                        self.disk.release(jobDisk)

                        log.error('Could not start job %s: %s', jobID,
                                  traceback.format_exc())

                        # Report as failed.
                        self.outputQueue.put(
                            UpdatedBatchJobInfo(
                                jobID=jobID,
                                exitStatus=EXIT_STATUS_UNAVAILABLE_VALUE,
                                wallTime=0,
                                exitReason=None))

                        # Free resources
                        self.coreFractions.release(coreFractions)
                        self.memory.release(jobMemory)
                        self.disk.release(jobDisk)

                        # Complain it broke.
                        return False
                    else:
                        # If the job did start, record it
                        self.children[popen.pid] = popen
                        # Make sure we can look it up by PID later
                        self.childToJob[popen.pid] = jobID
                        # Record that the job is running, and the resources it is using
                        info = Info(startTime,
                                    popen, (coreFractions, jobMemory, jobDisk),
                                    killIntended=False)
                        self.runningJobs[jobID] = info

                        log.debug('Launched job %s as child %d', jobID,
                                  popen.pid)

                        # Report success starting the job
                        # Note that if a PID were somehow 0 it would look like False
                        assert popen.pid != 0
                        return popen.pid
                else:
                    # We can't get disk, so free cores and memory
                    self.coreFractions.release(coreFractions)
                    self.memory.release(jobMemory)
                    self._setSchedulingStatusMessage(
                        'Not enough disk to run job %s' % jobID)
            else:
                # Free cores, since we can't get memory
                self.coreFractions.release(coreFractions)
                self._setSchedulingStatusMessage(
                    'Not enough memory to run job %s' % jobID)
        else:
            self._setSchedulingStatusMessage('Not enough cores to run job %s' %
                                             jobID)

        # If we get here, we didn't succeed or fail starting the job.
        # We didn't manage to get the resources.
        # Report that.
        return None
Example #11
0
    def _getUpdatedBatchJobImmediately(self):
        """
        Return None if no updated (completed or failed) batch job is currently
        available, and jobID, exitCode, runtime if such a job can be found.
        """

        # See if a local batch job has updated and is available immediately
        local_tuple = self.getUpdatedLocalJob(0)
        if local_tuple:
            # If so, use it
            return local_tuple

        # Otherwise we didn't get a local job.

        # Go looking for other jobs

        # Everybody else does this with a queue and some other thread that
        # is responsible for populating it.
        # But we can just ask kubernetes now.

        # Find a job that is done, failed, or stuck
        jobObject = None
        # Put 'done', 'failed', or 'stuck' here
        chosenFor = ''
        for j in self._ourJobObjects(onlySucceeded=True, limit=1):
            # Look for succeeded jobs because that's the only filter Kubernetes has
            jobObject = j
            chosenFor = 'done'

        if jobObject is None:
            for j in self._ourJobObjects():
                # If there aren't any succeeded jobs, scan all jobs
                # See how many times each failed
                failCount = getattr(j.status, 'failed', 0)
                if failCount is None:
                    # Make sure it is an int
                    failCount = 0
                if failCount > 0:
                    # Take the first failed one you find
                    jobObject = j
                    chosenFor = 'failed'
                    break

        if jobObject is None:
            # If no jobs are failed, look for jobs with pods that are stuck for various reasons.
            for j in self._ourJobObjects():
                pod = self._getPodForJob(j)

                if pod is None:
                    # Skip jobs with no pod
                    continue

                # Containers can get stuck in Waiting with reason ImagePullBackOff

                # Get the statuses of the pod's containers
                containerStatuses = pod.status.container_statuses
                if containerStatuses is None or len(containerStatuses) == 0:
                    # Pod exists but has no container statuses
                    # This happens when the pod is just "Scheduled"
                    # ("PodScheduled" status event) and isn't actually starting
                    # to run yet.
                    # Can't be stuck in ImagePullBackOff
                    continue

                waitingInfo = getattr(
                    getattr(pod.status.container_statuses[0], 'state', None),
                    'waiting', None)
                if waitingInfo is not None and waitingInfo.reason == 'ImagePullBackOff':
                    # Assume it will never finish, even if the registry comes back or whatever.
                    # We can get into this state when we send in a non-existent image.
                    # See https://github.com/kubernetes/kubernetes/issues/58384
                    jobObject = j
                    chosenFor = 'stuck'
                    logger.warning(
                        'Failing stuck job; did you try to run a non-existent Docker image?'
                        ' Check TOIL_APPLIANCE_SELF.')
                    break

                # Pods can also get stuck nearly but not quite out of memory,
                # if their memory limits are high and they try to exhaust them.

                if self._isPodStuckOOM(pod):
                    # We found a job that probably should be OOM! Report it as stuck.
                    # Polling function takes care of the logging.
                    jobObject = j
                    chosenFor = 'stuck'
                    break

        if jobObject is None:
            # Say we couldn't find anything
            return None

        # Otherwise we got something.

        # Work out what the job's ID was (whatever came after our name prefix)
        jobID = int(jobObject.metadata.name[len(self.jobPrefix):])

        # Work out when the job was submitted. If the pod fails before actually
        # running, this is the basis for our runtime.
        jobSubmitTime = getattr(jobObject.status, 'start_time', None)
        if jobSubmitTime is None:
            # If somehow this is unset, say it was just now.
            jobSubmitTime = utc_now()

        # Grab the pod
        pod = self._getPodForJob(jobObject)

        if pod is not None:
            if chosenFor == 'done' or chosenFor == 'failed':
                # The job actually finished or failed

                # Get the statuses of the pod's containers
                containerStatuses = pod.status.container_statuses

                # Get when the pod started (reached the Kubelet) as a datetime
                startTime = getattr(pod.status, 'start_time', None)
                if startTime is None:
                    # If the pod never made it to the kubelet to get a
                    # start_time, say it was when the job was submitted.
                    startTime = jobSubmitTime

                if containerStatuses is None or len(containerStatuses) == 0:
                    # No statuses available.
                    # This happens when a pod is "Scheduled". But how could a
                    # 'done' or 'failed' pod be merely "Scheduled"?
                    # Complain so we can find out.
                    logger.warning(
                        'Exit code and runtime unavailable; pod has no container statuses'
                    )
                    logger.warning('Pod: %s', str(pod))
                    exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
                    # Say it stopped now and started when it was scheduled/submitted.
                    # We still need a strictly positive runtime.
                    runtime = slow_down((utc_now() - startTime).totalSeconds())
                else:
                    # Get the termination info from the pod's main (only) container
                    terminatedInfo = getattr(
                        getattr(containerStatuses[0], 'state', None),
                        'terminated', None)
                    if terminatedInfo is None:
                        logger.warning(
                            'Exit code and runtime unavailable; pod stopped without container terminating'
                        )
                        logger.warning('Pod: %s', str(pod))
                        exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
                        # Say it stopped now and started when it was scheduled/submitted.
                        # We still need a strictly positive runtime.
                        runtime = slow_down(
                            (utc_now() - startTime).totalSeconds())
                    else:
                        # Extract the exit code
                        exitCode = terminatedInfo.exit_code

                        # Compute how long the job actually ran for (subtract
                        # datetimes). We need to look at the pod's start time
                        # because the job's start time is just when the job is
                        # created. And we need to look at the pod's end time
                        # because the job only gets a completion time if
                        # successful.
                        runtime = slow_down(
                            (terminatedInfo.finished_at -
                             pod.status.start_time).total_seconds())

                        if chosenFor == 'failed':
                            # Warn the user with the failed pod's log
                            # TODO: cut this down somehow?
                            logger.warning('Log from failed pod: %s',
                                           self._getLogForPod(pod))

            else:
                # The job has gotten stuck

                assert chosenFor == 'stuck'

                # Synthesize an exit code
                exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
                # Say it ran from when the job was submitted to when the pod got stuck
                runtime = slow_down((utc_now() - jobSubmitTime).totalSeconds())
        else:
            # The pod went away from under the job.
            logging.warning('Exit code and runtime unavailable; pod vanished')
            exitCode = EXIT_STATUS_UNAVAILABLE_VALUE
            # Say it ran from when the job was submitted to when the pod vanished
            runtime = slow_down((utc_now() - jobSubmitTime).totalSeconds())

        try:
            # Delete the job and all dependents (pods)
            self._api('batch').delete_namespaced_job(
                jobObject.metadata.name,
                self.namespace,
                propagation_policy='Foreground')

            # That just kicks off the deletion process. Foreground doesn't
            # actually block. See
            # https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/#foreground-cascading-deletion
            # We have to either wait until the deletion is done and we can't
            # see the job anymore, or ban the job from being "updated" again if
            # we see it. If we don't block on deletion, we can't use limit=1
            # on our query for succeeded jobs. So we poll for the job's
            # non-existence.
            self._waitForJobDeath(jobObject.metadata.name)

        except kubernetes.client.rest.ApiException:
            # TODO: check to see if this is a 404 on the thing we tried to delete
            # If so, it is gone already and we don't need to delete it again.
            pass

        # Return the one finished job we found
        return UpdatedBatchJobInfo(jobID=jobID,
                                   exitStatus=exitCode,
                                   wallTime=runtime,
                                   exitReason=None)
Example #12
0
    def getUpdatedBatchJob(self, maxWait):

        entry = datetime.datetime.now()

        result = self._getUpdatedBatchJobImmediately()

        if result is not None or maxWait == 0:
            # We got something on the first try, or we only get one try
            return result

        # Otherwise we need to maybe wait.

        if self.enableWatching:
            # Try watching for something to happen and use that.

            w = kubernetes.watch.Watch()

            if self.enableWatching:
                for j in self._ourJobObjects():
                    for event in w.stream(
                            self._api('core').list_namespaced_pod,
                            self.namespace,
                            timeout_seconds=maxWait):
                        pod = event['object']
                        if pod.metadata.name.startswith(self.jobPrefix):
                            if pod.status.phase == 'Failed' or pod.status.phase == 'Succeeded':
                                containerStatuses = pod.status.container_statuses
                                logger.debug("FINISHED")
                                if containerStatuses is None or len(
                                        containerStatuses) == 0:
                                    logger.debug(
                                        "No job container statuses for job %s"
                                        %
                                        (pod.metadata.owner_references[0].name)
                                    )
                                    return UpdatedBatchJobInfo(
                                        jobID=int(
                                            pod.metadata.owner_references[0].
                                            name[len(self.jobPrefix):]),
                                        exitStatus=
                                        EXIT_STATUS_UNAVAILABLE_VALUE,
                                        wallTime=0,
                                        exitReason=None)

                                # Get termination onformation from the pod
                                termination = pod.status.container_statuses[
                                    0].state.terminated
                                logger.info("REASON: %s Exit Code: %s",
                                            termination.reason,
                                            termination.exit_code)

                                if termination.exit_code != 0:
                                    # The pod failed. Dump information about it.
                                    logger.debug('Failed pod information: %s',
                                                 str(pod))
                                    logger.warning('Log from failed pod: %s',
                                                   self._getLogForPod(pod))
                                jobID = int(pod.metadata.owner_references[0].
                                            name[len(self.jobPrefix):])
                                terminated = pod.status.container_statuses[
                                    0].state.terminated
                                runtime = slow_down(
                                    (terminated.finished_at -
                                     terminated.started_at).total_seconds())
                                result = UpdatedBatchJobInfo(
                                    jobID=jobID,
                                    exitStatus=terminated.exit_code,
                                    wallTime=runtime,
                                    exitReason=None)
                                self._api('batch').delete_namespaced_job(
                                    pod.metadata.owner_references[0].name,
                                    self.namespace,
                                    propagation_policy='Foreground')

                                self._waitForJobDeath(
                                    pod.metadata.owner_references[0].name)
                                return result
                            else:
                                continue
        else:
            # Try polling instead
            while result is None and (datetime.datetime.now() -
                                      entry).total_seconds() < maxWait:
                # We still have nothing and we haven't hit the timeout.

                # Poll
                result = self._getUpdatedBatchJobImmediately()

                if result is None:
                    # Still nothing. Wait a second, or some fraction of our max wait time.
                    time.sleep(min(maxWait / 2, 1.0))

            # When we get here, either we found something or we ran out of time
            return result
Example #13
0
File: tes.py Project: tmooney/toil
    def getUpdatedBatchJob(self,
                           maxWait: int) -> Optional[UpdatedBatchJobInfo]:
        # Remember when we started, for respecting the timeout
        entry = datetime.datetime.now()
        # This is the updated job we have found, if any
        result = None
        while result is None and (
            (datetime.datetime.now() - entry).total_seconds() < maxWait
                or not maxWait):
            result = self.getUpdatedLocalJob(0)

            if result:
                return result

            # Collect together the list of TES and batch system IDs for tasks we
            # are acknowledging and don't care about anymore.
            acknowledged = []

            for tes_id, bs_id in self.tes_id_to_bs_id.items():
                # Immediately poll all the jobs we issued.
                # TODO: There's no way to acknowledge a finished job, so there's no
                # faster way to find the newly finished jobs than polling
                task = self.tes.get_task(tes_id, view="MINIMAL")
                if task.state in [
                        "COMPLETE", "CANCELED", "EXECUTOR_ERROR",
                        "SYSTEM_ERROR"
                ]:
                    # This task is done!
                    logger.debug("Found stopped task: %s", task)

                    # Acknowledge it
                    acknowledged.append((tes_id, bs_id))

                    if task.state == "CANCELED":
                        # Killed jobs aren't allowed to appear as updated.
                        continue

                    # Otherwise, it stopped running and it wasn't our fault.

                    # Fetch the task's full info, including logs.
                    task = self.tes.get_task(tes_id, view="FULL")

                    # Record runtime
                    runtime = self._get_runtime(task)

                    # Determine if it succeeded
                    exit_reason = STATE_TO_EXIT_REASON[task.state]

                    # Get its exit code
                    exit_code = self._get_exit_code(task)

                    if task.state == "EXECUTOR_ERROR":
                        # The task failed, so report executor logs.
                        logger.warning('Log from failed executor: %s',
                                       self.__get_log_text(task))

                    # Compose a result
                    result = UpdatedBatchJobInfo(jobID=bs_id,
                                                 exitStatus=exit_code,
                                                 wallTime=runtime,
                                                 exitReason=exit_reason)

                    # No more iteration needed, we found a result.
                    break

            # After the iteration, drop all the records for tasks we acknowledged
            for (tes_id, bs_id) in acknowledged:
                del self.tes_id_to_bs_id[tes_id]
                del self.bs_id_to_tes_id[bs_id]

            if not maxWait:
                # Don't wait at all
                break
            elif result is None:
                # Wait a bit and poll again
                time.sleep(min(maxWait / 2, 1.0))

        # When we get here we have all the result we can get
        return result