Beispiel #1
0
 def finish_job(self, job_state):
     super(KubernetesJobRunner, self).finish_job(job_state)
     jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id,
                                                 namespace=self.runner_params['k8s_namespace'])
     # If more than one job matches selector, leave all jobs intact as it's a configuration error
     if len(jobs.response['items']) == 1:
         job = Job(self._pykube_api, jobs.response['items'][0])
         self.__cleanup_k8s_job(job)
Beispiel #2
0
 def finish_job(self, job_state):
     super(KubernetesJobRunner, self).finish_job(job_state)
     jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id,
                                                 namespace=self.runner_params['k8s_namespace'])
     if len(jobs.response['items']) != 1:
         log.warning("More than one job matches selector. Possible configuration error"
                     " in job id '%s'", job_state.job_id)
     job = Job(self._pykube_api, jobs.response['items'][0])
     self.__cleanup_k8s_job(job)
Beispiel #3
0
 def _active_kubernetes_jobs(self):
     pykube_api = pykube_client_from_dict({})
     # TODO: namespace.
     jobs = Job.objects(pykube_api).filter()
     active = 0
     for job in jobs:
         if self.instance_id not in job.obj["metadata"]["name"]:
             continue
         status = job.obj["status"]
         active += status.get("active", 0)
     return active
Beispiel #4
0
 def stop_job(self, job_wrapper):
     """Attempts to delete a dispatched job to the k8s cluster"""
     job = job_wrapper.get_job()
     try:
         jobs = Job.objects(self._pykube_api).filter(
             selector="app=" + self.__produce_unique_k8s_job_name(job.get_id_tag()),
             namespace=self.runner_params['k8s_namespace'])
         if len(jobs.response['items']) > 0:
             job_to_delete = Job(self._pykube_api, jobs.response['items'][0])
             self.__cleanup_k8s_job(job_to_delete)
         # TODO assert whether job parallelism == 0
         # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists"
         log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id))
     except Exception as e:
         log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % (
             job.id, job.job_runner_external_id, e))
Beispiel #5
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(
            selector="app=" + job_state.job_id,
            namespace=self.runner_params['k8s_namespace'])
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            if 'max_pod_retries' in job_destination.params:
                max_pod_retries = int(
                    job_destination.params['max_pod_retries'])
            elif 'k8s_pod_retries' in self.runner_params:
                max_pod_retries = int(self.runner_params['k8s_pod_retries'])
            elif 'max_pod_retrials' in job_destination.params:
                # For backward compatibility
                max_pod_retries = int(
                    job_destination.params['max_pod_retrials'])
            elif 'k8s_pod_retrials' in self.runner_params:
                # For backward compatibility
                max_pod_retries = int(self.runner_params['max_pod_retrials'])
            else:
                max_pod_retries = 1

            # Check if job.obj['status'] is empty,
            # return job_state unchanged if this is the case
            # as probably this means that the k8s API server hasn't
            # had time to fill in the object status since the
            # job was created only too recently.
            if len(job.obj['status']) == 0:
                return job_state
            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif active > 0 and failed <= max_pod_retries:
                if not job_state.running:
                    job_state.running = True
                    job_state.job_wrapper.change_state(
                        model.Job.states.RUNNING)
                return job_state
            elif job_state.job_wrapper.get_job(
            ).state == model.Job.states.DELETED:
                # Job has been deleted via stop_job and job has not been deleted,
                # remove from watched_jobs by returning `None`
                if job_state.job_wrapper.cleanup_job in ("always",
                                                         "onsuccess"):
                    job_state.job_wrapper.cleanup()
                return None
            else:
                return self._handle_job_failure(job, job_state)

        elif len(jobs.response['items']) == 0:
            if job_state.job_wrapper.get_job(
            ).state == model.Job.states.DELETED:
                # Job has been deleted via stop_job and job has been deleted,
                # cleanup and remove from watched_jobs by returning `None`
                if job_state.job_wrapper.cleanup_job in ("always",
                                                         "onsuccess"):
                    job_state.job_wrapper.cleanup()
                return None
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=%s",
                      job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write(
                    "No Kubernetes Jobs are available under expected selector app=%s\n"
                    % job_state.job_id)
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error("More than one Kubernetes Job associated to job id '%s'",
                      job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write(
                    "More than one Kubernetes Job associated with job id '%s'\n"
                    % job_state.job_id)
            self.mark_as_failed(job_state)
            return job_state