def queue_job(self, job_wrapper): """Create job script and submit it to Kubernetes cluster""" # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # were galaxy will expect results. log.debug("Starting queue_job for job " + job_wrapper.get_id_tag()) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False): return job_destination = job_wrapper.job_destination # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/ k8s_job_name = self.__produce_unique_k8s_job_name(job_wrapper.get_id_tag()) k8s_job_obj = { "apiVersion": self.runner_params['k8s_job_api_version'], "kind": "Job", "metadata": { # metadata.name is the name of the pod resource created, and must be unique # http://kubernetes.io/docs/user-guide/configuring-containers/ "name": k8s_job_name, "namespace": self.runner_params['k8s_namespace'], "labels": {"app": k8s_job_name} }, "spec": self.__get_k8s_job_spec(job_wrapper) } # Checks if job exists and is trusted, or if it needs re-creation. job = Job(self._pykube_api, k8s_job_obj) if job.exists() and not self._galaxy_instance_id: # if galaxy instance id is not set, then we don't trust matching jobs and we simply delete and # re-create the job log.debug("Matching job exists, but Job is not trusted, so it will be deleted and a new one created.") job.delete() elapsed_seconds = 0 while job.exists(): sleep(3) elapsed_seconds += 3 if elapsed_seconds > self.runner_params['k8s_timeout_seconds_job_deletion']: log.debug("Timed out before k8s could delete existing untrusted job " + k8s_job_name + ", not queuing associated Galaxy job.") return log.debug("Waiting for job to be deleted " + k8s_job_name) Job(self._pykube_api, k8s_job_obj).create() elif job.exists() and self._galaxy_instance_id: # The job exists and we trust the identifier. log.debug("Matching job exists, but Job is trusted, so we simply use the existing one for " + k8s_job_name) # We simply leave the k8s job to be handled later on by the check watched-items. else: # Creates the Kubernetes Job if it doesn't exist. job.create() # define job attributes in the AsyncronousJobState for follow-up ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_id=k8s_job_name, job_destination=job_destination) self.monitor_queue.put(ajs) # external_runJob_script can be None, in which case it's not used. external_runjob_script = None return external_runjob_script
def queue_job(self, job_wrapper): """Create job script and submit it to Kubernetes cluster""" # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # were galaxy will expect results. log.debug("Starting queue_job for job " + job_wrapper.get_id_tag()) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False): return job_destination = job_wrapper.job_destination # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/ k8s_job_name = self.__produce_unique_k8s_job_name( job_wrapper.get_id_tag()) k8s_job_obj = { "apiVersion": "extensions/v1beta1", "kind": "Job", "metadata": # metadata.name is the name of the pod resource created, and must be unique # http://kubernetes.io/docs/user-guide/configuring-containers/ { "name": k8s_job_name, "namespace": "default", # TODO this should be set "labels": { "app": k8s_job_name }, }, "spec": self.__get_k8s_job_spec(job_wrapper) } # Checks if job exists job = Job(self._pykube_api, k8s_job_obj) if job.exists(): job.delete() # Creates the Kubernetes Job # TODO if a job with that ID exists, what should we do? # TODO do we trust that this is the same job and use that? # TODO or create a new job as we cannot make sure Job(self._pykube_api, k8s_job_obj).create() # define job attributes in the AsyncronousJobState for follow-up ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_id=k8s_job_name, job_destination=job_destination) self.monitor_queue.put(ajs) # external_runJob_script can be None, in which case it's not used. external_runjob_script = None return external_runjob_script
def run(self): self._init_kubernetes() # Render job job_json = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": self.uu_name, "labels": { "spawned_by": "luigi", "luigi_task_id": self.job_uuid } }, "spec": { "template": { "metadata": { "name": self.uu_name }, "spec": self.spec_schema } } } # Update user labels job_json['metadata']['labels'].update(self.labels) # Add default restartPolicy if not specified if ("restartPolicy" not in self.spec_schema): job_json["spec"]["template"]["spec"]["restartPolicy"] = "Never" # Submit job self.__logger.info("Submitting Kubernetes Job: " + self.uu_name) job = Job(self.__kube_api, job_json) job.create() # Track the Job (wait while active) self.__logger.info("Start tracking Kubernetes Job: " + self.uu_name) self.__track_job()
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState""" jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id, namespace=self.runner_params['k8s_namespace']) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 max_pod_retrials = 1 if 'k8s_pod_retrials' in self.runner_params: max_pod_retrials = int(self.runner_params['k8s_pod_retrials']) if 'max_pod_retrials' in job_destination.params: max_pod_retrials = int(job_destination.params['max_pod_retrials']) if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] # This assumes jobs dependent on a single pod, single container if succeeded > 0: self.__produce_log_file(job_state) with open(job_state.error_file, 'w'): pass job_state.running = False self.mark_as_finished(job_state) return None elif failed > 0 and self.__job_failed_due_to_low_memory(job_state): return self._handle_job_failure(job, job_state, reason="OOM") elif active > 0 and failed <= max_pod_retrials: job_state.running = True return job_state elif failed > max_pod_retrials: return self._handle_job_failure(job, job_state) # We should not get here log.debug( "Reaching unexpected point for Kubernetes job name %s where it is not classified as succ., active nor failed.", job.name) log.debug("k8s full job object:\n%s", job.obj) return job_state elif len(jobs.response['items']) == 0: # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=" + job_state.job_id) with open(job_state.error_file, 'w') as error_file: error_file.write("No Kubernetes Jobs are available under expected selector app=" + job_state.job_id + "\n") self.mark_as_failed(job_state) return job_state else: # there is more than one job associated to the expected unique job id used as selector. log.error("There is more than one Kubernetes Job associated to job id " + job_state.job_id) self.__produce_log_file(job_state) with open(job_state.error_file, 'w') as error_file: error_file.write("There is more than one Kubernetes Job associated to job id " + job_state.job_id + "\n") self.mark_as_failed(job_state) return job_state
def run(self): if self.runlocal: Task.run(self) else: # KubernetesJobTask.run() self._init_kubernetes() # Render job job_json = self._create_job_json() job_json = self._add_env_variables(job_json) if self.active_deadline_seconds is not None: job_json['spec']['activeDeadlineSeconds'] = \ self.active_deadline_seconds # Update user labels job_json['metadata']['labels'].update(self.labels) # Add default restartPolicy if not specified if "restartPolicy" not in self.spec_schema: job_json["spec"]["template"]["spec"]["restartPolicy"] = "Never" # Submit job self.__logger.info("Submitting Kubernetes Job: " + self.uu_name) job = Job(self.__kube_api, job_json) job.create() # Track the Job (wait while active) self.__logger.info("Start tracking Kubernetes Job: " + self.uu_name) self.__track_job()
def __get_job(self): jobs = Job.objects(self.__kube_api) \ .filter(namespace=self.job_namespace, selector="luigi_task_id=" + self.job_uuid) \ .response['items'] assert len(jobs) == 1, "Kubernetes job " + self.job_namespace +"/"+ self.uu_name + " not found" "(job_uuid= "+ self.job_uuid + ")" return Job(self.__kube_api, jobs[0])
def test_fail_job(self): fail = FailJob() self.assertRaises(RuntimeError, fail.run) # Check for retrials kube_api = HTTPClient( KubeConfig.from_file("~/.kube/config")) # assumes minikube jobs = Job.objects(kube_api).filter(selector="luigi_task_id=" + fail.job_uuid) self.assertEqual(len(jobs.response["items"]), 1) job = Job(kube_api, jobs.response["items"][0]) self.assertTrue("failed" in job.obj["status"]) self.assertTrue(job.obj["status"]["failed"] > fail.max_retrials)
def stop_job(self, job_wrapper): """Attempts to delete a dispatched job to the k8s cluster""" job = job_wrapper.get_job() try: jobs = Job.objects(self._pykube_api).filter(selector="app=" + self.__produce_unique_k8s_job_name(job.get_id_tag())) if len(jobs.response['items']) >= 0: job_to_delete = Job(self._pykube_api, jobs.response['items'][0]) job_to_delete.scale(replicas=0) # TODO assert whether job parallelism == 0 # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists" log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id)) except Exception as e: log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % ( job.id, job.job_runner_external_id, e))
def run(self): self._init_kubernetes() # Render job job_json = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": self.uu_name, "labels": { "spawned_by": "luigi", "luigi_task_id": self.job_uuid } }, "spec": { "backoffLimit": self.backoff_limit, "template": { "metadata": { "name": self.uu_name, "labels": {} }, "spec": self.spec_schema } } } if self.kubernetes_namespace is not None: job_json['metadata']['namespace'] = self.kubernetes_namespace if self.active_deadline_seconds is not None: job_json['spec']['activeDeadlineSeconds'] = \ self.active_deadline_seconds # Update user labels job_json['metadata']['labels'].update(self.labels) job_json['spec']['template']['metadata']['labels'].update(self.labels) # Add default restartPolicy if not specified if "restartPolicy" not in self.spec_schema: job_json["spec"]["template"]["spec"]["restartPolicy"] = "Never" # Submit job self.__logger.info("Submitting Kubernetes Job: " + self.uu_name) job = Job(self.__kube_api, job_json) job.create() # Track the Job (wait while active) self.__logger.info("Start tracking Kubernetes Job: " + self.uu_name) try: self.__track_job() except AssertionError: self.__print_pod_logs() raise
def __get_job_status(self): """Return the Kubernetes job status""" # Look for the required job jobs = Job.objects(self.__kube_api).filter(selector="luigi_task_id=" + self.job_uuid) # Raise an exception if no such job found if len(jobs.response["items"]) == 0: raise RuntimeError("Kubernetes job " + self.uu_name + " not found") # Figure out status and return it job = Job(self.__kube_api, jobs.response["items"][0]) if ("succeeded" in job.obj["status"] and job.obj["status"]["succeeded"] > 0): job.scale(replicas=0) # Downscale the job, but keep it for logging return "succeeded" if ("failed" in job.obj["status"]): failed_cnt = job.obj["status"]["failed"] self.__logger.debug("Kubernetes job " + self.uu_name + " status.failed: " + str(failed_cnt)) if (failed_cnt > self.max_retrials): job.scale(replicas=0) # avoid more retrials return "failed" return "running"
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState""" jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 max_pod_retrials = 1 if 'k8s_pod_retrials' in self.runner_params: max_pod_retrials = int(self.runner_params['k8s_pod_retrials']) if 'max_pod_retrials' in job_destination.params: max_pod_retrials = int( job_destination.params['max_pod_retrials']) if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] # This assumes jobs dependent on a single pod, single container if succeeded > 0: self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write("") error_file.close() job_state.running = False self.mark_as_finished(job_state) return None elif active > 0 and failed <= max_pod_retrials: job_state.running = True return job_state elif failed > max_pod_retrials: self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write( "Exceeded max number of Kubernetes pod retrials allowed for job\n" ) error_file.close() job_state.running = False job_state.fail_message = "More pods failed than allowed. See stdout for pods details." self.mark_as_failed(job_state) job.scale(replicas=0) return None # We should not get here log.debug( "Reaching unexpected point for Kubernetes job, where it is not classified as succ., active nor failed." ) return job_state elif len(jobs.response['items']) == 0: # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=" + job_state.job_id) error_file = open(job_state.error_file, 'w') error_file.write( "No Kubernetes Jobs are available under expected selector app=" + job_state.job_id + "\n") error_file.close() self.mark_as_failed(job_state) return job_state else: # there is more than one job associated to the expected unique job id used as selector. log.error( "There is more than one Kubernetes Job associated to job id " + job_state.job_id) self.__produce_log_file(job_state) error_file = open(job_state.error_file, 'w') error_file.write( "There is more than one Kubernetes Job associated to job id " + job_state.job_id + "\n") error_file.close() self.mark_as_failed(job_state) return job_state
def check_watched_item(self, job_state): """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState""" jobs = Job.objects(self._pykube_api).filter( selector="app=" + job_state.job_id, namespace=self.runner_params['k8s_namespace']) if len(jobs.response['items']) == 1: job = Job(self._pykube_api, jobs.response['items'][0]) job_destination = job_state.job_wrapper.job_destination succeeded = 0 active = 0 failed = 0 max_pod_retrials = 1 if 'k8s_pod_retrials' in self.runner_params: max_pod_retrials = int(self.runner_params['k8s_pod_retrials']) if 'max_pod_retrials' in job_destination.params: max_pod_retrials = int( job_destination.params['max_pod_retrials']) if 'succeeded' in job.obj['status']: succeeded = job.obj['status']['succeeded'] if 'active' in job.obj['status']: active = job.obj['status']['active'] if 'failed' in job.obj['status']: failed = job.obj['status']['failed'] # This assumes jobs dependent on a single pod, single container if succeeded > 0: job_state.running = False self.mark_as_finished(job_state) return None elif failed > 0 and self.__job_failed_due_to_low_memory(job_state): return self._handle_job_failure(job, job_state, reason="OOM") elif active > 0 and failed <= max_pod_retrials: if not job_state.running: job_state.running = True job_state.job_wrapper.change_state( model.Job.states.RUNNING) return job_state elif failed > max_pod_retrials: return self._handle_job_failure(job, job_state) elif job_state.job_wrapper.get_job( ).state == model.Job.states.DELETED: # Job has been deleted via stop_job, cleanup and remove from watched_jobs by returning `None` if job_state.job_wrapper.cleanup_job in ("always", "onsuccess"): job_state.job_wrapper.cleanup() return None else: # We really shouldn't reach this point, but we might if the job has been killed by the kubernetes admin log.info( "Kubernetes job '%s' not classified as succ., active or failed. Full Job object: \n%s", job.name, job.obj) elif len(jobs.response['items']) == 0: # there is no job responding to this job_id, it is either lost or something happened. log.error("No Jobs are available under expected selector app=%s", job_state.job_id) with open(job_state.error_file, 'w') as error_file: error_file.write( "No Kubernetes Jobs are available under expected selector app=%s\n" % job_state.job_id) self.mark_as_failed(job_state) return job_state else: # there is more than one job associated to the expected unique job id used as selector. log.error("More than one Kubernetes Job associated to job id '%s'", job_state.job_id) with open(job_state.error_file, 'w') as error_file: error_file.write( "More than one Kubernetes Job associated to job id '%s'\n" % job_state.job_id) self.mark_as_failed(job_state) return job_state