Example #1
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to Kubernetes cluster"""
        # prepare the job
        # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same
        # were galaxy will expect results.
        log.debug("Starting queue_job for job " + job_wrapper.get_id_tag())
        if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False):
            return

        job_destination = job_wrapper.job_destination

        # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/
        k8s_job_name = self.__produce_unique_k8s_job_name(job_wrapper.get_id_tag())
        k8s_job_obj = {
            "apiVersion": self.runner_params['k8s_job_api_version'],
            "kind": "Job",
            "metadata": {
                    # metadata.name is the name of the pod resource created, and must be unique
                    # http://kubernetes.io/docs/user-guide/configuring-containers/
                    "name": k8s_job_name,
                    "namespace": self.runner_params['k8s_namespace'],
                    "labels": {"app": k8s_job_name}
            },
            "spec": self.__get_k8s_job_spec(job_wrapper)
        }

        # Checks if job exists and is trusted, or if it needs re-creation.
        job = Job(self._pykube_api, k8s_job_obj)
        if job.exists() and not self._galaxy_instance_id:
            # if galaxy instance id is not set, then we don't trust matching jobs and we simply delete and
            # re-create the job
            log.debug("Matching job exists, but Job is not trusted, so it will be deleted and a new one created.")
            job.delete()
            elapsed_seconds = 0
            while job.exists():
                sleep(3)
                elapsed_seconds += 3
                if elapsed_seconds > self.runner_params['k8s_timeout_seconds_job_deletion']:
                    log.debug("Timed out before k8s could delete existing untrusted job " + k8s_job_name +
                              ", not queuing associated Galaxy job.")
                    return
                log.debug("Waiting for job to be deleted " + k8s_job_name)

            Job(self._pykube_api, k8s_job_obj).create()
        elif job.exists() and self._galaxy_instance_id:
            # The job exists and we trust the identifier.
            log.debug("Matching job exists, but Job is trusted, so we simply use the existing one for " + k8s_job_name)
            # We simply leave the k8s job to be handled later on by the check watched-items.
        else:
            # Creates the Kubernetes Job if it doesn't exist.
            job.create()

        # define job attributes in the AsyncronousJobState for follow-up
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper,
                                   job_id=k8s_job_name, job_destination=job_destination)
        self.monitor_queue.put(ajs)

        # external_runJob_script can be None, in which case it's not used.
        external_runjob_script = None
        return external_runjob_script
Example #2
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to Kubernetes cluster"""
        # prepare the job
        # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same
        # were galaxy will expect results.
        log.debug("Starting queue_job for job " + job_wrapper.get_id_tag())
        if not self.prepare_job(job_wrapper,
                                include_metadata=False,
                                modify_command_for_container=False):
            return

        job_destination = job_wrapper.job_destination

        # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/
        k8s_job_name = self.__produce_unique_k8s_job_name(
            job_wrapper.get_id_tag())
        k8s_job_obj = {
            "apiVersion": "extensions/v1beta1",
            "kind": "Job",
            "metadata":
            # metadata.name is the name of the pod resource created, and must be unique
            # http://kubernetes.io/docs/user-guide/configuring-containers/
            {
                "name": k8s_job_name,
                "namespace": "default",  # TODO this should be set
                "labels": {
                    "app": k8s_job_name
                },
            },
            "spec": self.__get_k8s_job_spec(job_wrapper)
        }

        # Checks if job exists
        job = Job(self._pykube_api, k8s_job_obj)
        if job.exists():
            job.delete()
        # Creates the Kubernetes Job
        # TODO if a job with that ID exists, what should we do?
        # TODO do we trust that this is the same job and use that?
        # TODO or create a new job as we cannot make sure
        Job(self._pykube_api, k8s_job_obj).create()

        # define job attributes in the AsyncronousJobState for follow-up
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_id=k8s_job_name,
                                   job_destination=job_destination)
        self.monitor_queue.put(ajs)

        # external_runJob_script can be None, in which case it's not used.
        external_runjob_script = None
        return external_runjob_script
Example #3
0
 def run(self):
     self._init_kubernetes()
     # Render job
     job_json = {
         "apiVersion": "batch/v1",
         "kind": "Job",
         "metadata": {
             "name": self.uu_name,
             "labels": {
                 "spawned_by": "luigi",
                 "luigi_task_id": self.job_uuid
             }
         },
         "spec": {
             "template": {
                 "metadata": {
                     "name": self.uu_name
                 },
                 "spec": self.spec_schema
             }
         }
     }
     # Update user labels
     job_json['metadata']['labels'].update(self.labels)
     # Add default restartPolicy if not specified
     if ("restartPolicy" not in self.spec_schema):
         job_json["spec"]["template"]["spec"]["restartPolicy"] = "Never"
     # Submit job
     self.__logger.info("Submitting Kubernetes Job: " + self.uu_name)
     job = Job(self.__kube_api, job_json)
     job.create()
     # Track the Job (wait while active)
     self.__logger.info("Start tracking Kubernetes Job: " + self.uu_name)
     self.__track_job()
Example #4
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id,
                                                    namespace=self.runner_params['k8s_namespace'])
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            max_pod_retrials = 1
            if 'k8s_pod_retrials' in self.runner_params:
                max_pod_retrials = int(self.runner_params['k8s_pod_retrials'])
            if 'max_pod_retrials' in job_destination.params:
                max_pod_retrials = int(job_destination.params['max_pod_retrials'])

            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                self.__produce_log_file(job_state)
                with open(job_state.error_file, 'w'):
                    pass
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif failed > 0 and self.__job_failed_due_to_low_memory(job_state):
                return self._handle_job_failure(job, job_state, reason="OOM")
            elif active > 0 and failed <= max_pod_retrials:
                job_state.running = True
                return job_state
            elif failed > max_pod_retrials:
                return self._handle_job_failure(job, job_state)
            # We should not get here
            log.debug(
                "Reaching unexpected point for Kubernetes job name %s where it is not classified as succ., active nor failed.", job.name)
            log.debug("k8s full job object:\n%s", job.obj)
            return job_state

        elif len(jobs.response['items']) == 0:
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=" + job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write("No Kubernetes Jobs are available under expected selector app=" + job_state.job_id + "\n")
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error("There is more than one Kubernetes Job associated to job id " + job_state.job_id)
            self.__produce_log_file(job_state)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write("There is more than one Kubernetes Job associated to job id " + job_state.job_id + "\n")
            self.mark_as_failed(job_state)
            return job_state
Example #5
0
    def run(self):
        if self.runlocal:
            Task.run(self)
        else:
            # KubernetesJobTask.run()
            self._init_kubernetes()
            # Render job
            job_json = self._create_job_json()
            job_json = self._add_env_variables(job_json)

            if self.active_deadline_seconds is not None:
                job_json['spec']['activeDeadlineSeconds'] = \
                    self.active_deadline_seconds
            # Update user labels
            job_json['metadata']['labels'].update(self.labels)
            # Add default restartPolicy if not specified
            if "restartPolicy" not in self.spec_schema:
                job_json["spec"]["template"]["spec"]["restartPolicy"] = "Never"
            # Submit job
            self.__logger.info("Submitting Kubernetes Job: " + self.uu_name)
            job = Job(self.__kube_api, job_json)
            job.create()
            # Track the Job (wait while active)
            self.__logger.info("Start tracking Kubernetes Job: " + self.uu_name)
            self.__track_job()
Example #6
0
    def __get_job(self):
        jobs = Job.objects(self.__kube_api) \
            .filter(namespace=self.job_namespace, selector="luigi_task_id=" + self.job_uuid) \
            .response['items']

        assert len(jobs) == 1, "Kubernetes job " + self.job_namespace +"/"+ self.uu_name + " not found" "(job_uuid= "+  self.job_uuid + ")"
        return Job(self.__kube_api, jobs[0])
Example #7
0
 def test_fail_job(self):
     fail = FailJob()
     self.assertRaises(RuntimeError, fail.run)
     # Check for retrials
     kube_api = HTTPClient(
         KubeConfig.from_file("~/.kube/config"))  # assumes minikube
     jobs = Job.objects(kube_api).filter(selector="luigi_task_id=" +
                                         fail.job_uuid)
     self.assertEqual(len(jobs.response["items"]), 1)
     job = Job(kube_api, jobs.response["items"][0])
     self.assertTrue("failed" in job.obj["status"])
     self.assertTrue(job.obj["status"]["failed"] > fail.max_retrials)
Example #8
0
 def stop_job(self, job_wrapper):
     """Attempts to delete a dispatched job to the k8s cluster"""
     job = job_wrapper.get_job()
     try:
         jobs = Job.objects(self._pykube_api).filter(selector="app=" +
                                                              self.__produce_unique_k8s_job_name(job.get_id_tag()))
         if len(jobs.response['items']) >= 0:
             job_to_delete = Job(self._pykube_api, jobs.response['items'][0])
             job_to_delete.scale(replicas=0)
         # TODO assert whether job parallelism == 0
         # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists"
         log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id))
     except Exception as e:
         log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % (
             job.id, job.job_runner_external_id, e))
Example #9
0
    def run(self):
        self._init_kubernetes()
        # Render job
        job_json = {
            "apiVersion": "batch/v1",
            "kind": "Job",
            "metadata": {
                "name": self.uu_name,
                "labels": {
                    "spawned_by": "luigi",
                    "luigi_task_id": self.job_uuid
                }
            },
            "spec": {
                "backoffLimit": self.backoff_limit,
                "template": {
                    "metadata": {
                        "name": self.uu_name,
                        "labels": {}
                    },
                    "spec": self.spec_schema
                }
            }
        }
        if self.kubernetes_namespace is not None:
            job_json['metadata']['namespace'] = self.kubernetes_namespace
        if self.active_deadline_seconds is not None:
            job_json['spec']['activeDeadlineSeconds'] = \
                self.active_deadline_seconds
        # Update user labels
        job_json['metadata']['labels'].update(self.labels)
        job_json['spec']['template']['metadata']['labels'].update(self.labels)

        # Add default restartPolicy if not specified
        if "restartPolicy" not in self.spec_schema:
            job_json["spec"]["template"]["spec"]["restartPolicy"] = "Never"
        # Submit job
        self.__logger.info("Submitting Kubernetes Job: " + self.uu_name)
        job = Job(self.__kube_api, job_json)
        job.create()
        # Track the Job (wait while active)
        self.__logger.info("Start tracking Kubernetes Job: " + self.uu_name)
        try:
            self.__track_job()
        except AssertionError:
            self.__print_pod_logs()
            raise
Example #10
0
 def __get_job_status(self):
     """Return the Kubernetes job status"""
     # Look for the required job
     jobs = Job.objects(self.__kube_api).filter(selector="luigi_task_id=" +
                                                self.job_uuid)
     # Raise an exception if no such job found
     if len(jobs.response["items"]) == 0:
         raise RuntimeError("Kubernetes job " + self.uu_name + " not found")
     # Figure out status and return it
     job = Job(self.__kube_api, jobs.response["items"][0])
     if ("succeeded" in job.obj["status"]
             and job.obj["status"]["succeeded"] > 0):
         job.scale(replicas=0)  # Downscale the job, but keep it for logging
         return "succeeded"
     if ("failed" in job.obj["status"]):
         failed_cnt = job.obj["status"]["failed"]
         self.__logger.debug("Kubernetes job " + self.uu_name +
                             " status.failed: " + str(failed_cnt))
         if (failed_cnt > self.max_retrials):
             job.scale(replicas=0)  # avoid more retrials
             return "failed"
     return "running"
Example #11
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(selector="app=" +
                                                    job_state.job_id)
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            max_pod_retrials = 1
            if 'k8s_pod_retrials' in self.runner_params:
                max_pod_retrials = int(self.runner_params['k8s_pod_retrials'])
            if 'max_pod_retrials' in job_destination.params:
                max_pod_retrials = int(
                    job_destination.params['max_pod_retrials'])

            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                self.__produce_log_file(job_state)
                error_file = open(job_state.error_file, 'w')
                error_file.write("")
                error_file.close()
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif active > 0 and failed <= max_pod_retrials:
                job_state.running = True
                return job_state
            elif failed > max_pod_retrials:
                self.__produce_log_file(job_state)
                error_file = open(job_state.error_file, 'w')
                error_file.write(
                    "Exceeded max number of Kubernetes pod retrials allowed for job\n"
                )
                error_file.close()
                job_state.running = False
                job_state.fail_message = "More pods failed than allowed. See stdout for pods details."
                self.mark_as_failed(job_state)
                job.scale(replicas=0)
                return None
            # We should not get here
            log.debug(
                "Reaching unexpected point for Kubernetes job, where it is not classified as succ., active nor failed."
            )
            return job_state

        elif len(jobs.response['items']) == 0:
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=" +
                      job_state.job_id)
            error_file = open(job_state.error_file, 'w')
            error_file.write(
                "No Kubernetes Jobs are available under expected selector app="
                + job_state.job_id + "\n")
            error_file.close()
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error(
                "There is more than one Kubernetes Job associated to job id " +
                job_state.job_id)
            self.__produce_log_file(job_state)
            error_file = open(job_state.error_file, 'w')
            error_file.write(
                "There is more than one Kubernetes Job associated to job id " +
                job_state.job_id + "\n")
            error_file.close()
            self.mark_as_failed(job_state)
            return job_state
Example #12
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(
            selector="app=" + job_state.job_id,
            namespace=self.runner_params['k8s_namespace'])
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            max_pod_retrials = 1
            if 'k8s_pod_retrials' in self.runner_params:
                max_pod_retrials = int(self.runner_params['k8s_pod_retrials'])
            if 'max_pod_retrials' in job_destination.params:
                max_pod_retrials = int(
                    job_destination.params['max_pod_retrials'])

            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif failed > 0 and self.__job_failed_due_to_low_memory(job_state):
                return self._handle_job_failure(job, job_state, reason="OOM")
            elif active > 0 and failed <= max_pod_retrials:
                if not job_state.running:
                    job_state.running = True
                    job_state.job_wrapper.change_state(
                        model.Job.states.RUNNING)
                return job_state
            elif failed > max_pod_retrials:
                return self._handle_job_failure(job, job_state)
            elif job_state.job_wrapper.get_job(
            ).state == model.Job.states.DELETED:
                # Job has been deleted via stop_job, cleanup and remove from watched_jobs by returning `None`
                if job_state.job_wrapper.cleanup_job in ("always",
                                                         "onsuccess"):
                    job_state.job_wrapper.cleanup()
                return None
            else:
                # We really shouldn't reach this point, but we might if the job has been killed by the kubernetes admin
                log.info(
                    "Kubernetes job '%s' not classified as succ., active or failed. Full Job object: \n%s",
                    job.name, job.obj)

        elif len(jobs.response['items']) == 0:
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=%s",
                      job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write(
                    "No Kubernetes Jobs are available under expected selector app=%s\n"
                    % job_state.job_id)
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error("More than one Kubernetes Job associated to job id '%s'",
                      job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write(
                    "More than one Kubernetes Job associated to job id '%s'\n"
                    % job_state.job_id)
            self.mark_as_failed(job_state)
            return job_state