Beispiel #1
0
 def finish_job(self, job_state):
     super(KubernetesJobRunner, self).finish_job(job_state)
     jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id,
                                                 namespace=self.runner_params['k8s_namespace'])
     # If more than one job matches selector, leave all jobs intact as it's a configuration error
     if len(jobs.response['items']) == 1:
         job = Job(self._pykube_api, jobs.response['items'][0])
         self.__cleanup_k8s_job(job)
Beispiel #2
0
 def finish_job(self, job_state):
     super(KubernetesJobRunner, self).finish_job(job_state)
     jobs = Job.objects(self._pykube_api).filter(selector="app=" + job_state.job_id,
                                                 namespace=self.runner_params['k8s_namespace'])
     if len(jobs.response['items']) != 1:
         log.warning("More than one job matches selector. Possible configuration error"
                     " in job id '%s'", job_state.job_id)
     job = Job(self._pykube_api, jobs.response['items'][0])
     self.__cleanup_k8s_job(job)
Beispiel #3
0
 def stop_job(self, job_wrapper):
     """Attempts to delete a dispatched job to the k8s cluster"""
     job = job_wrapper.get_job()
     try:
         jobs = Job.objects(self._pykube_api).filter(
             selector="app=" + self.__produce_unique_k8s_job_name(job.get_id_tag()),
             namespace=self.runner_params['k8s_namespace'])
         if len(jobs.response['items']) > 0:
             job_to_delete = Job(self._pykube_api, jobs.response['items'][0])
             self.__cleanup_k8s_job(job_to_delete)
         # TODO assert whether job parallelism == 0
         # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists"
         log.debug("(%s/%s) Terminated at user's request" % (job.id, job.job_runner_external_id))
     except Exception as e:
         log.debug("(%s/%s) User killed running job, but error encountered during termination: %s" % (
             job.id, job.job_runner_external_id, e))
Beispiel #4
0
 def finish_job(self, job_state):
     super().finish_job(job_state)
     jobs = find_job_object_by_name(self._pykube_api, job_state.job_id,
                                    self.runner_params['k8s_namespace'])
     if len(jobs.response['items']) != 1:
         log.warning(
             "More than one job matches selector. Possible configuration error"
             " in job id '%s'", job_state.job_id)
     job = Job(self._pykube_api, jobs.response['items'][0])
     self.__cleanup_k8s_job(job)
Beispiel #5
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to Kubernetes cluster"""
        # prepare the job
        # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same
        # where galaxy will expect results.
        log.debug("Starting queue_job for job " + job_wrapper.get_id_tag())
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_destination=job_wrapper.job_destination)

        if not self.prepare_job(job_wrapper,
                                include_metadata=False,
                                modify_command_for_container=False,
                                stdout_file=ajs.output_file,
                                stderr_file=ajs.error_file):
            return

        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ajs.exit_code_file,
                                   shell=job_wrapper.shell,
                                   galaxy_virtual_env=None)
        try:
            self.write_executable_script(ajs.job_file, script)
        except Exception:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception("(%s) failure writing job script" %
                          job_wrapper.get_id_tag())
            return

        # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/
        k8s_job_prefix = self.__produce_k8s_job_prefix()
        k8s_job_obj = job_object_dict(self.runner_params, k8s_job_prefix,
                                      self.__get_k8s_job_spec(ajs))

        job = Job(self._pykube_api, k8s_job_obj)
        job.create()
        job_id = job.metadata['name']

        # define job attributes in the AsyncronousJobState for follow-up
        ajs.job_id = job_id
        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_external_id(job_id)
        self.monitor_queue.put(ajs)
Beispiel #6
0
 def _active_kubernetes_jobs(self):
     pykube_api = pykube_client_from_dict({})
     # TODO: namespace.
     jobs = Job.objects(pykube_api).filter()
     active = 0
     for job in jobs:
         if self.instance_id not in job.obj["metadata"]["name"]:
             continue
         status = job.obj["status"]
         active += status.get("active", 0)
     return active
Beispiel #7
0
 def finish_job(self, job_state):
     self._handle_metadata_externally(job_state.job_wrapper, resolve_requirements=True)
     super().finish_job(job_state)
     jobs = find_job_object_by_name(self._pykube_api, job_state.job_id, self.runner_params['k8s_namespace'])
     if len(jobs.response['items']) > 1:
         log.warning("More than one job matches selector: %s. Possible configuration error"
                     " in job id '%s'" % (jobs.response['items'], job_state.job_id))
     elif len(jobs.response['items']) == 0:
         log.warning("No k8s job found which matches job id '%s'. Ignoring...", job_state.job_id)
     else:
         job = Job(self._pykube_api, jobs.response['items'][0])
         if self.__has_guest_ports(job_state.job_wrapper):
             self.__cleanup_k8s_guest_ports(job_state.job_wrapper, job)
         self.__cleanup_k8s_job(job)
Beispiel #8
0
    def stop_job(self, job_wrapper):
        """Attempts to delete a dispatched job to the k8s cluster"""
        job = job_wrapper.get_job()
        try:
            job_to_delete = find_job_object_by_name(self._pykube_api, job.get_job_runner_external_id(), self.runner_params['k8s_namespace'])
            if job_to_delete and len(job_to_delete.response['items']) > 0:
                k8s_job = Job(self._pykube_api, job_to_delete.response['items'][0])
                if self.__has_guest_ports(job_wrapper):
                    self.__cleanup_k8s_guest_ports(job_wrapper, k8s_job)
                self.__cleanup_k8s_job(k8s_job)
            # TODO assert whether job parallelism == 0
            # assert not job_to_delete.exists(), "Could not delete job,"+job.job_runner_external_id+" it still exists"
            log.debug(f"({job.id}/{job.job_runner_external_id}) Terminated at user's request")

        except Exception as e:
            log.exception("({}/{}) User killed running job, but error encountered during termination: {}".format(
                job.id, job.get_job_runner_external_id(), e))
Beispiel #9
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is a AsynchronousJobState"""
        jobs = Job.objects(self._pykube_api).filter(
            selector="app=" + job_state.job_id,
            namespace=self.runner_params['k8s_namespace'])
        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            if 'max_pod_retries' in job_destination.params:
                max_pod_retries = int(
                    job_destination.params['max_pod_retries'])
            elif 'k8s_pod_retries' in self.runner_params:
                max_pod_retries = int(self.runner_params['k8s_pod_retries'])
            elif 'max_pod_retrials' in job_destination.params:
                # For backward compatibility
                max_pod_retries = int(
                    job_destination.params['max_pod_retrials'])
            elif 'k8s_pod_retrials' in self.runner_params:
                # For backward compatibility
                max_pod_retries = int(self.runner_params['max_pod_retrials'])
            else:
                max_pod_retries = 1

            # Check if job.obj['status'] is empty,
            # return job_state unchanged if this is the case
            # as probably this means that the k8s API server hasn't
            # had time to fill in the object status since the
            # job was created only too recently.
            if len(job.obj['status']) == 0:
                return job_state
            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0:
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif active > 0 and failed <= max_pod_retries:
                if not job_state.running:
                    job_state.running = True
                    job_state.job_wrapper.change_state(
                        model.Job.states.RUNNING)
                return job_state
            elif job_state.job_wrapper.get_job(
            ).state == model.Job.states.DELETED:
                # Job has been deleted via stop_job and job has not been deleted,
                # remove from watched_jobs by returning `None`
                if job_state.job_wrapper.cleanup_job in ("always",
                                                         "onsuccess"):
                    job_state.job_wrapper.cleanup()
                return None
            else:
                return self._handle_job_failure(job, job_state)

        elif len(jobs.response['items']) == 0:
            if job_state.job_wrapper.get_job(
            ).state == model.Job.states.DELETED:
                # Job has been deleted via stop_job and job has been deleted,
                # cleanup and remove from watched_jobs by returning `None`
                if job_state.job_wrapper.cleanup_job in ("always",
                                                         "onsuccess"):
                    job_state.job_wrapper.cleanup()
                return None
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=%s",
                      job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write(
                    "No Kubernetes Jobs are available under expected selector app=%s\n"
                    % job_state.job_id)
            self.mark_as_failed(job_state)
            return job_state
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error("More than one Kubernetes Job associated to job id '%s'",
                      job_state.job_id)
            with open(job_state.error_file, 'w') as error_file:
                error_file.write(
                    "More than one Kubernetes Job associated with job id '%s'\n"
                    % job_state.job_id)
            self.mark_as_failed(job_state)
            return job_state
Beispiel #10
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to Kubernetes cluster"""
        # prepare the job
        # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same
        # where galaxy will expect results.
        log.debug("Starting queue_job for job " + job_wrapper.get_id_tag())
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_destination=job_wrapper.job_destination)

        if not self.prepare_job(job_wrapper,
                                include_metadata=False,
                                modify_command_for_container=False,
                                stdout_file=ajs.output_file,
                                stderr_file=ajs.error_file):
            return

        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ajs.exit_code_file,
                                   shell=job_wrapper.shell,
                                   galaxy_virtual_env=None)
        try:
            self.write_executable_script(ajs.job_file, script)
        except Exception:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception("(%s) failure writing job script" %
                          job_wrapper.get_id_tag())
            return

        # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/
        k8s_job_name = self.__produce_unique_k8s_job_name(
            job_wrapper.get_id_tag())
        k8s_job_obj = job_object_dict(self.runner_params, k8s_job_name,
                                      self.__get_k8s_job_spec(ajs))

        # Checks if job exists and is trusted, or if it needs re-creation.
        job = Job(self._pykube_api, k8s_job_obj)
        if job.exists() and not self._galaxy_instance_id:
            # if galaxy instance id is not set, then we don't trust matching jobs and we simply delete and
            # re-create the job
            log.debug(
                "Matching job exists, but Job is not trusted, so it will be deleted and a new one created."
            )
            job.delete()
            elapsed_seconds = 0
            while job.exists():
                sleep(3)
                elapsed_seconds += 3
                if elapsed_seconds > self.runner_params[
                        'k8s_timeout_seconds_job_deletion']:
                    log.debug(
                        "Timed out before k8s could delete existing untrusted job "
                        + k8s_job_name +
                        ", not queuing associated Galaxy job.")
                    return
                log.debug("Waiting for job to be deleted " + k8s_job_name)

            Job(self._pykube_api, k8s_job_obj).create()
        elif job.exists() and self._galaxy_instance_id:
            # The job exists and we trust the identifier.
            log.debug(
                "Matching job exists, but Job is trusted, so we simply use the existing one for "
                + k8s_job_name)
            # We simply leave the k8s job to be handled later on by the check watched-items.
        else:
            # Creates the Kubernetes Job if it doesn't exist.
            job.create()

        # define job attributes in the AsyncronousJobState for follow-up
        ajs.job_id = k8s_job_name
        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_job_destination(job_wrapper.job_destination,
                                        k8s_job_name)
        self.monitor_queue.put(ajs)
Beispiel #11
0
    def check_watched_item(self, job_state):
        """Checks the state of a job already submitted on k8s. Job state is an AsynchronousJobState"""
        jobs = find_job_object_by_name(self._pykube_api, job_state.job_id,
                                       self.runner_params['k8s_namespace'])

        if len(jobs.response['items']) == 1:
            job = Job(self._pykube_api, jobs.response['items'][0])
            job_destination = job_state.job_wrapper.job_destination
            succeeded = 0
            active = 0
            failed = 0

            if 'max_pod_retries' in job_destination.params:
                max_pod_retries = int(
                    job_destination.params['max_pod_retries'])
            elif 'k8s_pod_retries' in self.runner_params:
                max_pod_retries = int(self.runner_params['k8s_pod_retries'])
            else:
                max_pod_retries = 1

            # Check if job.obj['status'] is empty,
            # return job_state unchanged if this is the case
            # as probably this means that the k8s API server hasn't
            # had time to fill in the object status since the
            # job was created only too recently.
            if len(job.obj['status']) == 0:
                return job_state
            if 'succeeded' in job.obj['status']:
                succeeded = job.obj['status']['succeeded']
            if 'active' in job.obj['status']:
                active = job.obj['status']['active']
            if 'failed' in job.obj['status']:
                failed = job.obj['status']['failed']

            job_persisted_state = job_state.job_wrapper.get_state()

            # This assumes jobs dependent on a single pod, single container
            if succeeded > 0 or job_state == model.Job.states.STOPPED:
                job_state.running = False
                self.mark_as_finished(job_state)
                return None
            elif active > 0 and failed <= max_pod_retries:
                if not job_state.running:
                    if self.__job_pending_due_to_unschedulable_pod(job_state):
                        if self.runner_params.get(
                                'k8s_unschedulable_walltime_limit'):
                            creation_time_str = job.obj['metadata'].get(
                                'creationTimestamp')
                            creation_time = datetime.strptime(
                                creation_time_str, '%Y-%m-%dT%H:%M:%SZ')
                            elapsed_seconds = (datetime.utcnow() -
                                               creation_time).total_seconds()
                            if elapsed_seconds > self.runner_params[
                                    'k8s_unschedulable_walltime_limit']:
                                return self._handle_unschedulable_job(
                                    job, job_state)
                            else:
                                pass
                        else:
                            pass
                    else:
                        job_state.running = True
                        job_state.job_wrapper.change_state(
                            model.Job.states.RUNNING)
                return job_state
            elif job_persisted_state == model.Job.states.DELETED:
                # Job has been deleted via stop_job and job has not been deleted,
                # remove from watched_jobs by returning `None`
                if job_state.job_wrapper.cleanup_job in ("always",
                                                         "onsuccess"):
                    job_state.job_wrapper.cleanup()
                return None
            else:
                return self._handle_job_failure(job, job_state)

        elif len(jobs.response['items']) == 0:
            if job_state.job_wrapper.get_job(
            ).state == model.Job.states.DELETED:
                # Job has been deleted via stop_job and job has been deleted,
                # cleanup and remove from watched_jobs by returning `None`
                if job_state.job_wrapper.cleanup_job in ("always",
                                                         "onsuccess"):
                    job_state.job_wrapper.cleanup()
                return None
            # there is no job responding to this job_id, it is either lost or something happened.
            log.error("No Jobs are available under expected selector app=%s",
                      job_state.job_id)
            self.mark_as_failed(job_state)
            # job is no longer viable - remove from watched jobs
            return None
        else:
            # there is more than one job associated to the expected unique job id used as selector.
            log.error("More than one Kubernetes Job associated to job id '%s'",
                      job_state.job_id)
            self.mark_as_failed(job_state)
            # job is no longer viable - remove from watched jobs
            return None
Beispiel #12
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to Kubernetes cluster"""
        # prepare the job
        # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same
        # where galaxy will expect results.
        log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}")
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_destination=job_wrapper.job_destination)

        if not self.prepare_job(job_wrapper,
                                include_metadata=False,
                                modify_command_for_container=False,
                                stdout_file=ajs.output_file,
                                stderr_file=ajs.error_file):
            return

        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ajs.exit_code_file,
                                   shell=job_wrapper.shell,
                                   galaxy_virtual_env=None)
        try:
            self.write_executable_script(ajs.job_file, script)
        except Exception:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception(
                f"({job_wrapper.get_id_tag()}) failure writing job script")
            return

        # Construction of Kubernetes objects follow: https://kubernetes.io/docs/concepts/workloads/controllers/job/
        if self.__has_guest_ports(job_wrapper):
            try:
                self.__configure_port_routing(ajs)
            except HTTPError:
                log.exception(
                    "Kubernetes failed to expose tool ports as services, HTTP exception encountered"
                )
                ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR
                ajs.fail_message = "Kubernetes failed to export tool ports as services."
                self.mark_as_failed(ajs)
                return

        k8s_job_prefix = self.__produce_k8s_job_prefix()
        k8s_job_obj = job_object_dict(self.runner_params, k8s_job_prefix,
                                      self.__get_k8s_job_spec(ajs))

        job = Job(self._pykube_api, k8s_job_obj)
        try:
            job.create()
        except HTTPError:
            log.exception(
                "Kubernetes failed to create job, HTTP exception encountered")
            ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR
            ajs.fail_message = "Kubernetes failed to create job."
            self.mark_as_failed(ajs)
            return
        if not job.name:
            log.exception(
                f"Kubernetes failed to create job, empty name encountered: [{job.obj}]"
            )
            ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR
            ajs.fail_message = "Kubernetes failed to create job."
            self.mark_as_failed(ajs)
            return
        job_id = job.name

        # define job attributes in the AsyncronousJobState for follow-up
        ajs.job_id = job_id
        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_external_id(job_id)
        self.monitor_queue.put(ajs)
Beispiel #13
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to Kubernetes cluster"""
        # prepare the job
        # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same
        # where galaxy will expect results.
        log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}")
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_destination=job_wrapper.job_destination)

        if not self.prepare_job(job_wrapper,
                                include_metadata=False,
                                modify_command_for_container=False,
                                stdout_file=ajs.output_file,
                                stderr_file=ajs.error_file):
            return

        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ajs.exit_code_file,
                                   shell=job_wrapper.shell,
                                   galaxy_virtual_env=None)
        try:
            self.write_executable_script(ajs.job_file, script)
        except Exception:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception(
                f"({job_wrapper.get_id_tag()}) failure writing job script")
            return

        # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/
        k8s_job_prefix = self.__produce_k8s_job_prefix()
        guest_ports = ajs.job_wrapper.guest_ports
        ports_dict = {}
        for guest_port in guest_ports:
            ports_dict[str(guest_port)] = dict(host='manual',
                                               port=guest_port,
                                               protocol="https")
        eps = None
        if ajs.job_wrapper.guest_ports:
            k8s_job_name = self.__get_k8s_job_name(k8s_job_prefix,
                                                   ajs.job_wrapper)
            log.debug(
                f'Configuring entry points and deploying service/ingress for job with ID {ajs.job_id}'
            )
            k8s_service_obj = service_object_dict(
                self.runner_params, k8s_job_name,
                self.__get_k8s_service_spec(ajs))
            eps = self.app.interactivetool_manager.configure_entry_points(
                ajs.job_wrapper.get_job(), ports_dict)
            k8s_ingress_obj = ingress_object_dict(
                self.runner_params, k8s_job_name,
                self.__get_k8s_ingress_spec(ajs, eps))
            service = Service(self._pykube_api, k8s_service_obj)
            service.create()
            ingress = Ingress(self._pykube_api, k8s_ingress_obj)
            ingress.create()
        k8s_job_obj = job_object_dict(self.runner_params, k8s_job_prefix,
                                      self.__get_k8s_job_spec(ajs, eps))
        job = Job(self._pykube_api, k8s_job_obj)
        try:
            job.create()
        except HTTPError:
            log.exception(
                "Kubernetes failed to create job, HTTP exception encountered")
            ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR
            ajs.fail_message = "Kubernetes failed to create job."
            self.mark_as_failed(ajs)
            return
        if not job.name:
            log.exception(
                f"Kubernetes failed to create job, empty name encountered: [{job.obj}]"
            )
            ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR
            ajs.fail_message = "Kubernetes failed to create job."
            self.mark_as_failed(ajs)
            return
        job_id = job.name

        # define job attributes in the AsyncronousJobState for follow-up
        ajs.job_id = job_id
        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_external_id(job_id)
        self.monitor_queue.put(ajs)