Ejemplo n.º 1
0
 def test_transform_ujs_err(self):
     code = 1000
     message = "some error message"
     name = "UJSError"
     ujs_err = UJSServerError(name, code, message)
     nar_err = transform_job_exception(ujs_err)
     self.assertEqual(nar_err.code, code)
     self.assertEqual(nar_err.message, message)
     self.assertEqual(nar_err.name, name)
     self.assertEqual(nar_err.source, 'ujs')
Ejemplo n.º 2
0
 def test_transform_ujs_err(self):
     code = 1000
     message = "some error message"
     name = "UJSError"
     ujs_err = UJSServerError(name, code, message)
     nar_err = transform_job_exception(ujs_err)
     self.assertEqual(nar_err.code, code)
     self.assertEqual(nar_err.message, message)
     self.assertEqual(nar_err.name, name)
     self.assertEqual(nar_err.source, 'ujs')
Ejemplo n.º 3
0
 def test_transform_ee2_err(self):
     code = 1000
     message = ERROR_MSG
     name = "EEError"
     ee2_err = EEServerError(name, code, message)
     nar_err = transform_job_exception(ee2_err)
     self.assertEqual(nar_err.code, code)
     self.assertEqual(nar_err.message, message)
     self.assertEqual(nar_err.name, name)
     self.assertEqual(nar_err.source, "ee2")
     self.assertIsNone(nar_err.error)
Ejemplo n.º 4
0
 def test_transform_http_err_internal(self):
     code = 500
     message = "An internal error occurred in the KBase service."
     name = "HTTPError"
     res = requests.Response()
     res.status_code = code
     err = HTTPError("http error", response=res)
     nar_err = transform_job_exception(err)
     self.assertEqual(nar_err.code, code)
     self.assertEqual(nar_err.message, message)
     self.assertEqual(nar_err.name, name)
     self.assertEqual(nar_err.source, 'network')
Ejemplo n.º 5
0
 def test_transform_http_err_unknown(self):
     code = 666
     message = "An untracked error occurred."
     name = "HTTPError"
     res = requests.Response()
     res.status_code = code
     err = HTTPError("http error", response=res)
     nar_err = transform_job_exception(err)
     self.assertEqual(nar_err.code, code)
     self.assertEqual(nar_err.message, message)
     self.assertEqual(nar_err.name, name)
     self.assertEqual(nar_err.source, 'network')
Ejemplo n.º 6
0
 def test_transform_http_err_unknown(self):
     code = 666
     message = "An untracked error occurred."
     name = "HTTPError"
     res = requests.Response()
     res.status_code = code
     err = HTTPError("http error", response=res)
     nar_err = transform_job_exception(err)
     self.assertEqual(nar_err.code, code)
     self.assertEqual(nar_err.message, message)
     self.assertEqual(nar_err.name, name)
     self.assertEqual(nar_err.source, 'network')
Ejemplo n.º 7
0
    def cancel_job(self, job_id, parent_job_id=None):
        """
        Cancels a running job, placing it in a canceled state.
        Does NOT delete the job.
        Raises an exception if the current user doesn't have permission to cancel the job.
        """

        if job_id is None:
            raise ValueError('Job id required for cancellation!')
        if not parent_job_id and job_id not in self._running_jobs:
            self._send_comm_message('job_does_not_exist', {
                'job_id': job_id,
                'source': 'cancel_job'
            })
            return

        try:
            state = self._get_job_state(job_id, parent_job_id=parent_job_id)
            if state.get('canceled', 0) == 1 or state.get('finished', 0) == 1:
                # It's already finished, don't try to cancel it again.
                return
        except Exception as e:
            raise ValueError('Unable to get Job state')

        # Stop updating the job status while we try to cancel.
        # Also, set it to have a special state of 'canceling' while we're doing the cancel
        if not parent_job_id:
            is_refreshing = self._running_jobs[job_id].get('refresh', 0)
            self._running_jobs[job_id]['refresh'] = 0
            self._running_jobs[job_id]['canceling'] = True
        try:
            clients.get('job_service').cancel_job({'job_id': job_id})
        except Exception as e:
            new_e = transform_job_exception(e)
            error = {
                'error': 'Unable to get cancel job',
                'message': getattr(new_e, 'message', 'Unknown reason'),
                'code': getattr(new_e, 'code', -1),
                'source': getattr(new_e, 'source', 'jobmanager'),
                'name': getattr(new_e, 'name',
                                type(e).__name__),
                'request_type': 'cancel_job',
                'job_id': job_id
            }
            self._send_comm_message('job_comm_error', error)
            raise (e)
        finally:
            if not parent_job_id:
                self._running_jobs[job_id]['refresh'] = is_refreshing
                del self._running_jobs[job_id]['canceling']

        # Rather than a separate message, how about triggering a job-status message:
        self._lookup_job_status(job_id, parent_job_id=parent_job_id)
Ejemplo n.º 8
0
 def test_transform_http_err_internal(self):
     code = 500
     message = "An internal error occurred in the KBase service."
     name = "HTTPError"
     res = requests.Response()
     res.status_code = code
     err = HTTPError("http error", response=res)
     nar_err = transform_job_exception(err)
     self.assertEqual(nar_err.code, code)
     self.assertEqual(nar_err.message, message)
     self.assertEqual(nar_err.name, name)
     self.assertEqual(nar_err.source, 'network')
Ejemplo n.º 9
0
    def retry_jobs(self, job_id_list: List[str]) -> dict:
        """
        Returns
        [
            {
                "job_id": job_id,
                "job": {"state": {"job_id": job_id, "status": status, ...} ...},
                "retry_id": retry_id,
                "retry": {"state": {"job_id": retry_id, "status": status, ...} ...}
            },
            {
                "job": {"state": {"job_id": job_id, "status": status, ...} ...},
                "error": "..."
            }
            ...
            {
                "job": {"state": {"job_id": job_id, "status": DOES_NOT_EXIST}},
                "error": f"Cannot find job with ID {job_id}",
            }
        ]
        where the innermost dictionaries are job states from ee2 and are within the
        job states from job.output_state()
        """
        job_ids, error_ids = self._check_job_list(job_id_list)
        try:
            retry_results = clients.get("execution_engine2").retry_jobs(
                {"job_ids": job_ids}
            )
        except Exception as e:
            raise transform_job_exception(e, "Unable to retry job(s)")
        # for each retry result, refresh the state of the retried and new jobs
        orig_ids = [result["job_id"] for result in retry_results]
        retry_ids = [
            result["retry_id"] for result in retry_results if "retry_id" in result
        ]
        orig_states = self._construct_job_output_state_set(orig_ids)
        retry_states = self._construct_job_output_state_set(
            retry_ids, self._create_jobs(retry_ids)  # add to self._running_jobs index
        )
        job_states = {**orig_states, **retry_states}

        results_by_job_id = {}
        # fill in the job state details
        for result in retry_results:
            job_id = result["job_id"]
            results_by_job_id[job_id] = {"job_id": job_id, "job": job_states[job_id]}
            if "retry_id" in result:
                retry_id = result["retry_id"]
                results_by_job_id[job_id]["retry_id"] = retry_id
                results_by_job_id[job_id]["retry"] = job_states[retry_id]
            if "error" in result:
                results_by_job_id[job_id]["error"] = result["error"]
        return self.add_errors_to_results(results_by_job_id, error_ids)
Ejemplo n.º 10
0
 def test_transform_ee2_err__with_error(self):
     code = 1000
     message = ERROR_MSG
     name = "EEError"
     error = "Unable to perform some request"
     ee2_err = EEServerError(name, code, message)
     nar_err = transform_job_exception(ee2_err, error)
     self.assertEqual(nar_err.code, code)
     self.assertEqual(nar_err.message, message)
     self.assertEqual(nar_err.name, name)
     self.assertEqual(nar_err.source, "ee2")
     self.assertEqual(nar_err.error, error)
Ejemplo n.º 11
0
 def test_transform_http_err_unavailable(self):
     codes = [404, 502, 503]
     message = "A KBase service is currently unavailable."
     name = "HTTPError"
     for c in codes:
         res = requests.Response()
         res.status_code = c
         err = HTTPError("http error", response=res)
         nar_err = transform_job_exception(err)
         self.assertEqual(nar_err.code, c)
         self.assertEqual(nar_err.message, message)
         self.assertEqual(nar_err.name, name)
         self.assertEqual(nar_err.source, 'network')
Ejemplo n.º 12
0
 def test_transform_http_err_timeout(self):
     codes = [504, 598, 599]
     message = "There was a temporary network connection error."
     name = "HTTPError"
     for c in codes:
         res = requests.Response()
         res.status_code = c
         err = HTTPError("http error", response=res)
         nar_err = transform_job_exception(err)
         self.assertEqual(nar_err.code, c)
         self.assertEqual(nar_err.message, message)
         self.assertEqual(nar_err.name, name)
         self.assertEqual(nar_err.source, 'network')
Ejemplo n.º 13
0
 def test_transform_http_err_unavailable(self):
     codes = [404, 502, 503]
     message = "A KBase service is currently unavailable."
     name = "HTTPError"
     for c in codes:
         res = requests.Response()
         res.status_code = c
         err = HTTPError("http error", response=res)
         nar_err = transform_job_exception(err)
         self.assertEqual(nar_err.code, c)
         self.assertEqual(nar_err.message, message)
         self.assertEqual(nar_err.name, name)
         self.assertEqual(nar_err.source, 'network')
Ejemplo n.º 14
0
    def cancel_job(self, job_id: str, parent_job_id: str = None) -> None:
        """
        Cancels a running job, placing it in a canceled state.
        Does NOT delete the job.
        if the job_id is None or not found in this Narrative, a ValueError is raised.
        This then checks the job to see if it is already canceled/finished,
        then attempts to cancel it.
        If either of those steps fail, a NarrativeException is raised.
        """

        if job_id is None:
            raise ValueError('Job id required for cancellation!')
        if not parent_job_id and job_id not in self._running_jobs:
            raise ValueError(f"No job present with id {job_id}")

        try:
            cancel_status = clients.get(
                "execution_engine2").check_job_canceled({"job_id": job_id})
            if cancel_status.get("finished", 0) == 1 or cancel_status.get(
                    "canceled", 0) == 1:
                # It's already finished, don't try to cancel it again.
                return
        except Exception as e:
            raise transform_job_exception(e)

        # Stop updating the job status while we try to cancel.
        # Also, set it to have a special state of 'canceling' while we're doing the cancel
        if not parent_job_id:
            is_refreshing = self._running_jobs[job_id].get('refresh', 0)
            self._running_jobs[job_id]['refresh'] = 0
            self._running_jobs[job_id]['canceling'] = True
        try:
            clients.get('execution_engine2').cancel_job({'job_id': job_id})
        except Exception as e:
            raise transform_job_exception(e)
        finally:
            if not parent_job_id:
                self._running_jobs[job_id]['refresh'] = is_refreshing
                del self._running_jobs[job_id]['canceling']
Ejemplo n.º 15
0
 def test_transform_http_err_timeout(self):
     codes = [504, 598, 599]
     message = "There was a temporary network connection error."
     name = "HTTPError"
     for c in codes:
         res = requests.Response()
         res.status_code = c
         err = HTTPError("http error", response=res)
         nar_err = transform_job_exception(err)
         self.assertEqual(nar_err.code, c)
         self.assertEqual(nar_err.message, message)
         self.assertEqual(nar_err.name, name)
         self.assertEqual(nar_err.source, 'network')
Ejemplo n.º 16
0
    def cancel_job(self, job_id):
        """
        Cancels a running job, placing it in a canceled state.
        Does NOT delete the job.
        Raises an exception if the current user doesn't have permission to cancel the job.
        """

        if job_id is None:
            raise ValueError('Job id required for cancellation!')
        if job_id not in self._running_jobs:
            self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'source': 'cancel_job'})
            return

        try:
            job = self.get_job(job_id)
            state = job.state()
            if state.get('canceled', 0) == 1 or state.get('finished', 0) == 1:
                # It's already finished, don't try to cancel it again.
                return
        except Exception as e:
            raise ValueError('Unable to get Job state')

        # Stop updating the job status while we try to cancel.
        # Also, set it to have a special state of 'canceling' while we're doing the cancel
        is_refreshing = self._running_jobs[job_id].get('refresh', False)
        self._running_jobs[job_id]['refresh'] = False
        self._running_jobs[job_id]['canceling'] = True
        try:
            clients.get('job_service').cancel_job({'job_id': job_id})
        except Exception as e:
            new_e = transform_job_exception(e)
            error = {
                'error': 'Unable to get cancel job',
                'message': getattr(new_e, 'message', 'Unknown reason'),
                'code': getattr(new_e, 'code', -1),
                'source': getattr(new_e, 'source', 'jobmanager'),
                'name': getattr(new_e, 'name', type(e).__name__),
                'request_type': 'cancel_job',
                'job_id': job_id
            }
            self._send_comm_message('job_comm_error', error)
            raise(e)
        finally:
            self._running_jobs[job_id]['refresh'] = is_refreshing
            del self._running_jobs[job_id]['canceling']

        #
        # self._send_comm_message('job_canceled', {'job_id': job_id})
        # Rather than a separate message, how about triggering a job-status message:
        self._lookup_job_status(job_id)
Ejemplo n.º 17
0
 def _cancel_job(self, job_id: str) -> None:
     # Stop updating the job status while we try to cancel.
     # Set the job to a special state of 'canceling' while we're doing the cancel
     is_refreshing = self._running_jobs[job_id].get("refresh", False)
     self._running_jobs[job_id]["refresh"] = False
     self._running_jobs[job_id]["canceling"] = True
     error = None
     try:
         clients.get("execution_engine2").cancel_job({"job_id": job_id})
     except Exception as e:
         error = transform_job_exception(e, "Unable to cancel job")
     self._running_jobs[job_id]["refresh"] = is_refreshing
     del self._running_jobs[job_id]["canceling"]
     return error
Ejemplo n.º 18
0
    def initialize_jobs(self):
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id)
        3. initialize the Job objects by running NJS.get_job_params (also gets app_id)
        4. start the status lookup loop.
        """
        ws_id = system_variable("workspace_id")
        job_states = dict()
        kblogging.log_event(self._log, "JobManager.initialize_jobs",
                            {"ws_id": ws_id})
        try:
            job_states = clients.get("execution_engine2").check_workspace_jobs(
                {
                    "workspace_id": ws_id,
                    "return_list": 0
                })
            self._running_jobs = dict()
        except Exception as e:
            kblogging.log_event(self._log, "init_error", {"err": str(e)})
            new_e = transform_job_exception(e)
            raise new_e

        for job_id, job_state in job_states.items():
            job_input = job_state.get("job_input", {})
            job_meta = job_input.get("narrative_cell_info", {})
            status = job_state.get("status")
            job = Job.from_state(
                job_id,
                job_input,
                job_state.get("user"),
                app_id=job_input.get("app_id"),
                tag=job_meta.get("tag", "release"),
                cell_id=job_meta.get("cell_id", None),
                run_id=job_meta.get("run_id", None),
                token_id=job_meta.get("token_id", None),
                meta=job_meta,
            )
            self._running_jobs[job_id] = {
                "refresh":
                1
                if status not in ["completed", "errored", "terminated"] else 0,
                "job":
                job,
            }
Ejemplo n.º 19
0
    def get_job_logs(
        self,
        job_id: str,
        parent_job_id: str = None,
        first_line: int = 0,
        num_lines: int = None,
        latest_only: bool = False,
    ) -> tuple:
        """
        Raises a Value error if the job_id doesn't exist or is not present.
        :param job_id: str - the job id from the execution engine
        :param parent_job_id: if the job is a child job, this is its parent (optional)
        :param first_line: int - the first line to be requested by the log. 0-indexed. If < 0,
            this will be set to 0
        :param max_lines: int - the maximum number of lines to return.
            if < 0, will be reset to 0.
            if None, then will not be considered, and just return all the lines.
        :param latest_only: bool - if True, will only return the most recent max_lines
            of logs. This overrides the first_line parameter if set to True. So if the call made
            is get_job_logs(id, first_line=0, num_lines=5, latest_only=True), and there are 100
            log lines available, then lines 96-100 will be returned.
        :returns: 3-tuple. elements in order:
            int - the first line returned
            int - the number of logs lines currently available for that job
            list - the lines themselves, fresh from the server. These are all tiny dicts
                with key "is_error" (either 0 or 1) and "line" - the log line string

        """
        job = self.get_job(job_id)

        if first_line < 0:
            first_line = 0
        if num_lines is not None and num_lines < 0:
            num_lines = 0

        try:
            if latest_only:
                (max_lines, logs) = job.log()
                if num_lines is not None and max_lines > num_lines:
                    first_line = max_lines - num_lines
                    logs = logs[first_line:]
            else:
                (max_lines, logs) = job.log(first_line=first_line,
                                            num_lines=num_lines)

            return (first_line, max_lines, logs)
        except Exception as e:
            raise transform_job_exception(e)
Ejemplo n.º 20
0
    def initialize_jobs(self, cell_ids: List[str] = None) -> None:
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. gets the current workspace ID from app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from ee2 (also gets tag, cell_id, run_id)
        3. initialize the Job objects and add them to the running jobs list
        4. start the status lookup loop.
        """
        ws_id = system_variable("workspace_id")
        job_states = dict()
        kblogging.log_event(self._log, "JobManager.initialize_jobs", {"ws_id": ws_id})
        try:
            job_states = clients.get("execution_engine2").check_workspace_jobs(
                {
                    "workspace_id": ws_id,
                    "return_list": 0,  # do not remove
                    "exclude_fields": JOB_INIT_EXCLUDED_JOB_STATE_FIELDS,
                }
            )
        except Exception as e:
            kblogging.log_event(self._log, "init_error", {"err": str(e)})
            new_e = transform_job_exception(e, "Unable to initialize jobs")
            raise new_e

        self._running_jobs = dict()
        job_states = self._reorder_parents_children(job_states)
        for job_state in job_states.values():
            child_jobs = None
            if job_state.get("batch_job"):
                child_jobs = [
                    self.get_job(child_id)
                    for child_id in job_state.get("child_jobs", [])
                ]

            job = Job(job_state, children=child_jobs)

            # Set to refresh when job is not in terminal state
            # and when job is present in cells (if given)
            # and when it is not part of a batch
            refresh = not job.was_terminal() and not job.batch_id
            if cell_ids is not None:
                refresh = refresh and job.in_cells(cell_ids)

            self.register_new_job(job, refresh)
Ejemplo n.º 21
0
    def initialize_jobs(self):
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id)
        3. initialize the Job objects by running NJS.get_job_params (also gets app_id)
        4. start the status lookup loop.
        """
        ws_id = system_variable("workspace_id")
        job_states = dict()
        kblogging.log_event(self._log, "JobManager.initialize_jobs",
                            {'ws_id': ws_id})
        try:
            job_states = clients.get('execution_engine2').check_workspace_jobs(
                {
                    'workspace_id': ws_id,
                    'return_list': 0
                })
            self._running_jobs = dict()
        except Exception as e:
            kblogging.log_event(self._log, 'init_error', {'err': str(e)})
            new_e = transform_job_exception(e)
            raise new_e

        for job_id, job_state in job_states.items():
            job_input = job_state.get('job_input', {})
            job_meta = job_input.get('narrative_cell_info', {})
            status = job_state.get('status')
            job = Job.from_state(job_id,
                                 job_input,
                                 job_state.get('user'),
                                 app_id=job_input.get('app_id'),
                                 tag=job_meta.get('tag', 'release'),
                                 cell_id=job_meta.get('cell_id', None),
                                 run_id=job_meta.get('run_id', None),
                                 token_id=job_meta.get('token_id', None),
                                 meta=job_meta)
            self._running_jobs[job_id] = {
                'refresh':
                1
                if status not in ['completed', 'errored', 'terminated'] else 0,
                'job':
                job
            }
Ejemplo n.º 22
0
    def _run_app_internal(self, app_id, params, tag, version, cell_id, run_id, **kwargs):
        """
        Attemps to run the app, returns a Job with the running app info.
        Should *hopefully* also inject that app into the Narrative's metadata.
        Probably need some kind of JavaScript-foo to get that to work.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - the dictionary of parameters.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules have
                  versions, so if the tag is not 'release', and a version is given,
                  a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.

        Example:
        --------
        my_job = mm.run_app('MegaHit/run_megahit', version=">=1.0.0", read_library_name="My_PE_Library", output_contigset_name="My_Contig_Assembly")
        """

        ### TODO: this needs restructuring so that we can send back validation failure
        ### messages. Perhaps a separate function and catch the errors, or return an
        ### error structure.

        # Intro tests:
        self.spec_manager.check_app(app_id, tag, raise_exception=True)

        if version is not None and tag != "release":
            if re.match(version, '\d+\.\d+\.\d+') is not None:
                raise ValueError("Semantic versions only apply to released app modules. You can use a Git commit hash instead to specify a version.")

        # Get the spec & params
        spec = self.spec_manager.get_spec(app_id, tag)

        # There's some branching to do here.
        # Cases:
        # app has behavior.kb_service_input_mapping -- is a valid long-running app.
        # app only has behavior.output_mapping - not kb_service_input_mapping or script_module - it's a viewer and should return immediately
        # app has other things besides kb_service_input_mapping -- not a valid app.
        if 'behavior' not in spec:
            raise Exception("This app appears invalid - it has no defined behavior")

        if  'kb_service_input_mapping' not in spec['behavior']:
            raise Exception("This app does not appear to be a long-running job! Please use 'run_local_app' to start this instead.")

        # Preflight check the params - all required ones are present, all values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        (params, ws_input_refs) = self._validate_parameters(app_id, tag, spec_params, params)

        ws_id = system_variable('workspace_id')
        if ws_id is None:
            raise ValueError('Unable to retrive current Narrative workspace information!')

        input_vals = self._map_inputs(spec['behavior']['kb_service_input_mapping'], params)

        service_method = spec['behavior']['kb_service_method']
        service_name = spec['behavior']['kb_service_name']
        service_ver = spec['behavior'].get('kb_service_version', None)
        service_url = spec['behavior']['kb_service_url']


        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        function_name = service_name + '.' + service_method
        job_meta = {'tag': tag}
        if cell_id is not None:
            job_meta['cell_id'] = cell_id
        if run_id is not None:
            job_meta['run_id'] = run_id

        # This is the input set for NJSW.run_job. Now we need the worksapce id and whatever fits in the metadata.
        job_runner_inputs = {
            'method': function_name,
            'service_ver': service_ver,
            'params': input_vals,
            'app_id': app_id,
            'wsid': ws_id,
            'meta': job_meta
        }
        if len(ws_input_refs) > 0:
            job_runner_inputs['source_ws_objects'] = ws_input_refs

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': tag,
            'version': service_ver,
            'username': system_variable('user_id'),
            'wsid': ws_id
        }
        kblogging.log_event(self._log, "run_app", log_info)

        try:
            job_id = self.njs.run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({'err': str(e)})
            kblogging.log_event(self._log, "run_app_error", log_info)
            raise transform_job_exception(e)

        new_job = Job(job_id,
                      app_id,
                      [params],
                      system_variable('user_id'),
                      tag=tag,
                      app_version=service_ver,
                      cell_id=cell_id,
                      run_id=run_id)

        self._send_comm_message('run_status', {
            'event': 'launched_job',
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
            'cell_id': cell_id,
            'run_id': run_id,
            'job_id': job_id
        })
        JobManager().register_new_job(new_job)
        if cell_id is not None:
            return
        else:
            return new_job
Ejemplo n.º 23
0
    def initialize_jobs(self, start_lookup_thread=True):
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id)
        3. initialize the Job objects by running NJS.get_job_params (also gets app_id)
        4. start the status lookup loop.
        """

        the_time = int(round(time.time() * 1000))

        self._send_comm_message('start', {'time': the_time})

        ws_id = system_variable('workspace_id')
        try:
            nar_jobs = clients.get('user_and_job_state').list_jobs2({
                'authstrat': 'kbaseworkspace',
                'authparams': [str(ws_id)]
            })
        except Exception as e:
            kblogging.log_event(self._log, 'init_error', {'err': str(e)})
            new_e = transform_job_exception(e)
            error = {
                'error': 'Unable to get initial jobs list',
                'message': getattr(new_e, 'message', 'Unknown reason'),
                'code': getattr(new_e, 'code', -1),
                'source': getattr(new_e, 'source', 'jobmanager'),
                'name': getattr(new_e, 'name', type(e).__name__),
                'service': 'user_and_job_state'
            }
            self._send_comm_message('job_init_err', error)
            raise new_e

        job_ids = [j[0] for j in nar_jobs]
        job_states = clients.get('job_service').check_jobs({
            'job_ids': job_ids, 'with_job_params': 1
        })
        job_param_info = job_states.get('job_params', {})
        job_check_error = job_states.get('check_error', {})
        error_jobs = dict()
        for info in nar_jobs:
            job_id = info[0]
            user_info = info[1]
            job_meta = info[10]
            try:
                if job_id in job_param_info:
                    job_info = job_param_info[job_id]

                    job = Job.from_state(job_id,
                                         job_info,
                                         user_info[0],
                                         app_id=job_info.get('app_id'),
                                         tag=job_meta.get('tag', 'release'),
                                         cell_id=job_meta.get('cell_id', None),
                                         run_id=job_meta.get('run_id', None),
                                         token_id=job_meta.get('token_id', None),
                                         meta=job_meta)

                    # Note that when jobs for this narrative are initially loaded,
                    # they are set to not be refreshed. Rather, if a client requests
                    # updates via the start_job_update message, the refresh flag will
                    # be set to True.
                    self._running_jobs[job_id] = {
                        'refresh': 0,
                        'job': job
                    }
                elif job_id in job_check_error:
                    job_err_state = {
                        'job_state': 'error',
                        'error': {
                            'error': 'KBase execution engine returned an error while looking up this job.',
                            'message': job_check_error[job_id].get('message', 'No error message available'),
                            'name': 'Job Error',
                            'code': job_check_error[job_id].get('code', -999),
                            'exception': {
                                'error_message': 'Job lookup in execution engine failed',
                                'error_type': job_check_error[job_id].get('name', 'unknown'),
                                'error_stacktrace': job_check_error[job_id].get('error', '')
                            }
                        },
                        'cell_id': job_meta.get('cell_id', None),
                        'run_id': job_meta.get('run_id', None),
                    }
                    error_jobs[job_id] = job_err_state

            except Exception as e:
                kblogging.log_event(self._log, 'init_error', {'err': str(e)})
                new_e = transform_job_exception(e)
                error = {
                    'error': 'Unable to get job info on initial lookup',
                    'job_id': job_id,
                    'message': getattr(new_e, 'message', 'Unknown reason'),
                    'code': getattr(new_e, 'code', -1),
                    'source': getattr(new_e, 'source', 'jobmanager'),
                    'name': getattr(new_e, 'name', type(e).__name__),
                    'service': 'job_service'
                }
                self._send_comm_message('job_init_lookup_err', error)
                raise new_e  # should crash and burn on any of these.

        if len(job_check_error):
            err_str = 'Unable to find info for some jobs on initial lookup'
            err_type = 'job_init_partial_err'
            if len(job_check_error) == len(nar_jobs):
                err_str = 'Unable to get info for any job on initial lookup'
                err_type = 'job_init_lookup_err'
            error = {
                'error': err_str,
                'job_errors': error_jobs,
                'message': 'Job information was unavailable from the server',
                'code': -2,
                'source': 'jobmanager',
                'name': 'jobmanager',
                'service': 'job_service',
            }
            self._send_comm_message(err_type, error)

        if not self._running_lookup_loop and start_lookup_thread:
            # only keep one loop at a time in cause this gets called again!
            if self._lookup_timer is not None:
                self._lookup_timer.cancel()
            self._running_lookup_loop = True
            self._lookup_job_status_loop()
        else:
            self._lookup_all_job_status()
Ejemplo n.º 24
0
    def _construct_job_status(self, job, state):
        """
        Creates a Job status dictionary with structure:
        {
            owner: string (username),
            spec: app_spec (from NMS, via biokbase.narrative.jobs.specmanager)
            widget_info: (if not finished, None, else...) job.get_viewer_params result
            state: {
                job_state: string,
                error (if present): dict of error info,
                cell_id: string/None,
                run_id: string/None,
                awe_job_id: string/None,
                canceled: 0/1
                creation_time: epoch second
                exec_start_time: epoch/none,
                finish_time: epoch/none,
                finished: 0/1,
                job_id: string,
                status: (from UJS) [
                    timestamp(last_update, string),
                    stage (string),
                    status (string),
                    progress (string/None),
                    est_complete (string/None),
                    complete (0/1),
                    error (0/1)
                ],
                ujs_url: string
            }
        }
        """
        widget_info = None
        app_spec = {}

        if job is None:
            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Job does not seem to exist, or it is otherwise unavailable.',
                    'message': 'Job does not exist',
                    'name': 'Job Error',
                    'code': -1,
                    'exception': {
                        'error_message': 'job not found in JobManager',
                        'error_type': 'ValueError',
                        'error_stacktrace': ''
                    }
                },
                'cell_id': None,
                'run_id': None,
            }
            return {
                'state': state,
                'app_spec': app_spec,
                'widget_info': widget_info,
                'owner': None
            }

        # try:
        #     app_spec = job.app_spec()
        # except Exception as e:
        #     kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})

        if state is None:
            kblogging.log_event(self._log, "lookup_job_status.error", {'err': 'Unable to get job state for job {}'.format(job.job_id)})

            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Unable to find current job state. Please try again later, or contact KBase.',
                    'message': 'Unable to return job state',
                    'name': 'Job Error',
                    'code': -1,
                    'source': 'JobManager._construct_job_status',
                    'exception': {
                        'error_message': 'No state provided during lookup',
                        'error_type': 'null-state',
                        'error_stacktrace': '',
                    }
                },
                'creation_time': 0,
                'cell_id': job.cell_id,
                'run_id': job.run_id,
                'job_id': job.job_id
            }

        elif 'lookup_error' in state:
            kblogging.log_event(self._log, "lookup_job_status.error", {
                'err': 'Problem while getting state for job {}'.format(job.job_id),
                'info': str(state['lookup_error'])
            })
            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Unable to fetch current state. Please try again later, or contact KBase.',
                    'message': 'Error while looking up job state',
                    'name': 'Job Error',
                    'code': -1,
                    'source': 'JobManager._construct_job_status',
                    'exception': {
                        'error_message': 'Error while fetching job state',
                        'error_type': 'failed-lookup',
                    },
                    'error_response': state['lookup_error'],
                    'creation_time': 0,
                    'cell_id': job.cell_id,
                    'run_id': job.run_id,
                    'job_id': job.job_id
                }
            }
        if state.get('finished', 0) == 1:
            try:
                widget_info = job.get_viewer_params(state)
            except Exception as e:
                # Can't get viewer params
                new_e = transform_job_exception(e)
                kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})
                state['job_state'] = 'error'
                state['error'] = {
                    'error': 'Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance.',
                    'message': 'Unable to build output viewer parameters!',
                    'name': 'App Error',
                    'code': getattr(new_e, "code", -1),
                    'source': getattr(new_e, "source", "JobManager")
                }

        if 'canceling' in self._running_jobs[job.job_id]:
            state['job_state'] = 'canceling'

        state.update({
            'child_jobs': self._child_job_states(
                state.get('sub_jobs', []),
                job.meta.get('batch_app'),
                job.meta.get('batch_tag')
            )
        })
        if 'batch_size' in job.meta:
            state.update({'batch_size': job.meta['batch_size']})
        return {'state': state,
                'spec': app_spec,
                'widget_info': widget_info,
                'owner': job.owner,
                'listener_count': self._running_jobs[job.job_id]['refresh']}
Ejemplo n.º 25
0
    def _run_app_internal(self, app_id, params, tag, version, cell_id, run_id,
                          dry_run):
        """
        Attemps to run the app, returns a Job with the running app info.
        Should *hopefully* also inject that app into the Narrative's metadata.
        Probably need some kind of JavaScript-foo to get that to work.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - a dictionary of parameters.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules
                  have versions, so if the tag is not 'release', and a version
                  is given, a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.
        """
        ws_id = strict_system_variable('workspace_id')
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        spec_params_map = dict((spec_params[i]['id'], spec_params[i])
                               for i in range(len(spec_params)))
        ws_input_refs = extract_ws_refs(app_id, tag, spec_params, params)
        input_vals = self._map_inputs(
            spec['behavior']['kb_service_input_mapping'], params,
            spec_params_map)

        service_method = spec['behavior']['kb_service_method']
        service_name = spec['behavior']['kb_service_name']
        service_ver = spec['behavior'].get('kb_service_version', None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        function_name = service_name + '.' + service_method
        job_meta = {'tag': tag}
        if cell_id is not None:
            job_meta['cell_id'] = cell_id
        if run_id is not None:
            job_meta['run_id'] = run_id

        # This is the input set for NJSW.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            'method': function_name,
            'service_ver': service_ver,
            'params': input_vals,
            'app_id': app_id,
            'wsid': ws_id,
            'meta': job_meta
        }
        if len(ws_input_refs) > 0:
            job_runner_inputs['source_ws_objects'] = ws_input_refs
        if dry_run:
            return job_runner_inputs

        # We're now almost ready to run the job. Last, we need an agent token.
        try:
            token_name = 'KBApp_{}'.format(app_id)
            token_name = token_name[:self.__MAX_TOKEN_NAME_LEN]
            agent_token = auth.get_agent_token(auth.get_auth_token(),
                                               token_name=token_name)
        except Exception as e:
            raise
        job_runner_inputs['meta']['token_id'] = agent_token['id']

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': tag,
            'version': service_ver,
            'username': system_variable('user_id'),
            'wsid': ws_id
        }
        kblogging.log_event(self._log, "run_app", log_info)

        try:
            job_id = clients.get(
                "execution_engine2",
                token=agent_token['token']).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({'err': str(e)})
            kblogging.log_event(self._log, "run_app_error", log_info)
            raise transform_job_exception(e)

        new_job = Job(job_id,
                      app_id,
                      input_vals,
                      system_variable('user_id'),
                      tag=tag,
                      app_version=service_ver,
                      cell_id=cell_id,
                      run_id=run_id,
                      token_id=agent_token['id'])

        self._send_comm_message(
            'run_status', {
                'event': 'launched_job',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id,
                'job_id': job_id
            })
        self.register_new_job(new_job)
        if cell_id is not None:
            return
        else:
            return new_job
Ejemplo n.º 26
0
    def _run_app_batch_internal(self, app_id, params, tag, version, cell_id,
                                run_id, dry_run):
        batch_method = "kb_BatchApp.run_batch"
        batch_app_id = "kb_BatchApp/run_batch"
        batch_method_ver = "dev"
        batch_method_tag = "dev"
        ws_id = strict_system_variable('workspace_id')
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        # A list of lists of UPAs, used for each subjob.
        batch_ws_upas = list()
        # The list of actual input values, post-mapping.
        batch_run_inputs = list()

        for param_set in params:
            spec_params_map = dict((spec_params[i]['id'], spec_params[i])
                                   for i in range(len(spec_params)))
            batch_ws_upas.append(
                extract_ws_refs(app_id, tag, spec_params, param_set))
            batch_run_inputs.append(
                self._map_inputs(spec['behavior']['kb_service_input_mapping'],
                                 param_set, spec_params_map))

        service_method = spec['behavior']['kb_service_method']
        service_name = spec['behavior']['kb_service_name']
        service_ver = spec['behavior'].get('kb_service_version', None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        job_meta = {
            'tag': batch_method_tag,
            'batch_app': app_id,
            'batch_tag': tag,
            'batch_size': len(params),
        }
        if cell_id is not None:
            job_meta['cell_id'] = cell_id
        if run_id is not None:
            job_meta['run_id'] = run_id

        # Now put these all together in a way that can be sent to the batch processing app.
        batch_params = [{
            "module_name":
            service_name,
            "method_name":
            service_method,
            "service_ver":
            service_ver,
            "wsid":
            ws_id,
            "meta":
            job_meta,
            "batch_params": [{
                "params": batch_run_inputs[i],
                "source_ws_objects": batch_ws_upas[i]
            } for i in range(len(batch_run_inputs))],
        }]

        # We're now almost ready to run the job. Last, we need an agent token.
        try:
            token_name = 'KBApp_{}'.format(app_id)
            token_name = token_name[:self.__MAX_TOKEN_NAME_LEN]
            agent_token = auth.get_agent_token(auth.get_auth_token(),
                                               token_name=token_name)
        except Exception as e:
            raise

        job_meta['token_id'] = agent_token['id']
        # This is the input set for NJSW.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            'method': batch_method,
            'service_ver': batch_method_ver,
            'params': batch_params,
            'app_id': batch_app_id,
            'wsid': ws_id,
            'meta': job_meta
        }
        # if len(ws_input_refs) > 0:
        #     job_runner_inputs['source_ws_objects'] = ws_input_refs

        # if we're doing a dry run, just return the inputs that we made.
        if dry_run:
            return job_runner_inputs

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': batch_method_tag,
            'version': service_ver,
            'username': system_variable('user_id'),
            'wsid': ws_id
        }
        kblogging.log_event(self._log, "run_batch_app", log_info)

        try:
            job_id = clients.get(
                "execution_engine2",
                token=agent_token['token']).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({'err': str(e)})
            kblogging.log_event(self._log, "run_batch_app_error", log_info)
            raise transform_job_exception(e)

        new_job = Job(job_id,
                      batch_app_id,
                      batch_params,
                      system_variable('user_id'),
                      tag=batch_method_tag,
                      app_version=batch_method_ver,
                      cell_id=cell_id,
                      run_id=run_id,
                      token_id=agent_token['id'],
                      meta=job_meta)

        self._send_comm_message(
            'run_status', {
                'event': 'launched_job',
                'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
                'cell_id': cell_id,
                'run_id': run_id,
                'job_id': job_id
            })
        self.register_new_job(new_job)
        if cell_id is not None:
            return
        else:
            return new_job
Ejemplo n.º 27
0
    def _construct_job_status(self, job: Job, state: dict) -> dict:
        """
        Creates a Job status dictionary with structure:
        {
            owner: string (username, who started the job),
            spec: app spec (optional)
            widget_info: (if not finished, None, else...) job.get_viewer_params result
            state: {
                job_id: string,
                status: string,
                created: epoch ms,
                updated: epoch ms,
                queued: optional - epoch ms,
                finished: optional - epoc ms,
                terminated_code: optional - int,
                tag: string (release, beta, dev),
                parent_job_id: optional - string or null,
                run_id: string,
                cell_id: string,
                errormsg: optional - string,
                error (optional): {
                    code: int,
                    name: string,
                    message: string (should be for the user to read),
                    error: string, (likely a stacktrace)
                },
                error_code: optional - int
            }
        }
        :param job: a Job object
        :param state: dict, expected to be in the format that comes straight from the
            Execution Engine 2 service
        """
        widget_info = None
        app_spec = {}

        # If there's no job, but the state is valid, then that (likely) means the job was started
        # by either running AppManager.run_app directly without cell_id or run_id info, or that
        # it was started outside of the biokbase.narrative.jobs setup. This could be done through
        # direct calls to EE2.
        #
        # This could also be triggered by manually looking up job state for some job that doesn't
        # exist in the Narrative. Which is borderline, but still probably ok.
        if job is None and state is not None:
            state.update({
                "cell_id": None,
                "run_id": None,
            })
            return {
                'state': state,
                'app_spec': app_spec,
                'widget_info': widget_info,
                'owner': None
            }

        if state is None:
            kblogging.log_event(self._log, "lookup_job_status.error", {
                'err':
                'Unable to get job state for job {}'.format(job.job_id)
            })
            state = self._create_error_state(
                "Unable to find current job state. Please try again later, or contact KBase.",
                "Unable to return job state",
                -1,
                cell_id=job.cell_id,
                run_id=job.run_id,
                job_id=job.job_id)

        if state.get('finished'):
            try:
                widget_info = job.get_viewer_params(state)
            except Exception as e:
                # Can't get viewer params
                new_e = transform_job_exception(e)
                kblogging.log_event(self._log, "lookup_job_status.error",
                                    {'err': str(e)})
                state.update({
                    "status": "error",
                    "errormsg": "Unable to build output viewer parameters!",
                    "error": {
                        "code":
                        getattr(new_e, "code", -1),
                        "source":
                        getattr(new_e, "source", "JobManager"),
                        "name":
                        "App Error",
                        "message":
                        "Unable to build output viewer parameters",
                        "error":
                        "Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance."
                    }
                })

        state.update({
            "child_jobs":
            self._child_job_states(state.get("sub_jobs", []),
                                   job.meta.get("batch_app"),
                                   job.meta.get("batch_tag")),
            "run_id":
            job.cell_id,
            "cell_id":
            job.cell_id
        })
        if "batch_size" in job.meta:
            state.update({"batch_size": job.meta["batch_size"]})
        return {
            "state": state,
            "spec": app_spec,
            "widget_info": widget_info,
            "owner": job.owner,
            "listener_count": self._running_jobs[job.job_id]["refresh"]
        }
Ejemplo n.º 28
0
    def run_app_batch(
        self,
        app_info: list,
        cell_id: str = None,
        run_id: str = None,
        dry_run: bool = False,
    ) -> Union[dict, None]:
        """
        Attempts to run a batch of apps in bulk using the Execution Engine's run_app_batch endpoint.
        If a cell_id is provided, this sends various job messages over the comm channel, and returns None.
        If dry_run is True, this returns the structure that would be sent to EE2.run_job_batch

        Parameters:
        -----------
        app_info: this is a list of app information dictionaries. It's broken down such that a single app
            can have multiple sets of parameters, which could create multiple runs of that app.
            Each dictionary is expected to have the following keys:
            app_id: the id of the app to run
            tag: the app tag to run, one of release, beta, or dev
            version: (optional) the specified version to run, if not provided, this will be the most recent
                for that particular tag
            shared_params: (optional) any params to be shared by all runs of the app
            params: a list of at least one dictionary. Each dict contains the set of parameters to run the
                app once.
        cell_id: if provided, this should be a unique id for the Narrative cell that's running the app.
        run_id: if provided, this should be a unique id representing a Narrative cell's knowledge of
            that job.
        dry_run: if True, this won't start the job, but return the structure that would be sent to the
            KBase execution engine.

        Example:
        --------
        run_app_batch([{
            "app_id": "Some_module/reads_to_contigset",
            "tag": "release",
            "version": "1.0.0",
            "shared_params": {
                "filter_len": 500
            },
            "params": [
                {
                    "read_library_name" : "My_PE_Library",
                    "output_contigset_name" : "My_Contig_Assembly"
                }, {
                    "read_library_name": "Another_reads_library",
                    "output_contigset_name": "Another_contig_assembly"
                }
            ]
        }, {
            "app_id": "Some_module/contigset_to_genome",
            "tag": "release",
            "version": "1.1.0",
            "shared_params": {
                "filter_len": 1000,
                "taxon_id": 121212
            },
            "params": [
                {
                    "contigset": "My_contigset",
                    "genome_name": "My_genome"
                }
            ]
        }])
        """

        if not isinstance(app_info, list) or len(app_info) == 0:
            raise ValueError(
                "app_info must be a list with at least one set of app information"
            )
        batch_run_inputs = []
        ws_id = strict_system_variable("workspace_id")
        batch_params = {"wsid": ws_id}  # for EE2.run_job_batch
        log_app_info = []
        for info in app_info:
            self._validate_bulk_app_info(info)
            self._reconstitute_shared_params(info)
            app_id = info["app_id"]
            tag = info.get("tag", "release")
            version = info.get("version")
            spec = self._get_validated_app_spec(app_id, tag, True, version)
            for param_set in info["params"]:
                # will raise a ValueError if anything is wrong or missing
                # otherwise, will build a set of inputs for EE2.run_job
                batch_run_inputs.append(
                    self._build_run_job_params(
                        spec,
                        tag,
                        param_set,
                        version=version,
                        cell_id=cell_id,
                        run_id=run_id,
                    )
                )
            log_app_info.append(
                {
                    "app_id": app_id,
                    "tag": tag,
                    "version": version,
                    "num_jobs": len(batch_run_inputs),
                }
            )
        log_info = {
            "app_info": log_app_info,
            "username": system_variable("user_id"),
            "wsid": ws_id,
        }
        kblogging.log_event(self._log, "run_app_batch", log_info)

        # if we're doing a dry run, stop here and return the setup
        if dry_run:
            return {"batch_run_params": batch_run_inputs, "batch_params": batch_params}

        # We're now almost ready to run the job. Last, we need an agent token.
        agent_token = self._get_agent_token(
            f"KBase_app_batch_{len(batch_run_inputs)}_apps"
        )

        # add the token id to the meta for all jobs
        for job_input in batch_run_inputs:
            job_input["meta"]["token_id"] = agent_token["id"]

        # run the job batch and get a batch_submission record
        try:
            batch_submission = clients.get(
                "execution_engine2", token=agent_token["token"]
            ).run_job_batch(batch_run_inputs, batch_params)
        except Exception as e:
            log_info.update({"err": str(e)})
            kblogging.log_event(self._log, "run_job_bulk_error", log_info)
            raise transform_job_exception(e) from e

        batch_id = batch_submission["batch_id"]
        child_ids = batch_submission["child_job_ids"]

        self._send_comm_message(
            MESSAGE_TYPE["RUN_STATUS"],
            {
                "event": "launched_job_batch",
                "event_at": timestamp(),
                "cell_id": cell_id,
                "run_id": run_id,
                "batch_id": batch_id,
                "child_job_ids": child_ids,
            },
        )

        child_jobs = Job.from_job_ids(child_ids, return_list=True)
        parent_job = Job.from_job_id(
            batch_id,
            children=child_jobs,
        )

        # TODO make a tighter design in the job manager for submitting a family of jobs
        for new_job in child_jobs:
            JobManager().register_new_job(new_job, refresh=False)
        JobManager().register_new_job(parent_job, refresh=False)

        if cell_id is None:
            return {"parent_job": parent_job, "child_jobs": child_jobs}
Ejemplo n.º 29
0
    def run_app(
        self,
        app_id,
        params,
        tag="release",
        version=None,
        cell_id=None,
        run_id=None,
        dry_run=False,
    ):
        """
        Attempts to run the app, returns a Job with the running app info.
        If this is given a cell_id, then returns None. If not, it returns the
        generated Job object.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - this is the dictionary of parameters to tbe used with the app.
                 They can be found by using the app_usage function. If any
                 non-optional apps are missing, a ValueError will be raised.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules
                  have versions, so if the tag is not 'release', and a version
                  is given, a ValueError will be raised.

        Example:
        --------
        run_app('MegaHit/run_megahit',
                {
                    'read_library_name' : 'My_PE_Library',
                    'output_contigset_name' : 'My_Contig_Assembly'
                },
                version='>=1.0.0'
        )
        """
        if params is None:
            params = {}
        ws_id = strict_system_variable("workspace_id")
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        job_runner_inputs = self._build_run_job_params(
            spec, tag, params, version, cell_id, run_id, ws_id
        )

        if dry_run:
            return job_runner_inputs

        # We're now almost ready to run the job. Last, we need an agent token.
        agent_token = self._get_agent_token(app_id)
        job_runner_inputs["meta"]["token_id"] = agent_token["id"]

        # Log that we're trying to run a job...
        log_info = {
            "app_id": app_id,
            "tag": tag,
            "version": job_runner_inputs["service_ver"],
            "username": system_variable("user_id"),
            "wsid": ws_id,
        }
        kblogging.log_event(self._log, "run_app", log_info)

        try:
            job_id = clients.get(
                "execution_engine2", token=agent_token["token"]
            ).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({"err": str(e)})
            kblogging.log_event(self._log, "run_app_error", log_info)
            raise transform_job_exception(e) from e

        new_job = Job.from_job_id(job_id)

        self._send_comm_message(
            MESSAGE_TYPE["RUN_STATUS"],
            {
                "event": "launched_job",
                "event_at": timestamp(),
                "cell_id": cell_id,
                "run_id": run_id,
                "job_id": job_id,
            },
        )
        JobManager().register_new_job(new_job, refresh=False)
        if cell_id is not None:
            return
        else:
            return new_job
Ejemplo n.º 30
0
    def run_legacy_batch_app(
        self,
        app_id,
        params,
        tag="release",
        version=None,
        cell_id=None,
        run_id=None,
        dry_run=False,
    ):
        if params is None:
            params = []
        ws_id = strict_system_variable("workspace_id")
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        # A list of lists of UPAs, used for each subjob.
        batch_ws_upas = []
        # The list of actual input values, post-mapping.
        batch_run_inputs = []

        for param_set in params:
            spec_params_map = dict(
                (spec_params[i]["id"], spec_params[i]) for i in range(len(spec_params))
            )
            batch_ws_upas.append(extract_ws_refs(app_id, tag, spec_params, param_set))
            batch_run_inputs.append(
                self._map_inputs(
                    spec["behavior"]["kb_service_input_mapping"],
                    param_set,
                    spec_params_map,
                )
            )

        service_method = spec["behavior"]["kb_service_method"]
        service_name = spec["behavior"]["kb_service_name"]
        service_ver = spec["behavior"].get("kb_service_version", None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        job_meta = {
            "tag": BATCH_APP["TAG"],
            "batch_app": app_id,
            "batch_tag": tag,
            "batch_size": len(params),
        }
        if cell_id is not None:
            job_meta["cell_id"] = cell_id
        if run_id is not None:
            job_meta["run_id"] = run_id

        # Now put these all together in a way that can be sent to the batch processing app.
        batch_params = [
            {
                "module_name": service_name,
                "method_name": service_method,
                "service_ver": service_ver,
                "wsid": ws_id,
                "meta": job_meta,
                "batch_params": [
                    {
                        "params": batch_run_inputs[i],
                        "source_ws_objects": batch_ws_upas[i],
                    }
                    for i in range(len(batch_run_inputs))
                ],
            }
        ]

        # We're now almost ready to run the job. Last, we need an agent token.
        agent_token = self._get_agent_token(app_id)
        job_meta["token_id"] = agent_token["id"]

        # This is the input set for ee2.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            "app_id": BATCH_APP["APP_ID"],
            "meta": job_meta,
            "method": BATCH_APP["METHOD"],
            "params": batch_params,
            "service_ver": BATCH_APP["VERSION"],
            "wsid": ws_id,
        }

        # if we're doing a dry run, just return the inputs that we made.
        if dry_run:
            return job_runner_inputs

        # Log that we're trying to run a job...
        log_info = {
            "app_id": app_id,
            "tag": BATCH_APP["TAG"],
            "version": service_ver,
            "username": system_variable("user_id"),
            "wsid": ws_id,
        }
        kblogging.log_event(self._log, "run_batch_app", log_info)

        try:
            job_id = clients.get(
                "execution_engine2", token=agent_token["token"]
            ).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({"err": str(e)})
            kblogging.log_event(self._log, "run_batch_app_error", log_info)
            raise transform_job_exception(e) from e

        new_job = Job.from_job_id(
            job_id,
            extra_data={
                # this data is not preserved in the ee2 record
                "batch_app": app_id,
                "batch_tag": tag,
                "batch_size": len(params),
            },
        )

        self._send_comm_message(
            MESSAGE_TYPE["RUN_STATUS"],
            {
                "event": "launched_job",
                "event_at": timestamp(),
                "cell_id": cell_id,
                "run_id": run_id,
                "job_id": job_id,
            },
        )
        JobManager().register_new_job(new_job, refresh=False)
        if cell_id is None:
            return new_job
Ejemplo n.º 31
0
    def output_state(self, state=None) -> dict:
        """
        :param state: can be queried individually from ee2/cache with self.state(),
            but sometimes want it to be queried in bulk from ee2 upstream
        :return: dict, with structure

        {
            outputWidgetInfo: (if not finished, None, else...) job.get_viewer_params result
            jobState: {
                job_id: string,
                status: string,
                created: epoch ms,
                updated: epoch ms,
                queued: optional - epoch ms,
                finished: optional - epoc ms,
                terminated_code: optional - int,
                tag: string (release, beta, dev),
                parent_job_id: optional - string or null,
                run_id: string,
                cell_id: string,
                errormsg: optional - string,
                error (optional): {
                    code: int,
                    name: string,
                    message: string (should be for the user to read),
                    error: string, (likely a stacktrace)
                },
                error_code: optional - int
            }
        }
        """
        if not state:
            state = self.state()
        else:
            self._update_state(state)
            state = self._internal_state()

        if state is None:
            return self._create_error_state(
                "Unable to find current job state. Please try again later, or contact KBase.",
                "Unable to return job state",
                -1,
            )

        self._trim_ee2_state(state, OUTPUT_STATE_EXCLUDED_JOB_STATE_FIELDS)
        if "job_output" not in state:
            state["job_output"] = {}
        for arg in EXTRA_JOB_STATE_FIELDS:
            state[arg] = getattr(self, arg)

        widget_info = None
        if state.get("finished"):
            try:
                widget_info = self.get_viewer_params(state)
            except Exception as e:
                # Can't get viewer params
                new_e = transform_job_exception(e)
                widget_info = {
                    "status": "error",
                    "errormsg": "Unable to build output viewer parameters!",
                    "error": {
                        "code":
                        getattr(new_e, "code", -1),
                        "source":
                        getattr(new_e, "source", "JobManager"),
                        "name":
                        "App Error",
                        "message":
                        "Unable to build output viewer parameters",
                        "error":
                        "Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact https://kbase.us/support for assistance.",
                    },
                }
                # update timestamp if there was an error
                state.update({"updated": int(time.time())})

        return {
            "job_id": self.job_id,
            "jobState": state,
            "outputWidgetInfo": widget_info,
        }
Ejemplo n.º 32
0
    def _construct_job_status(self, job_id):
        """
        Always creates a Job Status.
        It'll embed error messages into the status if there are problems.
        """

        state = {}
        widget_info = None
        app_spec = {}

        job = self.get_job(job_id)
        if job is None:
            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Job does not seem to exist, or it is otherwise unavailable.',
                    'message': 'Job does not exist',
                    'name': 'Job Error',
                    'code': -1,
                    'exception': {
                        'error_message': 'job not found in JobManager',
                        'error_type': 'ValueError',
                        'error_stacktrace': ''
                    }
                },
                'cell_id': None,
                'run_id': None
            }
            return {
                'state': state,
                'app_spec': app_spec,
                'widget_info': widget_info,
                'owner': None
            }

        try:
            app_spec = job.app_spec()
        except Exception as e:
            kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})

        try:
            state = job.state()
        except Exception as e:
            kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})

            new_e = transform_job_exception(e)
            e_type = type(e).__name__
            e_message = str(new_e).replace('<', '&lt;').replace('>', '&gt;')
            e_trace = traceback.format_exc().replace('<', '&lt;').replace('>', '&gt;')
            e_code = getattr(new_e, "code", -2)
            e_source = getattr(new_e, "source", "JobManager")

            state = {
                'job_state': 'error',
                'error': {
                    'error': 'Unable to find current job state. Please try again later, or contact KBase.',
                    'message': 'Unable to return job state',
                    'name': 'Job Error',
                    'code': e_code,
                    'source': e_source,
                    'exception': {
                        'error_message': e_message,
                        'error_type': e_type,
                        'error_stacktrace': e_trace,
                    }
                },
                'creation_time': 0,
                'cell_id': job.cell_id,
                'run_id': job.run_id,
                'job_id': job_id
            }

        if state.get('finished', 0) == 1:
            try:
                widget_info = job.get_viewer_params(state)
            except Exception as e:
                # Can't get viewer params
                new_e = transform_job_exception(e)
                kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)})
                state['job_state'] = 'error'
                state['error'] = {
                    'error': 'Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance.',
                    'message': 'Unable to build output viewer parameters!',
                    'name': 'App Error',
                    'code': getattr(new_e, "code", -1),
                    'source': getattr(new_e, "source", "JobManager")
                }

        if 'canceling' in self._running_jobs[job_id]:
            state['job_state'] = 'canceling'

        return {'state': state,
                'spec': app_spec,
                'widget_info': widget_info,
                'owner': job.owner}
Ejemplo n.º 33
0
    def _run_app_batch_internal(self, app_id, params, tag, version, cell_id, run_id, dry_run):
        batch_method = "kb_BatchApp.run_batch"
        batch_app_id = "kb_BatchApp/run_batch"
        batch_method_ver = "dev"
        batch_method_tag = "dev"
        ws_id = strict_system_variable('workspace_id')
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        # A list of lists of UPAs, used for each subjob.
        batch_ws_upas = list()
        # The list of actual input values, post-mapping.
        batch_run_inputs = list()

        for param_set in params:
            spec_params_map = dict((spec_params[i]['id'], spec_params[i])
                                   for i in range(len(spec_params)))
            batch_ws_upas.append(extract_ws_refs(app_id, tag, spec_params, param_set))
            batch_run_inputs.append(self._map_inputs(
                spec['behavior']['kb_service_input_mapping'],
                param_set,
                spec_params_map))

        service_method = spec['behavior']['kb_service_method']
        service_name = spec['behavior']['kb_service_name']
        service_ver = spec['behavior'].get('kb_service_version', None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        job_meta = {
            'tag': batch_method_tag,
            'batch_app': app_id,
            'batch_tag': tag,
            'batch_size': len(params),
        }
        if cell_id is not None:
            job_meta['cell_id'] = cell_id
        if run_id is not None:
            job_meta['run_id'] = run_id

        # Now put these all together in a way that can be sent to the batch processing app.
        batch_params = [{
            "module_name": service_name,
            "method_name": service_method,
            "service_ver": service_ver,
            "wsid": ws_id,
            "meta": job_meta,
            "batch_params": [{
                "params": batch_run_inputs[i],
                "source_ws_objects": batch_ws_upas[i]
            } for i in range(len(batch_run_inputs))],
        }]

        # We're now almost ready to run the job. Last, we need an agent token.
        try:
            token_name = 'KBApp_{}'.format(app_id)
            token_name = token_name[:self.__MAX_TOKEN_NAME_LEN]
            agent_token = auth.get_agent_token(auth.get_auth_token(), token_name=token_name)
        except Exception as e:
            raise

        job_meta['token_id'] = agent_token['id']
        # This is the input set for NJSW.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            'method': batch_method,
            'service_ver': batch_method_ver,
            'params': batch_params,
            'app_id': batch_app_id,
            'wsid': ws_id,
            'meta': job_meta
        }
        # if len(ws_input_refs) > 0:
        #     job_runner_inputs['source_ws_objects'] = ws_input_refs

        # if we're doing a dry run, just return the inputs that we made.
        if dry_run:
            return job_runner_inputs

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': batch_method_tag,
            'version': service_ver,
            'username': system_variable('user_id'),
            'wsid': ws_id
        }
        kblogging.log_event(self._log, "run_batch_app", log_info)

        try:
            job_id = clients.get("job_service", token=agent_token['token']).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({'err': str(e)})
            kblogging.log_event(self._log, "run_batch_app_error", log_info)
            raise transform_job_exception(e)

        new_job = Job(job_id,
                      batch_app_id,
                      batch_params,
                      system_variable('user_id'),
                      tag=batch_method_tag,
                      app_version=batch_method_ver,
                      cell_id=cell_id,
                      run_id=run_id,
                      token_id=agent_token['id'],
                      meta=job_meta)

        self._send_comm_message('run_status', {
            'event': 'launched_job',
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
            'cell_id': cell_id,
            'run_id': run_id,
            'job_id': job_id
        })
        JobManager().register_new_job(new_job)
        if cell_id is not None:
            return
        else:
            return new_job
Ejemplo n.º 34
0
    def _run_app_internal(self, app_id, params, tag, version,
                          cell_id, run_id, dry_run):
        """
        Attemps to run the app, returns a Job with the running app info.
        Should *hopefully* also inject that app into the Narrative's metadata.
        Probably need some kind of JavaScript-foo to get that to work.

        Parameters:
        -----------
        app_id - should be from the app spec, e.g. 'build_a_metabolic_model'
                    or 'MegaHit/run_megahit'.
        params - a dictionary of parameters.
        tag - optional, one of [release|beta|dev] (default=release)
        version - optional, a semantic version string. Only released modules
                  have versions, so if the tag is not 'release', and a version
                  is given, a ValueError will be raised.
        **kwargs - these are the set of parameters to be used with the app.
                   They can be found by using the app_usage function. If any
                   non-optional apps are missing, a ValueError will be raised.
        """
        ws_id = strict_system_variable('workspace_id')
        spec = self._get_validated_app_spec(app_id, tag, True, version=version)

        # Preflight check the params - all required ones are present, all
        # values are the right type, all numerical values are in given ranges
        spec_params = self.spec_manager.app_params(spec)

        spec_params_map = dict((spec_params[i]['id'], spec_params[i])
                               for i in range(len(spec_params)))
        ws_input_refs = extract_ws_refs(app_id, tag, spec_params, params)
        input_vals = self._map_inputs(
            spec['behavior']['kb_service_input_mapping'],
            params,
            spec_params_map)

        service_method = spec['behavior']['kb_service_method']
        service_name = spec['behavior']['kb_service_name']
        service_ver = spec['behavior'].get('kb_service_version', None)

        # Let the given version override the spec's version.
        if version is not None:
            service_ver = version

        # This is what calls the function in the back end - Module.method
        # This isn't the same as the app spec id.
        function_name = service_name + '.' + service_method
        job_meta = {'tag': tag}
        if cell_id is not None:
            job_meta['cell_id'] = cell_id
        if run_id is not None:
            job_meta['run_id'] = run_id

        # This is the input set for NJSW.run_job. Now we need the workspace id
        # and whatever fits in the metadata.
        job_runner_inputs = {
            'method': function_name,
            'service_ver': service_ver,
            'params': input_vals,
            'app_id': app_id,
            'wsid': ws_id,
            'meta': job_meta
        }
        if len(ws_input_refs) > 0:
            job_runner_inputs['source_ws_objects'] = ws_input_refs
        if dry_run:
            return job_runner_inputs

        # We're now almost ready to run the job. Last, we need an agent token.
        try:
            token_name = 'KBApp_{}'.format(app_id)
            token_name = token_name[:self.__MAX_TOKEN_NAME_LEN]
            agent_token = auth.get_agent_token(auth.get_auth_token(), token_name=token_name)
        except Exception as e:
            raise
        job_runner_inputs['meta']['token_id'] = agent_token['id']

        # Log that we're trying to run a job...
        log_info = {
            'app_id': app_id,
            'tag': tag,
            'version': service_ver,
            'username': system_variable('user_id'),
            'wsid': ws_id
        }
        kblogging.log_event(self._log, "run_app", log_info)

        try:
            job_id = clients.get("job_service", token=agent_token['token']).run_job(job_runner_inputs)
        except Exception as e:
            log_info.update({'err': str(e)})
            kblogging.log_event(self._log, "run_app_error", log_info)
            raise transform_job_exception(e)

        new_job = Job(job_id,
                      app_id,
                      input_vals,
                      system_variable('user_id'),
                      tag=tag,
                      app_version=service_ver,
                      cell_id=cell_id,
                      run_id=run_id,
                      token_id=agent_token['id'])

        self._send_comm_message('run_status', {
            'event': 'launched_job',
            'event_at': datetime.datetime.utcnow().isoformat() + 'Z',
            'cell_id': cell_id,
            'run_id': run_id,
            'job_id': job_id
        })
        JobManager().register_new_job(new_job)
        if cell_id is not None:
            return
        else:
            return new_job
Ejemplo n.º 35
0
    def initialize_jobs(self):
        """
        Initializes this JobManager.
        This is expected to be run by a running Narrative, and naturally linked to a workspace.
        So it does the following steps.
        1. app_util.system_variable('workspace_id')
        2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id)
        3. initialize the Job objects by running NJS.get_job_params on each of those (also gets app_id)
        4. start the status lookup loop.
        """

        ws_id = system_variable('workspace_id')
        try:
            nar_jobs = clients.get('user_and_job_state').list_jobs2({
                'authstrat': 'kbaseworkspace',
                'authparams': [str(ws_id)]
            })
        except Exception as e:
            kblogging.log_event(self._log, 'init_error', {'err': str(e)})
            new_e = transform_job_exception(e)
            error = {
                'error': 'Unable to get initial jobs list',
                'message': getattr(new_e, 'message', 'Unknown reason'),
                'code': getattr(new_e, 'code', -1),
                'source': getattr(new_e, 'source', 'jobmanager'),
                'name': getattr(new_e, 'name', type(e).__name__),
                'service': 'user_and_job_state'
            }
            self._send_comm_message('job_init_err', error)
            raise new_e

        for info in nar_jobs:
            job_id = info[0]
            user_info = info[1]
            job_meta = info[10]
            try:
                job_info = clients.get('job_service').get_job_params(job_id)[0]

                self._running_jobs[job_id] = {
                    'refresh': True,
                    'job': Job.from_state(job_id,
                                          job_info,
                                          user_info[0],
                                          app_id=job_info.get('app_id'),
                                          tag=job_meta.get('tag', 'release'),
                                          cell_id=job_meta.get('cell_id', None),
                                          run_id=job_meta.get('run_id', None))
                }
                
            except Exception as e:
                kblogging.log_event(self._log, 'init_error', {'err': str(e)})
                new_e = transform_job_exception(e)
                error = {
                    'error': 'Unable to get job info on initial lookup',
                    'job_id': job_id,
                    'message': getattr(new_e, 'message', 'Unknown reason'),
                    'code': getattr(new_e, 'code', -1),
                    'source': getattr(new_e, 'source', 'jobmanager'),
                    'name': getattr(new_e, 'name', type(e).__name__),
                    'service': 'job_service'
                }
                self._send_comm_message('job_init_lookup_err', error)
                raise new_e # should crash and burn on any of these.

        if not self._running_lookup_loop:
            # only keep one loop at a time in cause this gets called again!
            if self._lookup_timer is not None:
                self._lookup_timer.cancel()
            self._running_lookup_loop = True
            self._lookup_job_status_loop()
        else:
            self._lookup_all_job_status()