def test_transform_ujs_err(self): code = 1000 message = "some error message" name = "UJSError" ujs_err = UJSServerError(name, code, message) nar_err = transform_job_exception(ujs_err) self.assertEqual(nar_err.code, code) self.assertEqual(nar_err.message, message) self.assertEqual(nar_err.name, name) self.assertEqual(nar_err.source, 'ujs')
def test_transform_ee2_err(self): code = 1000 message = ERROR_MSG name = "EEError" ee2_err = EEServerError(name, code, message) nar_err = transform_job_exception(ee2_err) self.assertEqual(nar_err.code, code) self.assertEqual(nar_err.message, message) self.assertEqual(nar_err.name, name) self.assertEqual(nar_err.source, "ee2") self.assertIsNone(nar_err.error)
def test_transform_http_err_internal(self): code = 500 message = "An internal error occurred in the KBase service." name = "HTTPError" res = requests.Response() res.status_code = code err = HTTPError("http error", response=res) nar_err = transform_job_exception(err) self.assertEqual(nar_err.code, code) self.assertEqual(nar_err.message, message) self.assertEqual(nar_err.name, name) self.assertEqual(nar_err.source, 'network')
def test_transform_http_err_unknown(self): code = 666 message = "An untracked error occurred." name = "HTTPError" res = requests.Response() res.status_code = code err = HTTPError("http error", response=res) nar_err = transform_job_exception(err) self.assertEqual(nar_err.code, code) self.assertEqual(nar_err.message, message) self.assertEqual(nar_err.name, name) self.assertEqual(nar_err.source, 'network')
def cancel_job(self, job_id, parent_job_id=None): """ Cancels a running job, placing it in a canceled state. Does NOT delete the job. Raises an exception if the current user doesn't have permission to cancel the job. """ if job_id is None: raise ValueError('Job id required for cancellation!') if not parent_job_id and job_id not in self._running_jobs: self._send_comm_message('job_does_not_exist', { 'job_id': job_id, 'source': 'cancel_job' }) return try: state = self._get_job_state(job_id, parent_job_id=parent_job_id) if state.get('canceled', 0) == 1 or state.get('finished', 0) == 1: # It's already finished, don't try to cancel it again. return except Exception as e: raise ValueError('Unable to get Job state') # Stop updating the job status while we try to cancel. # Also, set it to have a special state of 'canceling' while we're doing the cancel if not parent_job_id: is_refreshing = self._running_jobs[job_id].get('refresh', 0) self._running_jobs[job_id]['refresh'] = 0 self._running_jobs[job_id]['canceling'] = True try: clients.get('job_service').cancel_job({'job_id': job_id}) except Exception as e: new_e = transform_job_exception(e) error = { 'error': 'Unable to get cancel job', 'message': getattr(new_e, 'message', 'Unknown reason'), 'code': getattr(new_e, 'code', -1), 'source': getattr(new_e, 'source', 'jobmanager'), 'name': getattr(new_e, 'name', type(e).__name__), 'request_type': 'cancel_job', 'job_id': job_id } self._send_comm_message('job_comm_error', error) raise (e) finally: if not parent_job_id: self._running_jobs[job_id]['refresh'] = is_refreshing del self._running_jobs[job_id]['canceling'] # Rather than a separate message, how about triggering a job-status message: self._lookup_job_status(job_id, parent_job_id=parent_job_id)
def retry_jobs(self, job_id_list: List[str]) -> dict: """ Returns [ { "job_id": job_id, "job": {"state": {"job_id": job_id, "status": status, ...} ...}, "retry_id": retry_id, "retry": {"state": {"job_id": retry_id, "status": status, ...} ...} }, { "job": {"state": {"job_id": job_id, "status": status, ...} ...}, "error": "..." } ... { "job": {"state": {"job_id": job_id, "status": DOES_NOT_EXIST}}, "error": f"Cannot find job with ID {job_id}", } ] where the innermost dictionaries are job states from ee2 and are within the job states from job.output_state() """ job_ids, error_ids = self._check_job_list(job_id_list) try: retry_results = clients.get("execution_engine2").retry_jobs( {"job_ids": job_ids} ) except Exception as e: raise transform_job_exception(e, "Unable to retry job(s)") # for each retry result, refresh the state of the retried and new jobs orig_ids = [result["job_id"] for result in retry_results] retry_ids = [ result["retry_id"] for result in retry_results if "retry_id" in result ] orig_states = self._construct_job_output_state_set(orig_ids) retry_states = self._construct_job_output_state_set( retry_ids, self._create_jobs(retry_ids) # add to self._running_jobs index ) job_states = {**orig_states, **retry_states} results_by_job_id = {} # fill in the job state details for result in retry_results: job_id = result["job_id"] results_by_job_id[job_id] = {"job_id": job_id, "job": job_states[job_id]} if "retry_id" in result: retry_id = result["retry_id"] results_by_job_id[job_id]["retry_id"] = retry_id results_by_job_id[job_id]["retry"] = job_states[retry_id] if "error" in result: results_by_job_id[job_id]["error"] = result["error"] return self.add_errors_to_results(results_by_job_id, error_ids)
def test_transform_ee2_err__with_error(self): code = 1000 message = ERROR_MSG name = "EEError" error = "Unable to perform some request" ee2_err = EEServerError(name, code, message) nar_err = transform_job_exception(ee2_err, error) self.assertEqual(nar_err.code, code) self.assertEqual(nar_err.message, message) self.assertEqual(nar_err.name, name) self.assertEqual(nar_err.source, "ee2") self.assertEqual(nar_err.error, error)
def test_transform_http_err_unavailable(self): codes = [404, 502, 503] message = "A KBase service is currently unavailable." name = "HTTPError" for c in codes: res = requests.Response() res.status_code = c err = HTTPError("http error", response=res) nar_err = transform_job_exception(err) self.assertEqual(nar_err.code, c) self.assertEqual(nar_err.message, message) self.assertEqual(nar_err.name, name) self.assertEqual(nar_err.source, 'network')
def test_transform_http_err_timeout(self): codes = [504, 598, 599] message = "There was a temporary network connection error." name = "HTTPError" for c in codes: res = requests.Response() res.status_code = c err = HTTPError("http error", response=res) nar_err = transform_job_exception(err) self.assertEqual(nar_err.code, c) self.assertEqual(nar_err.message, message) self.assertEqual(nar_err.name, name) self.assertEqual(nar_err.source, 'network')
def cancel_job(self, job_id: str, parent_job_id: str = None) -> None: """ Cancels a running job, placing it in a canceled state. Does NOT delete the job. if the job_id is None or not found in this Narrative, a ValueError is raised. This then checks the job to see if it is already canceled/finished, then attempts to cancel it. If either of those steps fail, a NarrativeException is raised. """ if job_id is None: raise ValueError('Job id required for cancellation!') if not parent_job_id and job_id not in self._running_jobs: raise ValueError(f"No job present with id {job_id}") try: cancel_status = clients.get( "execution_engine2").check_job_canceled({"job_id": job_id}) if cancel_status.get("finished", 0) == 1 or cancel_status.get( "canceled", 0) == 1: # It's already finished, don't try to cancel it again. return except Exception as e: raise transform_job_exception(e) # Stop updating the job status while we try to cancel. # Also, set it to have a special state of 'canceling' while we're doing the cancel if not parent_job_id: is_refreshing = self._running_jobs[job_id].get('refresh', 0) self._running_jobs[job_id]['refresh'] = 0 self._running_jobs[job_id]['canceling'] = True try: clients.get('execution_engine2').cancel_job({'job_id': job_id}) except Exception as e: raise transform_job_exception(e) finally: if not parent_job_id: self._running_jobs[job_id]['refresh'] = is_refreshing del self._running_jobs[job_id]['canceling']
def cancel_job(self, job_id): """ Cancels a running job, placing it in a canceled state. Does NOT delete the job. Raises an exception if the current user doesn't have permission to cancel the job. """ if job_id is None: raise ValueError('Job id required for cancellation!') if job_id not in self._running_jobs: self._send_comm_message('job_does_not_exist', {'job_id': job_id, 'source': 'cancel_job'}) return try: job = self.get_job(job_id) state = job.state() if state.get('canceled', 0) == 1 or state.get('finished', 0) == 1: # It's already finished, don't try to cancel it again. return except Exception as e: raise ValueError('Unable to get Job state') # Stop updating the job status while we try to cancel. # Also, set it to have a special state of 'canceling' while we're doing the cancel is_refreshing = self._running_jobs[job_id].get('refresh', False) self._running_jobs[job_id]['refresh'] = False self._running_jobs[job_id]['canceling'] = True try: clients.get('job_service').cancel_job({'job_id': job_id}) except Exception as e: new_e = transform_job_exception(e) error = { 'error': 'Unable to get cancel job', 'message': getattr(new_e, 'message', 'Unknown reason'), 'code': getattr(new_e, 'code', -1), 'source': getattr(new_e, 'source', 'jobmanager'), 'name': getattr(new_e, 'name', type(e).__name__), 'request_type': 'cancel_job', 'job_id': job_id } self._send_comm_message('job_comm_error', error) raise(e) finally: self._running_jobs[job_id]['refresh'] = is_refreshing del self._running_jobs[job_id]['canceling'] # # self._send_comm_message('job_canceled', {'job_id': job_id}) # Rather than a separate message, how about triggering a job-status message: self._lookup_job_status(job_id)
def _cancel_job(self, job_id: str) -> None: # Stop updating the job status while we try to cancel. # Set the job to a special state of 'canceling' while we're doing the cancel is_refreshing = self._running_jobs[job_id].get("refresh", False) self._running_jobs[job_id]["refresh"] = False self._running_jobs[job_id]["canceling"] = True error = None try: clients.get("execution_engine2").cancel_job({"job_id": job_id}) except Exception as e: error = transform_job_exception(e, "Unable to cancel job") self._running_jobs[job_id]["refresh"] = is_refreshing del self._running_jobs[job_id]["canceling"] return error
def initialize_jobs(self): """ Initializes this JobManager. This is expected to be run by a running Narrative, and naturally linked to a workspace. So it does the following steps. 1. app_util.system_variable('workspace_id') 2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id) 3. initialize the Job objects by running NJS.get_job_params (also gets app_id) 4. start the status lookup loop. """ ws_id = system_variable("workspace_id") job_states = dict() kblogging.log_event(self._log, "JobManager.initialize_jobs", {"ws_id": ws_id}) try: job_states = clients.get("execution_engine2").check_workspace_jobs( { "workspace_id": ws_id, "return_list": 0 }) self._running_jobs = dict() except Exception as e: kblogging.log_event(self._log, "init_error", {"err": str(e)}) new_e = transform_job_exception(e) raise new_e for job_id, job_state in job_states.items(): job_input = job_state.get("job_input", {}) job_meta = job_input.get("narrative_cell_info", {}) status = job_state.get("status") job = Job.from_state( job_id, job_input, job_state.get("user"), app_id=job_input.get("app_id"), tag=job_meta.get("tag", "release"), cell_id=job_meta.get("cell_id", None), run_id=job_meta.get("run_id", None), token_id=job_meta.get("token_id", None), meta=job_meta, ) self._running_jobs[job_id] = { "refresh": 1 if status not in ["completed", "errored", "terminated"] else 0, "job": job, }
def get_job_logs( self, job_id: str, parent_job_id: str = None, first_line: int = 0, num_lines: int = None, latest_only: bool = False, ) -> tuple: """ Raises a Value error if the job_id doesn't exist or is not present. :param job_id: str - the job id from the execution engine :param parent_job_id: if the job is a child job, this is its parent (optional) :param first_line: int - the first line to be requested by the log. 0-indexed. If < 0, this will be set to 0 :param max_lines: int - the maximum number of lines to return. if < 0, will be reset to 0. if None, then will not be considered, and just return all the lines. :param latest_only: bool - if True, will only return the most recent max_lines of logs. This overrides the first_line parameter if set to True. So if the call made is get_job_logs(id, first_line=0, num_lines=5, latest_only=True), and there are 100 log lines available, then lines 96-100 will be returned. :returns: 3-tuple. elements in order: int - the first line returned int - the number of logs lines currently available for that job list - the lines themselves, fresh from the server. These are all tiny dicts with key "is_error" (either 0 or 1) and "line" - the log line string """ job = self.get_job(job_id) if first_line < 0: first_line = 0 if num_lines is not None and num_lines < 0: num_lines = 0 try: if latest_only: (max_lines, logs) = job.log() if num_lines is not None and max_lines > num_lines: first_line = max_lines - num_lines logs = logs[first_line:] else: (max_lines, logs) = job.log(first_line=first_line, num_lines=num_lines) return (first_line, max_lines, logs) except Exception as e: raise transform_job_exception(e)
def initialize_jobs(self, cell_ids: List[str] = None) -> None: """ Initializes this JobManager. This is expected to be run by a running Narrative, and naturally linked to a workspace. So it does the following steps. 1. gets the current workspace ID from app_util.system_variable('workspace_id') 2. get list of jobs with that ws id from ee2 (also gets tag, cell_id, run_id) 3. initialize the Job objects and add them to the running jobs list 4. start the status lookup loop. """ ws_id = system_variable("workspace_id") job_states = dict() kblogging.log_event(self._log, "JobManager.initialize_jobs", {"ws_id": ws_id}) try: job_states = clients.get("execution_engine2").check_workspace_jobs( { "workspace_id": ws_id, "return_list": 0, # do not remove "exclude_fields": JOB_INIT_EXCLUDED_JOB_STATE_FIELDS, } ) except Exception as e: kblogging.log_event(self._log, "init_error", {"err": str(e)}) new_e = transform_job_exception(e, "Unable to initialize jobs") raise new_e self._running_jobs = dict() job_states = self._reorder_parents_children(job_states) for job_state in job_states.values(): child_jobs = None if job_state.get("batch_job"): child_jobs = [ self.get_job(child_id) for child_id in job_state.get("child_jobs", []) ] job = Job(job_state, children=child_jobs) # Set to refresh when job is not in terminal state # and when job is present in cells (if given) # and when it is not part of a batch refresh = not job.was_terminal() and not job.batch_id if cell_ids is not None: refresh = refresh and job.in_cells(cell_ids) self.register_new_job(job, refresh)
def initialize_jobs(self): """ Initializes this JobManager. This is expected to be run by a running Narrative, and naturally linked to a workspace. So it does the following steps. 1. app_util.system_variable('workspace_id') 2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id) 3. initialize the Job objects by running NJS.get_job_params (also gets app_id) 4. start the status lookup loop. """ ws_id = system_variable("workspace_id") job_states = dict() kblogging.log_event(self._log, "JobManager.initialize_jobs", {'ws_id': ws_id}) try: job_states = clients.get('execution_engine2').check_workspace_jobs( { 'workspace_id': ws_id, 'return_list': 0 }) self._running_jobs = dict() except Exception as e: kblogging.log_event(self._log, 'init_error', {'err': str(e)}) new_e = transform_job_exception(e) raise new_e for job_id, job_state in job_states.items(): job_input = job_state.get('job_input', {}) job_meta = job_input.get('narrative_cell_info', {}) status = job_state.get('status') job = Job.from_state(job_id, job_input, job_state.get('user'), app_id=job_input.get('app_id'), tag=job_meta.get('tag', 'release'), cell_id=job_meta.get('cell_id', None), run_id=job_meta.get('run_id', None), token_id=job_meta.get('token_id', None), meta=job_meta) self._running_jobs[job_id] = { 'refresh': 1 if status not in ['completed', 'errored', 'terminated'] else 0, 'job': job }
def _run_app_internal(self, app_id, params, tag, version, cell_id, run_id, **kwargs): """ Attemps to run the app, returns a Job with the running app info. Should *hopefully* also inject that app into the Narrative's metadata. Probably need some kind of JavaScript-foo to get that to work. Parameters: ----------- app_id - should be from the app spec, e.g. 'build_a_metabolic_model' or 'MegaHit/run_megahit'. params - the dictionary of parameters. tag - optional, one of [release|beta|dev] (default=release) version - optional, a semantic version string. Only released modules have versions, so if the tag is not 'release', and a version is given, a ValueError will be raised. **kwargs - these are the set of parameters to be used with the app. They can be found by using the app_usage function. If any non-optional apps are missing, a ValueError will be raised. Example: -------- my_job = mm.run_app('MegaHit/run_megahit', version=">=1.0.0", read_library_name="My_PE_Library", output_contigset_name="My_Contig_Assembly") """ ### TODO: this needs restructuring so that we can send back validation failure ### messages. Perhaps a separate function and catch the errors, or return an ### error structure. # Intro tests: self.spec_manager.check_app(app_id, tag, raise_exception=True) if version is not None and tag != "release": if re.match(version, '\d+\.\d+\.\d+') is not None: raise ValueError("Semantic versions only apply to released app modules. You can use a Git commit hash instead to specify a version.") # Get the spec & params spec = self.spec_manager.get_spec(app_id, tag) # There's some branching to do here. # Cases: # app has behavior.kb_service_input_mapping -- is a valid long-running app. # app only has behavior.output_mapping - not kb_service_input_mapping or script_module - it's a viewer and should return immediately # app has other things besides kb_service_input_mapping -- not a valid app. if 'behavior' not in spec: raise Exception("This app appears invalid - it has no defined behavior") if 'kb_service_input_mapping' not in spec['behavior']: raise Exception("This app does not appear to be a long-running job! Please use 'run_local_app' to start this instead.") # Preflight check the params - all required ones are present, all values are the right type, all numerical values are in given ranges spec_params = self.spec_manager.app_params(spec) (params, ws_input_refs) = self._validate_parameters(app_id, tag, spec_params, params) ws_id = system_variable('workspace_id') if ws_id is None: raise ValueError('Unable to retrive current Narrative workspace information!') input_vals = self._map_inputs(spec['behavior']['kb_service_input_mapping'], params) service_method = spec['behavior']['kb_service_method'] service_name = spec['behavior']['kb_service_name'] service_ver = spec['behavior'].get('kb_service_version', None) service_url = spec['behavior']['kb_service_url'] # Let the given version override the spec's version. if version is not None: service_ver = version # This is what calls the function in the back end - Module.method # This isn't the same as the app spec id. function_name = service_name + '.' + service_method job_meta = {'tag': tag} if cell_id is not None: job_meta['cell_id'] = cell_id if run_id is not None: job_meta['run_id'] = run_id # This is the input set for NJSW.run_job. Now we need the worksapce id and whatever fits in the metadata. job_runner_inputs = { 'method': function_name, 'service_ver': service_ver, 'params': input_vals, 'app_id': app_id, 'wsid': ws_id, 'meta': job_meta } if len(ws_input_refs) > 0: job_runner_inputs['source_ws_objects'] = ws_input_refs # Log that we're trying to run a job... log_info = { 'app_id': app_id, 'tag': tag, 'version': service_ver, 'username': system_variable('user_id'), 'wsid': ws_id } kblogging.log_event(self._log, "run_app", log_info) try: job_id = self.njs.run_job(job_runner_inputs) except Exception as e: log_info.update({'err': str(e)}) kblogging.log_event(self._log, "run_app_error", log_info) raise transform_job_exception(e) new_job = Job(job_id, app_id, [params], system_variable('user_id'), tag=tag, app_version=service_ver, cell_id=cell_id, run_id=run_id) self._send_comm_message('run_status', { 'event': 'launched_job', 'event_at': datetime.datetime.utcnow().isoformat() + 'Z', 'cell_id': cell_id, 'run_id': run_id, 'job_id': job_id }) JobManager().register_new_job(new_job) if cell_id is not None: return else: return new_job
def initialize_jobs(self, start_lookup_thread=True): """ Initializes this JobManager. This is expected to be run by a running Narrative, and naturally linked to a workspace. So it does the following steps. 1. app_util.system_variable('workspace_id') 2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id) 3. initialize the Job objects by running NJS.get_job_params (also gets app_id) 4. start the status lookup loop. """ the_time = int(round(time.time() * 1000)) self._send_comm_message('start', {'time': the_time}) ws_id = system_variable('workspace_id') try: nar_jobs = clients.get('user_and_job_state').list_jobs2({ 'authstrat': 'kbaseworkspace', 'authparams': [str(ws_id)] }) except Exception as e: kblogging.log_event(self._log, 'init_error', {'err': str(e)}) new_e = transform_job_exception(e) error = { 'error': 'Unable to get initial jobs list', 'message': getattr(new_e, 'message', 'Unknown reason'), 'code': getattr(new_e, 'code', -1), 'source': getattr(new_e, 'source', 'jobmanager'), 'name': getattr(new_e, 'name', type(e).__name__), 'service': 'user_and_job_state' } self._send_comm_message('job_init_err', error) raise new_e job_ids = [j[0] for j in nar_jobs] job_states = clients.get('job_service').check_jobs({ 'job_ids': job_ids, 'with_job_params': 1 }) job_param_info = job_states.get('job_params', {}) job_check_error = job_states.get('check_error', {}) error_jobs = dict() for info in nar_jobs: job_id = info[0] user_info = info[1] job_meta = info[10] try: if job_id in job_param_info: job_info = job_param_info[job_id] job = Job.from_state(job_id, job_info, user_info[0], app_id=job_info.get('app_id'), tag=job_meta.get('tag', 'release'), cell_id=job_meta.get('cell_id', None), run_id=job_meta.get('run_id', None), token_id=job_meta.get('token_id', None), meta=job_meta) # Note that when jobs for this narrative are initially loaded, # they are set to not be refreshed. Rather, if a client requests # updates via the start_job_update message, the refresh flag will # be set to True. self._running_jobs[job_id] = { 'refresh': 0, 'job': job } elif job_id in job_check_error: job_err_state = { 'job_state': 'error', 'error': { 'error': 'KBase execution engine returned an error while looking up this job.', 'message': job_check_error[job_id].get('message', 'No error message available'), 'name': 'Job Error', 'code': job_check_error[job_id].get('code', -999), 'exception': { 'error_message': 'Job lookup in execution engine failed', 'error_type': job_check_error[job_id].get('name', 'unknown'), 'error_stacktrace': job_check_error[job_id].get('error', '') } }, 'cell_id': job_meta.get('cell_id', None), 'run_id': job_meta.get('run_id', None), } error_jobs[job_id] = job_err_state except Exception as e: kblogging.log_event(self._log, 'init_error', {'err': str(e)}) new_e = transform_job_exception(e) error = { 'error': 'Unable to get job info on initial lookup', 'job_id': job_id, 'message': getattr(new_e, 'message', 'Unknown reason'), 'code': getattr(new_e, 'code', -1), 'source': getattr(new_e, 'source', 'jobmanager'), 'name': getattr(new_e, 'name', type(e).__name__), 'service': 'job_service' } self._send_comm_message('job_init_lookup_err', error) raise new_e # should crash and burn on any of these. if len(job_check_error): err_str = 'Unable to find info for some jobs on initial lookup' err_type = 'job_init_partial_err' if len(job_check_error) == len(nar_jobs): err_str = 'Unable to get info for any job on initial lookup' err_type = 'job_init_lookup_err' error = { 'error': err_str, 'job_errors': error_jobs, 'message': 'Job information was unavailable from the server', 'code': -2, 'source': 'jobmanager', 'name': 'jobmanager', 'service': 'job_service', } self._send_comm_message(err_type, error) if not self._running_lookup_loop and start_lookup_thread: # only keep one loop at a time in cause this gets called again! if self._lookup_timer is not None: self._lookup_timer.cancel() self._running_lookup_loop = True self._lookup_job_status_loop() else: self._lookup_all_job_status()
def _construct_job_status(self, job, state): """ Creates a Job status dictionary with structure: { owner: string (username), spec: app_spec (from NMS, via biokbase.narrative.jobs.specmanager) widget_info: (if not finished, None, else...) job.get_viewer_params result state: { job_state: string, error (if present): dict of error info, cell_id: string/None, run_id: string/None, awe_job_id: string/None, canceled: 0/1 creation_time: epoch second exec_start_time: epoch/none, finish_time: epoch/none, finished: 0/1, job_id: string, status: (from UJS) [ timestamp(last_update, string), stage (string), status (string), progress (string/None), est_complete (string/None), complete (0/1), error (0/1) ], ujs_url: string } } """ widget_info = None app_spec = {} if job is None: state = { 'job_state': 'error', 'error': { 'error': 'Job does not seem to exist, or it is otherwise unavailable.', 'message': 'Job does not exist', 'name': 'Job Error', 'code': -1, 'exception': { 'error_message': 'job not found in JobManager', 'error_type': 'ValueError', 'error_stacktrace': '' } }, 'cell_id': None, 'run_id': None, } return { 'state': state, 'app_spec': app_spec, 'widget_info': widget_info, 'owner': None } # try: # app_spec = job.app_spec() # except Exception as e: # kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)}) if state is None: kblogging.log_event(self._log, "lookup_job_status.error", {'err': 'Unable to get job state for job {}'.format(job.job_id)}) state = { 'job_state': 'error', 'error': { 'error': 'Unable to find current job state. Please try again later, or contact KBase.', 'message': 'Unable to return job state', 'name': 'Job Error', 'code': -1, 'source': 'JobManager._construct_job_status', 'exception': { 'error_message': 'No state provided during lookup', 'error_type': 'null-state', 'error_stacktrace': '', } }, 'creation_time': 0, 'cell_id': job.cell_id, 'run_id': job.run_id, 'job_id': job.job_id } elif 'lookup_error' in state: kblogging.log_event(self._log, "lookup_job_status.error", { 'err': 'Problem while getting state for job {}'.format(job.job_id), 'info': str(state['lookup_error']) }) state = { 'job_state': 'error', 'error': { 'error': 'Unable to fetch current state. Please try again later, or contact KBase.', 'message': 'Error while looking up job state', 'name': 'Job Error', 'code': -1, 'source': 'JobManager._construct_job_status', 'exception': { 'error_message': 'Error while fetching job state', 'error_type': 'failed-lookup', }, 'error_response': state['lookup_error'], 'creation_time': 0, 'cell_id': job.cell_id, 'run_id': job.run_id, 'job_id': job.job_id } } if state.get('finished', 0) == 1: try: widget_info = job.get_viewer_params(state) except Exception as e: # Can't get viewer params new_e = transform_job_exception(e) kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)}) state['job_state'] = 'error' state['error'] = { 'error': 'Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance.', 'message': 'Unable to build output viewer parameters!', 'name': 'App Error', 'code': getattr(new_e, "code", -1), 'source': getattr(new_e, "source", "JobManager") } if 'canceling' in self._running_jobs[job.job_id]: state['job_state'] = 'canceling' state.update({ 'child_jobs': self._child_job_states( state.get('sub_jobs', []), job.meta.get('batch_app'), job.meta.get('batch_tag') ) }) if 'batch_size' in job.meta: state.update({'batch_size': job.meta['batch_size']}) return {'state': state, 'spec': app_spec, 'widget_info': widget_info, 'owner': job.owner, 'listener_count': self._running_jobs[job.job_id]['refresh']}
def _run_app_internal(self, app_id, params, tag, version, cell_id, run_id, dry_run): """ Attemps to run the app, returns a Job with the running app info. Should *hopefully* also inject that app into the Narrative's metadata. Probably need some kind of JavaScript-foo to get that to work. Parameters: ----------- app_id - should be from the app spec, e.g. 'build_a_metabolic_model' or 'MegaHit/run_megahit'. params - a dictionary of parameters. tag - optional, one of [release|beta|dev] (default=release) version - optional, a semantic version string. Only released modules have versions, so if the tag is not 'release', and a version is given, a ValueError will be raised. **kwargs - these are the set of parameters to be used with the app. They can be found by using the app_usage function. If any non-optional apps are missing, a ValueError will be raised. """ ws_id = strict_system_variable('workspace_id') spec = self._get_validated_app_spec(app_id, tag, True, version=version) # Preflight check the params - all required ones are present, all # values are the right type, all numerical values are in given ranges spec_params = self.spec_manager.app_params(spec) spec_params_map = dict((spec_params[i]['id'], spec_params[i]) for i in range(len(spec_params))) ws_input_refs = extract_ws_refs(app_id, tag, spec_params, params) input_vals = self._map_inputs( spec['behavior']['kb_service_input_mapping'], params, spec_params_map) service_method = spec['behavior']['kb_service_method'] service_name = spec['behavior']['kb_service_name'] service_ver = spec['behavior'].get('kb_service_version', None) # Let the given version override the spec's version. if version is not None: service_ver = version # This is what calls the function in the back end - Module.method # This isn't the same as the app spec id. function_name = service_name + '.' + service_method job_meta = {'tag': tag} if cell_id is not None: job_meta['cell_id'] = cell_id if run_id is not None: job_meta['run_id'] = run_id # This is the input set for NJSW.run_job. Now we need the workspace id # and whatever fits in the metadata. job_runner_inputs = { 'method': function_name, 'service_ver': service_ver, 'params': input_vals, 'app_id': app_id, 'wsid': ws_id, 'meta': job_meta } if len(ws_input_refs) > 0: job_runner_inputs['source_ws_objects'] = ws_input_refs if dry_run: return job_runner_inputs # We're now almost ready to run the job. Last, we need an agent token. try: token_name = 'KBApp_{}'.format(app_id) token_name = token_name[:self.__MAX_TOKEN_NAME_LEN] agent_token = auth.get_agent_token(auth.get_auth_token(), token_name=token_name) except Exception as e: raise job_runner_inputs['meta']['token_id'] = agent_token['id'] # Log that we're trying to run a job... log_info = { 'app_id': app_id, 'tag': tag, 'version': service_ver, 'username': system_variable('user_id'), 'wsid': ws_id } kblogging.log_event(self._log, "run_app", log_info) try: job_id = clients.get( "execution_engine2", token=agent_token['token']).run_job(job_runner_inputs) except Exception as e: log_info.update({'err': str(e)}) kblogging.log_event(self._log, "run_app_error", log_info) raise transform_job_exception(e) new_job = Job(job_id, app_id, input_vals, system_variable('user_id'), tag=tag, app_version=service_ver, cell_id=cell_id, run_id=run_id, token_id=agent_token['id']) self._send_comm_message( 'run_status', { 'event': 'launched_job', 'event_at': datetime.datetime.utcnow().isoformat() + 'Z', 'cell_id': cell_id, 'run_id': run_id, 'job_id': job_id }) self.register_new_job(new_job) if cell_id is not None: return else: return new_job
def _run_app_batch_internal(self, app_id, params, tag, version, cell_id, run_id, dry_run): batch_method = "kb_BatchApp.run_batch" batch_app_id = "kb_BatchApp/run_batch" batch_method_ver = "dev" batch_method_tag = "dev" ws_id = strict_system_variable('workspace_id') spec = self._get_validated_app_spec(app_id, tag, True, version=version) # Preflight check the params - all required ones are present, all # values are the right type, all numerical values are in given ranges spec_params = self.spec_manager.app_params(spec) # A list of lists of UPAs, used for each subjob. batch_ws_upas = list() # The list of actual input values, post-mapping. batch_run_inputs = list() for param_set in params: spec_params_map = dict((spec_params[i]['id'], spec_params[i]) for i in range(len(spec_params))) batch_ws_upas.append( extract_ws_refs(app_id, tag, spec_params, param_set)) batch_run_inputs.append( self._map_inputs(spec['behavior']['kb_service_input_mapping'], param_set, spec_params_map)) service_method = spec['behavior']['kb_service_method'] service_name = spec['behavior']['kb_service_name'] service_ver = spec['behavior'].get('kb_service_version', None) # Let the given version override the spec's version. if version is not None: service_ver = version # This is what calls the function in the back end - Module.method # This isn't the same as the app spec id. job_meta = { 'tag': batch_method_tag, 'batch_app': app_id, 'batch_tag': tag, 'batch_size': len(params), } if cell_id is not None: job_meta['cell_id'] = cell_id if run_id is not None: job_meta['run_id'] = run_id # Now put these all together in a way that can be sent to the batch processing app. batch_params = [{ "module_name": service_name, "method_name": service_method, "service_ver": service_ver, "wsid": ws_id, "meta": job_meta, "batch_params": [{ "params": batch_run_inputs[i], "source_ws_objects": batch_ws_upas[i] } for i in range(len(batch_run_inputs))], }] # We're now almost ready to run the job. Last, we need an agent token. try: token_name = 'KBApp_{}'.format(app_id) token_name = token_name[:self.__MAX_TOKEN_NAME_LEN] agent_token = auth.get_agent_token(auth.get_auth_token(), token_name=token_name) except Exception as e: raise job_meta['token_id'] = agent_token['id'] # This is the input set for NJSW.run_job. Now we need the workspace id # and whatever fits in the metadata. job_runner_inputs = { 'method': batch_method, 'service_ver': batch_method_ver, 'params': batch_params, 'app_id': batch_app_id, 'wsid': ws_id, 'meta': job_meta } # if len(ws_input_refs) > 0: # job_runner_inputs['source_ws_objects'] = ws_input_refs # if we're doing a dry run, just return the inputs that we made. if dry_run: return job_runner_inputs # Log that we're trying to run a job... log_info = { 'app_id': app_id, 'tag': batch_method_tag, 'version': service_ver, 'username': system_variable('user_id'), 'wsid': ws_id } kblogging.log_event(self._log, "run_batch_app", log_info) try: job_id = clients.get( "execution_engine2", token=agent_token['token']).run_job(job_runner_inputs) except Exception as e: log_info.update({'err': str(e)}) kblogging.log_event(self._log, "run_batch_app_error", log_info) raise transform_job_exception(e) new_job = Job(job_id, batch_app_id, batch_params, system_variable('user_id'), tag=batch_method_tag, app_version=batch_method_ver, cell_id=cell_id, run_id=run_id, token_id=agent_token['id'], meta=job_meta) self._send_comm_message( 'run_status', { 'event': 'launched_job', 'event_at': datetime.datetime.utcnow().isoformat() + 'Z', 'cell_id': cell_id, 'run_id': run_id, 'job_id': job_id }) self.register_new_job(new_job) if cell_id is not None: return else: return new_job
def _construct_job_status(self, job: Job, state: dict) -> dict: """ Creates a Job status dictionary with structure: { owner: string (username, who started the job), spec: app spec (optional) widget_info: (if not finished, None, else...) job.get_viewer_params result state: { job_id: string, status: string, created: epoch ms, updated: epoch ms, queued: optional - epoch ms, finished: optional - epoc ms, terminated_code: optional - int, tag: string (release, beta, dev), parent_job_id: optional - string or null, run_id: string, cell_id: string, errormsg: optional - string, error (optional): { code: int, name: string, message: string (should be for the user to read), error: string, (likely a stacktrace) }, error_code: optional - int } } :param job: a Job object :param state: dict, expected to be in the format that comes straight from the Execution Engine 2 service """ widget_info = None app_spec = {} # If there's no job, but the state is valid, then that (likely) means the job was started # by either running AppManager.run_app directly without cell_id or run_id info, or that # it was started outside of the biokbase.narrative.jobs setup. This could be done through # direct calls to EE2. # # This could also be triggered by manually looking up job state for some job that doesn't # exist in the Narrative. Which is borderline, but still probably ok. if job is None and state is not None: state.update({ "cell_id": None, "run_id": None, }) return { 'state': state, 'app_spec': app_spec, 'widget_info': widget_info, 'owner': None } if state is None: kblogging.log_event(self._log, "lookup_job_status.error", { 'err': 'Unable to get job state for job {}'.format(job.job_id) }) state = self._create_error_state( "Unable to find current job state. Please try again later, or contact KBase.", "Unable to return job state", -1, cell_id=job.cell_id, run_id=job.run_id, job_id=job.job_id) if state.get('finished'): try: widget_info = job.get_viewer_params(state) except Exception as e: # Can't get viewer params new_e = transform_job_exception(e) kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)}) state.update({ "status": "error", "errormsg": "Unable to build output viewer parameters!", "error": { "code": getattr(new_e, "code", -1), "source": getattr(new_e, "source", "JobManager"), "name": "App Error", "message": "Unable to build output viewer parameters", "error": "Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance." } }) state.update({ "child_jobs": self._child_job_states(state.get("sub_jobs", []), job.meta.get("batch_app"), job.meta.get("batch_tag")), "run_id": job.cell_id, "cell_id": job.cell_id }) if "batch_size" in job.meta: state.update({"batch_size": job.meta["batch_size"]}) return { "state": state, "spec": app_spec, "widget_info": widget_info, "owner": job.owner, "listener_count": self._running_jobs[job.job_id]["refresh"] }
def run_app_batch( self, app_info: list, cell_id: str = None, run_id: str = None, dry_run: bool = False, ) -> Union[dict, None]: """ Attempts to run a batch of apps in bulk using the Execution Engine's run_app_batch endpoint. If a cell_id is provided, this sends various job messages over the comm channel, and returns None. If dry_run is True, this returns the structure that would be sent to EE2.run_job_batch Parameters: ----------- app_info: this is a list of app information dictionaries. It's broken down such that a single app can have multiple sets of parameters, which could create multiple runs of that app. Each dictionary is expected to have the following keys: app_id: the id of the app to run tag: the app tag to run, one of release, beta, or dev version: (optional) the specified version to run, if not provided, this will be the most recent for that particular tag shared_params: (optional) any params to be shared by all runs of the app params: a list of at least one dictionary. Each dict contains the set of parameters to run the app once. cell_id: if provided, this should be a unique id for the Narrative cell that's running the app. run_id: if provided, this should be a unique id representing a Narrative cell's knowledge of that job. dry_run: if True, this won't start the job, but return the structure that would be sent to the KBase execution engine. Example: -------- run_app_batch([{ "app_id": "Some_module/reads_to_contigset", "tag": "release", "version": "1.0.0", "shared_params": { "filter_len": 500 }, "params": [ { "read_library_name" : "My_PE_Library", "output_contigset_name" : "My_Contig_Assembly" }, { "read_library_name": "Another_reads_library", "output_contigset_name": "Another_contig_assembly" } ] }, { "app_id": "Some_module/contigset_to_genome", "tag": "release", "version": "1.1.0", "shared_params": { "filter_len": 1000, "taxon_id": 121212 }, "params": [ { "contigset": "My_contigset", "genome_name": "My_genome" } ] }]) """ if not isinstance(app_info, list) or len(app_info) == 0: raise ValueError( "app_info must be a list with at least one set of app information" ) batch_run_inputs = [] ws_id = strict_system_variable("workspace_id") batch_params = {"wsid": ws_id} # for EE2.run_job_batch log_app_info = [] for info in app_info: self._validate_bulk_app_info(info) self._reconstitute_shared_params(info) app_id = info["app_id"] tag = info.get("tag", "release") version = info.get("version") spec = self._get_validated_app_spec(app_id, tag, True, version) for param_set in info["params"]: # will raise a ValueError if anything is wrong or missing # otherwise, will build a set of inputs for EE2.run_job batch_run_inputs.append( self._build_run_job_params( spec, tag, param_set, version=version, cell_id=cell_id, run_id=run_id, ) ) log_app_info.append( { "app_id": app_id, "tag": tag, "version": version, "num_jobs": len(batch_run_inputs), } ) log_info = { "app_info": log_app_info, "username": system_variable("user_id"), "wsid": ws_id, } kblogging.log_event(self._log, "run_app_batch", log_info) # if we're doing a dry run, stop here and return the setup if dry_run: return {"batch_run_params": batch_run_inputs, "batch_params": batch_params} # We're now almost ready to run the job. Last, we need an agent token. agent_token = self._get_agent_token( f"KBase_app_batch_{len(batch_run_inputs)}_apps" ) # add the token id to the meta for all jobs for job_input in batch_run_inputs: job_input["meta"]["token_id"] = agent_token["id"] # run the job batch and get a batch_submission record try: batch_submission = clients.get( "execution_engine2", token=agent_token["token"] ).run_job_batch(batch_run_inputs, batch_params) except Exception as e: log_info.update({"err": str(e)}) kblogging.log_event(self._log, "run_job_bulk_error", log_info) raise transform_job_exception(e) from e batch_id = batch_submission["batch_id"] child_ids = batch_submission["child_job_ids"] self._send_comm_message( MESSAGE_TYPE["RUN_STATUS"], { "event": "launched_job_batch", "event_at": timestamp(), "cell_id": cell_id, "run_id": run_id, "batch_id": batch_id, "child_job_ids": child_ids, }, ) child_jobs = Job.from_job_ids(child_ids, return_list=True) parent_job = Job.from_job_id( batch_id, children=child_jobs, ) # TODO make a tighter design in the job manager for submitting a family of jobs for new_job in child_jobs: JobManager().register_new_job(new_job, refresh=False) JobManager().register_new_job(parent_job, refresh=False) if cell_id is None: return {"parent_job": parent_job, "child_jobs": child_jobs}
def run_app( self, app_id, params, tag="release", version=None, cell_id=None, run_id=None, dry_run=False, ): """ Attempts to run the app, returns a Job with the running app info. If this is given a cell_id, then returns None. If not, it returns the generated Job object. Parameters: ----------- app_id - should be from the app spec, e.g. 'build_a_metabolic_model' or 'MegaHit/run_megahit'. params - this is the dictionary of parameters to tbe used with the app. They can be found by using the app_usage function. If any non-optional apps are missing, a ValueError will be raised. tag - optional, one of [release|beta|dev] (default=release) version - optional, a semantic version string. Only released modules have versions, so if the tag is not 'release', and a version is given, a ValueError will be raised. Example: -------- run_app('MegaHit/run_megahit', { 'read_library_name' : 'My_PE_Library', 'output_contigset_name' : 'My_Contig_Assembly' }, version='>=1.0.0' ) """ if params is None: params = {} ws_id = strict_system_variable("workspace_id") spec = self._get_validated_app_spec(app_id, tag, True, version=version) job_runner_inputs = self._build_run_job_params( spec, tag, params, version, cell_id, run_id, ws_id ) if dry_run: return job_runner_inputs # We're now almost ready to run the job. Last, we need an agent token. agent_token = self._get_agent_token(app_id) job_runner_inputs["meta"]["token_id"] = agent_token["id"] # Log that we're trying to run a job... log_info = { "app_id": app_id, "tag": tag, "version": job_runner_inputs["service_ver"], "username": system_variable("user_id"), "wsid": ws_id, } kblogging.log_event(self._log, "run_app", log_info) try: job_id = clients.get( "execution_engine2", token=agent_token["token"] ).run_job(job_runner_inputs) except Exception as e: log_info.update({"err": str(e)}) kblogging.log_event(self._log, "run_app_error", log_info) raise transform_job_exception(e) from e new_job = Job.from_job_id(job_id) self._send_comm_message( MESSAGE_TYPE["RUN_STATUS"], { "event": "launched_job", "event_at": timestamp(), "cell_id": cell_id, "run_id": run_id, "job_id": job_id, }, ) JobManager().register_new_job(new_job, refresh=False) if cell_id is not None: return else: return new_job
def run_legacy_batch_app( self, app_id, params, tag="release", version=None, cell_id=None, run_id=None, dry_run=False, ): if params is None: params = [] ws_id = strict_system_variable("workspace_id") spec = self._get_validated_app_spec(app_id, tag, True, version=version) # Preflight check the params - all required ones are present, all # values are the right type, all numerical values are in given ranges spec_params = self.spec_manager.app_params(spec) # A list of lists of UPAs, used for each subjob. batch_ws_upas = [] # The list of actual input values, post-mapping. batch_run_inputs = [] for param_set in params: spec_params_map = dict( (spec_params[i]["id"], spec_params[i]) for i in range(len(spec_params)) ) batch_ws_upas.append(extract_ws_refs(app_id, tag, spec_params, param_set)) batch_run_inputs.append( self._map_inputs( spec["behavior"]["kb_service_input_mapping"], param_set, spec_params_map, ) ) service_method = spec["behavior"]["kb_service_method"] service_name = spec["behavior"]["kb_service_name"] service_ver = spec["behavior"].get("kb_service_version", None) # Let the given version override the spec's version. if version is not None: service_ver = version # This is what calls the function in the back end - Module.method # This isn't the same as the app spec id. job_meta = { "tag": BATCH_APP["TAG"], "batch_app": app_id, "batch_tag": tag, "batch_size": len(params), } if cell_id is not None: job_meta["cell_id"] = cell_id if run_id is not None: job_meta["run_id"] = run_id # Now put these all together in a way that can be sent to the batch processing app. batch_params = [ { "module_name": service_name, "method_name": service_method, "service_ver": service_ver, "wsid": ws_id, "meta": job_meta, "batch_params": [ { "params": batch_run_inputs[i], "source_ws_objects": batch_ws_upas[i], } for i in range(len(batch_run_inputs)) ], } ] # We're now almost ready to run the job. Last, we need an agent token. agent_token = self._get_agent_token(app_id) job_meta["token_id"] = agent_token["id"] # This is the input set for ee2.run_job. Now we need the workspace id # and whatever fits in the metadata. job_runner_inputs = { "app_id": BATCH_APP["APP_ID"], "meta": job_meta, "method": BATCH_APP["METHOD"], "params": batch_params, "service_ver": BATCH_APP["VERSION"], "wsid": ws_id, } # if we're doing a dry run, just return the inputs that we made. if dry_run: return job_runner_inputs # Log that we're trying to run a job... log_info = { "app_id": app_id, "tag": BATCH_APP["TAG"], "version": service_ver, "username": system_variable("user_id"), "wsid": ws_id, } kblogging.log_event(self._log, "run_batch_app", log_info) try: job_id = clients.get( "execution_engine2", token=agent_token["token"] ).run_job(job_runner_inputs) except Exception as e: log_info.update({"err": str(e)}) kblogging.log_event(self._log, "run_batch_app_error", log_info) raise transform_job_exception(e) from e new_job = Job.from_job_id( job_id, extra_data={ # this data is not preserved in the ee2 record "batch_app": app_id, "batch_tag": tag, "batch_size": len(params), }, ) self._send_comm_message( MESSAGE_TYPE["RUN_STATUS"], { "event": "launched_job", "event_at": timestamp(), "cell_id": cell_id, "run_id": run_id, "job_id": job_id, }, ) JobManager().register_new_job(new_job, refresh=False) if cell_id is None: return new_job
def output_state(self, state=None) -> dict: """ :param state: can be queried individually from ee2/cache with self.state(), but sometimes want it to be queried in bulk from ee2 upstream :return: dict, with structure { outputWidgetInfo: (if not finished, None, else...) job.get_viewer_params result jobState: { job_id: string, status: string, created: epoch ms, updated: epoch ms, queued: optional - epoch ms, finished: optional - epoc ms, terminated_code: optional - int, tag: string (release, beta, dev), parent_job_id: optional - string or null, run_id: string, cell_id: string, errormsg: optional - string, error (optional): { code: int, name: string, message: string (should be for the user to read), error: string, (likely a stacktrace) }, error_code: optional - int } } """ if not state: state = self.state() else: self._update_state(state) state = self._internal_state() if state is None: return self._create_error_state( "Unable to find current job state. Please try again later, or contact KBase.", "Unable to return job state", -1, ) self._trim_ee2_state(state, OUTPUT_STATE_EXCLUDED_JOB_STATE_FIELDS) if "job_output" not in state: state["job_output"] = {} for arg in EXTRA_JOB_STATE_FIELDS: state[arg] = getattr(self, arg) widget_info = None if state.get("finished"): try: widget_info = self.get_viewer_params(state) except Exception as e: # Can't get viewer params new_e = transform_job_exception(e) widget_info = { "status": "error", "errormsg": "Unable to build output viewer parameters!", "error": { "code": getattr(new_e, "code", -1), "source": getattr(new_e, "source", "JobManager"), "name": "App Error", "message": "Unable to build output viewer parameters", "error": "Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact https://kbase.us/support for assistance.", }, } # update timestamp if there was an error state.update({"updated": int(time.time())}) return { "job_id": self.job_id, "jobState": state, "outputWidgetInfo": widget_info, }
def _construct_job_status(self, job_id): """ Always creates a Job Status. It'll embed error messages into the status if there are problems. """ state = {} widget_info = None app_spec = {} job = self.get_job(job_id) if job is None: state = { 'job_state': 'error', 'error': { 'error': 'Job does not seem to exist, or it is otherwise unavailable.', 'message': 'Job does not exist', 'name': 'Job Error', 'code': -1, 'exception': { 'error_message': 'job not found in JobManager', 'error_type': 'ValueError', 'error_stacktrace': '' } }, 'cell_id': None, 'run_id': None } return { 'state': state, 'app_spec': app_spec, 'widget_info': widget_info, 'owner': None } try: app_spec = job.app_spec() except Exception as e: kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)}) try: state = job.state() except Exception as e: kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)}) new_e = transform_job_exception(e) e_type = type(e).__name__ e_message = str(new_e).replace('<', '<').replace('>', '>') e_trace = traceback.format_exc().replace('<', '<').replace('>', '>') e_code = getattr(new_e, "code", -2) e_source = getattr(new_e, "source", "JobManager") state = { 'job_state': 'error', 'error': { 'error': 'Unable to find current job state. Please try again later, or contact KBase.', 'message': 'Unable to return job state', 'name': 'Job Error', 'code': e_code, 'source': e_source, 'exception': { 'error_message': e_message, 'error_type': e_type, 'error_stacktrace': e_trace, } }, 'creation_time': 0, 'cell_id': job.cell_id, 'run_id': job.run_id, 'job_id': job_id } if state.get('finished', 0) == 1: try: widget_info = job.get_viewer_params(state) except Exception as e: # Can't get viewer params new_e = transform_job_exception(e) kblogging.log_event(self._log, "lookup_job_status.error", {'err': str(e)}) state['job_state'] = 'error' state['error'] = { 'error': 'Unable to generate App output viewer!\nThe App appears to have completed successfully,\nbut we cannot construct its output viewer.\nPlease contact the developer of this App for assistance.', 'message': 'Unable to build output viewer parameters!', 'name': 'App Error', 'code': getattr(new_e, "code", -1), 'source': getattr(new_e, "source", "JobManager") } if 'canceling' in self._running_jobs[job_id]: state['job_state'] = 'canceling' return {'state': state, 'spec': app_spec, 'widget_info': widget_info, 'owner': job.owner}
def _run_app_batch_internal(self, app_id, params, tag, version, cell_id, run_id, dry_run): batch_method = "kb_BatchApp.run_batch" batch_app_id = "kb_BatchApp/run_batch" batch_method_ver = "dev" batch_method_tag = "dev" ws_id = strict_system_variable('workspace_id') spec = self._get_validated_app_spec(app_id, tag, True, version=version) # Preflight check the params - all required ones are present, all # values are the right type, all numerical values are in given ranges spec_params = self.spec_manager.app_params(spec) # A list of lists of UPAs, used for each subjob. batch_ws_upas = list() # The list of actual input values, post-mapping. batch_run_inputs = list() for param_set in params: spec_params_map = dict((spec_params[i]['id'], spec_params[i]) for i in range(len(spec_params))) batch_ws_upas.append(extract_ws_refs(app_id, tag, spec_params, param_set)) batch_run_inputs.append(self._map_inputs( spec['behavior']['kb_service_input_mapping'], param_set, spec_params_map)) service_method = spec['behavior']['kb_service_method'] service_name = spec['behavior']['kb_service_name'] service_ver = spec['behavior'].get('kb_service_version', None) # Let the given version override the spec's version. if version is not None: service_ver = version # This is what calls the function in the back end - Module.method # This isn't the same as the app spec id. job_meta = { 'tag': batch_method_tag, 'batch_app': app_id, 'batch_tag': tag, 'batch_size': len(params), } if cell_id is not None: job_meta['cell_id'] = cell_id if run_id is not None: job_meta['run_id'] = run_id # Now put these all together in a way that can be sent to the batch processing app. batch_params = [{ "module_name": service_name, "method_name": service_method, "service_ver": service_ver, "wsid": ws_id, "meta": job_meta, "batch_params": [{ "params": batch_run_inputs[i], "source_ws_objects": batch_ws_upas[i] } for i in range(len(batch_run_inputs))], }] # We're now almost ready to run the job. Last, we need an agent token. try: token_name = 'KBApp_{}'.format(app_id) token_name = token_name[:self.__MAX_TOKEN_NAME_LEN] agent_token = auth.get_agent_token(auth.get_auth_token(), token_name=token_name) except Exception as e: raise job_meta['token_id'] = agent_token['id'] # This is the input set for NJSW.run_job. Now we need the workspace id # and whatever fits in the metadata. job_runner_inputs = { 'method': batch_method, 'service_ver': batch_method_ver, 'params': batch_params, 'app_id': batch_app_id, 'wsid': ws_id, 'meta': job_meta } # if len(ws_input_refs) > 0: # job_runner_inputs['source_ws_objects'] = ws_input_refs # if we're doing a dry run, just return the inputs that we made. if dry_run: return job_runner_inputs # Log that we're trying to run a job... log_info = { 'app_id': app_id, 'tag': batch_method_tag, 'version': service_ver, 'username': system_variable('user_id'), 'wsid': ws_id } kblogging.log_event(self._log, "run_batch_app", log_info) try: job_id = clients.get("job_service", token=agent_token['token']).run_job(job_runner_inputs) except Exception as e: log_info.update({'err': str(e)}) kblogging.log_event(self._log, "run_batch_app_error", log_info) raise transform_job_exception(e) new_job = Job(job_id, batch_app_id, batch_params, system_variable('user_id'), tag=batch_method_tag, app_version=batch_method_ver, cell_id=cell_id, run_id=run_id, token_id=agent_token['id'], meta=job_meta) self._send_comm_message('run_status', { 'event': 'launched_job', 'event_at': datetime.datetime.utcnow().isoformat() + 'Z', 'cell_id': cell_id, 'run_id': run_id, 'job_id': job_id }) JobManager().register_new_job(new_job) if cell_id is not None: return else: return new_job
def _run_app_internal(self, app_id, params, tag, version, cell_id, run_id, dry_run): """ Attemps to run the app, returns a Job with the running app info. Should *hopefully* also inject that app into the Narrative's metadata. Probably need some kind of JavaScript-foo to get that to work. Parameters: ----------- app_id - should be from the app spec, e.g. 'build_a_metabolic_model' or 'MegaHit/run_megahit'. params - a dictionary of parameters. tag - optional, one of [release|beta|dev] (default=release) version - optional, a semantic version string. Only released modules have versions, so if the tag is not 'release', and a version is given, a ValueError will be raised. **kwargs - these are the set of parameters to be used with the app. They can be found by using the app_usage function. If any non-optional apps are missing, a ValueError will be raised. """ ws_id = strict_system_variable('workspace_id') spec = self._get_validated_app_spec(app_id, tag, True, version=version) # Preflight check the params - all required ones are present, all # values are the right type, all numerical values are in given ranges spec_params = self.spec_manager.app_params(spec) spec_params_map = dict((spec_params[i]['id'], spec_params[i]) for i in range(len(spec_params))) ws_input_refs = extract_ws_refs(app_id, tag, spec_params, params) input_vals = self._map_inputs( spec['behavior']['kb_service_input_mapping'], params, spec_params_map) service_method = spec['behavior']['kb_service_method'] service_name = spec['behavior']['kb_service_name'] service_ver = spec['behavior'].get('kb_service_version', None) # Let the given version override the spec's version. if version is not None: service_ver = version # This is what calls the function in the back end - Module.method # This isn't the same as the app spec id. function_name = service_name + '.' + service_method job_meta = {'tag': tag} if cell_id is not None: job_meta['cell_id'] = cell_id if run_id is not None: job_meta['run_id'] = run_id # This is the input set for NJSW.run_job. Now we need the workspace id # and whatever fits in the metadata. job_runner_inputs = { 'method': function_name, 'service_ver': service_ver, 'params': input_vals, 'app_id': app_id, 'wsid': ws_id, 'meta': job_meta } if len(ws_input_refs) > 0: job_runner_inputs['source_ws_objects'] = ws_input_refs if dry_run: return job_runner_inputs # We're now almost ready to run the job. Last, we need an agent token. try: token_name = 'KBApp_{}'.format(app_id) token_name = token_name[:self.__MAX_TOKEN_NAME_LEN] agent_token = auth.get_agent_token(auth.get_auth_token(), token_name=token_name) except Exception as e: raise job_runner_inputs['meta']['token_id'] = agent_token['id'] # Log that we're trying to run a job... log_info = { 'app_id': app_id, 'tag': tag, 'version': service_ver, 'username': system_variable('user_id'), 'wsid': ws_id } kblogging.log_event(self._log, "run_app", log_info) try: job_id = clients.get("job_service", token=agent_token['token']).run_job(job_runner_inputs) except Exception as e: log_info.update({'err': str(e)}) kblogging.log_event(self._log, "run_app_error", log_info) raise transform_job_exception(e) new_job = Job(job_id, app_id, input_vals, system_variable('user_id'), tag=tag, app_version=service_ver, cell_id=cell_id, run_id=run_id, token_id=agent_token['id']) self._send_comm_message('run_status', { 'event': 'launched_job', 'event_at': datetime.datetime.utcnow().isoformat() + 'Z', 'cell_id': cell_id, 'run_id': run_id, 'job_id': job_id }) JobManager().register_new_job(new_job) if cell_id is not None: return else: return new_job
def initialize_jobs(self): """ Initializes this JobManager. This is expected to be run by a running Narrative, and naturally linked to a workspace. So it does the following steps. 1. app_util.system_variable('workspace_id') 2. get list of jobs with that ws id from UJS (also gets tag, cell_id, run_id) 3. initialize the Job objects by running NJS.get_job_params on each of those (also gets app_id) 4. start the status lookup loop. """ ws_id = system_variable('workspace_id') try: nar_jobs = clients.get('user_and_job_state').list_jobs2({ 'authstrat': 'kbaseworkspace', 'authparams': [str(ws_id)] }) except Exception as e: kblogging.log_event(self._log, 'init_error', {'err': str(e)}) new_e = transform_job_exception(e) error = { 'error': 'Unable to get initial jobs list', 'message': getattr(new_e, 'message', 'Unknown reason'), 'code': getattr(new_e, 'code', -1), 'source': getattr(new_e, 'source', 'jobmanager'), 'name': getattr(new_e, 'name', type(e).__name__), 'service': 'user_and_job_state' } self._send_comm_message('job_init_err', error) raise new_e for info in nar_jobs: job_id = info[0] user_info = info[1] job_meta = info[10] try: job_info = clients.get('job_service').get_job_params(job_id)[0] self._running_jobs[job_id] = { 'refresh': True, 'job': Job.from_state(job_id, job_info, user_info[0], app_id=job_info.get('app_id'), tag=job_meta.get('tag', 'release'), cell_id=job_meta.get('cell_id', None), run_id=job_meta.get('run_id', None)) } except Exception as e: kblogging.log_event(self._log, 'init_error', {'err': str(e)}) new_e = transform_job_exception(e) error = { 'error': 'Unable to get job info on initial lookup', 'job_id': job_id, 'message': getattr(new_e, 'message', 'Unknown reason'), 'code': getattr(new_e, 'code', -1), 'source': getattr(new_e, 'source', 'jobmanager'), 'name': getattr(new_e, 'name', type(e).__name__), 'service': 'job_service' } self._send_comm_message('job_init_lookup_err', error) raise new_e # should crash and burn on any of these. if not self._running_lookup_loop: # only keep one loop at a time in cause this gets called again! if self._lookup_timer is not None: self._lookup_timer.cancel() self._running_lookup_loop = True self._lookup_job_status_loop() else: self._lookup_all_job_status()