def test_update_task(tables, client, new_job, new_task): new_job.add_task(new_task) new_job.save() envs = [{ 'name': 'ENV', 'value': 'path' }] params = [{ 'name': '--rank', 'value': '3' }] data_to_update = { 'hostname': 'remotehost', 'cmdsegments': { 'envs': envs, 'params': params } } resp = client.put(ENDPOINT + '/{}'.format(new_task.id), headers=HEADERS, data=json.dumps(data_to_update)) resp_json = json.loads(resp.data.decode('utf-8')) assert resp.status_code == HTTPStatus.CREATED assert resp_json['task']['hostname'] == 'remotehost' assert Task.get(int(resp_json['task']['id'])).number_of_params == 1 assert Task.get(int(resp_json['task']['id'])).number_of_env_vars == 1
def remove_task(job_id: JobId, task_id: TaskId) -> Tuple[Content, HttpStatusCode]: """Removes Task from Job.""" job = None try: job = Job.get(job_id) task = Task.get(task_id) assert job.user_id == get_jwt_identity(), 'Not an owner' job.remove_task(task) except NoResultFound: if job is None: content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value else: content, status = {'msg': TASK['not_found']}, HTTPStatus.NOT_FOUND.value except InvalidRequestException as e: content, status = {'msg': JOB['tasks']['remove']['failure']['not_found'].format(reason=e)}, \ HTTPStatus.NOT_FOUND.value except AssertionError as e: content, status = {'msg': JOB['tasks']['remove']['failure']['assertions'].format(reason=e)}, \ HTTPStatus.FORBIDDEN.value except Exception as e: log.critical(e) content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value else: content, status = {'msg': JOB['tasks']['remove']['success'], 'job': job.as_dict()}, HTTPStatus.OK.value finally: return content, status
def business_get_log(id: TaskId, tail: bool) -> Tuple[Content, HttpStatusCode]: """Fetches log file created by spawned task (output redirection). It relies on reading files located on filesystem, via connection with `[email protected]` If file does not exist there's no way to fetch it from database (currently). File names must be named in one fashion (standard defined in `task_nursery.fetch_log`, currently: `task_<id>.log`). Renaming them manually will lead to inconsistency or 'Not Found' errors. `tail` argument allows for returning only the last few lines (10 is default for `tail` program). For more details, see,: `task_nursery.fetch_log`. """ try: task = Task.get(id) parent_job = Job.get(task.job_id) assert task.hostname, 'hostname is empty' assert parent_job.user, 'user does not exist' output_gen, log_path = task_nursery.fetch_log(task.hostname, parent_job.user.username, task.id, tail) except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except ExitCodeError as e: content, status = {'msg': TASK['get_log']['failure']['not_found'].format(location=e)}, 404 except AssertionError as e: content, status = {'msg': TASK['get_log']['failure']['assertions'].format(reason=e)}, 422 except (ConnectionErrorException, AuthenticationException, UnknownHostException) as e: content, status = {'msg': SSH['failure']['connection'].format(reason=e)}, 500 except Exception as e: log.critical(e) content, status = {'msg': GENERAL['internal_error']}, 500 else: content, status = {'msg': TASK['get_log']['success'], 'path': log_path, 'output_lines': list(output_gen)}, 200 finally: return content, status
def business_spawn(id: TaskId) -> Tuple[Content, HttpStatusCode]: """Spawns command stored in Task db record (task.full_command). It won't allow for spawning task which is currently running (sync + status check). If spawn operation has succeeded then `running` status is set. """ try: task = Task.get(id) parent_job = Job.get(task.job_id) assert task.status is not TaskStatus.running, 'task is already running' assert task.full_command, 'command is empty' assert task.hostname, 'hostname is empty' assert parent_job.user, 'user does not exist' pid = task_nursery.spawn(task.full_command, task.hostname, parent_job.user.username, name_appendix=str(task.id)) task.pid = pid task.status = TaskStatus.running task.save() except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except AssertionError as e: content, status = {'msg': TASK['spawn']['failure']['assertions'].format(reason=e)}, 422 except SpawnError as e: log.warning(e) content, status = {'msg': TASK['spawn']['failure']['backend'].format(reason=e)}, 500 except Exception as e: log.critical(e) content, status = {'msg': GENERAL['internal_error']}, 500 else: log.info('Task {} is now: {}'.format(task.id, task.status.name)) content, status = {'msg': TASK['spawn']['success'], 'pid': pid}, 200 finally: return content, status
def synchronize(task_id: TaskId) -> None: """Updates the state of a Task object stored in database. It compares current db record with list of active screen session (their pids in general) on node defined by that record ([email protected]). If task_nursery is unable to fetch active screen sessions then the new state is always set to unsynchronized. If task.pid is not alive (db record is outdated), then it makes transition from last known state to a new state: state before sync => state applied after sync ----------------------------------------------- running => terminated unsynchronized => not_running """ log.debug('Syncing Task {}...'.format(task_id)) try: task = Task.get(task_id) assert task.host, 'hostname is empty' assert task.user, 'user does not exist' active_sessions_pids = task_nursery.running(host=task.host, user=task.user.username) except NoResultFound: # This exception must be handled within try/except block when using Task.get() # In other words, methods decorated with @synchronize_task_record must handle this case by themselves! log.warning( 'Task {} could not be found (also synchronized). Failing without taking any action...' .format(task_id)) pass except (AssertionError, Exception) as e: # task_nursery.running pssh exceptions are also catched here log.error('Unable to synchronize Task {}, reason: {}'.format( task_id, e)) log.debug('Task {} status was: {}'.format(task_id, task.status.name)) task.status = TaskStatus.unsynchronized task.save() log.debug('Task {} is now: {}'.format(task_id, task.status.name)) else: log.debug('[BEFORE SYNC] Task {} status was: {}'.format( task_id, task.status.name)) change_status_msg = '[AFTER SYNC] Task {id} is now: {curr_status}' if task.pid not in active_sessions_pids: if task.status is TaskStatus.running: task.status = TaskStatus.terminated log.debug( change_status_msg.format(id=task_id, curr_status=task.status.name)) if task.status is TaskStatus.unsynchronized: task.status = TaskStatus.not_running log.debug( change_status_msg.format(id=task_id, curr_status=task.status.name)) task.pid = None task.save()
def destroy(id: TaskId) -> Tuple[Content, HttpStatusCode]: try: task = Task.get(id) assert task.user_id == get_jwt_identity(), 'Not an owner' except NoResultFound: content, status = {'msg': T['not_found']}, 404 except AssertionError: content, status = {'msg': G['unpriviliged']}, 403 else: content, status = business_destroy(id) finally: return content, status
def get_log(id: TaskId, tail: bool) -> Tuple[Content, HttpStatusCode]: try: task = Task.get(id) assert get_jwt_identity() == task.user_id or is_admin() except NoResultFound: content, status = {'msg': T['not_found']}, 404 except AssertionError: content, status = {'msg': G['unpriviliged']}, 403 else: content, status = business_get_log(id, tail) finally: return content, status
def spawn(id: TaskId) -> Tuple[Content, HttpStatusCode]: try: task = Task.get(id) assert task.user_id == get_jwt_identity(), 'Not an owner' except NoResultFound as e: log.error(e) content, status = {'msg': TASK['not_found']}, 404 except AssertionError: content, status = {'msg': GENERAL['unprivileged']}, 403 else: content, status = business_spawn(id) finally: return content, status
def update(id: TaskId, newValues: Dict[str, Any]) -> Tuple[Content, HttpStatusCode]: try: task = Task.get(id) assert task.user_id == get_jwt_identity(), 'Not an owner' except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except AssertionError: content, status = {'msg': GENERAL['unprivileged']}, 403 else: content, status = business_update(id, newValues) finally: return content, status
def business_get(id: TaskId) -> Tuple[Content, HttpStatusCode]: """Fetches one Task db record""" try: task = Task.get(id) except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except Exception as e: log.critical(e) content, status = {'msg': GENERAL['internal_error']}, 500 else: content, status = {'msg': TASK['get']['success'], 'task': task.as_dict()}, 200 finally: return content, status
def terminate( id: TaskId, gracefully: Optional[bool] = True) -> Tuple[Content, HttpStatusCode]: try: task = Task.get(id) assert get_jwt_identity() == task.user_id or is_admin() except NoResultFound: content, status = {'msg': T['not_found']}, 404 except AssertionError: content, status = {'msg': G['unpriviliged']}, 403 else: content, status = business_terminate(id, gracefully) finally: return content, status
def get_log(id: TaskId, tail: bool) -> Tuple[Content, HttpStatusCode]: try: task = Task.get(id) parent_job = Job.get(task.job_id) if not is_admin() and not parent_job.user_id == get_jwt_identity(): raise ForbiddenException("not an owner") except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except ForbiddenException: content, status = {'msg': GENERAL['unprivileged']}, 403 else: content, status = business_get_log(id, tail) finally: return content, status
def update(id: TaskId, newValues: Dict[str, Any]) -> Tuple[Content, HttpStatusCode]: try: task = Task.get(id) parent_job = Job.get(task.job_id) if not is_admin() and not parent_job.user_id == get_jwt_identity(): raise ForbiddenException('not an owner') except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except ForbiddenException: content, status = {'msg': GENERAL['unprivileged']}, 403 else: content, status = business_update(id, newValues) finally: return content, status
def business_update(id: TaskId, newValues: Dict[str, Any]) -> Tuple[Content, HttpStatusCode]: """Updates certain fields of a Task db record, including command field and segments.""" try: new_values = newValues task = Task.get(id) assert task.status is not TaskStatus.running, "Cannot update task which is already running" for key, value in new_values.items(): if key == 'hostname': setattr(task, key, value) elif key == 'command': task.gpu_id = parse_gpu_id_from_command(value) setattr(task, key, value) elif key == 'cmdsegments': # FIXME Somehow the loop doesn't get all of the elements by the first time # but repeating it cleares it for segment in task.cmd_segments: task.remove_cmd_segment(segment) for segment in task.cmd_segments: task.remove_cmd_segment(segment) for segment in new_values['cmdsegments']['envs']: new_segment = CommandSegment.query.filter(CommandSegment.segment_type == SegmentType.env_variable, CommandSegment.name == segment['name']).first() if (new_segment is None): new_segment = CommandSegment( name=segment['name'], _segment_type=SegmentType.env_variable) task.add_cmd_segment(new_segment, segment['value']) for segment in new_values['cmdsegments']['params']: new_segment = CommandSegment.query.filter(CommandSegment.segment_type == SegmentType.parameter, CommandSegment.name == segment['name']).first() if (new_segment is None): new_segment = CommandSegment( name=segment['name'], _segment_type=SegmentType.parameter) task.add_cmd_segment(new_segment, segment['value']) task.save() except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except AssertionError as e: content, status = {'msg': TASK['update']['failure']['assertions'].format(reason=e)}, 422 except Exception as e: log.critical(e) content, status = {'msg': GENERAL['internal_error']}, 500 else: content, status = {'msg': TASK['update']['success'], 'task': task.as_dict()}, 201 finally: return content, status
def business_terminate(id: TaskId, gracefully: Optional[bool] = True) -> Tuple[Content, HttpStatusCode]: """Sends SIGINT (default) or SIGKILL to process with pid that is stored in Task db record. In order to send SIGKILL, pass `gracefully=False` to `terminate` function. Note that: 1) `exit_code` is related to executing kill operation, not killed process. 2) termination signal should be respected by most processes, however this function does not guarantee stoping the process! """ try: task = Task.get(id) assert task.status is TaskStatus.running, 'only running tasks can be terminated' assert task.pid, 'task has no pid assigned' # It means there's inconsistency parent_job = Job.get(task.job_id) # gracefully: # True -> interrupt (allows output to be flushed into log file) # None -> terminate (works almost every time, but losing output that could be produced before closing) # False -> kill (similar to above, but success is almost guaranteed) exit_code = task_nursery.terminate(task.pid, task.hostname, parent_job.user.username, gracefully=gracefully) if exit_code != 0: raise ExitCodeError('operation exit code is not 0') # Note: Code below is unsafe, because interrupt and terminate does not guarantee success. # It's better to let synchhronization update this (via comparison with screen sessions) # (Unsafe section) Original comment: Allow to spawn that task again # task.pid = None # task.status = TaskStatus.terminated task.save() except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except ExitCodeError: content, status = {'msg': TASK['terminate']['failure']['exit_code'], 'exit_code': exit_code}, 202 except AssertionError as e: content, status = {'msg': TASK['terminate']['failure']['state'].format(reason=e)}, 409 except ConnectionErrorException as e: content, status = {'msg': TASK['failure']['connection'].format(reason=e)}, 500 except Exception as e: log.critical(e) content, status = {'msg': GENERAL['internal_error']}, 500 else: content, status = {'msg': TASK['terminate']['success'], 'exit_code': exit_code}, 200 finally: return content, status
def business_destroy(id: TaskId) -> Tuple[Content, HttpStatusCode]: """Deletes a Task db record. Requires terminating task manually in advance.""" try: task = Task.get(id) assert task.status is not TaskStatus.running, 'must be terminated first' task.destroy() except NoResultFound: content, status = {'msg': T['not_found']}, 404 except AssertionError as e: content, status = { 'msg': T['delete']['failure']['assertions'].format(reason=e) }, 422 except Exception: content, status = {'msg': G['internal_error']}, 500 else: content, status = {'msg': T['delete']['success']}, 200 finally: return content, status
def business_spawn(id: TaskId) -> Tuple[Content, HttpStatusCode]: """Spawns command stored in Task db record (task.command). It won't allow for spawning task which is currently running (sync + status check). If spawn operation has succeeded then `running` status is set. """ try: task = Task.get(id) assert task.status is not TaskStatus.running, 'task is already running' assert task.command, 'command is empty' assert task.host, 'hostname is empty' assert task.user, 'user does not exist' pid = task_nursery.spawn(task.command, task.host, task.user.username, name_appendix=str(task.id)) task.pid = pid task.status = TaskStatus.running # If task was scheduled to terminate and user just # spawned that task manually, scheduler should still # continue to watch and terminate the task automatically. task.save() except NoResultFound: content, status = {'msg': T['not_found']}, 404 except AssertionError as e: content, status = { 'msg': T['spawn']['failure']['assertions'].format(reason=e) }, 422 except SpawnError as e: log.warning(e) content, status = { 'msg': T['spawn']['failure']['backend'].format(reason=e) }, 500 except Exception as e: log.critical(e) content, status = {'msg': G['internal_error']}, 500 else: log.info('Task {} is now: {}'.format(task.id, task.status.name)) content, status = {'msg': T['spawn']['success'], 'pid': pid}, 200 finally: return content, status
def business_update( id: TaskId, new_values: Dict[str, Any]) -> Tuple[Content, HttpStatusCode]: """Updates certain fields of a Task db record, see `allowed_fields`.""" allowed_fields = {'command', 'hostname', 'spawnAt', 'terminateAt'} try: assert set(new_values.keys()).issubset( allowed_fields), 'invalid field is present' task = Task.get(id) for field_name, new_value in new_values.items(): if field_name == 'hostname': # API client is allowed to use more verbose name here (hostname <=> host) field_name = 'host' if field_name in {'spawnAt', 'terminateAt'}: field_name = field_name.replace('At', '_at') new_value = try_parse_input_datetime(new_value) else: # Check that every other field matches assert hasattr( task, field_name), 'task has no {} column'.format(field_name) setattr(task, field_name, new_value) task.save() except NoResultFound: content, status = {'msg': T['not_found']}, 404 except ValueError: # Invalid string format for datetime content, status = {'msg': G['bad_request']}, 422 except AssertionError as e: content, status = { 'msg': T['update']['failure']['assertions'].format(reason=e) }, 422 except Exception as e: log.critical(e) content, status = {'msg': G['internal_error']}, 500 else: content, status = { 'msg': T['update']['success'], 'task': task.as_dict }, 201 finally: return content, status
def test_create_task(tables, client, new_job, new_user): new_user.save() new_job.save() envs = [ { 'name': 'ENV', 'value': 'path' }, { 'name': 'LIBPATH', 'value': 'some/path/2' } ] params = [ { 'name': '--batch_size', 'value': '32' }, { 'name': '--rank', 'value': '2' } ] data = { 'command': 'python command.py', 'hostname': 'localhost', 'cmdsegments': { 'envs': envs, 'params': params } } resp = client.post(BASE_URI + '/jobs/{}/tasks'.format(new_job.id), headers=HEADERS, data=json.dumps(data)) resp_json = json.loads(resp.data.decode('utf-8')) assert resp.status_code == HTTPStatus.CREATED assert resp_json['task']['command'] == 'python command.py' assert resp_json['task']['jobId'] == new_job.id assert len(new_job.tasks) == 1 assert Task.get(int(resp_json['task']['id'])).number_of_params == 2
def business_destroy(id: TaskId) -> Tuple[Content, HttpStatusCode]: """Deletes a Task db record. Requires terminating task manually in advance. All of the m-n relationship links (task-cmd_segment) are deleted too Have to delete unwanted command segments (no task attached) manually """ try: task = Task.get(id) cmd_segments = task.cmd_segments assert task.status is not TaskStatus.running, 'must be terminated first' task.destroy() for segment in cmd_segments: if len(segment.tasks) == 0: segment.destroy() except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except AssertionError as e: content, status = {'msg': TASK['delete']['failure']['assertions'].format(reason=e)}, 422 except Exception: content, status = {'msg': GENERAL['internal_error']}, 500 else: content, status = {'msg': TASK['delete']['success']}, 200 finally: return content, status
def business_update( id: TaskId, new_values: Dict[str, Any]) -> Tuple[Content, HttpStatusCode]: """Updates certain fields of a Task db record, see `allowed_fields`.""" allowed_fields = {'command', 'hostname', 'spawnAt', 'terminateAt'} try: assert set(new_values.keys()).issubset( allowed_fields), 'invalid field is present' task = Task.get(id) for field_name, new_value in new_values.items(): if field_name in {'spawnAt', 'terminateAt'}: new_value = DateUtils.try_parse_string(new_value) field_name = snakecase(field_name) # Check that every field matches assert (field_name is not None) and hasattr( task, field_name), 'task has no {} field'.format(field_name) setattr(task, field_name, new_value) task.save() except NoResultFound: content, status = {'msg': TASK['not_found']}, 404 except ValueError: # Invalid string format for datetime content, status = {'msg': GENERAL['bad_request']}, 422 except AssertionError as e: content, status = { 'msg': TASK['update']['failure']['assertions'].format(reason=e) }, 422 except Exception as e: log.critical(e) content, status = {'msg': GENERAL['internal_error']}, 500 else: content, status = { 'msg': TASK['update']['success'], 'task': task.as_dict() }, 201 finally: return content, status