def screen_sessions(username: str, hostname: str) -> Tuple[Content, HttpStatusCode]: """Returns pids of running `screen` sessions. This endpoint is for purely development purposes, currently there's no need to use it. """ try: assert username and hostname, 'parameters must not be empty' pids = task_nursery.running(host=hostname, user=username) except AssertionError as e: content, status = { 'msg': S['failure']['assertions'].format(reason=e) }, 422 except (ConnectionErrorException, AuthenticationException, UnknownHostException) as e: content, status = { 'msg': API.RESPONSES['ssh']['failure']['connection'].format(reason=e) }, 500 except Exception as e: log.critical(e) content, status = {'msg': G['internal_error']}, 500 else: # FIXME content, status = {'msg': S['success'], 'pids': pids}, 200 finally: return content, status
def synchronize(task_id: TaskId) -> None: """Updates the state of a Task object stored in database. It compares current db record with list of active screen session (their pids in general) on node defined by that record ([email protected]). If task_nursery is unable to fetch active screen sessions then the new state is always set to unsynchronized. If task.pid is not alive (db record is outdated), then it makes transition from last known state to a new state: state before sync => state applied after sync ----------------------------------------------- running => terminated unsynchronized => not_running """ log.debug('Syncing Task {}...'.format(task_id)) try: task = Task.get(task_id) assert task.host, 'hostname is empty' assert task.user, 'user does not exist' active_sessions_pids = task_nursery.running(host=task.host, user=task.user.username) except NoResultFound: # This exception must be handled within try/except block when using Task.get() # In other words, methods decorated with @synchronize_task_record must handle this case by themselves! log.warning( 'Task {} could not be found (also synchronized). Failing without taking any action...' .format(task_id)) pass except (AssertionError, Exception) as e: # task_nursery.running pssh exceptions are also catched here log.error('Unable to synchronize Task {}, reason: {}'.format( task_id, e)) log.debug('Task {} status was: {}'.format(task_id, task.status.name)) task.status = TaskStatus.unsynchronized task.save() log.debug('Task {} is now: {}'.format(task_id, task.status.name)) else: log.debug('[BEFORE SYNC] Task {} status was: {}'.format( task_id, task.status.name)) change_status_msg = '[AFTER SYNC] Task {id} is now: {curr_status}' if task.pid not in active_sessions_pids: if task.status is TaskStatus.running: task.status = TaskStatus.terminated log.debug( change_status_msg.format(id=task_id, curr_status=task.status.name)) if task.status is TaskStatus.unsynchronized: task.status = TaskStatus.not_running log.debug( change_status_msg.format(id=task_id, curr_status=task.status.name)) task.pid = None task.save()
def sync_running_from_queue( self, available_hosts_with_gpu_occupation: Dict[str, Dict[str, List]]): jobs_running_from_queue = Job.get_jobs_running_from_queue() for job in jobs_running_from_queue: job_should_be_stopped = False for task in job.tasks: gpu_uid = Scheduler.get_assigned_gpu_uid( task, available_hosts_with_gpu_occupation) if not gpu_uid or task.pid not in task_nursery.running( task.hostname, job.user.username): task.status = TaskStatus.not_running continue current_processes_on_gpu = available_hosts_with_gpu_occupation[ task.hostname][gpu_uid] if current_processes_on_gpu is None: other_process_pids = [] else: other_process_pids = [ process['pid'] for process in current_processes_on_gpu if process['pid'] is not task.pid ] considered_future_period = timedelta( minutes=CONFIG.SCHEDULE_QUEUED_JOBS_WHEN_FREE_MINS) interferes = self.interferes_with_reservations( job, available_hosts_with_gpu_occupation, considered_future_period=considered_future_period, # Queued jobs should run only between reservations allow_own=False) if len(other_process_pids) or interferes: job_should_be_stopped = True if job_should_be_stopped: log.info( self._log_msg(now=datetime.utcnow(), action='Stopping queued job', id=job.id)) self.stop_with_grace(job.id)