Exemple #1
0
def create(job: Dict[str, Any]) -> Tuple[Content, HttpStatusCode]:
    """ Creates new Job db record."""
    try:
        assert job['userId'] == get_jwt_identity(), 'Not an owner'
        new_job = Job(
            name=job['name'],
            description=job['description'],
            user_id=job['userId']
        )
        if 'startAt' in job and job['startAt'] is not None:
            setattr(new_job, 'start_at', job['startAt'])
        if 'stopAt' in job and job['stopAt'] is not None:
            setattr(new_job, 'stop_at', job['stopAt'])
        new_job.save()
    except AssertionError as e:
        if e.args[0] == 'Not an owner':
            content, status = {'msg': GENERAL['unprivileged']}, HTTPStatus.FORBIDDEN.value
        else:
            content = {'msg': JOB['create']['failure']['invalid'].format(reason=e)}
            status = HTTPStatus.UNPROCESSABLE_ENTITY.value
    except ValueError:
        # Invalid string format for datetime
        content = {'msg': JOB['create']['failure']['invalid']}
        status = HTTPStatus.UNPROCESSABLE_ENTITY.value
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value
    else:
        content = {
            'msg': JOB['create']['success'],
            'job': new_job.as_dict()
        }
        status = HTTPStatus.CREATED.value
    finally:
        return content, status
Exemple #2
0
def new_running_job(new_user):
    new_user.save()
    job = Job(name='running_job',
              description='A running job',
              user_id=new_user.id,
              _status=JobStatus.running)
    job.save()
    return job
Exemple #3
0
def new_job(new_user):
    new_user.save()
    job = Job(name='job_name',
              description='testDescription',
              user_id=new_user.id,
              _status=JobStatus.not_running)
    job.save()
    return job
def test_create_job(tables, client, new_user):
    new_user.save()
    job_name = 'TestJob'
    data = {
        'name':
        job_name,
        'description':
        'testDescription',
        'userId':
        1,
        'startAt':
        DateUtils.stringify_datetime_to_api_format(datetime.datetime.utcnow() +
                                                   timedelta(hours=5)),
        'stopAt':
        DateUtils.stringify_datetime_to_api_format(datetime.datetime.utcnow() +
                                                   timedelta(hours=10))
    }

    resp = client.post(ENDPOINT, headers=HEADERS, data=json.dumps(data))
    resp_json = json.loads(resp.data.decode('utf-8'))

    assert resp.status_code == HTTPStatus.CREATED
    assert resp_json['job']['id'] is not None
    assert resp_json['job']['name'] == job_name
    assert Job.get(int(resp_json['job']['id'])) is not None
Exemple #5
0
def remove_task(job_id: JobId, task_id: TaskId) -> Tuple[Content, HttpStatusCode]:
    """Removes Task from Job."""
    job = None
    try:
        job = Job.get(job_id)
        task = Task.get(task_id)
        assert job.user_id == get_jwt_identity(), 'Not an owner'
        job.remove_task(task)
    except NoResultFound:
        if job is None:
            content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
        else:
            content, status = {'msg': TASK['not_found']}, HTTPStatus.NOT_FOUND.value
    except InvalidRequestException as e:
        content, status = {'msg': JOB['tasks']['remove']['failure']['not_found'].format(reason=e)}, \
            HTTPStatus.NOT_FOUND.value
    except AssertionError as e:
        content, status = {'msg': JOB['tasks']['remove']['failure']['assertions'].format(reason=e)}, \
            HTTPStatus.FORBIDDEN.value
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value
    else:
        content, status = {'msg': JOB['tasks']['remove']['success'], 'job': job.as_dict()}, HTTPStatus.OK.value
    finally:
        return content, status
Exemple #6
0
def delete(id: JobId) -> Tuple[Content, HttpStatusCode]:
    """Deletes a Job db record.

    If running, requires stopping job manually in advance."""
    try:
        job = Job.get(id)

        if not (is_admin() or job.user_id == get_jwt_identity()):
            raise ForbiddenException('not an owner')

        assert job.status is not JobStatus.running, 'must be stopped first'
        job.destroy()
    except ForbiddenException as fe:
        content, status = {'msg': JOB['update']['failure']['forbidden'].format(reason=fe)}, 403
    except AssertionError as e:
        content, status = {'msg': JOB['delete']['failure']['assertions'].format(reason=e)}, \
            HTTPStatus.UNPROCESSABLE_ENTITY.value
    except NoResultFound:
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value
    else:
        content, status = {'msg': JOB['delete']['success']}, HTTPStatus.OK.value
    finally:
        return content, status
Exemple #7
0
def update(id: JobId, newValues: Dict[str, Any]) -> Tuple[Content, HttpStatusCode]:
    """Updates certain fields of a Job db record, see `allowed_fields`."""
    new_values = newValues
    allowed_fields = {'name', 'description', 'startAt', 'stopAt'}
    try:
        job = Job.get(id)

        if not (is_admin() or job.user_id == get_jwt_identity()):
            raise ForbiddenException('not an owner')

        assert set(new_values.keys()).issubset(allowed_fields), 'invalid field is present'

        assert job.status is not JobStatus.running, 'must be stopped first'

        for field_name, new_value in new_values.items():
            field_name = snakecase(field_name)
            if new_value is not None:
                assert hasattr(job, field_name), 'job has no {} field'.format(field_name)
                setattr(job, field_name, new_value)
        job.save()
    except ForbiddenException as fe:
        content, status = {'msg': JOB['update']['failure']['forbidden'].format(reason=fe)}, 403
    except NoResultFound:
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except AssertionError as e:
        content, status = {'msg': JOB['update']['failure']['assertions'].format(reason=e)}, \
            HTTPStatus.UNPROCESSABLE_ENTITY.value
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value
    else:
        content, status = {'msg': JOB['update']['success'], 'job': job.as_dict()}, HTTPStatus.OK.value
    finally:
        return content, status
Exemple #8
0
def business_get_log(id: TaskId, tail: bool) -> Tuple[Content, HttpStatusCode]:
    """Fetches log file created by spawned task (output redirection).

    It relies on reading files located on filesystem, via connection with `[email protected]`
    If file does not exist there's no way to fetch it from database (currently).
    File names must be named in one fashion (standard defined in `task_nursery.fetch_log`,
    currently: `task_<id>.log`). Renaming them manually will lead to inconsistency or 'Not Found' errors.

    `tail` argument allows for returning only the last few lines (10 is default for `tail` program).
    For more details, see,: `task_nursery.fetch_log`.
    """
    try:
        task = Task.get(id)
        parent_job = Job.get(task.job_id)
        assert task.hostname, 'hostname is empty'
        assert parent_job.user, 'user does not exist'
        output_gen, log_path = task_nursery.fetch_log(task.hostname, parent_job.user.username, task.id, tail)
    except NoResultFound:
        content, status = {'msg': TASK['not_found']}, 404
    except ExitCodeError as e:
        content, status = {'msg': TASK['get_log']['failure']['not_found'].format(location=e)}, 404
    except AssertionError as e:
        content, status = {'msg': TASK['get_log']['failure']['assertions'].format(reason=e)}, 422
    except (ConnectionErrorException, AuthenticationException, UnknownHostException) as e:
        content, status = {'msg': SSH['failure']['connection'].format(reason=e)}, 500
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, 500
    else:
        content, status = {'msg': TASK['get_log']['success'], 'path': log_path, 'output_lines': list(output_gen)}, 200
    finally:
        return content, status
Exemple #9
0
def business_spawn(id: TaskId) -> Tuple[Content, HttpStatusCode]:
    """Spawns command stored in Task db record (task.full_command).
    It won't allow for spawning task which is currently running (sync + status check).
    If spawn operation has succeeded then `running` status is set.
    """
    try:
        task = Task.get(id)
        parent_job = Job.get(task.job_id)
        assert task.status is not TaskStatus.running, 'task is already running'
        assert task.full_command, 'command is empty'
        assert task.hostname, 'hostname is empty'
        assert parent_job.user, 'user does not exist'

        pid = task_nursery.spawn(task.full_command,
                                 task.hostname,
                                 parent_job.user.username,
                                 name_appendix=str(task.id))
        task.pid = pid
        task.status = TaskStatus.running
        task.save()
    except NoResultFound:
        content, status = {'msg': TASK['not_found']}, 404
    except AssertionError as e:
        content, status = {'msg': TASK['spawn']['failure']['assertions'].format(reason=e)}, 422
    except SpawnError as e:
        log.warning(e)
        content, status = {'msg': TASK['spawn']['failure']['backend'].format(reason=e)}, 500
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, 500
    else:
        log.info('Task {} is now: {}'.format(task.id, task.status.name))
        content, status = {'msg': TASK['spawn']['success'], 'pid': pid}, 200
    finally:
        return content, status
Exemple #10
0
def get_all(userId: Optional[int]) -> Tuple[Content, HttpStatusCode]:
    """Fetches all Job records"""
    user_id = userId
    sync_all = True
    try:
        if user_id:
            # Owner or admin can fetch
            if not (is_admin() or get_jwt_identity() == user_id):
                raise ForbiddenException("not an owner")
            jobs = Job.query.filter(Job.user_id == user_id).all()
        else:
            # Only admin can fetch all
            if not is_admin():
                raise ForbiddenException("unauthorized")
            jobs = Job.all()
        if sync_all:
            for job in jobs:
                for task in job.tasks:
                    synchronize(task.id)
    except NoResultFound:
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except ForbiddenException as fe:
        content, status = {'msg': JOB['all']['forbidden'].format(reason=fe)}, HTTPStatus.FORBIDDEN.value
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value
    else:
        results = []
        for job in jobs:
            results.append(job.as_dict())
        content, status = {'msg': JOB['all']['success'], 'jobs': results}, HTTPStatus.OK.value
    finally:
        return content, status
Exemple #11
0
def test_delete_task(tables, client, new_job, new_task):
    new_job.add_task(new_task)
    new_job.save()

    resp = client.delete(ENDPOINT + '/{}'.format(new_task.id), headers=HEADERS)

    assert resp.status_code == HTTPStatus.OK
    assert len(Task.all()) == 0
    assert len(Job.all()) == 1
    assert len(CommandSegment.all()) == 0  # checks if segments from deleted task are deleted by cascade
def test_create_job_without_dates(tables, client, new_user):
    new_user.save()
    job_name = 'TestJob'
    data = {'name': job_name, 'description': 'testDescription', 'userId': 1}

    resp = client.post(ENDPOINT, headers=HEADERS, data=json.dumps(data))
    resp_json = json.loads(resp.data.decode('utf-8'))

    assert resp.status_code == HTTPStatus.CREATED
    assert resp_json['job']['id'] is not None
    assert resp_json['job']['name'] == job_name
    assert Job.get(int(resp_json['job']['id'])) is not None
Exemple #13
0
def execute(id: JobId) -> Tuple[Content, HttpStatusCode]:
    try:
        job = Job.get(id)
        assert job.user_id == get_jwt_identity(), 'Not an owner'
    except NoResultFound:
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except AssertionError as e:
        content, status = {'msg': GENERAL['unprivileged'].format(reason=e)}, HTTPStatus.FORBIDDEN.value
    else:
        content, status = business_execute(id)
    finally:
        return content, status
Exemple #14
0
def synchronize(task_id: TaskId) -> None:
    """Updates the state of a Task object stored in database.

    It compares current db record with list of active screen session (their pids in general)
    on node defined by that record ([email protected]).

    If task_nursery is unable to fetch active screen sessions then
    the new state is always set to unsynchronized.

    If task.pid is not alive (db record is outdated), then it
    makes transition from last known state to a new state:

    state before sync   => state applied after sync
    -----------------------------------------------
    running             => terminated
    unsynchronized      => not_running

    On every state transition job status is synchronized too
    """
    log.debug('Syncing Task {}...'.format(task_id))
    try:
        task = Task.get(task_id)
        parent_job = Job.get(task.job_id)
        assert task.hostname, 'hostname is empty'
        assert parent_job.user, 'user does not exist'
        active_sessions_pids = task_nursery.running(host=task.hostname, user=parent_job.user.username)
    except NoResultFound:
        # This exception must be handled within try/except block when using Task.get()
        # In other words, methods decorated with @synchronize_task_record must handle this case by themselves!
        log.warning(
            'Task {} could not be found (also synchronized). Failing without taking any action...'.format(task_id))
        pass
    except (AssertionError, Exception) as e:
        # task_nursery.running pssh exceptions are also catched here
        log.error('Unable to synchronize Task {}, reason: {}'.format(task_id, e))
        log.debug('Task {} status was: {}'.format(task_id, task.status.name))
        task.status = TaskStatus.unsynchronized
        task.save()
        log.debug('Task {} is now: {}'.format(task_id, task.status.name))
    else:
        log.debug('[BEFORE SYNC] Task {} status was: {}'.format(task_id, task.status.name))
        change_status_msg = '[AFTER SYNC] Task {id} is now: {curr_status}'
        if task.pid not in active_sessions_pids:
            if task.status is TaskStatus.running:
                task.status = TaskStatus.terminated
                log.debug(change_status_msg.format(id=task_id, curr_status=task.status.name))
            if task.status is TaskStatus.unsynchronized:
                task.status = TaskStatus.not_running
                log.debug(change_status_msg.format(id=task_id, curr_status=task.status.name))
            task.pid = None
            task.save()
Exemple #15
0
def get_log(id: TaskId, tail: bool) -> Tuple[Content, HttpStatusCode]:
    try:
        task = Task.get(id)
        parent_job = Job.get(task.job_id)
        if not is_admin() and not parent_job.user_id == get_jwt_identity():
            raise ForbiddenException("not an owner")
    except NoResultFound:
        content, status = {'msg': TASK['not_found']}, 404
    except ForbiddenException:
        content, status = {'msg': GENERAL['unprivileged']}, 403
    else:
        content, status = business_get_log(id, tail)
    finally:
        return content, status
Exemple #16
0
def update(id: TaskId, newValues: Dict[str, Any]) -> Tuple[Content, HttpStatusCode]:
    try:
        task = Task.get(id)
        parent_job = Job.get(task.job_id)
        if not is_admin() and not parent_job.user_id == get_jwt_identity():
            raise ForbiddenException('not an owner')
    except NoResultFound:
        content, status = {'msg': TASK['not_found']}, 404
    except ForbiddenException:
        content, status = {'msg': GENERAL['unprivileged']}, 403
    else:
        content, status = business_update(id, newValues)
    finally:
        return content, status
Exemple #17
0
def new_admin_job(new_user, new_admin, new_task):
    new_user.save()
    new_admin.save()
    job = Job(name='admin_job',
              description='Admin is the owner of this job',
              user_id=new_admin.id,
              _status=JobStatus.not_running)
    job.save()
    job.add_task(new_task)
    return job
Exemple #18
0
def dequeue(id: JobId) -> Tuple[Content, HttpStatusCode]:
    try:
        job = Job.get(id)
        if not (is_admin() or job.user_id == get_jwt_identity()):
            raise ForbiddenException("not an owner")
        job.dequeue()
    except NoResultFound:
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except ForbiddenException as fe:
        content, status = {'msg': GENERAL['unprivileged'].format(reason=fe)}, HTTPStatus.FORBIDDEN.value
    except AssertionError as ae:
        content, status = {'msg': JOB['dequeue']['failure'].format(reason=ae)}, HTTPStatus.CONFLICT.value
    else:
        content, status = {'msg': JOB['dequeue']['success'], 'job': job.as_dict()}, HTTPStatus.OK.value
    finally:
        return content, status
Exemple #19
0
def get_by_id(id: JobId) -> Tuple[Content, HttpStatusCode]:
    """Fetches one Job db record"""
    try:
        job = Job.get(id)
        assert get_jwt_identity() == job.user_id or is_admin()
    except NoResultFound as e:
        log.warning(e)
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except AssertionError:
        content, status = {'msg': GENERAL['unprivileged']}, HTTPStatus.FORBIDDEN.value
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value
    else:
        content, status = {'msg': JOB['get']['success'], 'job': job.as_dict()}, HTTPStatus.OK.value
    finally:
        return content, status
Exemple #20
0
def stop(id: JobId, gracefully: Optional[bool] = True) -> Tuple[Content, HttpStatusCode]:
    try:
        job = Job.get(id)
        assert get_jwt_identity() == job.user_id or is_admin()
        assert job.status is JobStatus.running, 'Only running jobs can be stopped'
    except NoResultFound:
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except AssertionError as e:
        if 'Only running jobs can be stopped' in e.args[0]:
            content, status = {'msg': JOB['stop']['failure']['state'].format(reason=e)}, \
                HTTPStatus.CONFLICT.value
        else:
            content, status = {'msg': GENERAL['unprivileged']}, HTTPStatus.FORBIDDEN.value
    else:
        content, status = business_stop(id, gracefully)
    finally:
        return content, status
Exemple #21
0
def business_terminate(id: TaskId, gracefully: Optional[bool] = True) -> Tuple[Content, HttpStatusCode]:
    """Sends SIGINT (default) or SIGKILL to process with pid that is stored in Task db record.

    In order to send SIGKILL, pass `gracefully=False` to `terminate` function.
    Note that:
    1) `exit_code` is related to executing kill operation, not killed process.
    2) termination signal should be respected by most processes, however this
        function does not guarantee stoping the process!
    """
    try:
        task = Task.get(id)
        assert task.status is TaskStatus.running, 'only running tasks can be terminated'
        assert task.pid, 'task has no pid assigned'  # It means there's inconsistency
        parent_job = Job.get(task.job_id)

        # gracefully:
        # True -> interrupt (allows output to be flushed into log file)
        # None -> terminate (works almost every time, but losing output that could be produced before closing)
        # False -> kill (similar to above, but success is almost guaranteed)
        exit_code = task_nursery.terminate(task.pid, task.hostname, parent_job.user.username, gracefully=gracefully)
        if exit_code != 0:
            raise ExitCodeError('operation exit code is not 0')

        # Note: Code below is unsafe, because interrupt and terminate does not guarantee success.
        # It's better to let synchhronization update this (via comparison with screen sessions)
        # (Unsafe section) Original comment: Allow to spawn that task again
        # task.pid = None
        # task.status = TaskStatus.terminated

        task.save()
    except NoResultFound:
        content, status = {'msg': TASK['not_found']}, 404
    except ExitCodeError:
        content, status = {'msg': TASK['terminate']['failure']['exit_code'], 'exit_code': exit_code}, 202
    except AssertionError as e:
        content, status = {'msg': TASK['terminate']['failure']['state'].format(reason=e)}, 409
    except ConnectionErrorException as e:
        content, status = {'msg': TASK['failure']['connection'].format(reason=e)}, 500
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, 500
    else:
        content, status = {'msg': TASK['terminate']['success'], 'exit_code': exit_code}, 200
    finally:
        return content, status
Exemple #22
0
def business_execute(id: JobId) -> Tuple[Content, HttpStatusCode]:
    """Tries to spawn all commands stored in Tasks belonging to Job (db records - (task.command))

    It won't allow for executing job which is currently running.
    If execute operation has succeeded then `running` status is set

    If one or more tasks did not spawn correctly, job is marked as running anyway and
    tasks which spawned correctly are running too. In that case user can stop the job and
    try to run it again, or just continue.
    """
    try:
        not_spawned_tasks = []
        job = Job.get(id)
        assert job.status is not JobStatus.running, 'Job is already running'
        for task in job.tasks:
            content, status = business_spawn(task.id)
            if status is not HTTPStatus.OK.value:
                not_spawned_tasks.append(task.id)

        job.synchronize_status()
        job.save()

        assert not_spawned_tasks == [], 'Could not spawn some tasks'

        # If job was scheduled to stop and user just
        # execute that job manually, scheduler should still
        # continue to watch and stop the job automatically.
    except NoResultFound:
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except AssertionError as e:
        if 'Job is already running' in e.args[0]:
            content, status = {'msg': JOB['execute']['failure']['state'].format(reason=e)}, \
                HTTPStatus.CONFLICT.value
        else:
            content, status = {'msg': JOB['execute']['failure']['tasks'].format(reason=e),
                               'not_spawned_list': not_spawned_tasks}, HTTPStatus.UNPROCESSABLE_ENTITY.value
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value
    else:
        log.info('Job {} is now: {}'.format(job.id, job.status.name))
        content, status = {'msg': JOB['execute']['success'], 'job': job.as_dict()}, HTTPStatus.OK.value
    finally:
        return content, status
    def sync_running_from_queue(
            self, available_hosts_with_gpu_occupation: Dict[str, Dict[str,
                                                                      List]]):
        jobs_running_from_queue = Job.get_jobs_running_from_queue()

        for job in jobs_running_from_queue:
            job_should_be_stopped = False
            for task in job.tasks:
                gpu_uid = Scheduler.get_assigned_gpu_uid(
                    task, available_hosts_with_gpu_occupation)

                if not gpu_uid or task.pid not in task_nursery.running(
                        task.hostname, job.user.username):
                    task.status = TaskStatus.not_running
                    continue

                current_processes_on_gpu = available_hosts_with_gpu_occupation[
                    task.hostname][gpu_uid]
                if current_processes_on_gpu is None:
                    other_process_pids = []
                else:
                    other_process_pids = [
                        process['pid'] for process in current_processes_on_gpu
                        if process['pid'] is not task.pid
                    ]

                considered_future_period = timedelta(
                    minutes=CONFIG.SCHEDULE_QUEUED_JOBS_WHEN_FREE_MINS)
                interferes = self.interferes_with_reservations(
                    job,
                    available_hosts_with_gpu_occupation,
                    considered_future_period=considered_future_period,
                    # Queued jobs should run only between reservations
                    allow_own=False)

                if len(other_process_pids) or interferes:
                    job_should_be_stopped = True

            if job_should_be_stopped:
                log.info(
                    self._log_msg(now=datetime.utcnow(),
                                  action='Stopping queued job',
                                  id=job.id))
                self.stop_with_grace(job.id)
Exemple #24
0
def business_stop(id: JobId, gracefully: Optional[bool] = True) -> Tuple[Content, HttpStatusCode]:
    """Tries to terminate all Tasks belonging to Job.

    If some tasks did not terminate correctly, job is not terminated, but all of
    the other (correct) tasks are terminated.

    'gracefully' parameter is passed to all of Tasks 'terminate' methods to either send
    SIGINT (default) or SIGKILL to processes with pid that are stored in Tasks db records.

    In order to send SIGKILL, pass `gracefully=False` to `stop` function.

    Note that termination signal should be respected by most processes, however this
    function does not guarantee stoping them!
    """
    try:
        job = Job.get(id)
        not_terminated_tasks = 0
        for task in job.tasks:
            content, status = business_terminate(task.id, gracefully)
            if status != HTTPStatus.OK.value:
                not_terminated_tasks += 1

        assert not_terminated_tasks == 0, 'Not all tasks could be terminated'
        # If job was scheduled to start automatically
        # but user decided to stop it manually
        # scheduler should not execute the job by itself then
        if job.start_at:
            # So this job should not be started automatically anymore
            job.start_at = None
        job.synchronize_status()
        job.save()
    except NoResultFound:
        content, status = {'msg': JOB['not_found']}, HTTPStatus.NOT_FOUND.value
    except AssertionError as e:
        content, status = {'msg': JOB['stop']['failure']['tasks'].format(reason=e)}, \
            HTTPStatus.UNPROCESSABLE_ENTITY.value
    except Exception as e:
        log.critical(e)
        content, status = {'msg': GENERAL['internal_error']}, HTTPStatus.INTERNAL_SERVER_ERROR.value
    else:
        log.info('Job {} is now: {}'.format(job.id, job.status.name))
        content, status = {'msg': JOB['stop']['success'], 'job': job.as_dict()}, HTTPStatus.OK.value
    finally:
        return content, status
    def execute_queued(self,
                       available_hosts_with_gpu_occupation: Dict[str,
                                                                 Dict[str,
                                                                      bool]]):
        queued_jobs = Job.get_job_queue()

        queued_jobs_to_eligible_gpus = self.get_hosts_with_gpus_eligible_for_jobs(
            queued_jobs)

        available_slots = self.check_current_gpu_slots(
            available_hosts_with_gpu_occupation)

        scheduled_jobs = self._scheduler.schedule_jobs(
            queued_jobs_to_eligible_gpus, available_slots)

        for scheduled_job in scheduled_jobs:
            log.info(
                self._log_msg(now=datetime.utcnow(),
                              action='Executing queued',
                              id=scheduled_job.id))
            self.try_execute(scheduled_job)
Exemple #26
0
def test_job_creation(tables):
    new_job = Job(name='job_name', description='testDescription').save()
    assert new_job.id is not None