Ejemplo n.º 1
0
    def refresh(self):
        """
        Refresh the available jobs and put any pending ones since last refresh
        onto the ThreadPoolExecutor.

        Otherwise will exit out without doing anything and
        subsequent jobs will be launched after the current one completes.
        """
        # lock to make sure the queries are safe across threads and jobs are unique
        # should be done with a DB transaction, but current setup with
        # peewee and the connection pooling does not support transactions
        with self._refresh_lock:
            _LOGGER.info("Refreshing JobWorkerManager")
            query = (
                Job.select()
                .where(
                    Job.status == JobStatus.pending
                    and Job.worker_ack == False  # noqa: E712
                )  # noqa: E712
                .order_by(Job.created)
            )
            job_ids = [job.job_id for job in query]
            _LOGGER.info(f"Found {len(job_ids)} pending jobs, adding to threadpool")

            for job in query:
                _LOGGER.debug(f"Adding job {job.job_id} to threadpool")
                self._pool.submit(self._execute_job, str(job.job_id), self._canceled)

            _LOGGER.debug("Updating jobs in db to ack that worker received them")
            Job.update(worker_ack=True).where(Job.job_id.in_(job_ids)).execute()
Ejemplo n.º 2
0
 def _update_job_error(self, job: Job, err: Any):
     _LOGGER.debug(f"Job {job.job_id} errored, saving to DB")
     job.status = JobStatus.canceled
     job.error = err
     job.progress = None
     job.save()
     _LOGGER.warning(f"Job {job.job_id} errored out {err}")
Ejemplo n.º 3
0
 def _report_error(self, job: Job, error: str):
     self._errored = True
     self._error = error
     job.status = JobStatus.error
     job.error = error
     job.progress = None
     job.save()
Ejemplo n.º 4
0
    def app_startup(self):
        """
        Handle app startup to clear uncompleted state for jobs and begin running
        """

        # cancel any jobs that were left in an uncompleted state
        with database.connection_context():
            Job.update(status=JobStatus.canceled).where(
                Job.status == JobStatus.started)

        self.refresh()
Ejemplo n.º 5
0
    def _update_job_progress(self, job: Job, state: _JobExecutionState, progress: Dict):
        # update the progress max 5 times a second to not hammer the DB
        if (
            state.db_progress_saved_time
            and time.time() - state.db_progress_saved_time < 0.2
        ):
            return

        _LOGGER.debug(f"Job {job.job_id} saving progress to DB")
        job.progress = progress
        job.save()
        state.db_progress_saved_time = time.time()
Ejemplo n.º 6
0
    def _load_next_pending_helper() -> Union[None, JobWorkerWrapper]:
        with database.connection_context():
            next_job = None  # type: Union[None, Job]
            query = (Job.select().where(
                Job.status == JobStatus.pending).order_by(
                    Job.created).limit(1))

            for job in query:
                next_job = job
                break

            if next_job is None:
                return None

            try:
                if next_job.type_ not in JobWorkerRegistryHolder.REGISTRY:
                    raise ValueError("Cannot find job of type {}".format(
                        next_job.type_))

                cls = JobWorkerRegistryHolder.REGISTRY[next_job.type_]
                worker = cls(job.job_id, job.project_id, **job.worker_args)
                wrapper = JobWorkerWrapper(worker)

                return wrapper
            except Exception as err:
                next_job.error = str(err)
                next_job.status = JobStatus.error
                next_job.save()

                raise err
Ejemplo n.º 7
0
    def cancel_job(self, job_id: str):
        """
        Cancel a job with the given job_id so it won't be run.

        :param job_id: the job_id to cancel
        :raise JobNotFoundError: if the job could not be found in the database
        :raise JobCancelationFailureError: if the job could not be canceled
        """
        _LOGGER.info("Canceling job {}".format(job_id))

        try:
            _LOGGER.debug(f"Getting job {job_id} from DB")
            job = Job.get_or_none(Job.job_id == job_id)

            if job is None:
                _LOGGER.error(f"Could not find job {job_id} to cancel")
                raise JobNotFoundError()

            if (
                job.status == JobStatus.error
                or job.status == JobStatus.completed
                or job.status == JobStatus.canceled
            ):
                _LOGGER.error(f"Could not cancel job {job_id} with status {job.status}")

                raise JobCancelationFailureError(
                    "Job with status {} cannot be canceled".format(job.status)
                )

            self._update_job_canceled(job)
        except (JobNotFoundError, JobCancelationFailureError) as passthrough_err:
            raise passthrough_err
        except Exception as err:
            _LOGGER.warning(f"Error while canceling job {job_id} in db: {err}")
Ejemplo n.º 8
0
def load_model_from_repo(project_id: str) -> Tuple[Response, int]:
    """
    Route for loading a model for a project from the Neural Magic model repo.
    Starts a background job in the JobWorker setup to run.
    The state of the job can be checked after.
    Raises an HTTPNotFoundError if the project is not found in the database.

    :param project_id: the id of the project to load the model for
    :return: a tuple containing (json response, http status code)
    """
    _LOGGER.info(
        "loading model from repo for project {} for request json {}".format(
            project_id, request.json))
    project = _add_model_check(project_id)
    data = SetProjectModelFromSchema().load(request.get_json(force=True))
    project_model = None
    job = None

    try:
        project_model = ProjectModel.create(project=project,
                                            source="downloaded_repo",
                                            job=None)
        job = Job.create(
            project_id=project.project_id,
            type_=ModelFromRepoJobWorker.get_type(),
            worker_args=ModelFromRepoJobWorker.format_args(
                model_id=project_model.model_id,
                uri=data["uri"],
            ),
        )
        project_model.job = job
        project_model.save()
        project_model.setup_filesystem()
        project_model.validate_filesystem()
    except Exception as err:
        _LOGGER.error(
            "error while creating new project model, rolling back: {}".format(
                err))
        if project_model:
            try:
                project_model.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error("error while rolling back new model: {}".format(
                    rollback_err))
        if job:
            try:
                job.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error("error while rolling back new model: {}".format(
                    rollback_err))
        raise err

    # call into JobWorkerManager to kick off job if it's not already running
    JobWorkerManager().refresh()

    resp_model = data_dump_and_validation(ResponseProjectModelSchema(),
                                          {"model": project_model})
    _LOGGER.info("created project model from repo {}".format(resp_model))

    return jsonify(resp_model), HTTPStatus.OK.value
Ejemplo n.º 9
0
    def _cancel_pending_jobs(self):
        _LOGGER.debug("Canceling any pending jobs")
        query = Job.update(
            status=JobStatus.canceled,
            error=(
                "Job was left in a stranded state and did not complete on last run, "
                "canceled on server startup"
            ),
        ).where(Job.status == JobStatus.started or Job.status == JobStatus.pending)
        row_count = query.execute()

        if row_count > 0:
            _LOGGER.info(f"Canceled {row_count} stranded jobs")
Ejemplo n.º 10
0
    def _execute_job(self, job_id: str, canceled: _LockedVar[bool]):
        _LOGGER.info(f"Starting job {job_id} in JobWorkerManager")
        state = _JobExecutionState()
        job = None

        try:
            _LOGGER.debug(f"Getting job {job_id} from DB")
            job = Job.get(Job.job_id == job_id)

            if self._check_cancel_job(job, canceled, state):
                _LOGGER.debug(
                    f"Job {job_id} cancel requested before starting, canceling"
                )
                raise JobCancelError()

            self._update_job_started(job)

            _LOGGER.debug(f"Creating worker for job {job_id}")
            worker = JobWorkerRegistry.create_worker(job)  # type: JobWorker

            _LOGGER.debug(f"Starting worker run for job {job_id}")
            for progress in worker.run():
                if self._check_cancel_job(job, canceled, state):
                    _LOGGER.debug(f"Job {job_id} cancel requested, canceling")
                    raise JobCancelError()

                self._update_job_progress(job, state, progress)

            self._update_job_completed(job)
        except JobCancelError:
            if not job:
                raise RuntimeError("job is None after JobCancelError")

            self._update_job_canceled(job)
            _LOGGER.info(f"Job {job_id} canceled in JobWorkerManager")
        except Exception as err:
            # try to update the job in the DB in case the job doesn't exist
            # or the job was deleted from the DB
            try:
                self._update_job_error(job, err)
            except Exception as save_err:
                _LOGGER.warning(
                    f"Could not update job state in db to errored "
                    f"for job {job_id}: {save_err}: for error {err}"
                )
Ejemplo n.º 11
0
    def _check_cancel_job(
        self, job: Job, canceled: _LockedVar[bool], state: _JobExecutionState
    ) -> bool:
        # cancel if overall system is being shutdown
        if canceled.get() or not threading.main_thread().is_alive():
            return True

        # refresh job state at maximum one second intervals to see if job was canceled
        if (
            state.db_canceled_check_time
            and time.time() - state.db_canceled_check_time < 1.0
        ):
            return False

        job = job.refresh()
        state.db_canceled_check_time = time.time()

        return job.status == JobStatus.canceled
Ejemplo n.º 12
0
def get_job(job_id: str):
    """
    Route for getting a job matching the given job_id.
    Raises an HTTPNotFoundError if the job is not found in the database.

    :param job_id: the id of the job to get
    :return: a tuple containing (json response, http status code)
    """
    _LOGGER.info("getting job {}".format(job_id))
    job = Job.get_or_none(Job.job_id == job_id)

    if job is None:
        raise HTTPNotFoundError("could not find job with job_id {}".format(job_id))

    resp_schema = ResponseJobSchema()
    resp_job = resp_schema.dump({"job": job})
    resp_schema.validate(resp_job)
    _LOGGER.info("retrieved job {}".format(resp_job))

    return jsonify(resp_job), HTTPStatus.OK.value
Ejemplo n.º 13
0
    def cancel_job(self, job_id: str):
        """
        Cancel a job with the given job_id so it won't be run.
        Blocks until the job can be canceled.

        :param job_id: the job_id to cancel
        :raise JobNotFoundError: if the job could not be found in the database
        :raise JobCancelationFailureError: if the job could not be canceled
        """
        _LOGGER.info("Canceling job with id {}".format(job_id))

        with self._lock:
            if self._current is not None and self._current.job_id == job_id:
                self._current.cancel()

                return

            with database.connection_context():
                job = Job.get_or_none(Job.job_id == job_id)

                if job is None:
                    _LOGGER.error(
                        "Could not find job with id {}".format(job_id))

                    raise JobNotFoundError(
                        "Could not find job with id {}".format(job_id))

                if (job.status == JobStatus.error
                        or job.status == JobStatus.completed
                        or job.status == JobStatus.canceled):
                    _LOGGER.error("Could not cancel job with status {}".format(
                        job.status))

                    raise JobCancelationFailureError(
                        "Job with status {} cannot be canceled".format(
                            job.status))

                job.status = JobStatus.canceled
                job.save()
Ejemplo n.º 14
0
def get_jobs():
    """
    Route for getting a list of jobs filtered by the flask request args

    :return: a tuple containing (json response, http status code)
    """
    _LOGGER.info("getting jobs for request args {}".format(request.args))
    args = SearchJobsSchema().load({key: val for key, val in request.args.items()})

    query = Job.select()
    if "project_id" in args and args["project_id"]:
        query = query.where(Job.project == args["project_id"])
    order_by = getattr(Job, args["order_by"])
    query = query.order_by(
        order_by if not args["order_desc"] else order_by.desc()
    ).paginate(args["page"], args["page_length"])

    jobs = [res for res in query]
    resp_schema = ResponseJobsSchema()
    resp_jobs = resp_schema.dump({"jobs": jobs})
    resp_schema.validate(resp_jobs)
    _LOGGER.info("retrieved {} jobs".format(len(resp_jobs)))

    return jsonify(resp_jobs), HTTPStatus.OK.value
Ejemplo n.º 15
0
 def _update_job_completed(self, job: Job):
     _LOGGER.debug(f"Job {job.job_id} completed, saving DB state")
     job.status = JobStatus.completed
     job.progress = None
     job.save()
     _LOGGER.info(f"Job {job.job_id} completed in JobWorkerManager")
Ejemplo n.º 16
0
 def _update_job_canceled(self, job: Job):
     _LOGGER.debug(f"Job {job.job_id} cancel requested, saving in DB")
     job.status = JobStatus.canceled
     job.progress = None
     job.save()
     _LOGGER.info(f"Job {job.job_id} canceled in DB")
Ejemplo n.º 17
0
 def _update_job_started(self, job: Job):
     _LOGGER.debug(f"Updating job {job.job_id} to started status")
     job.status = JobStatus.started
     job.save()
Ejemplo n.º 18
0
def create_benchmark(project_id: str):
    """
    Route for creating a new benchmark for a given project.
    Raises an HTTPNotFoundError if the project is not found in the database.

    :param project_id: the id of the project to create a benchmark for
    :return: a tuple containing (json response, http status code)
    """
    _LOGGER.info(
        "creating benchmark for project {} for request json {}".format(
            project_id, request.get_json()))

    project = get_project_by_id(project_id)

    benchmark_params = CreateProjectBenchmarkSchema().load(
        request.get_json(force=True))

    model = project.model
    if model is None:
        raise ValidationError(
            ("A model has not been set for the project with id {}, "
             "project must set a model before running a benchmark."
             ).format(project_id))

    sys_info = get_ml_sys_info()
    benchmark = None
    job = None

    try:
        benchmark_params["instruction_sets"] = (
            sys_info["available_instructions"]
            if "available_instructions" in sys_info else [])
        benchmark = ProjectBenchmark.create(project=project,
                                            source="generated",
                                            **benchmark_params)

        job = Job.create(
            project_id=project.project_id,
            type_=CreateBenchmarkJobWorker.get_type(),
            worker_args=CreateBenchmarkJobWorker.format_args(
                model_id=model.model_id,
                benchmark_id=benchmark.benchmark_id,
                core_counts=benchmark.core_counts,
                batch_sizes=benchmark.batch_sizes,
                instruction_sets=benchmark.instruction_sets,
                inference_models=benchmark.inference_models,
                warmup_iterations_per_check=benchmark.
                warmup_iterations_per_check,
                iterations_per_check=benchmark.iterations_per_check,
            ),
        )
        benchmark.job = job
        benchmark.save()
    except Exception as err:
        _LOGGER.error(
            "error while creating new benchmark, rolling back: {}".format(err))
        if benchmark:
            try:
                benchmark.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error(
                    "error while rolling back new benchmark: {}".format(
                        rollback_err))
        if job:
            try:
                job.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error(
                    "error while rolling back new benchmark: {}".format(
                        rollback_err))
        raise err

    JobWorkerManager().refresh()

    resp_benchmark = data_dump_and_validation(ResponseProjectBenchmarkSchema(),
                                              {"benchmark": benchmark})
    _LOGGER.info("created benchmark and job: {}".format(resp_benchmark))

    return jsonify(resp_benchmark), HTTPStatus.OK.value
Ejemplo n.º 19
0
 def _report_started(self, job: Job):
     self._started = True
     job.status = JobStatus.started
     job.save()
Ejemplo n.º 20
0
    def _worker_thread(self):
        _LOGGER.debug(
            "job worker for job_id {} and project_id {} thead init".format(
                self._worker.job_id, self._worker.project_id
            )
        )

        with database.connection_context():
            with self._lock:
                job = Job.get(Job.job_id == self._worker.job_id)
                self._report_started(job)

            canceled = False
            error = None

            try:
                # initial check to see if job was canceled before it started
                if self._should_cancel():
                    raise JobCancelError()

                for progress in self._worker.run():
                    with self._lock:
                        if self._should_report_progress():
                            self._report_progress(job, progress)

                        if self._should_cancel():
                            raise JobCancelError()
            except JobCancelError:
                canceled = True

                _LOGGER.debug(
                    "cancel job worker for job_id {} and project_id {} received".format(
                        self._worker.job_id, self._worker.project_id
                    )
                )
            except Exception as err:
                _LOGGER.info(
                    (
                        "job worker for job_id {} and project_id {} "
                        "encountered error: {}"
                    ).format(self._worker.job_id, self._worker.project_id, err)
                )
                error = err

            with self._lock:
                self._start_completed = True

                if canceled:
                    self._report_canceled(job)
                    _LOGGER.info(
                        "canceled job worker for job_id {} and project_id {}".format(
                            self._worker.job_id, self._worker.project_id
                        )
                    )
                elif error is not None:
                    self._report_error(job, str(error))
                    _LOGGER.info(
                        "errored job worker for job_id {} and project_id {}".format(
                            self._worker.job_id, self._worker.project_id
                        )
                    )
                else:
                    self._report_completed(job)
                    _LOGGER.info(
                        "completed job worker for job_id {} and project_id {}".format(
                            self._worker.job_id, self._worker.project_id
                        )
                    )

        self._done_callback()
Ejemplo n.º 21
0
 def _report_progress(self, job: Job, progress: Dict[str, Any]):
     self._progress = progress
     self._progress_time = time.time()
     job.progress = progress
     job.save()
Ejemplo n.º 22
0
 def _report_canceled(self, job: Job):
     self._canceled = True
     job.status = JobStatus.canceled
     job.progress = None
     job.save()
Ejemplo n.º 23
0
def load_data_from_repo(project_id: str):
    """
    Route for loading data file(s) for a project from the Neural Magic model repo.
    Starts a background job in the JobWorker setup to run.
    The state of the job can be checked after.
    Raises an HTTPNotFoundError if the project is not found in the database.

    :param project_id: the id of the project to load the data for
    :return: a tuple containing (json response, http status code)
    """
    _LOGGER.info(
        "loading data from repo for project {} for request json {}".format(
            project_id, request.json))
    project = get_project_by_id(project_id)
    get_project_model_by_project_id(project_id)
    data = SetProjectDataFromSchema().load(request.get_json(force=True))
    project_data = None
    job = None

    try:
        project_data = ProjectData.create(project=project,
                                          source="downloaded_path",
                                          job=None)
        job = Job.create(
            project_id=project.project_id,
            type_=DataFromRepoJobWorker.get_type(),
            worker_args=DataFromRepoJobWorker.format_args(
                data_id=project_data.data_id, uri=data["uri"]),
        )
        project_data.job = job
        project_data.save()
        project_data.setup_filesystem()
        project_data.validate_filesystem()
    except Exception as err:
        if project_data:
            try:
                os.remove(project_data.file_path)
            except OSError:
                pass

            try:
                project_data.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error("error while rolling back new data: {}".format(
                    rollback_err))

        if job:
            try:
                job.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error("error while rolling back new data: {}".format(
                    rollback_err))

        _LOGGER.error(
            "error while creating new project data, rolling back: {}".format(
                err))
        raise err

    # call into JobWorkerManager to kick off job if it's not already running
    JobWorkerManager().refresh()

    resp_data = data_dump_and_validation(ResponseProjectDataSingleSchema(),
                                         {"data": project_data})
    _LOGGER.info("created project data from path {}".format(resp_data))

    return jsonify(resp_data), HTTPStatus.OK.value
Ejemplo n.º 24
0
def create_perf_profile(project_id: str):
    """
    Route for creating a new perf profile for a given project.
    Raises an HTTPNotFoundError if the project is not found in the database.

    :param project_id: the id of the project to create a perf profile for
    :return: a tuple containing (json response, http status code)
    """
    _LOGGER.info(
        "creating perf profile for project {} for request json {}".format(
            project_id, request.json))
    project = get_project_by_id(project_id)

    perf_profile_params = CreateProjectPerfProfileSchema().load(
        request.get_json(force=True))
    sys_info = get_ml_sys_info()

    if not perf_profile_params[
            "core_count"] or perf_profile_params["core_count"] < 1:
        perf_profile_params["core_count"] = sys_info["cores_per_socket"]

    if not perf_profile_params["core_count"]:
        # extra check in case the system couldn't get cores_per_socket
        perf_profile_params["core_count"] = -1

    perf_profile_params["instruction_sets"] = sys_info[
        "available_instructions"]

    model = project.model
    if model is None:
        raise ValidationError(
            ("A model is has not been set for the project with id {}, "
             "project must set a model before running a perf profile."
             ).format(project_id))
    perf_profile = None
    job = None

    try:
        perf_profile = ProjectPerfProfile.create(project=project,
                                                 source="generated",
                                                 **perf_profile_params)
        job = Job.create(
            project_id=project_id,
            type_=CreatePerfProfileJobWorker.get_type(),
            worker_args=CreatePerfProfileJobWorker.format_args(
                model_id=model.model_id,
                profile_id=perf_profile.profile_id,
                batch_size=perf_profile_params["batch_size"],
                core_count=perf_profile_params["core_count"],
                pruning_estimations=perf_profile_params["pruning_estimations"],
                quantized_estimations=perf_profile_params[
                    "quantized_estimations"],
                iterations_per_check=perf_profile_params[
                    "iterations_per_check"],
                warmup_iterations_per_check=perf_profile_params[
                    "warmup_iterations_per_check"],
            ),
        )
        perf_profile.job = job
        perf_profile.save()
    except Exception as err:
        _LOGGER.error(
            "error while creating new perf profile, rolling back: {}".format(
                err))
        if perf_profile:
            try:
                perf_profile.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error(
                    "error while rolling back new perf profile: {}".format(
                        rollback_err))
        if job:
            try:
                job.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error(
                    "error while rolling back new perf profile: {}".format(
                        rollback_err))
        raise err

    # call into JobWorkerManager to kick off job if it's not already running
    JobWorkerManager().refresh()

    resp_profile = data_dump_and_validation(ResponseProjectPerfProfileSchema(),
                                            {"profile": perf_profile})
    _LOGGER.info("created perf profile and job: {}".format(resp_profile))

    return jsonify(resp_profile), HTTPStatus.OK.value
Ejemplo n.º 25
0
def create_loss_profile(project_id: str):
    """
    Route for creating a new loss profile for a given project.
    Raises an HTTPNotFoundError if the project is not found in the database.

    :param project_id: the id of the project to create a loss profile for
    :return: a tuple containing (json response, http status code)
    """
    _LOGGER.info(
        "creating loss profile for project {} for request json {}".format(
            project_id, request.json))
    project = get_project_by_id(project_id)

    loss_profile_params = CreateProjectLossProfileSchema().load(
        request.get_json(force=True))

    model = project.model
    if model is None:
        raise ValidationError(
            ("A model has not been set for the project with id {}, "
             "project must set a model before running a loss profile."
             ).format(project_id))
    loss_profile = None
    job = None

    try:
        loss_profile = ProjectLossProfile.create(project=project,
                                                 source="generated",
                                                 **loss_profile_params)
        job = Job.create(
            project_id=project_id,
            type_=CreateLossProfileJobWorker.get_type(),
            worker_args=CreateLossProfileJobWorker.format_args(
                model_id=model.model_id,
                profile_id=loss_profile.profile_id,
                pruning_estimations=loss_profile_params["pruning_estimations"],
                pruning_estimation_type=loss_profile_params[
                    "pruning_estimation_type"],
                pruning_structure=loss_profile_params["pruning_structure"],
                quantized_estimations=loss_profile_params[
                    "quantized_estimations"],
            ),
        )
        loss_profile.job = job
        loss_profile.save()
    except Exception as err:
        _LOGGER.error(
            "error while creating new loss profile, rolling back: {}".format(
                err))
        if loss_profile:
            try:
                loss_profile.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error(
                    "error while rolling back new loss profile: {}".format(
                        rollback_err))
        if job:
            try:
                job.delete_instance()
            except Exception as rollback_err:
                _LOGGER.error(
                    "error while rolling back new loss profile: {}".format(
                        rollback_err))
        raise err

    # call into JobWorkerManager to kick off job if it's not already running
    JobWorkerManager().refresh()

    resp_profile = data_dump_and_validation(ResponseProjectLossProfileSchema(),
                                            {"profile": loss_profile})
    _LOGGER.info("created loss profile and job: {}".format(resp_profile))

    return jsonify(resp_profile), HTTPStatus.OK.value