def refresh(self): """ Refresh the available jobs and put any pending ones since last refresh onto the ThreadPoolExecutor. Otherwise will exit out without doing anything and subsequent jobs will be launched after the current one completes. """ # lock to make sure the queries are safe across threads and jobs are unique # should be done with a DB transaction, but current setup with # peewee and the connection pooling does not support transactions with self._refresh_lock: _LOGGER.info("Refreshing JobWorkerManager") query = ( Job.select() .where( Job.status == JobStatus.pending and Job.worker_ack == False # noqa: E712 ) # noqa: E712 .order_by(Job.created) ) job_ids = [job.job_id for job in query] _LOGGER.info(f"Found {len(job_ids)} pending jobs, adding to threadpool") for job in query: _LOGGER.debug(f"Adding job {job.job_id} to threadpool") self._pool.submit(self._execute_job, str(job.job_id), self._canceled) _LOGGER.debug("Updating jobs in db to ack that worker received them") Job.update(worker_ack=True).where(Job.job_id.in_(job_ids)).execute()
def _update_job_error(self, job: Job, err: Any): _LOGGER.debug(f"Job {job.job_id} errored, saving to DB") job.status = JobStatus.canceled job.error = err job.progress = None job.save() _LOGGER.warning(f"Job {job.job_id} errored out {err}")
def _report_error(self, job: Job, error: str): self._errored = True self._error = error job.status = JobStatus.error job.error = error job.progress = None job.save()
def app_startup(self): """ Handle app startup to clear uncompleted state for jobs and begin running """ # cancel any jobs that were left in an uncompleted state with database.connection_context(): Job.update(status=JobStatus.canceled).where( Job.status == JobStatus.started) self.refresh()
def _update_job_progress(self, job: Job, state: _JobExecutionState, progress: Dict): # update the progress max 5 times a second to not hammer the DB if ( state.db_progress_saved_time and time.time() - state.db_progress_saved_time < 0.2 ): return _LOGGER.debug(f"Job {job.job_id} saving progress to DB") job.progress = progress job.save() state.db_progress_saved_time = time.time()
def _load_next_pending_helper() -> Union[None, JobWorkerWrapper]: with database.connection_context(): next_job = None # type: Union[None, Job] query = (Job.select().where( Job.status == JobStatus.pending).order_by( Job.created).limit(1)) for job in query: next_job = job break if next_job is None: return None try: if next_job.type_ not in JobWorkerRegistryHolder.REGISTRY: raise ValueError("Cannot find job of type {}".format( next_job.type_)) cls = JobWorkerRegistryHolder.REGISTRY[next_job.type_] worker = cls(job.job_id, job.project_id, **job.worker_args) wrapper = JobWorkerWrapper(worker) return wrapper except Exception as err: next_job.error = str(err) next_job.status = JobStatus.error next_job.save() raise err
def cancel_job(self, job_id: str): """ Cancel a job with the given job_id so it won't be run. :param job_id: the job_id to cancel :raise JobNotFoundError: if the job could not be found in the database :raise JobCancelationFailureError: if the job could not be canceled """ _LOGGER.info("Canceling job {}".format(job_id)) try: _LOGGER.debug(f"Getting job {job_id} from DB") job = Job.get_or_none(Job.job_id == job_id) if job is None: _LOGGER.error(f"Could not find job {job_id} to cancel") raise JobNotFoundError() if ( job.status == JobStatus.error or job.status == JobStatus.completed or job.status == JobStatus.canceled ): _LOGGER.error(f"Could not cancel job {job_id} with status {job.status}") raise JobCancelationFailureError( "Job with status {} cannot be canceled".format(job.status) ) self._update_job_canceled(job) except (JobNotFoundError, JobCancelationFailureError) as passthrough_err: raise passthrough_err except Exception as err: _LOGGER.warning(f"Error while canceling job {job_id} in db: {err}")
def load_model_from_repo(project_id: str) -> Tuple[Response, int]: """ Route for loading a model for a project from the Neural Magic model repo. Starts a background job in the JobWorker setup to run. The state of the job can be checked after. Raises an HTTPNotFoundError if the project is not found in the database. :param project_id: the id of the project to load the model for :return: a tuple containing (json response, http status code) """ _LOGGER.info( "loading model from repo for project {} for request json {}".format( project_id, request.json)) project = _add_model_check(project_id) data = SetProjectModelFromSchema().load(request.get_json(force=True)) project_model = None job = None try: project_model = ProjectModel.create(project=project, source="downloaded_repo", job=None) job = Job.create( project_id=project.project_id, type_=ModelFromRepoJobWorker.get_type(), worker_args=ModelFromRepoJobWorker.format_args( model_id=project_model.model_id, uri=data["uri"], ), ) project_model.job = job project_model.save() project_model.setup_filesystem() project_model.validate_filesystem() except Exception as err: _LOGGER.error( "error while creating new project model, rolling back: {}".format( err)) if project_model: try: project_model.delete_instance() except Exception as rollback_err: _LOGGER.error("error while rolling back new model: {}".format( rollback_err)) if job: try: job.delete_instance() except Exception as rollback_err: _LOGGER.error("error while rolling back new model: {}".format( rollback_err)) raise err # call into JobWorkerManager to kick off job if it's not already running JobWorkerManager().refresh() resp_model = data_dump_and_validation(ResponseProjectModelSchema(), {"model": project_model}) _LOGGER.info("created project model from repo {}".format(resp_model)) return jsonify(resp_model), HTTPStatus.OK.value
def _cancel_pending_jobs(self): _LOGGER.debug("Canceling any pending jobs") query = Job.update( status=JobStatus.canceled, error=( "Job was left in a stranded state and did not complete on last run, " "canceled on server startup" ), ).where(Job.status == JobStatus.started or Job.status == JobStatus.pending) row_count = query.execute() if row_count > 0: _LOGGER.info(f"Canceled {row_count} stranded jobs")
def _execute_job(self, job_id: str, canceled: _LockedVar[bool]): _LOGGER.info(f"Starting job {job_id} in JobWorkerManager") state = _JobExecutionState() job = None try: _LOGGER.debug(f"Getting job {job_id} from DB") job = Job.get(Job.job_id == job_id) if self._check_cancel_job(job, canceled, state): _LOGGER.debug( f"Job {job_id} cancel requested before starting, canceling" ) raise JobCancelError() self._update_job_started(job) _LOGGER.debug(f"Creating worker for job {job_id}") worker = JobWorkerRegistry.create_worker(job) # type: JobWorker _LOGGER.debug(f"Starting worker run for job {job_id}") for progress in worker.run(): if self._check_cancel_job(job, canceled, state): _LOGGER.debug(f"Job {job_id} cancel requested, canceling") raise JobCancelError() self._update_job_progress(job, state, progress) self._update_job_completed(job) except JobCancelError: if not job: raise RuntimeError("job is None after JobCancelError") self._update_job_canceled(job) _LOGGER.info(f"Job {job_id} canceled in JobWorkerManager") except Exception as err: # try to update the job in the DB in case the job doesn't exist # or the job was deleted from the DB try: self._update_job_error(job, err) except Exception as save_err: _LOGGER.warning( f"Could not update job state in db to errored " f"for job {job_id}: {save_err}: for error {err}" )
def _check_cancel_job( self, job: Job, canceled: _LockedVar[bool], state: _JobExecutionState ) -> bool: # cancel if overall system is being shutdown if canceled.get() or not threading.main_thread().is_alive(): return True # refresh job state at maximum one second intervals to see if job was canceled if ( state.db_canceled_check_time and time.time() - state.db_canceled_check_time < 1.0 ): return False job = job.refresh() state.db_canceled_check_time = time.time() return job.status == JobStatus.canceled
def get_job(job_id: str): """ Route for getting a job matching the given job_id. Raises an HTTPNotFoundError if the job is not found in the database. :param job_id: the id of the job to get :return: a tuple containing (json response, http status code) """ _LOGGER.info("getting job {}".format(job_id)) job = Job.get_or_none(Job.job_id == job_id) if job is None: raise HTTPNotFoundError("could not find job with job_id {}".format(job_id)) resp_schema = ResponseJobSchema() resp_job = resp_schema.dump({"job": job}) resp_schema.validate(resp_job) _LOGGER.info("retrieved job {}".format(resp_job)) return jsonify(resp_job), HTTPStatus.OK.value
def cancel_job(self, job_id: str): """ Cancel a job with the given job_id so it won't be run. Blocks until the job can be canceled. :param job_id: the job_id to cancel :raise JobNotFoundError: if the job could not be found in the database :raise JobCancelationFailureError: if the job could not be canceled """ _LOGGER.info("Canceling job with id {}".format(job_id)) with self._lock: if self._current is not None and self._current.job_id == job_id: self._current.cancel() return with database.connection_context(): job = Job.get_or_none(Job.job_id == job_id) if job is None: _LOGGER.error( "Could not find job with id {}".format(job_id)) raise JobNotFoundError( "Could not find job with id {}".format(job_id)) if (job.status == JobStatus.error or job.status == JobStatus.completed or job.status == JobStatus.canceled): _LOGGER.error("Could not cancel job with status {}".format( job.status)) raise JobCancelationFailureError( "Job with status {} cannot be canceled".format( job.status)) job.status = JobStatus.canceled job.save()
def get_jobs(): """ Route for getting a list of jobs filtered by the flask request args :return: a tuple containing (json response, http status code) """ _LOGGER.info("getting jobs for request args {}".format(request.args)) args = SearchJobsSchema().load({key: val for key, val in request.args.items()}) query = Job.select() if "project_id" in args and args["project_id"]: query = query.where(Job.project == args["project_id"]) order_by = getattr(Job, args["order_by"]) query = query.order_by( order_by if not args["order_desc"] else order_by.desc() ).paginate(args["page"], args["page_length"]) jobs = [res for res in query] resp_schema = ResponseJobsSchema() resp_jobs = resp_schema.dump({"jobs": jobs}) resp_schema.validate(resp_jobs) _LOGGER.info("retrieved {} jobs".format(len(resp_jobs))) return jsonify(resp_jobs), HTTPStatus.OK.value
def _update_job_completed(self, job: Job): _LOGGER.debug(f"Job {job.job_id} completed, saving DB state") job.status = JobStatus.completed job.progress = None job.save() _LOGGER.info(f"Job {job.job_id} completed in JobWorkerManager")
def _update_job_canceled(self, job: Job): _LOGGER.debug(f"Job {job.job_id} cancel requested, saving in DB") job.status = JobStatus.canceled job.progress = None job.save() _LOGGER.info(f"Job {job.job_id} canceled in DB")
def _update_job_started(self, job: Job): _LOGGER.debug(f"Updating job {job.job_id} to started status") job.status = JobStatus.started job.save()
def create_benchmark(project_id: str): """ Route for creating a new benchmark for a given project. Raises an HTTPNotFoundError if the project is not found in the database. :param project_id: the id of the project to create a benchmark for :return: a tuple containing (json response, http status code) """ _LOGGER.info( "creating benchmark for project {} for request json {}".format( project_id, request.get_json())) project = get_project_by_id(project_id) benchmark_params = CreateProjectBenchmarkSchema().load( request.get_json(force=True)) model = project.model if model is None: raise ValidationError( ("A model has not been set for the project with id {}, " "project must set a model before running a benchmark." ).format(project_id)) sys_info = get_ml_sys_info() benchmark = None job = None try: benchmark_params["instruction_sets"] = ( sys_info["available_instructions"] if "available_instructions" in sys_info else []) benchmark = ProjectBenchmark.create(project=project, source="generated", **benchmark_params) job = Job.create( project_id=project.project_id, type_=CreateBenchmarkJobWorker.get_type(), worker_args=CreateBenchmarkJobWorker.format_args( model_id=model.model_id, benchmark_id=benchmark.benchmark_id, core_counts=benchmark.core_counts, batch_sizes=benchmark.batch_sizes, instruction_sets=benchmark.instruction_sets, inference_models=benchmark.inference_models, warmup_iterations_per_check=benchmark. warmup_iterations_per_check, iterations_per_check=benchmark.iterations_per_check, ), ) benchmark.job = job benchmark.save() except Exception as err: _LOGGER.error( "error while creating new benchmark, rolling back: {}".format(err)) if benchmark: try: benchmark.delete_instance() except Exception as rollback_err: _LOGGER.error( "error while rolling back new benchmark: {}".format( rollback_err)) if job: try: job.delete_instance() except Exception as rollback_err: _LOGGER.error( "error while rolling back new benchmark: {}".format( rollback_err)) raise err JobWorkerManager().refresh() resp_benchmark = data_dump_and_validation(ResponseProjectBenchmarkSchema(), {"benchmark": benchmark}) _LOGGER.info("created benchmark and job: {}".format(resp_benchmark)) return jsonify(resp_benchmark), HTTPStatus.OK.value
def _report_started(self, job: Job): self._started = True job.status = JobStatus.started job.save()
def _worker_thread(self): _LOGGER.debug( "job worker for job_id {} and project_id {} thead init".format( self._worker.job_id, self._worker.project_id ) ) with database.connection_context(): with self._lock: job = Job.get(Job.job_id == self._worker.job_id) self._report_started(job) canceled = False error = None try: # initial check to see if job was canceled before it started if self._should_cancel(): raise JobCancelError() for progress in self._worker.run(): with self._lock: if self._should_report_progress(): self._report_progress(job, progress) if self._should_cancel(): raise JobCancelError() except JobCancelError: canceled = True _LOGGER.debug( "cancel job worker for job_id {} and project_id {} received".format( self._worker.job_id, self._worker.project_id ) ) except Exception as err: _LOGGER.info( ( "job worker for job_id {} and project_id {} " "encountered error: {}" ).format(self._worker.job_id, self._worker.project_id, err) ) error = err with self._lock: self._start_completed = True if canceled: self._report_canceled(job) _LOGGER.info( "canceled job worker for job_id {} and project_id {}".format( self._worker.job_id, self._worker.project_id ) ) elif error is not None: self._report_error(job, str(error)) _LOGGER.info( "errored job worker for job_id {} and project_id {}".format( self._worker.job_id, self._worker.project_id ) ) else: self._report_completed(job) _LOGGER.info( "completed job worker for job_id {} and project_id {}".format( self._worker.job_id, self._worker.project_id ) ) self._done_callback()
def _report_progress(self, job: Job, progress: Dict[str, Any]): self._progress = progress self._progress_time = time.time() job.progress = progress job.save()
def _report_canceled(self, job: Job): self._canceled = True job.status = JobStatus.canceled job.progress = None job.save()
def load_data_from_repo(project_id: str): """ Route for loading data file(s) for a project from the Neural Magic model repo. Starts a background job in the JobWorker setup to run. The state of the job can be checked after. Raises an HTTPNotFoundError if the project is not found in the database. :param project_id: the id of the project to load the data for :return: a tuple containing (json response, http status code) """ _LOGGER.info( "loading data from repo for project {} for request json {}".format( project_id, request.json)) project = get_project_by_id(project_id) get_project_model_by_project_id(project_id) data = SetProjectDataFromSchema().load(request.get_json(force=True)) project_data = None job = None try: project_data = ProjectData.create(project=project, source="downloaded_path", job=None) job = Job.create( project_id=project.project_id, type_=DataFromRepoJobWorker.get_type(), worker_args=DataFromRepoJobWorker.format_args( data_id=project_data.data_id, uri=data["uri"]), ) project_data.job = job project_data.save() project_data.setup_filesystem() project_data.validate_filesystem() except Exception as err: if project_data: try: os.remove(project_data.file_path) except OSError: pass try: project_data.delete_instance() except Exception as rollback_err: _LOGGER.error("error while rolling back new data: {}".format( rollback_err)) if job: try: job.delete_instance() except Exception as rollback_err: _LOGGER.error("error while rolling back new data: {}".format( rollback_err)) _LOGGER.error( "error while creating new project data, rolling back: {}".format( err)) raise err # call into JobWorkerManager to kick off job if it's not already running JobWorkerManager().refresh() resp_data = data_dump_and_validation(ResponseProjectDataSingleSchema(), {"data": project_data}) _LOGGER.info("created project data from path {}".format(resp_data)) return jsonify(resp_data), HTTPStatus.OK.value
def create_perf_profile(project_id: str): """ Route for creating a new perf profile for a given project. Raises an HTTPNotFoundError if the project is not found in the database. :param project_id: the id of the project to create a perf profile for :return: a tuple containing (json response, http status code) """ _LOGGER.info( "creating perf profile for project {} for request json {}".format( project_id, request.json)) project = get_project_by_id(project_id) perf_profile_params = CreateProjectPerfProfileSchema().load( request.get_json(force=True)) sys_info = get_ml_sys_info() if not perf_profile_params[ "core_count"] or perf_profile_params["core_count"] < 1: perf_profile_params["core_count"] = sys_info["cores_per_socket"] if not perf_profile_params["core_count"]: # extra check in case the system couldn't get cores_per_socket perf_profile_params["core_count"] = -1 perf_profile_params["instruction_sets"] = sys_info[ "available_instructions"] model = project.model if model is None: raise ValidationError( ("A model is has not been set for the project with id {}, " "project must set a model before running a perf profile." ).format(project_id)) perf_profile = None job = None try: perf_profile = ProjectPerfProfile.create(project=project, source="generated", **perf_profile_params) job = Job.create( project_id=project_id, type_=CreatePerfProfileJobWorker.get_type(), worker_args=CreatePerfProfileJobWorker.format_args( model_id=model.model_id, profile_id=perf_profile.profile_id, batch_size=perf_profile_params["batch_size"], core_count=perf_profile_params["core_count"], pruning_estimations=perf_profile_params["pruning_estimations"], quantized_estimations=perf_profile_params[ "quantized_estimations"], iterations_per_check=perf_profile_params[ "iterations_per_check"], warmup_iterations_per_check=perf_profile_params[ "warmup_iterations_per_check"], ), ) perf_profile.job = job perf_profile.save() except Exception as err: _LOGGER.error( "error while creating new perf profile, rolling back: {}".format( err)) if perf_profile: try: perf_profile.delete_instance() except Exception as rollback_err: _LOGGER.error( "error while rolling back new perf profile: {}".format( rollback_err)) if job: try: job.delete_instance() except Exception as rollback_err: _LOGGER.error( "error while rolling back new perf profile: {}".format( rollback_err)) raise err # call into JobWorkerManager to kick off job if it's not already running JobWorkerManager().refresh() resp_profile = data_dump_and_validation(ResponseProjectPerfProfileSchema(), {"profile": perf_profile}) _LOGGER.info("created perf profile and job: {}".format(resp_profile)) return jsonify(resp_profile), HTTPStatus.OK.value
def create_loss_profile(project_id: str): """ Route for creating a new loss profile for a given project. Raises an HTTPNotFoundError if the project is not found in the database. :param project_id: the id of the project to create a loss profile for :return: a tuple containing (json response, http status code) """ _LOGGER.info( "creating loss profile for project {} for request json {}".format( project_id, request.json)) project = get_project_by_id(project_id) loss_profile_params = CreateProjectLossProfileSchema().load( request.get_json(force=True)) model = project.model if model is None: raise ValidationError( ("A model has not been set for the project with id {}, " "project must set a model before running a loss profile." ).format(project_id)) loss_profile = None job = None try: loss_profile = ProjectLossProfile.create(project=project, source="generated", **loss_profile_params) job = Job.create( project_id=project_id, type_=CreateLossProfileJobWorker.get_type(), worker_args=CreateLossProfileJobWorker.format_args( model_id=model.model_id, profile_id=loss_profile.profile_id, pruning_estimations=loss_profile_params["pruning_estimations"], pruning_estimation_type=loss_profile_params[ "pruning_estimation_type"], pruning_structure=loss_profile_params["pruning_structure"], quantized_estimations=loss_profile_params[ "quantized_estimations"], ), ) loss_profile.job = job loss_profile.save() except Exception as err: _LOGGER.error( "error while creating new loss profile, rolling back: {}".format( err)) if loss_profile: try: loss_profile.delete_instance() except Exception as rollback_err: _LOGGER.error( "error while rolling back new loss profile: {}".format( rollback_err)) if job: try: job.delete_instance() except Exception as rollback_err: _LOGGER.error( "error while rolling back new loss profile: {}".format( rollback_err)) raise err # call into JobWorkerManager to kick off job if it's not already running JobWorkerManager().refresh() resp_profile = data_dump_and_validation(ResponseProjectLossProfileSchema(), {"profile": loss_profile}) _LOGGER.info("created loss profile and job: {}".format(resp_profile)) return jsonify(resp_profile), HTTPStatus.OK.value