def __check_jobs_at_startup( self ): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. In case the activation is enforced it will filter out the jobs of inactive users. """ jobs_at_startup = [] if self.app.config.user_activation_on: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .outerjoin( model.User ) \ .filter( ( ( model.Job.state == model.Job.states.NEW ) \ | ( model.Job.state == model.Job.states.RUNNING ) \ | ( model.Job.state == model.Job.states.QUEUED ) ) \ & ( model.Job.handler == self.app.config.server_name ) \ & or_( ( model.Job.user_id == None ),( model.User.active == True ) ) ).all() else: jobs_at_startup = self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( ( ( model.Job.state == model.Job.states.NEW ) \ | ( model.Job.state == model.Job.states.RUNNING ) \ | ( model.Job.state == model.Job.states.QUEUED ) ) \ & ( model.Job.handler == self.app.config.server_name ) ).all() for job in jobs_at_startup: if job.tool_id not in self.app.toolbox.tools_by_id: log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) ) JobWrapper( job, self ).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) if job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id ) job.job_runner_name = None if self.track_jobs_in_database: job.state = model.Job.states.NEW else: self.queue.put( ( job.id, job.tool_id ) ) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist # TODO: test me extensively job_wrapper = JobWrapper( job, self ) job_destination = self.dispatcher.url_to_destination(job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover( job, job_wrapper ) log.info('(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % ( job.id, job.state ) ) if self.track_jobs_in_database: job.state = model.Job.states.NEW else: self.queue.put( ( job.id, job.tool_id ) ) else: # Already dispatched and running job_wrapper = JobWrapper( job, self ) job_wrapper.job_runner_mapper.cached_job_destination = JobDestination(id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) self.dispatcher.recover( job, job_wrapper ) if self.sa_session.dirty: self.sa_session.flush()
def __check_if_ready_to_run(self, job): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ # If tracking in the database, job.state is guaranteed to be NEW and the inputs are guaranteed to be OK if not self.track_jobs_in_database: if job.state == model.Job.states.DELETED: return JOB_DELETED elif job.state == model.Job.states.ERROR: return JOB_ADMIN_DELETED for dataset_assoc in job.input_datasets + job.input_library_datasets: idata = dataset_assoc.dataset if not idata: continue # don't run jobs for which the input dataset was deleted if idata.deleted: self.job_wrappers.pop(job.id, JobWrapper(job, self)).fail( "input data %s (file: %s) was deleted before the job started" % (idata.hid, idata.file_name)) return JOB_INPUT_DELETED # an error in the input data causes us to bail immediately elif idata.state == idata.states.ERROR: self.job_wrappers.pop(job.id, JobWrapper(job, self)).fail( "input data %s is in error state" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state == idata.states.FAILED_METADATA: self.job_wrappers.pop(job.id, JobWrapper(job, self)).fail( "input data %s failed to properly set metadata" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self. app.datatypes_registry.set_external_metadata_tool.id): # need to requeue return JOB_WAIT # Create the job wrapper so that the destination can be set if job.id not in self.job_wrappers: self.job_wrappers[job.id] = JobWrapper(job, self) # Cause the job_destination to be set and cached by the mapper try: self.job_wrappers[job.id].job_destination except Exception, e: failure_message = getattr(e, 'failure_message', DEFAULT_JOB_PUT_FAILURE_MESSAGE) if failure_message == DEFAULT_JOB_PUT_FAILURE_MESSAGE: log.exception('Failed to generate job destination') else: log.debug("Intentionally failing job with message (%s)" % failure_message) self.job_wrappers[job.id].fail(failure_message) return JOB_ERROR
def __check_if_ready_to_run(self, job): """ Check if a job is ready to run by verifying that each of its input datasets is ready (specifically in the OK state). If any input dataset has an error, fail the job and return JOB_INPUT_ERROR. If any input dataset is deleted, fail the job and return JOB_INPUT_DELETED. If all input datasets are in OK state, return JOB_READY indicating that the job can be dispatched. Otherwise, return JOB_WAIT indicating that input datasets are still being prepared. """ # If tracking in the database, job.state is guaranteed to be NEW and the inputs are guaranteed to be OK if not self.track_jobs_in_database: if job.state == model.Job.states.DELETED: return JOB_DELETED elif job.state == model.Job.states.ERROR: return JOB_ADMIN_DELETED for dataset_assoc in job.input_datasets + job.input_library_datasets: idata = dataset_assoc.dataset if not idata: continue # don't run jobs for which the input dataset was deleted if idata.deleted: JobWrapper(job, self).fail( "input data %s (file: %s) was deleted before the job started" % (idata.hid, idata.file_name)) return JOB_INPUT_DELETED # an error in the input data causes us to bail immediately elif idata.state == idata.states.ERROR: JobWrapper(job, self).fail( "input data %s is in error state" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state == idata.states.FAILED_METADATA: JobWrapper(job, self).fail( "input data %s failed to properly set metadata" % (idata.hid)) return JOB_INPUT_ERROR elif idata.state != idata.states.OK and not ( idata.state == idata.states.SETTING_METADATA and job.tool_id is not None and job.tool_id == self. app.datatypes_registry.set_external_metadata_tool.id): # need to requeue return JOB_WAIT state = self.__check_user_jobs(job) if state == JOB_READY and self.app.config.enable_quotas: quota = self.app.quota_agent.get_quota(job.user) if quota is not None: try: usage = self.app.quota_agent.get_usage(user=job.user, history=job.history) if usage > quota: return JOB_USER_OVER_QUOTA except AssertionError, e: pass # No history, should not happen with an anon user
def __check_jobs_at_startup(self): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job manager starts. """ for job in self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( ( ( model.Job.state == model.Job.states.NEW ) \ | ( model.Job.state == model.Job.states.RUNNING ) \ | ( model.Job.state == model.Job.states.QUEUED ) ) \ & ( model.Job.handler == self.app.config.server_name ) ): if job.tool_id not in self.app.toolbox.tools_by_id: log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % (job.id, job.tool_id)) JobWrapper(job, self).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) elif job.job_runner_name is None or ( job.job_runner_name is not None and job.job_runner_external_id is None): if job.job_runner_name is None: log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % (job.id, job.state)) else: log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id) if self.track_jobs_in_database: job.state = model.Job.states.NEW else: self.queue.put((job.id, job.tool_id)) else: job_wrapper = JobWrapper(job, self) self.dispatcher.recover(job, job_wrapper) if self.sa_session.dirty: self.sa_session.flush()
def __check_jobs_at_startup( self ): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job manager starts. """ for job in self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( ( ( model.Job.state == model.Job.states.NEW ) \ | ( model.Job.state == model.Job.states.RUNNING ) \ | ( model.Job.state == model.Job.states.QUEUED ) ) \ & ( model.Job.handler == None ) ): if job.tool_id not in self.app.toolbox.tools_by_id: log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % ( job.id, job.tool_id ) ) JobWrapper( job, self ).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) else: job.handler = self.__get_handler( job ) # handler's recovery method will take it from here log.info( "(%d) Job in '%s' state had no handler at job manager startup, assigned '%s' handler" % ( job.id, job.state, job.handler ) ) if self.sa_session.dirty: self.sa_session.flush()
def job_wrapper(self, job, use_persisted_destination=False): return JobWrapper(job, self, use_persisted_destination=use_persisted_destination)
def _wrapper(self): return JobWrapper(self.job, self.queue)
def __check_jobs_at_startup(self): """ Checks all jobs that are in the 'new', 'queued' or 'running' state in the database and requeues or cleans up as necessary. Only run as the job handler starts. """ for job in self.sa_session.query( model.Job ).enable_eagerloads( False ) \ .filter( ( ( model.Job.state == model.Job.states.NEW ) \ | ( model.Job.state == model.Job.states.RUNNING ) \ | ( model.Job.state == model.Job.states.QUEUED ) ) \ & ( model.Job.handler == self.app.config.server_name ) ): if job.tool_id not in self.app.toolbox.tools_by_id: log.warning( "(%s) Tool '%s' removed from tool config, unable to recover job" % (job.id, job.tool_id)) JobWrapper(job, self).fail( 'This tool was disabled before the job completed. Please contact your Galaxy administrator.' ) if job.job_runner_name is not None and job.job_runner_external_id is None: # This could happen during certain revisions of Galaxy where a runner URL was persisted before the job was dispatched to a runner. log.debug( "(%s) Job runner assigned but no external ID recorded, adding to the job handler queue" % job.id) job.job_runner_name = None if self.track_jobs_in_database: job.state = model.Job.states.NEW else: self.queue.put((job.id, job.tool_id)) elif job.job_runner_name is not None and job.job_runner_external_id is not None and job.destination_id is None: # This is the first start after upgrading from URLs to destinations, convert the URL to a destination and persist # TODO: test me extensively job_wrapper = JobWrapper(job, self) job_destination = self.dispatcher.url_to_destination( job.job_runner_name) if job_destination.id is None: job_destination.id = 'legacy_url' job_wrapper.set_job_destination(job_destination, job.job_runner_external_id) self.dispatcher.recover(job, job_wrapper) log.info( '(%s) Converted job from a URL to a destination and recovered' % (job.id)) elif job.job_runner_name is None: # Never (fully) dispatched log.debug( "(%s) No job runner assigned and job still in '%s' state, adding to the job handler queue" % (job.id, job.state)) if self.track_jobs_in_database: job.state = model.Job.states.NEW else: self.queue.put((job.id, job.tool_id)) else: # Already dispatched and running job_wrapper = JobWrapper(job, self) job_wrapper.job_runner_mapper.cached_job_destination = JobDestination( id=job.destination_id, runner=job.job_runner_name, params=job.destination_params) self.dispatcher.recover(job, job_wrapper) if self.sa_session.dirty: self.sa_session.flush()
def __monitor_step(self): """ Called repeatedly by `monitor` to process waiting jobs. Gets any new jobs (either from the database or from its own queue), then iterates over all new and waiting jobs to check the state of the jobs each depends on. If the job has dependencies that have not finished, it it goes to the waiting queue. If the job has dependencies with errors, it is marked as having errors and removed from the queue. Otherwise, the job is dispatched. """ # Pull all new jobs from the queue at once jobs_to_check = [] if self.track_jobs_in_database: # Clear the session so we get fresh states for job and all datasets self.sa_session.expunge_all() # Fetch all new jobs hda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputDatasetAssociation) \ .join(model.HistoryDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.HistoryDatasetAssociation._state == model.HistoryDatasetAssociation.states.FAILED_METADATA), (model.HistoryDatasetAssociation.deleted == True ), (model.Dataset.state != model.Dataset.states.OK ), (model.Dataset.deleted == True)))).subquery() ldda_not_ready = self.sa_session.query(model.Job.id).enable_eagerloads(False) \ .join(model.JobToInputLibraryDatasetAssociation) \ .join(model.LibraryDatasetDatasetAssociation) \ .join(model.Dataset) \ .filter(and_((model.Job.state == model.Job.states.NEW), or_((model.LibraryDatasetDatasetAssociation._state != None), (model.LibraryDatasetDatasetAssociation.deleted == True), (model.Dataset.state != model.Dataset.states.OK), (model.Dataset.deleted == True)))).subquery() jobs_to_check = self.sa_session.query(model.Job).enable_eagerloads(False) \ .filter(and_((model.Job.state == model.Job.states.NEW), (model.Job.handler == self.app.config.server_name), ~model.Job.table.c.id.in_(hda_not_ready), ~model.Job.table.c.id.in_(ldda_not_ready))) \ .order_by(model.Job.id).all() # Ensure that we get new job counts on each iteration self.__clear_user_job_count() else: # Get job objects and append to watch queue for any which were # previously waiting for job_id in self.waiting_jobs: jobs_to_check.append( self.sa_session.query(model.Job).get(job_id)) try: while 1: message = self.queue.get_nowait() if message is self.STOP_SIGNAL: return # Unpack the message job_id, tool_id = message # Get the job object and append to watch queue jobs_to_check.append( self.sa_session.query(model.Job).get(job_id)) except Empty: pass # Iterate over new and waiting jobs and look for any that are # ready to run new_waiting_jobs = [] for job in jobs_to_check: try: # Check the job's dependencies, requeue if they're not done. # Some of these states will only happen when using the in-memory job queue job_state = self.__check_if_ready_to_run(job) if job_state == JOB_WAIT: if not self.track_jobs_in_database: new_waiting_jobs.append(job.id) elif job_state == JOB_INPUT_ERROR: log.info( "(%d) Job unable to run: one or more inputs in error state" % job.id) elif job_state == JOB_INPUT_DELETED: log.info( "(%d) Job unable to run: one or more inputs deleted" % job.id) elif job_state == JOB_READY: self.dispatcher.put(JobWrapper(job, self)) log.info("(%d) Job dispatched" % job.id) elif job_state == JOB_DELETED: log.info("(%d) Job deleted by user while still queued" % job.id) elif job_state == JOB_ADMIN_DELETED: log.info("(%d) Job deleted by admin while still queued" % job.id) elif job_state == JOB_USER_OVER_QUOTA: log.info("(%d) User (%s) is over quota: job paused" % (job.id, job.user_id)) job.state = model.Job.states.PAUSED for dataset_assoc in job.output_datasets + job.output_library_datasets: dataset_assoc.dataset.dataset.state = model.Dataset.states.PAUSED dataset_assoc.dataset.info = "Execution of this dataset's job is paused because you were over your disk quota at the time it was ready to run" self.sa_session.add(dataset_assoc.dataset.dataset) self.sa_session.add(job) else: log.error("(%d) Job in unknown state '%s'" % (job.id, job_state)) if not self.track_jobs_in_database: new_waiting_jobs.append(job.id) except Exception: log.exception("failure running job %d" % job.id) # Update the waiting list self.waiting_jobs = new_waiting_jobs # Flush, if we updated the state self.sa_session.flush() # Done with the session self.sa_session.remove()
def _wrapper(self): return JobWrapper(self.job, self.queue) # type: ignore[arg-type]