def flag_set(section, setting): """True if the given boolean setting is enabled in openquake.cfg :param string section: name of the configuration file section :param string setting: name of the configuration file setting :returns: True if the setting is enabled in openquake.cfg, False otherwise """ setting = get(section, setting) if setting is None: return False return general.str2bool(setting)
def check_nodes(self): """ Check that the expected celery nodes are all up. The loop continues until the main thread keeps running. """ while self.job_is_running(sleep=self.interval): live_nodes = self.ping(timeout=self.interval) if live_nodes < self.live_nodes: dead_nodes = list(self.live_nodes - live_nodes) logs.LOG.critical( 'Cluster nodes not accessible: %s', dead_nodes) terminate = general.str2bool( config.get('celery', 'terminate_job_when_celery_is_down')) if terminate: os.kill(os.getpid(), signal.SIGABRT) # commit suicide
def no_distribute(): """ Check the `OQ_NO_DISTRIBUTE` environment var to determine if calculations should be distributed or not. :returns: `True` if the envvar value is "true", "yes", "t", or "1", regardless of case. Otherwise, return `False`. If the variable is undefined, it defaults to `False`. """ nd = os.environ.get(NO_DISTRIBUTE_VAR) if nd is None: return False else: return general_utils.str2bool(nd)
from openquake import hazardlib from openquake import risklib from openquake import nrmllib INPUT_TYPES = dict(models.INPUT_TYPE_CHOICES) UNABLE_TO_DEL_HC_FMT = 'Unable to delete hazard calculation: %s' UNABLE_TO_DEL_RC_FMT = 'Unable to delete risk calculation: %s' LOG_FORMAT = ('[%(asctime)s %(calc_domain)s #%(calc_id)s %(hostname)s ' '%(levelname)s %(processName)s/%(process)s %(name)s] ' '%(message)s') TERMINATE = general.str2bool( config.get('celery', 'terminate_workers_on_revoke')) def cleanup_after_job(job, terminate): """ Release the resources used by an openquake job. In particular revoke the running tasks (if any). :param int job_id: the job id :param bool terminate: the celery revoke command terminate flag """ # Using the celery API, terminate and revoke and terminate any running # tasks associated with the current job. task_ids = Performance.objects.filter( oq_job=job, operation='storing task id', task_id__isnull=False)\ .values_list('task_id', flat=True)
class SupervisorLogMessageConsumer(logs.AMQPLogSource): """ Supervise an OpenQuake job by: - handling its "critical" and "error" messages - periodically checking that the job process is still running """ # Failure counter check delay, translates to 60 seconds with the current # settings. FCC_DELAY = 60 terminate = general.str2bool( config.get('celery', 'terminate_workers_on_revoke')) def __init__(self, job_id, job_pid, timeout=1): self.job_id = job_id job = OqJob.objects.get(id=job_id) self.calc_id = job.calculation.id if job.hazard_calculation is not None: self.calc_domain = 'hazard' else: self.calc_domain = 'risk' self.selflogger = logging.getLogger('oq.%s.%s.supervisor' % (self.calc_domain, self.calc_id)) self.selflogger.debug('Entering supervisor for %s calc %s' % (self.calc_domain, self.calc_id)) logger_name = 'oq.%s.%s' % (self.calc_domain, self.calc_id) key = '%s.#' % logger_name super(SupervisorLogMessageConsumer, self).__init__(timeout=timeout, routing_key=key) self.job_pid = job_pid self.joblogger = logging.getLogger(logger_name) self.jobhandler = logging.Handler(logging.ERROR) self.jobhandler.emit = self.log_callback self.joblogger.addHandler(self.jobhandler) # Failure counter check delay value self.fcc_delay_value = 0 def run(self): """ Wrap superclass' method just to add cleanup. """ started = datetime.utcnow() super(SupervisorLogMessageConsumer, self).run() stopped = datetime.utcnow() self.selflogger.info( '%s calc %s finished in %s' % (self.calc_domain, self.calc_id, stopped - started)) self.joblogger.removeHandler(self.jobhandler) self.selflogger.debug('Exiting supervisor for %s calc %s' % (self.calc_domain, self.calc_id)) def log_callback(self, record): """ Handles messages of severe level from the supervised job. """ if record.name == self.selflogger.name: # ignore error log messages sent by selflogger. # this way we don't try to kill the job if its # process has crashed (or has been stopped). # we emit selflogger's error messages from # timeout_callback(). return terminate_job(self.job_pid) update_job_status(self.job_id) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id, self.terminate) self.stop() def timeout_callback(self): """ On timeout expiration check if the job process is still running and whether it experienced any failures. Terminate the job process in the latter case. """ def failure_counters_need_check(): """Return `True` if failure counters should be checked.""" self.fcc_delay_value += 1 result = self.fcc_delay_value >= self.FCC_DELAY if result: self.fcc_delay_value = 0 return result process_stopped = job_failed = False message = None if not supervising.is_pid_running(self.job_pid): message = ('job process %s crashed or terminated' % self.job_pid) process_stopped = True elif failure_counters_need_check(): # Job process is still running. failures = stats.failure_counters(self.job_id) failed_nodes = None if failures: message = "job terminated with failures: %s" % failures else: # Don't check for failed nodes if distribution is disabled. # In this case, we don't expect any nodes to be present, and # thus, there are none that can fail. if not openquake.engine.no_distribute(): failed_nodes = abort_due_to_failed_nodes(self.job_id) if failed_nodes: message = ("job terminated due to %s failed nodes" % failed_nodes) if failures or failed_nodes: terminate_job(self.job_pid) job_failed = True if job_failed or process_stopped: job_status = get_job_status(self.job_id) if process_stopped and job_status == 'complete': message = 'job process %s succeeded' % self.job_pid self.selflogger.debug(message) elif not job_status == 'complete': # The job crashed without having a chance to update the # status in the database, or it has been running even though # there were failures. We update the job status here. self.selflogger.error(message) update_job_status(self.job_id) record_job_stop_time(self.job_id) cleanup_after_job(self.job_id, self.terminate) raise StopIteration()