def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists(get_task_job_log( schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SuiteProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity( log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError: if cylc.flags.debug: ERR.debug(traceback.format_exc())
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists( get_task_job_log(schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SuiteProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError: if cylc.flags.debug: ERR.debug(traceback.format_exc())
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flags.debug: traceback.print_exc() err_log = ("cannot execute database statement:\n" + "file=%(file)s:\nstmt=%(stmt)s") % { "file": self.db_file_name, "stmt": stmt } for i, stmt_args in enumerate(stmt_args_list): err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % { "i": i, "stmt_args": stmt_args }) ERR.warning(err_log) raise
def send_retry(self, event_message, event_id, max_n_tries, retry_intvl_secs): """CLI external trigger interface.""" max_n_tries = int(max_n_tries or self.__class__.MAX_N_TRIES) retry_intvl_secs = float(retry_intvl_secs or self.__class__.RETRY_INTVL_SECS) sent = False i_try = 0 while not sent and i_try < max_n_tries: i_try += 1 try: self.put(event_message, event_id) except Exception as exc: ERR.error(exc) OUT.info(self.__class__.MSG_SEND_FAILED % ( i_try, max_n_tries, )) if i_try >= max_n_tries: break OUT.info(self.__class__.MSG_SEND_RETRY % (retry_intvl_secs, self.timeout)) sleep(retry_intvl_secs) else: if i_try > 1: OUT.info(self.__class__.MSG_SEND_SUCCEEDED % (i_try, max_n_tries)) sent = True break if not sent: sys.exit('ERROR: send failed') return sent
def load_config(self): """Load the suite config.""" if self.suiterc: is_reload = True collapsed = self.suiterc.closed_families else: is_reload = False collapsed = [] try: self.suiterc = SuiteConfig( self.suite, self.file, self.template_vars, is_reload=is_reload, collapsed=collapsed, cli_initial_point_string=self.start_point_string, vis_start_string=self.start_point_string, vis_stop_string=self.stop_point_string) except Exception as exc: msg = "Failed - parsing error?\n\n" + str(exc) ERR.error(msg) if self.interactive: dia = gtk.MessageDialog(type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, message_format=msg) dia.run() dia.destroy() return False sys.exit(1) self.inherit = self.suiterc.get_parent_lists() return True
def load_config(self): """Load the suite config.""" if self.suiterc: is_reload = True collapsed = self.suiterc.closed_families else: is_reload = False collapsed = [] try: self.suiterc = SuiteConfig( self.suite, self.file, self.template_vars, is_reload=is_reload, collapsed=collapsed, cli_initial_point_string=self.start_point_string, vis_start_string=self.start_point_string, vis_stop_string=self.stop_point_string) except Exception as exc: msg = "Failed - parsing error?\n\n" + str(exc) ERR.error(msg) dia = gtk.MessageDialog(type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, message_format=msg) dia.run() dia.destroy() return False self.inherit = self.suiterc.get_parent_lists() return True
def _prep_submit_task_job(self, suite, itask, dry_run): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask try: job_conf = self._prep_submit_task_job_impl(suite, itask) local_job_file_path = self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE) self.job_file_writer.write(local_job_file_path, job_conf) except Exception, exc: # Could be a bad command template. ERR.error(traceback.format_exc()) LOG.error(traceback.format_exc()) self.task_events_mgr.log_task_job_activity( SuiteProcContext( self.JOBS_SUBMIT, '(prepare job file)', err=exc, ret_code=1), suite, itask.point, itask.tdef.name) if not dry_run: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) return
def unlink_suite_contact_files(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for user_at_host, should_unlink in self.initialised_hosts.items(): if not should_unlink: continue if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, "suite run directory", host, owner), SuiteSrvFilesManager.DIR_BASE_SRV, SuiteSrvFilesManager.FILE_BASE_CONTACT, ) cmd = shlex.split(ssh_tmpl) + ["-n", user_at_host, "rm", "-f", r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.items(): if not proc.poll(): continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) ) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() proc.wait() ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) )
def _run_event_handlers_callback(self, proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) ERR.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def _conditional_is_satisfied(self): try: res = eval(self.conditional_expression) except Exception, exc: err_msg = str(exc) if str(exc).find("unexpected EOF") != -1: err_msg += ("\n(?could be unmatched parentheses in the graph " "string?)") ERR.error(err_msg) raise TriggerExpressionError('"' + self.raw_conditional_expression + '"')
def _conditional_is_satisfied(self): try: res = eval(self.conditional_expression) except Exception, exc: err_msg = str(exc) if str(exc).find("unexpected EOF") != -1: err_msg += ("\n(?could be unmatched parentheses in the graph " "string?)") ERR.error(err_msg) raise TriggerExpressionError( '"' + self.raw_conditional_expression + '"')
def restart_upgrade(self): """Vacuum/upgrade runtime DB on restart.""" # Backward compat, upgrade database with state file if necessary suite_run_d = os.path.dirname(os.path.dirname(self.pub_path)) old_pri_db_path = os.path.join( suite_run_d, 'state', CylcSuiteDAO.OLD_DB_FILE_BASE_NAME) old_pri_db_path_611 = os.path.join( suite_run_d, CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[0]) old_state_file_path = os.path.join(suite_run_d, "state", "state") if (os.path.exists(old_pri_db_path) and os.path.exists(old_state_file_path) and not os.path.exists(self.pri_path)): # Upgrade pre-6.11.X runtime database + state file copy(old_pri_db_path, self.pri_path) pri_dao = self.get_pri_dao() pri_dao.upgrade_with_state_file(old_state_file_path) target = os.path.join(suite_run_d, "state.tar.gz") cmd = ["tar", "-C", suite_run_d, "-czf", target, "state"] if call(cmd, stdin=open(os.devnull)) == 0: rmtree(os.path.join(suite_run_d, "state"), ignore_errors=True) else: try: os.unlink(os.path.join(suite_run_d, "state.tar.gz")) except OSError: pass ERR.error("cannot tar-gzip + remove old state/ directory") # Remove old files as well try: os.unlink(os.path.join(suite_run_d, "cylc-suite-env")) except OSError: pass elif (os.path.exists(old_pri_db_path_611) and not os.path.exists(self.pri_path)): # Upgrade 6.11.X runtime database os.rename(old_pri_db_path_611, self.pri_path) pri_dao = self.get_pri_dao() pri_dao.upgrade_from_611() # Remove old files as well for name in [ CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[1], "cylc-suite-env"]: try: os.unlink(os.path.join(suite_run_d, name)) except OSError: pass else: pri_dao = self.get_pri_dao() pri_dao.upgrade_pickle_to_json() # Vacuum the primary/private database file pri_dao.vacuum() pri_dao.close()
def unlink_suite_contact_files(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.initialised.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('remote shell template', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory', host, owner), SuiteSrvFilesManager.DIR_BASE_SRV, SuiteSrvFilesManager.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file ] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err))
def __init__(self, suite): # Suite only needed for back-compat with old clients (see below): self.suite = suite self.engine = None self.port = None # Figure out the ports we are allowed to use. base_port = glbl_cfg().get(['communication', 'base port']) max_ports = glbl_cfg().get( ['communication', 'maximum number of ports']) self.ok_ports = range(int(base_port), int(base_port) + int(max_ports)) random.shuffle(self.ok_ports) comms_options = glbl_cfg().get(['communication', 'options']) # HTTP Digest Auth uses MD5 - pretty secure in this use case. # Extending it with extra algorithms is allowed, but won't be # supported by most browsers. requests and urllib2 are OK though. self.hash_algorithm = "MD5" if "SHA1" in comms_options: # Note 'SHA' rather than 'SHA1'. self.hash_algorithm = "SHA" self.srv_files_mgr = SuiteSrvFilesManager() self.comms_method = glbl_cfg().get(['communication', 'method']) self.get_ha1 = cherrypy.lib.auth_digest.get_ha1_dict_plain( { 'cylc': self.srv_files_mgr.get_auth_item( self.srv_files_mgr.FILE_BASE_PASSPHRASE, suite, content=True), 'anon': NO_PASSPHRASE }, algorithm=self.hash_algorithm) if self.comms_method == 'http': self.cert = None self.pkey = None else: # if self.comms_method in [None, 'https']: try: self.cert = self.srv_files_mgr.get_auth_item( self.srv_files_mgr.FILE_BASE_SSL_CERT, suite) self.pkey = self.srv_files_mgr.get_auth_item( self.srv_files_mgr.FILE_BASE_SSL_PEM, suite) except SuiteServiceFileError: ERR.error("no HTTPS/OpenSSL support. Aborting...") raise CylcError("No HTTPS support. " "Configure user's global.rc to use HTTP.") self.start()
def unlink_hosts_contacts(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.init_host_map.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): ERR.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err))
def _event_email_callback(self, proc_ctx, schd_ctx): """Call back when email notification command exits.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: if proc_ctx.ret_code == 0: del self.event_timers[id_key] log_ctx = SuiteProcContext((key1, submit_num), None) log_ctx.ret_code = 0 log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) else: self.event_timers[id_key].unset_waiting() except KeyError: if cylc.flags.debug: ERR.debug(traceback.format_exc())
def load_config(self): if self.suiterc: is_reload = True collapsed = self.suiterc.closed_families else: is_reload = False collapsed = [] try: self.suiterc = SuiteConfig( self.suite, self.file, self.template_vars, is_reload=is_reload, collapsed=collapsed, vis_start_string=self.start_point_string, vis_stop_string=self.stop_point_string) except Exception, x: ERR.error("Failed - parsing error?\n" + str(x)) return False
def _event_email_callback(self, proc_ctx, schd_ctx): """Call back when email notification command exits.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: if proc_ctx.ret_code == 0: del self.event_timers[id_key] log_ctx = SuiteProcContext((key1, submit_num), None) log_ctx.ret_code = 0 log_task_job_activity( log_ctx, schd_ctx.suite, point, name, submit_num) else: self.event_timers[id_key].unset_waiting() except KeyError: if cylc.flags.debug: ERR.debug(traceback.format_exc())
def _conditional_is_satisfied(self): """Evaluate the prerequisite's condition expression. Does not cache the result. """ try: res = eval(self.conditional_expression) except Exception, exc: err_msg = str(exc) if str(exc).find("unexpected EOF") != -1: err_msg += ("\n(?could be unmatched parentheses in the graph " "string?)") ERR.error(err_msg) raise TriggerExpressionError( '"%s"' % self.get_raw_conditional_expression())
def start(self): """Start quick web service.""" # cherrypy.config["tools.encode.on"] = True # cherrypy.config["tools.encode.encoding"] = "utf-8" cherrypy.config["server.socket_host"] = '0.0.0.0' cherrypy.config["engine.autoreload.on"] = False try: from OpenSSL import SSL, crypto cherrypy.config['server.ssl_module'] = 'pyopenSSL' cherrypy.config['server.ssl_certificate'] = self.cert cherrypy.config['server.ssl_private_key'] = self.pkey except ImportError: ERR.warning("no HTTPS/OpenSSL support") cherrypy.config['log.screen'] = None key = binascii.hexlify(os.urandom(16)) cherrypy.config.update({ 'tools.auth_digest.on': True, 'tools.auth_digest.realm': self.suite, 'tools.auth_digest.get_ha1': self.get_ha1, 'tools.auth_digest.key': key, 'tools.auth_digest.algorithm': self.hash_algorithm }) cherrypy.tools.connect_log = cherrypy.Tool( 'on_end_resource', self.report_connection_if_denied) cherrypy.config['tools.connect_log.on'] = True self.engine = cherrypy.engine for port in self.ok_ports: cherrypy.config["server.socket_port"] = port try: cherrypy.engine.start() cherrypy.engine.wait(cherrypy.engine.states.STARTED) except Exception: if cylc.flags.debug: traceback.print_exc() # We need to reinitialise the httpserver for each port attempt. cherrypy.server.httpserver = None else: if cherrypy.engine.state == cherrypy.engine.states.STARTED: self.port = port return raise Exception("No available ports")
def _conditional_is_satisfied(self): """Evaluate the prerequisite's condition expression. Does not cache the result. """ try: res = eval(self.conditional_expression) except (SyntaxError, ValueError) as exc: err_msg = str(exc) if str(exc).find("unexpected EOF") != -1: err_msg += ("\n(?could be unmatched parentheses in the graph " "string?)") ERR.error(err_msg) raise TriggerExpressionError( '"%s"' % self.get_raw_conditional_expression()) return res
def load_config(self): if self.suiterc: is_reload = True collapsed = self.suiterc.closed_families else: is_reload = False collapsed = [] try: self.suiterc = SuiteConfig( self.suite, self.file, self.template_vars, is_reload=is_reload, collapsed=collapsed, cli_initial_point_string=self.start_point_string, vis_start_string=self.start_point_string, vis_stop_string=self.stop_point_string) except Exception, x: ERR.error("Failed - parsing error?\n" + str(x)) return False
def send_retry(self, event_message, event_id, max_n_tries, retry_intvl_secs): """CLI external trigger interface.""" max_n_tries = int(max_n_tries or self.__class__.MAX_N_TRIES) retry_intvl_secs = float( retry_intvl_secs or self.__class__.RETRY_INTVL_SECS) sent = False i_try = 0 while not sent and i_try < max_n_tries: i_try += 1 try: self.put(event_message, event_id) except Exception as exc: ERR.error(exc) OUT.info(self.__class__.MSG_SEND_FAILED % ( i_try, max_n_tries, )) if i_try >= max_n_tries: break OUT.info(self.__class__.MSG_SEND_RETRY % ( retry_intvl_secs, self.timeout )) sleep(retry_intvl_secs) else: if i_try > 1: OUT.info(self.__class__.MSG_SEND_SUCCEEDED % ( i_try, max_n_tries )) sent = True break if not sent: sys.exit('ERROR: send failed') return sent
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flags.debug: traceback.print_exc() err_log = ("cannot execute database statement:\n" + "file=%(file)s:\nstmt=%(stmt)s") % { "file": self.db_file_name, "stmt": stmt, } for i, stmt_args in enumerate(stmt_args_list): err_log += "\nstmt_args[%(i)d]=%(stmt_args)s" % {"i": i, "stmt_args": stmt_args} ERR.warning(err_log) raise