def __init__(self, suite, suite_dir): # Suite only needed for back-compat with old clients (see below): self.suite = suite # Figure out the ports we are allowed to use. base_port = GLOBAL_CFG.get(["communication", "base port"]) max_ports = GLOBAL_CFG.get(["communication", "maximum number of ports"]) self.ok_ports = range(int(base_port), int(base_port) + int(max_ports)) random.shuffle(self.ok_ports) comms_options = GLOBAL_CFG.get(["communication", "options"]) # HTTP Digest Auth uses MD5 - pretty secure in this use case. # Extending it with extra algorithms is allowed, but won't be # supported by most browsers. requests and urllib2 are OK though. self.hash_algorithm = "MD5" if "SHA1" in comms_options: # Note 'SHA' rather than 'SHA1'. self.hash_algorithm = "SHA" self.reg_db = RegistrationDB() try: self.cert = self.reg_db.load_item(suite, USER, None, "certificate", create_ok=True) self.pkey = self.reg_db.load_item(suite, USER, None, "private_key", create_ok=True) except PassphraseError: # No OpenSSL installed. self.cert = None self.pkey = None self.suite = suite passphrase = self.reg_db.load_passphrase(suite, USER, None) userpassdict = {"cylc": passphrase, "anon": NO_PASSPHRASE} get_ha1 = cherrypy.lib.auth_digest.get_ha1_dict_plain(userpassdict, algorithm=self.hash_algorithm) self.get_ha1 = get_ha1 del passphrase del userpassdict self.client_reporter = CommsClientReporter.get_inst() self.start()
def main(name, start): # Parse the command line: server = start() # Print copyright and license information print_blurb() # Create run directory tree and get port. try: GLOBAL_CFG.create_cylc_run_tree(server.suite) server.configure_pyro() except Exception as exc: if flags.debug: raise else: sys.exit(exc) # Daemonize the suite if not server.options.no_detach and not flags.debug: daemonize(server) try: server.configure() server.run() # For profiling (see Python docs for how to display the stats). # import cProfile # cProfile.runctx('server.run()', globals(), locals(), 'stats') except SchedulerStop, x: # deliberate stop print str(x) server.shutdown()
def unlink_suite_contact_files(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for user_at_host, should_unlink in self.initialised_hosts.items(): if not should_unlink: continue if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, "suite run directory", host, owner), SuiteSrvFilesManager.DIR_BASE_SRV, SuiteSrvFilesManager.FILE_BASE_CONTACT, ) cmd = shlex.split(ssh_tmpl) + ["-n", user_at_host, "rm", "-f", r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.items(): if not proc.poll(): continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) ) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() proc.wait() ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) )
def _load_port_file(self): """Load port, host, etc from port file.""" # GLOBAL_CFG is expensive to import, so only load on demand from cylc.cfgspec.globalcfg import GLOBAL_CFG port_file_path = os.path.join( GLOBAL_CFG.get(['communication', 'ports directory']), self.suite) out = "" if is_remote_host(self.host) or is_remote_user(self.owner): # Only load these modules on demand, as they may be expensive import shlex from subprocess import Popen, PIPE ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() ret_code = proc.wait() if ret_code: if cylc.flags.debug: print >> sys.stderr, { "code": ret_code, "command": command, "stdout": out, "stderr": err} if self.port is None: raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError: if self.port is None: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() if self.port is None: try: self.port = int(lines[0]) except (IndexError, ValueError): raise PortFileError( "ERROR, bad content in port file: %s" % port_file_path) if self.host is None: if len(lines) >= 2: self.host = lines[1].strip() else: self.host = get_hostname()
def __init__( self, suite ): self.ldir = GLOBAL_CFG.get_derived_host_item( suite, 'suite log directory' ) self.path = os.path.join( self.ldir, 'log' ) self.err_path = os.path.join( self.ldir, 'err' ) self.roll_at_startup = GLOBAL_CFG.get( ['suite logging','roll over at start-up'] ) self.n_keep = GLOBAL_CFG.get( ['suite logging','rolling archive length'] ) self.max_bytes = GLOBAL_CFG.get( ['suite logging','maximum size in bytes'] )
def __init__( self, suite ): sodir = GLOBAL_CFG.get_derived_host_item( suite, 'suite log directory' ) self.opath = os.path.join( sodir, 'out' ) self.epath = os.path.join( sodir, 'err' ) # use same archive length as logging (TODO: document this) self.roll_at_startup = GLOBAL_CFG.get( ['suite logging','roll over at start-up'] ) self.arclen = GLOBAL_CFG.get( ['suite logging','rolling archive length'] )
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory from cylc.cfgspec.globalcfg import GLOBAL_CFG script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( GLOBAL_CFG.get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return content
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if '@' in user_at_host: owner, host = user_at_host.split('@', 1) else: owner, host = None, user_at_host if ((owner, host) in [(None, 'localhost'), (USER, 'localhost')] or host in self.initialised_hosts or self.single_task_mode): return suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory') sources = [os.path.join(suite_run_dir, CylcSuiteEnv.BASE_NAME)] if 'CYLC_SUITE_DEF_PATH' in os.environ: sources.append( os.path.join(os.getenv('CYLC_SUITE_DEF_PATH'), 'passphrase')) suite_run_py = os.path.join(suite_run_dir, 'python') if os.path.isdir(suite_run_py): sources.append(suite_run_py) r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite job log directory', host, owner) getLogger('main').log(INFO, 'Initialising %s:%s' % ( user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item( 'remote shell template', host, owner).replace(' %s', '') scp_tmpl = GLOBAL_CFG.get_host_item( 'remote copy template', host, owner) cmd1 = shlex.split(ssh_tmpl) + [ "-n", user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir] cmd2 = shlex.split(scp_tmpl) + ['-pr'] + sources + [ user_at_host + ":" + r_suite_run_dir + '/'] for cmd in [cmd1, cmd2]: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err) self.initialised_hosts.append(user_at_host)
def __init__(self, suite, run_mode='live', ict=None, stop_point=None): self.run_mode = run_mode self.cts_str = None self.set_cts(ict, stop_point) self.dir_name = GLOBAL_CFG.get_derived_host_item( suite, 'suite state directory') self.file_name = os.path.join(self.dir_name, self.BASE_NAME) self.arch_len = GLOBAL_CFG.get(['state dump rolling archive length']) if not self.arch_len or int(self.arch_len) <= 1: self.arch_len = 1 self.arch_files = [] self.pool = None self.log = logging.getLogger('main')
def get_port( suite, owner=user, host=get_hostname(), pphrase=None, pyro_timeout=None ): # Scan ports until a particular suite is found. pyro_base_port = GLOBAL_CFG.get( ['pyro','base port'] ) pyro_port_range = GLOBAL_CFG.get( ['pyro','maximum number of ports'] ) for port in range( pyro_base_port, pyro_base_port + pyro_port_range ): uri = cylcid_uri( host, port ) try: proxy = Pyro.core.getProxyForURI(uri) except Pyro.errors.URIError, x: # No such host? raise SuiteNotFoundError, x if pyro_timeout: # convert from string pyro_timeout = float( pyro_timeout ) proxy._setTimeout(pyro_timeout) proxy._setIdentification( pphrase ) before = datetime.datetime.now() try: name, xowner = proxy.id() except Pyro.errors.TimeoutError: warn_timeout( host, port, pyro_timeout ) pass except Pyro.errors.ConnectionDeniedError: #print >> sys.stderr, "Wrong suite or wrong passphrase at " + portid( host, port ) pass except Pyro.errors.ProtocolError: #print >> sys.stderr, "No Suite Found at " + portid( host, port ) pass except Pyro.errors.NamingError: #print >> sys.stderr, "Non-cylc pyro server found at " + portid( host, port ) pass else: if flags.verbose: after = datetime.datetime.now() print "Pyro connection on port " +str(port) + " took: " + str( after - before ) if name == suite and xowner == owner: if flags.verbose: print suite, owner, host, port # RESULT return port else: # ID'd some other suite. #print 'OTHER SUITE:', name, xowner, host, port pass
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if '@' in user_at_host: owner, host = user_at_host.split('@', 1) else: owner, host = None, user_at_host if ((owner, host) in [(None, 'localhost'), (user, 'localhost')] or host in self.initialised_hosts or self.single_task_mode): return suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory') sources = [os.path.join(suite_run_dir, "cylc-suite-env")] suite_run_py = os.path.join(suite_run_dir, "python") if os.path.isdir(suite_run_py): sources.append(suite_run_py) try: r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite job log directory', host, owner) getLogger('main').log(INFO, 'Initialising %s:%s' % ( user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item( 'remote shell template', host, owner).replace(" %s", "") scp_tmpl = GLOBAL_CFG.get_host_item( 'remote copy template', host, owner) cmd1 = shlex.split(ssh_tmpl) + [ user_at_host, 'mkdir -p "%s" "%s"' % (r_suite_run_dir, r_log_job_dir)] cmd2 = shlex.split(scp_tmpl) + ["-r"] + sources + [ user_at_host + ":" + r_suite_run_dir + "/"] for cmd in [cmd1, cmd2]: check_call(cmd) except Exception: raise RemoteJobHostInitError(user_at_host) self.initialised_hosts.append(user_at_host)
def unlink_hosts_contacts(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.init_host_map.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): ERR.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err))
def handle_proxies(): """Unset proxies if the configuration matches this.""" from cylc.cfgspec.globalcfg import GLOBAL_CFG if not GLOBAL_CFG.get(['communication', 'proxies on']): import os os.environ.pop("http_proxy", None) os.environ.pop("https_proxy", None)
def __init__(self, suite, host, owner): self.suite = suite self.host = host self.owner = owner self.locn = None self.local_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), suite)
def _write_prelude(cls, handle, job_conf): """Job script prelude.""" if cylc.flags.debug: if 'bash' in job_conf['job script shell']: handle.write("\n\nPS4='+[\D{%Y%m%dT%H%M%S%z}]\u@\h '") handle.write('\n\nset -x') handle.write('\n\necho "JOB SCRIPT STARTING"') # set cylc version and source profile scripts before turning on # error trapping so that profile errors do not abort the job handle.write('\n\nprelude() {') keys = GLOBAL_CFG.get_host_item( 'copyable environment variables', job_conf['host'], job_conf['owner']) for key in keys + ['CYLC_DIR', 'CYLC_VERSION']: if key in os.environ: handle.write("\n export %s='%s'" % (key, os.environ[key])) handle.write( r''' for FILE_NAME in \ "${HOME}/.cylc/job-init-env.sh" \ "${CYLC_DIR}/conf/job-init-env.sh" \ "${CYLC_DIR}/conf/job-init-env-default.sh" do if [[ -f "${FILE_NAME}" ]]; then . "${FILE_NAME}" 1>/dev/null 2>&1 break fi done } prelude''')
def __init__(self, hosts=None, owner=None, poll_interval=None, is_compact=False): # We can't use gobject.threads_init() for panel applets. warnings.filterwarnings('ignore', 'use the new', Warning) setup_icons() if not hosts: try: hosts = GLOBAL_CFG.get(["suite host scanning", "hosts"]) except KeyError: hosts = ["localhost"] self.is_compact = is_compact self.hosts = hosts if owner is None: owner = user self.owner = owner dot_hbox = gtk.HBox() dot_hbox.show() dot_eb = gtk.EventBox() dot_eb.show() dot_eb.add(dot_hbox) image = gtk.image_new_from_stock("gcylc", gtk.ICON_SIZE_MENU) image.show() image_eb = gtk.EventBox() image_eb.show() image_eb.connect("button-press-event", self._on_button_press_event) image_eb.add(image) self.top_hbox = gtk.HBox() self.top_hbox.pack_start(image_eb, expand=False, fill=False) self.top_hbox.pack_start(dot_eb, expand=False, fill=False, padding=2) self.top_hbox.show() self.updater = ScanPanelAppletUpdater(hosts, dot_hbox, image, self.is_compact, owner=owner, poll_interval=poll_interval) self.top_hbox.connect("destroy", self.stop)
def unregister(self, exp): """Un-register a suite.""" unregistered_set = set() skipped_set = set() ports_d = GLOBAL_CFG.get(['pyro', 'ports directory']) for name in sorted(self.list_all_suites()): if not re.match(exp + r'\Z', name): continue try: data = self.get_suite_data(name) except RegistrationError: continue if os.path.exists(os.path.join(ports_d, name)): skipped_set.add((name, data['path'])) print >> sys.stderr, ( 'SKIP UNREGISTER %s: port file exists' % (name)) continue for base_name in ['passphrase', 'suite.rc.processed']: try: os.unlink(os.path.join(data['path'], base_name)) except OSError: pass unregistered_set.add((name, data['path'])) print 'UNREGISTER %s:%s' % (name, data['path']) os.unlink(os.path.join(self.dbpath, name)) return unregistered_set, skipped_set
def get_create_job_log_path(cls, suite, task_name, task_point, submit_num): """Return a new job log path on the suite host, in two parts. /part1/part2 * part1: the top level job log directory on the suite host. * part2: the rest, which is also used on remote task hosts. The full local job log directory is created if necessary, and its parent symlinked to NN (submit number). """ suite_job_log_dir = GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory") the_rest_dir = os.path.join( str(task_point), task_name, "%02d" % int(submit_num)) the_rest = os.path.join(the_rest_dir, "job") local_log_dir = os.path.join(suite_job_log_dir, the_rest_dir) mkdir_p(local_log_dir) target = os.path.join(os.path.dirname(local_log_dir), "NN") try: os.unlink(target) except OSError: pass try: os.symlink(os.path.basename(local_log_dir), target) except OSError as exc: if not exc.filename: exc.filename = target raise exc return suite_job_log_dir, the_rest
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) from cylc.cfgspec.globalcfg import GLOBAL_CFG run_d = GLOBAL_CFG.get_host_item('run directory') results = [] for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a: # * .service/ # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/"). if dirpath != run_d and ( self.DIR_BASE_SRV in dnames or "cylc-suite.db" in fnames): dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self._get_suite_title(reg)]) except (IOError, SuiteServiceFileError) as exc: print >> sys.stderr, str(exc) return results
def prompt(question, force=False, gui=False, no_force=False, no_abort=False): """Interactive Yes/No prompt for cylc CLI scripts. For convenience, on No we just exit rather than return. If force is True don't prompt, just return immediately. """ if (force or GLOBAL_CFG.get(['disable interactive command prompts'])) and ( not no_force): return True if gui: import gtk dialog = gtk.MessageDialog( None, gtk.DIALOG_DESTROY_WITH_PARENT, gtk.MESSAGE_QUESTION, gtk.BUTTONS_YES_NO, question ) gui_response = dialog.run() response_no = (gui_response != gtk.RESPONSE_YES) else: cli_response = raw_input('%s (y/n)? ' % question) response_no = (cli_response not in ['y', 'Y']) if response_no: if no_abort: return False else: sys.exit(0) else: return True
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) from cylc.cfgspec.globalcfg import GLOBAL_CFG run_d = GLOBAL_CFG.get_host_item('run directory') results = [] skip_names = [ "log", "share", "work", self.DIR_BASE_SRV, self.FILE_BASE_SUITE_RC] for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): # Don't descent further if it looks like a suite directory if any([name in dnames or name in fnames for name in skip_names]): dnames[:] = [] # Choose only suites with info file and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self._get_suite_title(reg)]) except (IOError, SuiteServiceFileError) as exc: print >> sys.stderr, str(exc) return results
def _write_prelude(cls, handle, job_conf): """Job script prelude.""" handle.write('\n\necho "JOB SCRIPT STARTING"') # set cylc version and source profile scripts before turning on # error trapping so that profile errors do not abort the job handle.write("\n\nprelude() {") keys = GLOBAL_CFG.get_host_item("copyable environment variables", job_conf["host"], job_conf["owner"]) for key in keys + ["CYLC_DIR", "CYLC_VERSION"]: if key in os.environ: handle.write("\n export %s='%s'" % (key, os.environ[key])) handle.write( r""" for FILE_NAME in \ "${HOME}/.cylc/job-init-env.sh" \ "${CYLC_DIR}/conf/job-init-env.sh" \ "${CYLC_DIR}/conf/job-init-env-default.sh" do if [[ -f "${FILE_NAME}" ]]; then . "${FILE_NAME}" 1>/dev/null 2>&1 break fi done } prelude""" )
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs.""" if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks = self.prep_submit_task_jobs(suite, itasks) if not prepared_tasks: return # Submit task jobs auth_itasks = {} for itask in prepared_tasks: # The job file is now (about to be) used: reset the file write flag # so that subsequent manual retrigger will generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for auth, itasks in sorted(auth_itasks.items()): cmd = ["cylc", self.JOBS_SUBMIT] if cylc.flags.debug: cmd.append("--debug") host, owner = auth remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append("--") cmd.append(GLOBAL_CFG.get_derived_host_item( suite, 'suite job log directory', host, owner)) stdin_file_paths = [] job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): if remote_mode: stdin_file_paths.append( self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE)) job_log_dirs.append(self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext( self.JOBS_SUBMIT, cmd, stdin_file_paths=stdin_file_paths, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks])
def prompt( reason, force=False ): if force or GLOBAL_CFG.get( ['disable interactive command prompts'] ): return response = raw_input( reason + ' (y/n)? ' ) if response == 'y': return else: sys.exit(0)
def prompt(reason, force=False): if force or GLOBAL_CFG.get(["disable interactive command prompts"]): return response = raw_input(reason + " (y/n)? ") if response == "y": return else: sys.exit(0)
def __init__(self, suite): Pyro.config.PYRO_MULTITHREADED = 1 # Use dns names instead of fixed ip addresses from /etc/hosts # (see the Userguide "Networking Issues" section). Pyro.config.PYRO_DNS_URI = True # Base Pyro socket number. Pyro.config.PYRO_PORT = GLOBAL_CFG.get(['pyro', 'base port']) # Max number of sockets starting at base. Pyro.config.PYRO_PORT_RANGE = GLOBAL_CFG.get( ['pyro', 'maximum number of ports']) Pyro.core.initServer() self.daemon = None # Suite only needed for back-compat with old clients (see below): self.suite = suite
def get_host_ip_address(): from cylc.cfgspec.globalcfg import GLOBAL_CFG global host_ip_address if host_ip_address is None: target = GLOBAL_CFG.get( ['suite host self-identification','target'] ) # external IP address of the suite host: host_ip_address = get_local_ip_address( target ) return host_ip_address
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host if ( (owner, host) in [(None, "localhost"), (user, "localhost")] or host in self.initialised_hosts or self.single_task_mode ): return suite_run_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite run directory") sources = [os.path.join(suite_run_dir, "cylc-suite-env")] if "CYLC_SUITE_DEF_PATH" in os.environ: sources.append(os.path.join(os.getenv("CYLC_SUITE_DEF_PATH"), "passphrase")) suite_run_py = os.path.join(suite_run_dir, "python") if os.path.isdir(suite_run_py): sources.append(suite_run_py) r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite run directory", host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite job log directory", host, owner) getLogger("main").log(INFO, "Initialising %s:%s" % (user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner).replace(" %s", "") scp_tmpl = GLOBAL_CFG.get_host_item("remote copy template", host, owner) cmd1 = shlex.split(ssh_tmpl) + ["-n", user_at_host, "mkdir", "-p", r_suite_run_dir, r_log_job_dir] cmd2 = shlex.split(scp_tmpl) + ["-pr"] + sources + [user_at_host + ":" + r_suite_run_dir + "/"] for cmd in [cmd1, cmd2]: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err ) self.initialised_hosts.append(user_at_host)
def __init__(self, suite, test_params=None): if SuiteLog.__INSTANCE: raise Exception("Attempting to initiate a second singleton" "instance.") self._group = None if not test_params: self.is_test = False self.max_bytes = GLOBAL_CFG.get( ['suite logging', 'maximum size in bytes']) self.roll_at_startup = GLOBAL_CFG.get( ['suite logging', 'roll over at start-up']) self.archive_length = GLOBAL_CFG.get( ['suite logging', 'rolling archive length']) else: self.is_test = True self.max_bytes = test_params['max_bytes'] self.roll_at_startup = test_params['roll_at_startup'] self.archive_length = 4 # Log paths. if test_params: self.ldir = test_params['ldir'] else: self.ldir = GLOBAL_CFG.get_derived_host_item( suite, 'suite log directory') self.log_paths = {} self.log_paths[self.LOG] = os.path.join(self.ldir, self.LOG) self.log_paths[self.OUT] = os.path.join(self.ldir, self.OUT) self.log_paths[self.ERR] = os.path.join(self.ldir, self.ERR) # The loggers. self.loggers = {} self.loggers[self.LOG] = None self.loggers[self.OUT] = None self.loggers[self.ERR] = None # Filename stamp functions. if self.is_test: self.stamp = lambda: get_current_time_string(True, True, True ).replace('.', '-') else: self.stamp = lambda: get_current_time_string(False, True, True) SuiteLog.__INSTANCE = self
def get_suite_host(): from cylc.cfgspec.globalcfg import GLOBAL_CFG global suite_host if suite_host is None: hardwired = GLOBAL_CFG.get( ['suite host self-identification','host'] ) method = GLOBAL_CFG.get( ['suite host self-identification','method'] ) # the following is for suite host self-identfication in task job scripts: if method == 'name': suite_host = hostname elif method == 'address': suite_host = get_host_ip_address() elif method == 'hardwired': if not hardwired: sys.exit( 'ERROR, no hardwired hostname is configured' ) suite_host = hardwired else: sys.exit( 'ERROR, unknown host method: ' + method ) return suite_host
def __init__(self, hosts=None, owner=None, poll_interval=None): gobject.threads_init() set_exception_hook_dialog("cylc gscan") setup_icons() if not hosts: hosts = GLOBAL_CFG.get(["suite host scanning", "hosts"]) self.hosts = hosts if owner is None: owner = user self.owner = owner self.window = gtk.Window() self.window.set_title("cylc gscan") self.window.set_icon(get_icon()) self.vbox = gtk.VBox() self.vbox.show() self.theme_name = gcfg.get(['use theme']) self.theme = gcfg.get(['themes', self.theme_name]) self.dots = DotMaker(self.theme) suite_treemodel = gtk.TreeStore(str, str, bool, str, int, str, str) self._prev_tooltip_location_id = None self.suite_treeview = gtk.TreeView(suite_treemodel) # Construct the host column. host_name_column = gtk.TreeViewColumn("Host") cell_text_host = gtk.CellRendererText() host_name_column.pack_start(cell_text_host, expand=False) host_name_column.set_cell_data_func(cell_text_host, self._set_cell_text_host) host_name_column.set_sort_column_id(0) host_name_column.set_visible(False) host_name_column.set_resizable(True) # Construct the suite name column. suite_name_column = gtk.TreeViewColumn("Suite") cell_text_name = gtk.CellRendererText() suite_name_column.pack_start(cell_text_name, expand=False) suite_name_column.set_cell_data_func(cell_text_name, self._set_cell_text_name) suite_name_column.set_sort_column_id(1) suite_name_column.set_resizable(True) # Construct the suite title column. suite_title_column = gtk.TreeViewColumn("Title") cell_text_title = gtk.CellRendererText() suite_title_column.pack_start(cell_text_title, expand=False) suite_title_column.set_cell_data_func(cell_text_title, self._set_cell_text_title) suite_title_column.set_sort_column_id(3) suite_title_column.set_visible(False) suite_title_column.set_resizable(True) # Construct the update time column. time_column = gtk.TreeViewColumn("Updated") cell_text_time = gtk.CellRendererText() time_column.pack_start(cell_text_time, expand=False) time_column.set_cell_data_func(cell_text_time, self._set_cell_text_time) time_column.set_sort_column_id(4) time_column.set_visible(False) time_column.set_resizable(True) self.suite_treeview.append_column(host_name_column) self.suite_treeview.append_column(suite_name_column) self.suite_treeview.append_column(suite_title_column) self.suite_treeview.append_column(time_column) # Construct the status column. status_column = gtk.TreeViewColumn("Status") status_column.set_sort_column_id(5) status_column.set_resizable(True) status_column_info = 6 cycle_column_info = 5 cell_text_cycle = gtk.CellRendererText() status_column.pack_start(cell_text_cycle, expand=False) status_column.set_cell_data_func(cell_text_cycle, self._set_cell_text_cycle, cycle_column_info) self.suite_treeview.append_column(status_column) distinct_states = len(task_state.legal) for i in range(distinct_states): cell_pixbuf_state = gtk.CellRendererPixbuf() status_column.pack_start(cell_pixbuf_state, expand=False) status_column.set_cell_data_func(cell_pixbuf_state, self._set_cell_pixbuf_state, (status_column_info, i)) self.suite_treeview.show() if hasattr(self.suite_treeview, "set_has_tooltip"): self.suite_treeview.set_has_tooltip(True) try: self.suite_treeview.connect('query-tooltip', self._on_query_tooltip) except TypeError: # Lower PyGTK version. pass self.suite_treeview.connect("button-press-event", self._on_button_press_event) scrolled_window = gtk.ScrolledWindow() scrolled_window.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) scrolled_window.add(self.suite_treeview) scrolled_window.show() self.vbox.pack_start(scrolled_window, expand=True, fill=True) self.updater = ScanAppUpdater(self.hosts, suite_treemodel, self.suite_treeview, owner=self.owner, poll_interval=poll_interval) self.updater.start() self.window.add(self.vbox) self.window.connect("destroy", self._on_destroy_event) self.window.set_default_size(300, 150) self.suite_treeview.grab_focus() self.window.show()
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ uri_data = { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object } port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if self.host is None or self.port is None: if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str( GLOBAL_CFG.get_host_item('remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path ] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError as exc: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if uri_data["port"] is None: uri_data["port"] = int(lines[0]) self.port = uri_data["port"] except (IndexError, ValueError): raise PortFileError("ERROR, bad content in port file: %s" % port_file_path) if uri_data["host"] is None: if len(lines) >= 2: uri_data["host"] = lines[1].strip() else: uri_data["host"] = "localhost" self.host = uri_data["host"] # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % uri_data)
def scan(host=get_hostname(), db=None, pyro_timeout=None, owner=user): """Scan ports, return a list of suites found: [(port, suite.identify())]. Note that we could easily scan for a given suite+owner and return its port instead of reading port files, but this may not always be fast enough. """ base_port = GLOBAL_CFG.get(['pyro', 'base port']) last_port = base_port + GLOBAL_CFG.get(['pyro', 'maximum number of ports']) if pyro_timeout: pyro_timeout = float(pyro_timeout) else: pyro_timeout = None results = [] for port in range(base_port, last_port): try: proxy = get_proxy(host, port, pyro_timeout) conn_val = ConnValidator() conn_val.set_default_hash(SCAN_HASH) proxy._setNewConnectionValidator(conn_val) proxy._setIdentification((user, NO_PASSPHRASE)) result = (port, proxy.identify()) except Pyro.errors.ConnectionDeniedError as exc: if cylc.flags.debug: print '%s:%s (connection denied)' % (host, port) # Back-compat <= 6.4.1 msg = ' Old daemon at %s:%s?' % (host, port) for pphrase in load_passphrases(db): try: proxy = get_proxy(host, port, pyro_timeout) proxy._setIdentification(pphrase) info = proxy.id() result = (port, {'name': info[0], 'owner': info[1]}) except Pyro.errors.ConnectionDeniedError: connected = False else: connected = True break if not connected: if cylc.flags.verbose: print >> sys.stderr, msg, "- connection denied (%s)" % exc continue else: if cylc.flags.verbose: print >> sys.stderr, msg, "- connected with passphrase" except (Pyro.errors.ProtocolError, Pyro.errors.NamingError) as exc: # No suite at this port. if cylc.flags.debug: print str(exc) print '%s:%s (no suite)' % (host, port) continue except Pyro.errors.TimeoutError as exc: # E.g. Ctrl-Z suspended suite - holds up port scanning! if cylc.flags.debug: print '%s:%s (connection timed out)' % (host, port) print >> sys.stderr, ( 'suite? owner?@%s:%s - connection timed out (%s)' % (host, port, exc)) except Exception as exc: if cylc.flags.debug: print str(exc) break else: print >> sys.stderr, str(exc) else: name = result[1].get('name') owner = result[1].get('owner') states = result[1].get('states', None) if cylc.flags.debug: print ' suite:', name, owner if states is None: # This suite keeps its state info private. # Try again with the passphrase if I have it. try: pphrase = get_passphrase(name, owner, host, localdb(db)) except PassphraseError: if cylc.flags.debug: print ' (no passphrase)' else: try: proxy = get_proxy(host, port, pyro_timeout) conn_val = ConnValidator() conn_val.set_default_hash(SCAN_HASH) proxy._setNewConnectionValidator(conn_val) proxy._setIdentification((user, pphrase)) result = (port, proxy.identify()) except Exception: # Nope (private suite, wrong passphrase). if cylc.flags.debug: print ' (wrong passphrase)' else: if cylc.flags.debug: print ' (got states with passphrase)' results.append(result) return results
def _get_host_item(job_conf, key): """Return host item from GLOBAL_CFG.""" return GLOBAL_CFG.get_host_item(key, job_conf["host"], job_conf["owner"])
def update_suites_info(updater, full_mode=False): """Return mapping of suite info by host, owner and suite name. Args: updater (object): gscan or gpanel updater: Compulsory attributes from updater: hosts: hosts to scan owner_pattern: re to filter results by owners suite_info_map: previous results returned by this function Optional attributes from updater: timeout: communication timeout full_mode (boolean): update in full mode? Return: dict: {(host, owner, name): suite_info, ...} where each "suite_info" is a dict with keys: KEY_GROUP: group name of suite KEY_OWNER: suite owner name KEY_PORT: suite port, for running suites only KEY_STATES: suite state KEY_TASKS_BY_STATE: tasks by state KEY_TITLE: suite title KEY_UPDATE_TIME: last update time of suite """ # Compulsory attributes from updater # hosts - hosts to scan, or the default set in the site/user global.rc # owner_pattern - return only suites with owners matching this compiled re # suite_info_map - previous results returned by this function # Optional attributes from updater # timeout - communication timeout owner_pattern = updater.owner_pattern timeout = getattr(updater, "comms_timeout", None) # name_pattern - return only suites with names matching this compiled re name_pattern = getattr(updater, "name_pattern", None) # Determine items to scan results = {} items = [] if full_mode and not updater.hosts: # Scan users suites. Walk "~/cylc-run/" to get (host, port) from # ".service/contact" for active suites suite_srv_files_mgr = SuiteSrvFilesManager() if owner_pattern is None: # Run directory of current user only run_dirs = [GLOBAL_CFG.get_host_item('run directory')] else: # Run directory of all users matching "owner_pattern". # But skip those with /nologin or /false shells run_dirs = [] skips = ('/false', '/nologin') for pwent in getpwall(): if any(pwent.pw_shell.endswith(s) for s in (skips)): continue if owner_pattern.match(pwent.pw_name): run_dirs.append( GLOBAL_CFG.get_host_item('run directory', owner=pwent.pw_name, owner_home=pwent.pw_dir)) if cylc.flags.debug: sys.stderr.write( 'Listing suites:%s%s\n' % (_UPDATE_DEBUG_DELIM, _UPDATE_DEBUG_DELIM.join(run_dirs))) for run_d in run_dirs: for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): if updater.quit: return # Always descend for top directory, but # don't descend further if it has a: # * .service/ # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/"). if dirpath != run_d and (suite_srv_files_mgr.DIR_BASE_SRV in dnames or 'cylc-suite.db' in fnames): dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) try: contact_data = suite_srv_files_mgr.load_contact_file(reg) except (SuiteServiceFileError, IOError, TypeError, ValueError): continue else: items.append((contact_data[suite_srv_files_mgr.KEY_HOST], contact_data[suite_srv_files_mgr.KEY_PORT])) elif full_mode: # Scan full port range on all hosts items.extend(updater.hosts) else: # Scan suites in previous results only for (host, owner, name), prev_result in updater.suite_info_map.items(): port = prev_result.get(KEY_PORT) if port: items.append((host, port)) else: results[(host, owner, name)] = prev_result if not items: return results if cylc.flags.debug: sys.stderr.write( 'Scan items:%s%s\n' % (_UPDATE_DEBUG_DELIM, _UPDATE_DEBUG_DELIM.join(str(item) for item in items))) # Scan for host, port, result in scan_many(items, timeout=timeout, updater=updater): if updater.quit: return if (name_pattern and not name_pattern.match(result[KEY_NAME]) or owner_pattern and not owner_pattern.match(result[KEY_OWNER])): continue try: result[KEY_PORT] = port results[(host, result[KEY_OWNER], result[KEY_NAME])] = result result[KEY_UPDATE_TIME] = int(float(result[KEY_UPDATE_TIME])) except (KeyError, TypeError, ValueError): pass expire_threshold = time() - DURATION_EXPIRE_STOPPED for (host, owner, name), prev_result in updater.suite_info_map.items(): if updater.quit: return if ((host, owner, name) in results or owner_pattern and not owner_pattern.match(owner) or name_pattern and not name_pattern.match(name)): # OK if suite already in current results set. # Don't bother if: # * previous owner does not match current owner pattern # * previous suite name does not match current name pattern continue if prev_result.get(KEY_PORT): # A previously running suite is no longer running. # Get suite info with "cat-state", if possible, and include in the # results set. try: prev_result = _update_stopped_suite_info((host, owner, name)) except (IndexError, TypeError, ValueError): continue if prev_result.get(KEY_UPDATE_TIME, 0) > expire_threshold: results[(host, owner, name)] = prev_result return results
# This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """Package for network interfaces to cylc suite server objects.""" from cylc.cfgspec.globalcfg import GLOBAL_CFG METHOD = GLOBAL_CFG.get(['communication', 'method'])
def scan_many(items, timeout=None, updater=None): """Call "identify" method of suites on many host:port. Args: items (list): list of 'host' string or ('host', port) tuple to scan. timeout (float): connection timeout, default is CONNECT_TIMEOUT. updater (object): quit scan cleanly if updater.quit is set. Return: list: [(host, port, identify_result), ...] """ if not items: return [] try: timeout = float(timeout) except (TypeError, ValueError): timeout = CONNECT_TIMEOUT my_uuid = uuid4() # Ensure that it does "localhost" only once items = set(items) for item in list(items): if not isinstance(item, tuple) and not is_remote_host(item): items.remove(item) items.add("localhost") # To do and wait (submitted, waiting for results) sets todo_set = set() wait_set = set() # Determine ports to scan base_port = None max_ports = None for item in items: if isinstance(item, tuple): # Assume item is ("host", port) todo_set.add(item) else: # Full port range for a host if base_port is None or max_ports is None: base_port = GLOBAL_CFG.get(['communication', 'base port']) max_ports = GLOBAL_CFG.get( ['communication', 'maximum number of ports']) for port in range(base_port, base_port + max_ports): todo_set.add((item, port)) proc_items = [] results = [] # Number of child processes max_procs = GLOBAL_CFG.get(["process pool size"]) if max_procs is None: max_procs = cpu_count() try: while todo_set or proc_items: no_action = True # Get results back from child processes where possible busy_proc_items = [] while proc_items: if updater and updater.quit: raise KeyboardInterrupt() proc, my_conn, terminate_time = proc_items.pop() if my_conn.poll(): host, port, result = my_conn.recv() if result is None: # Can't connect, ignore wait_set.remove((host, port)) elif result == MSG_TIMEOUT: # Connection timeout, leave in "wait_set" pass else: # Connection success results.append((host, port, result)) wait_set.remove((host, port)) if todo_set: # Immediately give the child process something to do host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) busy_proc_items.append( (proc, my_conn, time() + INACTIVITY_TIMEOUT)) else: # Or quit if there is nothing left to do my_conn.send(MSG_QUIT) my_conn.close() proc.join() no_action = False elif time() > terminate_time: # Terminate child process if it is taking too long proc.terminate() proc.join() no_action = False else: busy_proc_items.append((proc, my_conn, terminate_time)) proc_items += busy_proc_items # Create some child processes where necessary while len(proc_items) < max_procs and todo_set: if updater and updater.quit: raise KeyboardInterrupt() my_conn, conn = Pipe() try: proc = Process(target=_scan_worker, args=(conn, timeout, my_uuid)) except OSError: # Die if unable to start any worker process. # OK to wait and see if any worker process already running. if not proc_items: raise if cylc.flags.debug: traceback.print_exc() else: proc.start() host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) proc_items.append( (proc, my_conn, time() + INACTIVITY_TIMEOUT)) no_action = False if no_action: sleep(SLEEP_INTERVAL) except KeyboardInterrupt: return [] # Report host:port with no results if wait_set: sys.stderr.write( 'WARNING, scan timed out, no result for the following:\n') for key in sorted(wait_set): sys.stderr.write(' %s:%s\n' % key) return results
def _prep_submit_task_job_impl(self, suite, itask): """Helper for self._prep_submit_task_job.""" overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) else: rtconfig = itask.tdef.rtconfig # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Submit number and try number LOG.debug("[%s] -incrementing submit number" % (itask.identity,)) itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.local_job_file_path = None self.suite_db_mgr.put_insert_task_jobs(itask, { "is_manual_submit": itask.is_manual_submit, "try_num": itask.get_try_num(), "time_submit": get_current_time_string(), }) itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) # Determine task host settings now, just before job submission, # because dynamic host selection may be used. # host may be None (= run task on suite host) itask.task_host = get_task_host(rtconfig['remote']['host']) if not itask.task_host: itask.task_host = 'localhost' elif itask.task_host != "localhost": LOG.info("[%s] -Task host: %s" % ( itask.identity, itask.task_host)) itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host itask.summary['host'] = user_at_host itask.summary['job_hosts'][itask.submit_num] = user_at_host try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = ( TaskActionTimer(delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING)]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf( itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) self.init_host(suite, itask.task_host, itask.task_owner) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.suite_db_mgr.put_update_task_jobs(itask, { "user_at_host": user_at_host, "batch_sys_name": itask.summary['batch_sys_name'], }) itask.is_manual_submit = False scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_env_tmpl': rtconfig['parameter environment templates'], 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def __init__(self, suite, task_name, task_point): dir_ = GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory") self.base_path = os.path.join(dir_, str(task_point), task_name) self.suite_logger = logging.getLogger("main")
from Pyro.protocol import DefaultConnValidator import Pyro.constants import Pyro.errors import hmac from cylc.cfgspec.globalcfg import GLOBAL_CFG from cylc.network import NO_PASSPHRASE, PRIVILEGE_LEVELS from cylc.config import SuiteConfig from cylc.suite_host import is_remote_host from cylc.owner import user, host # Access for users without the suite passphrase: encrypting the "no passphrase" # passphrase is unnecessary, but doing so allows common passphrase handling. OK_HASHES = GLOBAL_CFG.get()['authentication']['hashes'] SCAN_HASH = GLOBAL_CFG.get()['authentication']['scan hash'] if SCAN_HASH not in OK_HASHES: OK_HASHES.append(SCAN_HASH) CONNECT_DENIED_TMPL = "[client-connect] DENIED %s@%s:%s %s" CONNECT_ALLOWED_TMPL = "[client-connect] %s@%s:%s privilege='%s' %s" class ConnValidator(DefaultConnValidator): """Custom Pyro connection validator for user authentication.""" HASHES = {} LENGTH_HASH_DIGESTS = {} NO_PASSPHRASE_HASHES = {}
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ if ((self.host is None or self.port is None) and 'CYLC_SUITE_RUN_DIR' in os.environ): # Looks like we are in a running task job, so we should be able to # use "cylc-suite-env" file under the suite running directory try: suite_env = CylcSuiteEnv.load(self.suite, os.environ['CYLC_SUITE_RUN_DIR']) except CylcSuiteEnvLoadError: if cylc.flags.debug: traceback.print_exc() else: self.host = suite_env.suite_host self.port = suite_env.suite_port self.owner = suite_env.suite_owner if self.host is None or self.port is None: port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str( GLOBAL_CFG.get_host_item('remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path ] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() ret_code = proc.wait() if ret_code: if cylc.flags.debug: print >> sys.stderr, { "code": ret_code, "command": command, "stdout": out, "stderr": err } raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if self.port is None: self.port = int(lines[0]) except (IndexError, ValueError): raise PortFileError("ERROR, bad content in port file: %s" % port_file_path) if self.host is None: if len(lines) >= 2: self.host = lines[1].strip() else: self.host = get_hostname() # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object })
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote(host, owner): return if host is None: host = 'localhost' if owner is None: owner = get_user() from cylc.cfgspec.globalcfg import GLOBAL_CFG if item == 'contact' and not is_remote_host(host): # Attempt to read suite contact file via the local filesystem. path = r'%(run_d)s/%(srv_base)s' % { 'run_d': GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory', 'localhost', owner, replace_home=False), 'srv_base': self.DIR_BASE_SRV, } content = self._load_local_item(item, path) if content is not None: return content # Else drop through and attempt via ssh to the suite account. # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory script = (r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''') % { 'prefix': prefix, 'run_d': GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( GLOBAL_CFG.get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return content
def detect_old_contact_file(self, reg): """Detect old suite contact file. Raise SuiteServiceFileError if old contact file exists, and there is evidence that the old suite is still running. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["ps", "-opid,args", str(old_pid_str)] if is_remote_host(old_host): import shlex from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_tmpl = str(GLOBAL_CFG.get_host_item( "remote shell template", old_host)) cmd = shlex.split(ssh_tmpl) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) proc.wait() for line in reversed(proc.communicate()[0].splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break sys.stderr.write( ( r"""ERROR, suite contact file exists: %(fname)s If %(suite)s is not running, delete the suite contact file and try again. If it is running but unresponsive, kill any left over suite processes too. To see if %(suite)s is running on '%(host)s:%(port)s': * cylc scan -n '\b%(suite)s\b' '%(host)s' * cylc ping -v --host='%(host)s' '%(suite)s' * ssh -n '%(host)s' 'ps -o pid,args %(pid)s' """ ) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, } ) raise SuiteServiceFileError( "ERROR, suite contact file exists: %s" % fname)
def __init__(self, cfg, updater, theme, info_bar, xdot): super(GraphUpdater, self).__init__() self.quit = False self.cleared = False self.ignore_suicide = True self.focus_start_point_string = None self.focus_stop_point_string = None self.xdot = xdot self.first_update = False self.graph_disconnect = False self.action_required = True self.oldest_point_string = None self.newest_point_string = None self.orientation = "TB" # Top to Bottom ordering of nodes self.best_fit = True # zoom to page size self.normal_fit = False # zoom to 1.0 scale self.crop = False self.subgraphs_on = False # organise by cycle point. self.descendants = {} self.all_families = [] self.write_dot_frames = False self.prev_graph_id = () self.cfg = cfg self.updater = updater self.theme = theme self.info_bar = info_bar self.state_summary = {} self.fam_state_summary = {} self.global_summary = {} self.last_update_time = None self.god = None self.mode = "waiting..." self.update_time_str = "waiting..." self.prev_graph_id = () # empty graphw object: self.graphw = CGraphPlain(self.cfg.suite) # lists of nodes to newly group or ungroup (not of all currently # grouped and ungrouped nodes - still held server side) self.group = [] self.ungroup = [] self.have_leaves_and_feet = False self.leaves = [] self.feet = [] self.ungroup_recursive = False if "graph" in self.cfg.ungrouped_views: self.ungroup_all = True self.group_all = False else: self.ungroup_all = False self.group_all = True self.graph_frame_count = 0 self.suite_share_dir = GLOBAL_CFG.get_derived_host_item( self.cfg.suite, 'suite share directory')
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": HTTP(S) and SSH+HTTP(S) task comm ".service/passphrase": HTTP(S) task comm ".service/ssl.cert": HTTPS task comm "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install items = self._remote_init_items(host, owner) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create "stdin_file_paths" file, with "items" in it. tmphandle = NamedTemporaryFile() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), 'uuid') if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid)) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append(str(self.uuid)) cmd.append( GLOBAL_CFG.get_derived_host_item(self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SuiteProcContext('remote-init', cmd, stdin_file_paths=[tmphandle.name]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def init_host(self, reg, host, owner): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if host is None: host = 'localhost' if (self.single_task_mode or (host, owner) in self.init_host_map or not is_remote(host, owner)): return user_at_host = host if owner: user_at_host = owner + '@' + host r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite job log directory', host, owner) r_suite_srv_dir = os.path.join( r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, 'wb').close() proc = Popen( shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE) if proc.wait() == 0: # Initialised, but no need to tidy up self.init_host_map[(host, owner)] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append(shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". comm_meth = GLOBAL_CFG.get_host_item( 'task communication method', host, owner) should_unlink = comm_meth != 'poll' if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner) items = [self.suite_srv_files_mgr.get_contact_file(reg)] if comm_meth.startswith('http'): items.append(self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg)) # Handle not having SSL certs installed. try: items.append(self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg)) except (SuiteServiceFileError, ValueError): pass cmds.append( shlex.split(scp_tmpl) + ['-p'] + items + [user_at_host + ':' + r_suite_srv_dir + '/']) # Command to copy python library to remote host. suite_run_py = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'), 'python') if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + [ '-pr', suite_run_py, user_at_host + ':' + r_suite_run_dir + '/']) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err) self.init_host_map[(host, owner)] = should_unlink LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), 'uuid') try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append( os.path.join( GLOBAL_CFG.get_derived_host_item(self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = proc.communicate() if proc.wait(): LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err))
coercers['interval'] = coerce_interval coercers['interval_list'] = coerce_interval_list coercers['parameter_list'] = _coerce_parameter_list SPEC = { 'meta': { 'description': vdr(vtype='string', default=""), 'group': vdr(vtype='string', default=""), 'title': vdr(vtype='string', default=""), 'URL': vdr(vtype='string', default=""), '__MANY__': vdr(vtype='string', default=""), }, 'cylc': { 'UTC mode': vdr( vtype='boolean', default=GLOBAL_CFG.get(['cylc', 'UTC mode'])), 'cycle point format': vdr( vtype='cycletime_format', default=None), 'cycle point num expanded year digits': vdr( vtype='integer', default=0), 'cycle point time zone': vdr( vtype='cycletime_time_zone', default=None), 'required run mode': vdr( vtype='string', options=['live', 'dummy', 'dummy-local', 'simulation', '']), 'force run mode': vdr( vtype='string', options=['live', 'dummy', 'dummy-local', 'simulation', '']), 'abort if any task fails': vdr(vtype='boolean', default=False), 'health check interval': vdr(vtype='interval', default=None), 'task event mail interval': vdr(vtype='interval', default=None),
def _get_identification_cfg(key): """Return the [suite host self-identification]key global conf.""" from cylc.cfgspec.globalcfg import GLOBAL_CFG return GLOBAL_CFG.get(['suite host self-identification', key])
def __init__( self, hosts=None, patterns_name=None, patterns_owner=None, comms_timeout=None, poll_interval=None): gobject.threads_init() set_exception_hook_dialog("cylc gscan") setup_icons() if not hosts: hosts = GLOBAL_CFG.get(["suite host scanning", "hosts"]) self.hosts = hosts self.window = gtk.Window() title = "cylc gscan" for opt, items, skip in [ ("-n", patterns_name, None), ("-o", patterns_owner, USER)]: if items: for pattern in items: if pattern != skip: title += " %s %s" % (opt, pattern) self.window.set_title(title) self.window.set_icon(get_icon()) self.vbox = gtk.VBox() self.vbox.show() self.warnings = {} self.theme_name = gcfg.get(['use theme']) self.theme = gcfg.get(['themes', self.theme_name]) self.dots = DotMaker(self.theme) suite_treemodel = gtk.TreeStore( str, # group str, # host str, # owner str, # suite bool, # is_stopped str, # title int, # update_time str, # states str, # states_text str) # warning_text self._prev_tooltip_location_id = None self.suite_treeview = gtk.TreeView(suite_treemodel) # Visibility of columns vis_cols = gsfg.get(["columns"]) # Doesn't make any sense without suite name column if gsfg.COL_SUITE not in vis_cols: vis_cols.append(gsfg.COL_SUITE.lower()) # In multiple host environment, add host column by default if hosts: vis_cols.append(gsfg.COL_HOST.lower()) # In multiple owner environment, add owner column by default if patterns_owner != [USER]: vis_cols.append(gsfg.COL_OWNER.lower()) # Construct the group, host, owner, suite, title, update time column. for col_title, col_id, col_cell_text_setter in [ (gsfg.COL_GROUP, self.GROUP_COLUMN, self._set_cell_text_group), (gsfg.COL_HOST, self.HOST_COLUMN, self._set_cell_text_host), (gsfg.COL_OWNER, self.OWNER_COLUMN, self._set_cell_text_owner), (gsfg.COL_SUITE, self.SUITE_COLUMN, self._set_cell_text_name), (gsfg.COL_TITLE, self.TITLE_COLUMN, self._set_cell_text_title), (gsfg.COL_UPDATED, self.UPDATE_TIME_COLUMN, self._set_cell_text_time), ]: column = gtk.TreeViewColumn(col_title) cell_text = gtk.CellRendererText() column.pack_start(cell_text, expand=False) column.set_cell_data_func(cell_text, col_cell_text_setter) column.set_sort_column_id(col_id) column.set_visible(col_title.lower() in vis_cols) column.set_resizable(True) self.suite_treeview.append_column(column) # Construct the status column. status_column = gtk.TreeViewColumn(gsfg.COL_STATUS) status_column.set_sort_column_id(self.STATUS_COLUMN) status_column.set_visible(gsfg.COL_STATUS.lower() in vis_cols) status_column.set_resizable(True) cell_text_cycle = gtk.CellRendererText() status_column.pack_start(cell_text_cycle, expand=False) status_column.set_cell_data_func( cell_text_cycle, self._set_cell_text_cycle, self.CYCLE_COLUMN) self.suite_treeview.append_column(status_column) # Warning icon. warn_icon = gtk.CellRendererPixbuf() image = gtk.Image() pixbuf = image.render_icon( gtk.STOCK_DIALOG_WARNING, gtk.ICON_SIZE_LARGE_TOOLBAR) self.warn_icon_colour = pixbuf.scale_simple( # colour warn icon pixbuf self.ICON_SIZE, self.ICON_SIZE, gtk.gdk.INTERP_HYPER) self.warn_icon_grey = pixbuf.scale_simple( self.ICON_SIZE, self.ICON_SIZE, gtk.gdk.INTERP_HYPER) self.warn_icon_colour.saturate_and_pixelate( self.warn_icon_grey, 0, False) # b&w warn icon pixbuf status_column.pack_start(warn_icon, expand=False) status_column.set_cell_data_func(warn_icon, self._set_error_icon_state) self.warn_icon_blank = gtk.gdk.Pixbuf( # Transparent pixbuff. gtk.gdk.COLORSPACE_RGB, True, 8, self.ICON_SIZE, self.ICON_SIZE ).fill(0x00000000) # Task status icons. for i in range(len(TASK_STATUSES_ORDERED)): cell_pixbuf_state = gtk.CellRendererPixbuf() status_column.pack_start(cell_pixbuf_state, expand=False) status_column.set_cell_data_func( cell_pixbuf_state, self._set_cell_pixbuf_state, i) self.suite_treeview.show() if hasattr(self.suite_treeview, "set_has_tooltip"): self.suite_treeview.set_has_tooltip(True) try: self.suite_treeview.connect('query-tooltip', self._on_query_tooltip) except TypeError: # Lower PyGTK version. pass self.suite_treeview.connect("button-press-event", self._on_button_press_event) scrolled_window = gtk.ScrolledWindow() scrolled_window.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC) scrolled_window.add(self.suite_treeview) scrolled_window.show() self.vbox.pack_start(scrolled_window, expand=True, fill=True) patterns = {"name": None, "owner": None} for label, items in [ ("owner", patterns_owner), ("name", patterns_name)]: if items: patterns[label] = r"\A(?:" + r")|(?:".join(items) + r")\Z" try: patterns[label] = re.compile(patterns[label]) except re.error: raise ValueError("Invalid %s pattern: %s" % (label, items)) self.updater = ScanAppUpdater( self.window, self.hosts, suite_treemodel, self.suite_treeview, comms_timeout=comms_timeout, poll_interval=poll_interval, group_column_id=self.GROUP_COLUMN, name_pattern=patterns["name"], owner_pattern=patterns["owner"]) self.updater.start() self.window.add(self.vbox) self.window.connect("destroy", self._on_destroy_event) self.window.set_default_size(300, 150) self.suite_treeview.grab_focus() self.window.show() self.warning_icon_shown = []
def _get_derived_host_item(job_conf, key): """Return derived host item from GLOBAL_CFG.""" return GLOBAL_CFG.get_derived_host_item(job_conf['suite_name'], key, job_conf["host"], job_conf["owner"])
def detect_old_contact_file(self, reg, check_host_port=None): """Detect old suite contact file. If old contact file does not exist, do nothing. If old contact file exists, but suite process is definitely not alive, remove old contact file. If old contact file exists and suite process still alive, raise SuiteServiceFileError. If check_host_port is specified and does not match the (host, port) value in the old contact file, raise AssertionError. Args: reg (str): suite name check_host_port (tuple): (host, port) to check against Raise: AssertionError: If old contact file exists but does not have matching (host, port) with value of check_host_port. SuiteServiceFileError: If old contact file exists and the suite process still alive. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return if check_host_port and check_host_port != (old_host, int(old_port)): raise AssertionError("%s != (%s, %s)" % (check_host_port, old_host, old_port)) # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. # Terminate command after 10 seconds to prevent hanging, etc. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)] if is_remote_host(old_host): import shlex from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_str = str(GLOBAL_CFG.get_host_item("ssh command", old_host)) cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) ret_code = proc.wait() out, err = proc.communicate() if cylc.flags.debug and ret_code: sys.stderr.write("%s # return %d\n%s\n" % (' '.join(cmd), ret_code, err)) for line in reversed(out.splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break raise SuiteServiceFileError( (r"""ERROR, suite contact file exists: %(fname)s Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s". To start a new run, stop the old one first with one or more of these: * cylc stop %(suite)s # wait for active tasks/event handlers * cylc stop --kill %(suite)s # kill active tasks and wait * cylc stop --now %(suite)s # don't wait for active tasks * cylc stop --now --now %(suite)s # don't wait * ssh -n "%(host)s" kill %(pid)s # final brute force! """) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, })
def run(self): """Invoke the tailer.""" command = [] if ":" in self.filename: # remote user_at_host, filename = self.filename.split(':') if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = (None, user_at_host) ssh = str(GLOBAL_CFG.get_host_item("ssh command", host, owner)) command = shlex.split(ssh) + ["-n", user_at_host] cmd_tmpl = str( GLOBAL_CFG.get_host_item("remote tail command template", host, owner)) else: filename = self.filename cmd_tmpl = str( GLOBAL_CFG.get_host_item("local tail command template")) if self.cmd_tmpl: cmd_tmpl = self.cmd_tmpl command += shlex.split(cmd_tmpl % {"filename": filename}) try: self.proc = Popen(command, stdout=PIPE, stderr=STDOUT, preexec_fn=os.setpgrp) except OSError as exc: # E.g. ssh command not found dialog = warning_dialog( "%s: %s" % (exc, " ".join(quote(item) for item in command))) gobject.idle_add(dialog.warn) return poller = select.poll() poller.register(self.proc.stdout.fileno()) buf = "" while not self.quit and self.proc.poll() is None: try: self.pollable.poll() except (TypeError, AttributeError): pass if self.freeze or not poller.poll(100): # 100 ms timeout sleep(1) continue # Both self.proc.stdout.read(SIZE) and self.proc.stdout.readline() # can block. However os.read(FILENO, SIZE) should be fine after a # poller.poll(). try: data = os.read(self.proc.stdout.fileno(), self.READ_SIZE) except (IOError, OSError) as exc: dialog = warning_dialog( "%s: %s" % (exc, " ".join(quote(item) for item in command))) gobject.idle_add(dialog.warn) break if data: # Manage buffer, only add full lines to display to ensure # filtering and tagging work for line in data.splitlines(True): if not line.endswith("\n"): buf += line continue elif buf: line = buf + line buf = "" if (not self.filters or all(f.search(line) for f in self.filters)): gobject.idle_add(self.update_gui, line) sleep(0.01) self.stop()
def execute(self, force_required=False, env=None, path=None, dry_run=False): """Execute command on remote host. Returns False if remote re-invocation is not needed, True if it is needed and executes successfully otherwise aborts. """ if not self.is_remote: return False from cylc.cfgspec.globalcfg import GLOBAL_CFG from cylc.version import CYLC_VERSION name = os.path.basename(self.argv[0])[5:] # /path/to/cylc-foo => foo user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' # Build the remote command # ssh command and options (X forwarding) ssh_tmpl = str( GLOBAL_CFG.get_host_item("remote shell template", self.host, self.owner)).replace(" %s", "") command = shlex.split(ssh_tmpl) + ["-Y", user_at_host] # Use bash -l? ssh_login_shell = self.ssh_login_shell if ssh_login_shell is None: ssh_login_shell = GLOBAL_CFG.get_host_item("use login shell", self.host, self.owner) # Pass cylc version through. command += ["env", "CYLC_VERSION=%s" % CYLC_VERSION] if ssh_login_shell: # A login shell will always source /etc/profile and the user's bash # profile file. To avoid having to quote the entire remote command # it is passed as arguments to the bash script. command += ["bash", "--login", "-c", "'exec $0 \"$@\"'"] # "cylc" on the remote host if path: command.append(os.sep.join(path + ["cylc"])) else: command.append( GLOBAL_CFG.get_host_item("cylc executable", self.host, self.owner)) command.append(name) if env is None: env = {} for var, val in env.iteritems(): command.append("--env=%s=%s" % (var, val)) for arg in self.args: command.append("'" + arg + "'") # above: args quoted to avoid interpretation by the shell, # e.g. for match patterns such as '.*' on the command line. if cylc.flags.verbose: # Wordwrap the command, quoting arguments so they can be run # properly from the command line command_str = ' '.join([quote(arg) for arg in command]) print '\n'.join( TextWrapper(subsequent_indent='\t').wrap(command_str)) if dry_run: return command try: popen = subprocess.Popen(command) except OSError as exc: sys.exit("ERROR: remote command invocation failed %s" % str(exc)) res = popen.wait() if WIFSIGNALED(res): sys.exit("ERROR: remote command terminated by signal %d" % res) elif res: sys.exit("ERROR: remote command failed %d" % res) else: return True
def get_dir_for_suite(suite): """Returns the logging directory for a given suite without setting up suite logging.""" return GLOBAL_CFG.get_derived_host_item(suite, 'suite log directory')
def _get_default_hash_name(self): if hasattr(self, "_default_hash_name"): return self._default_hash_name return GLOBAL_CFG.get()['authentication']['hashes'][0]
def _write_environment_1(self, handle, job_conf): """Suite and task environment.""" handle.write("\n\n# CYLC SUITE ENVIRONMENT:") # write the static suite variables for var, val in sorted(self.suite_env.items()): handle.write("\nexport " + var + "=" + str(val)) if str(self.suite_env.get('CYLC_UTC')) == 'True': handle.write("\nexport TZ=UTC") handle.write("\n") # override and write task-host-specific suite variables suite_work_dir = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite work directory', job_conf['host'], job_conf['owner']) st_env = {} st_env['CYLC_SUITE_RUN_DIR'] = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite run directory', job_conf['host'], job_conf['owner']) st_env['CYLC_SUITE_WORK_DIR'] = suite_work_dir st_env['CYLC_SUITE_SHARE_DIR'] = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite share directory', job_conf['host'], job_conf['owner']) # DEPRECATED st_env['CYLC_SUITE_SHARE_PATH'] = '$CYLC_SUITE_SHARE_DIR' rsp = job_conf['remote suite path'] if rsp: st_env['CYLC_SUITE_DEF_PATH'] = rsp else: # replace home dir with '$HOME' for evaluation on the task host st_env['CYLC_SUITE_DEF_PATH'] = re.sub( os.environ['HOME'], '$HOME', self.suite_env['CYLC_SUITE_DEF_PATH_ON_SUITE_HOST']) for var, val in sorted(st_env.items()): handle.write("\nexport " + var + "=" + str(val)) task_work_dir = os.path.join(suite_work_dir, job_conf['work sub-directory']) use_login_shell = GLOBAL_CFG.get_host_item('use login shell', job_conf['host'], job_conf['owner']) comms = GLOBAL_CFG.get_host_item('task communication method', job_conf['host'], job_conf['owner']) task_name, point_string = TaskID.split(job_conf['task id']) handle.write("\n\n# CYLC TASK ENVIRONMENT:") handle.write("\nexport CYLC_TASK_COMMS_METHOD=" + comms) handle.write("\nexport CYLC_TASK_CYCLE_POINT=" + point_string) handle.write("\nexport CYLC_TASK_CYCLE_TIME=" + point_string) handle.write("\nexport CYLC_TASK_ID=" + job_conf['task id']) handle.write("\nexport CYLC_TASK_IS_COLDSTART=" + str(job_conf['is cold-start'])) handle.write("\nexport CYLC_TASK_LOG_ROOT=" + job_conf['job file path']) handle.write( "\nexport CYLC_TASK_MSG_MAX_TRIES=" + str(GLOBAL_CFG.get(['task messaging', 'maximum number of tries']))) handle.write("\nexport CYLC_TASK_MSG_RETRY_INTVL=" + str(GLOBAL_CFG.get(['task messaging', 'retry interval']))) handle.write( "\nexport CYLC_TASK_MSG_TIMEOUT=" + str(GLOBAL_CFG.get(['task messaging', 'connection timeout']))) handle.write("\nexport CYLC_TASK_NAME=" + task_name) handle.write('\nexport CYLC_TASK_NAMESPACE_HIERARCHY="' + ' '.join(job_conf['namespace hierarchy']) + '"') handle.write("\nexport CYLC_TASK_SSH_LOGIN_SHELL=" + str(use_login_shell)) handle.write("\nexport CYLC_TASK_SUBMIT_NUMBER=" + str(job_conf['absolute submit number'])) handle.write("\nexport CYLC_TASK_TRY_NUMBER=" + str(job_conf['try number'])) handle.write("\nexport CYLC_TASK_WORK_DIR=" + task_work_dir) # DEPRECATED handle.write("\nexport CYLC_TASK_WORK_PATH=$CYLC_TASK_WORK_DIR") handle.write("\nexport CYLC_JOB_PID=$$")