def construct_newtree(self): """construct self.newtree[one][two]...[nnn] = [auth, descr, dir ]""" regd_choices = {} for suite, suite_dir, descr in sorted(self.regd_choices): regd_choices[suite] = (suite, suite_dir, descr) self.newtree = {} for suite, auth in self.running_choices: if suite in regd_choices: if is_remote_host(auth.split(':', 1)[0]): descr, suite_dir = (None, None) else: # local suite _, suite_dir, descr = regd_choices[suite] del regd_choices[suite] nest2 = self.newtree regp = suite.split(SuiteSrvFilesManager.DELIM) for key in regp[:-1]: if key not in nest2: nest2[key] = {} nest2 = nest2[key] nest2[(regp[-1], suite, auth)] = [auth, descr, suite_dir] for suite, suite_dir, descr in regd_choices.values(): suite_dir = re.sub('^' + os.environ['HOME'], '~', suite_dir) nest2 = self.newtree regp = suite.split(SuiteSrvFilesManager.DELIM) for key in regp[:-1]: if key not in nest2: nest2[key] = {} nest2 = nest2[key] nest2[(regp[-1], suite, '-')] = ['-', descr, suite_dir]
def __init__(self, argv=None): self.owner = None self.host = None self.ssh_login_shell = None self.argv = argv or sys.argv cylc.flags.verbose = '-v' in self.argv or '--verbose' in self.argv argv = self.argv[1:] self.args = [] # detect and replace host and owner options while argv: arg = argv.pop(0) if arg.startswith("--user="******"--user="******"") elif arg.startswith("--host="): self.host = arg.replace("--host=", "") elif arg == "--login": self.ssh_login_shell = True elif arg == "--no-login": self.ssh_login_shell = False else: self.args.append(arg) if self.owner is None and self.host is None: self.is_remote = False else: from cylc.suite_host import is_remote_host from cylc.owner import is_remote_user self.is_remote = ( is_remote_user(self.owner) or is_remote_host(self.host))
def construct_newtree(self): """construct self.newtree[one][two]...[nnn] = [auth, descr, dir ]""" regd_choices = {} for suite, suite_dir, descr in sorted(self.regd_choices): regd_choices[suite] = (suite, suite_dir, descr) self.newtree = {} for suite, auth in self.running_choices: if suite in regd_choices: if is_remote_host(auth.split(':', 1)[0]): descr, suite_dir = (None, None) else: # local suite _, suite_dir, descr = regd_choices[suite] del regd_choices[suite] nest2 = self.newtree regp = suite.split(RegPath.delimiter) for key in regp[:-1]: if key not in nest2: nest2[key] = {} nest2 = nest2[key] nest2[(regp[-1], suite, auth)] = [auth, descr, suite_dir] for suite, suite_dir, descr in regd_choices.values(): suite_dir = re.sub('^' + os.environ['HOME'], '~', suite_dir) nest2 = self.newtree regp = suite.split(RegPath.delimiter) for key in regp[:-1]: if key not in nest2: nest2[key] = {} nest2 = nest2[key] nest2[(regp[-1], suite, '-')] = ['-', descr, suite_dir]
def _load_port_file(self): """Load port, host, etc from port file.""" # GLOBAL_CFG is expensive to import, so only load on demand from cylc.cfgspec.globalcfg import GLOBAL_CFG port_file_path = os.path.join( GLOBAL_CFG.get(['communication', 'ports directory']), self.suite) out = "" if is_remote_host(self.host) or is_remote_user(self.owner): # Only load these modules on demand, as they may be expensive import shlex from subprocess import Popen, PIPE ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() ret_code = proc.wait() if ret_code: if cylc.flags.debug: print >> sys.stderr, { "code": ret_code, "command": command, "stdout": out, "stderr": err} if self.port is None: raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError: if self.port is None: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() if self.port is None: try: self.port = int(lines[0]) except (IndexError, ValueError): raise PortFileError( "ERROR, bad content in port file: %s" % port_file_path) if self.host is None: if len(lines) >= 2: self.host = lines[1].strip() else: self.host = get_hostname()
def get_passphrase(suite, owner, host, db): """Find a suite passphrase.""" if not is_remote_host(host) and not is_remote_user(owner): # Local suite, retrieve suite definition directory location. suitedir = os.path.dirname(db.get_suiterc(suite)) else: suitedir = None return passphrase(suite, owner, host).get(None, suitedir)
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory from cylc.cfgspec.globalcfg import GLOBAL_CFG script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( GLOBAL_CFG.get_host_item('remote shell template', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return content
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory from cylc.cfgspec.globalcfg import GLOBAL_CFG script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( GLOBAL_CFG.get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return content
def _scan1_impl(conn, timeout, my_uuid): """Connect to host:port to get suite identify.""" srv_files_mgr = SuiteSrvFilesManager() while True: if not conn.poll(SLEEP_INTERVAL): continue item = conn.recv() if item == MSG_QUIT: break host, port = item host_anon = host if is_remote_host(host): host_anon = get_host_ip_by_name(host) # IP reduces DNS traffic client = SuiteIdClientAnon( None, host=host_anon, port=port, my_uuid=my_uuid, timeout=timeout) try: result = client.identify() except ConnectionTimeout as exc: conn.send((host, port, MSG_TIMEOUT)) except ConnectionError as exc: conn.send((host, port, None)) else: owner = result.get('owner') name = result.get('name') states = result.get('states', None) if cylc.flags.debug: print >> sys.stderr, ' suite:', name, owner if states is None: # This suite keeps its state info private. # Try again with the passphrase if I have it. try: pphrase = srv_files_mgr.get_auth_item( srv_files_mgr.FILE_BASE_PASSPHRASE, name, owner, host, content=True) except SuiteServiceFileError: pass else: if pphrase: client = SuiteIdClient( name, owner=owner, host=host, port=port, my_uuid=my_uuid, timeout=timeout) try: result = client.identify() except ConnectionError as exc: # Nope (private suite, wrong passphrase). if cylc.flags.debug: print >> sys.stderr, ' (wrong passphrase)' else: if cylc.flags.debug: print >> sys.stderr, ( ' (got states with passphrase)') conn.send((host, port, result)) conn.close()
def get(self): if is_remote_host(self.host) or is_remote_user(self.owner): str_port = self.get_remote() else: str_port = self.get_local() try: port = int(str_port) except ValueError, x: # This also catches an empty port file (touch). print >> sys.stderr, x print >> sys.stderr, "ERROR: bad port file", self.locn raise PortFileError( "ERROR, illegal port file content: %s" % str_port)
def __init__( self, suite, options ): self.options = options self.suite = suite self.suiterc = None self.suitedir = None if not is_remote_host( options.host ) and not is_remote_user( options.owner ): self.db = localdb(file=options.db ) try: self.suiterc = self.db.get_suiterc( suite ) self.suitedir = os.path.dirname( self.suiterc ) except Exception, x: if cylc.flags.debug: raise raise SystemExit(x)
def __init__(self, suite, options): self.options = options self.suite = suite self.suiterc = None self.suitedir = None if not is_remote_host(options.host) and not is_remote_user( options.owner): self.db = localdb(file=options.db, verbose=options.verbose) try: self.suiterc = self.db.get_suiterc(suite) self.suitedir = os.path.dirname(self.suiterc) except Exception, x: if options.debug: raise raise SystemExit(x)
def _scan1_impl(conn, reg_db_path, timeout, my_uuid): """Connect to host:port to get suite identify.""" while True: if not conn.poll(SLEEP_INTERVAL): continue item = conn.recv() if item == MSG_QUIT: break host, port = item host_anon = host if is_remote_host(host): host_anon = get_host_ip_by_name(host) # IP reduces DNS traffic client = SuiteIdClientAnon( None, host=host_anon, port=port, my_uuid=my_uuid, timeout=timeout) try: result = client.identify() except ConnectionTimeout as exc: conn.send((host, port, MSG_TIMEOUT)) except ConnectionError as exc: conn.send((host, port, None)) else: owner = result.get('owner') name = result.get('name') states = result.get('states', None) if cylc.flags.debug: print >> sys.stderr, ' suite:', name, owner if states is None: # This suite keeps its state info private. # Try again with the passphrase if I have it. reg_db = RegistrationDB(reg_db_path) pphrase = reg_db.load_passphrase(name, owner, host) if pphrase: client = SuiteIdClient( name, owner=owner, host=host, port=port, my_uuid=my_uuid, timeout=timeout) try: result = client.identify() except Exception: # Nope (private suite, wrong passphrase). if cylc.flags.debug: print >> sys.stderr, ' (wrong passphrase)' else: reg_db.cache_passphrase(name, owner, host, pphrase) if cylc.flags.debug: print >> sys.stderr, ( ' (got states with passphrase)') conn.send((host, port, result)) conn.close()
def _scan_item(timeout, my_uuid, srv_files_mgr, item): """Connect to item host:port (item) to get suite identify.""" host, port = item host_anon = host if is_remote_host(host): host_anon = get_host_ip_by_name(host) # IP reduces DNS traffic client = SuiteIdClientAnon( None, host=host_anon, port=port, my_uuid=my_uuid, timeout=timeout) try: result = client.identify() except ConnectionTimeout as exc: return (host, port, MSG_TIMEOUT) except ConnectionError as exc: return (host, port, None) else: owner = result.get('owner') name = result.get('name') states = result.get('states', None) if cylc.flags.debug: print >> sys.stderr, ' suite:', name, owner if states is None: # This suite keeps its state info private. # Try again with the passphrase if I have it. try: pphrase = srv_files_mgr.get_auth_item( srv_files_mgr.FILE_BASE_PASSPHRASE, name, owner, host, content=True) except SuiteServiceFileError: pass else: if pphrase: client = SuiteIdClient( name, owner=owner, host=host, port=port, my_uuid=my_uuid, timeout=timeout) try: result = client.identify() except ConnectionError as exc: # Nope (private suite, wrong passphrase). if cylc.flags.debug: print >> sys.stderr, ( ' (wrong passphrase)') else: if cylc.flags.debug: print >> sys.stderr, ( ' (got states with passphrase)') return (host, port, result)
def __init__( self, suite, options ): self.options = options self.suite = suite self.suiterc = None self.suitedir = None # dealias the suite name (an aliased name may be given for local suites) if not is_remote_host( options.host ) and not is_remote_user( options.owner ): self.db = localdb(file=options.db, verbose=options.verbose) self.db.load_from_file() try: self.suite = self.db.unalias( suite ) self.suiterc = self.db.getrc( suite ) self.suitedir = os.path.dirname( self.suiterc ) except Exception, x: if options.debug: raise raise SystemExit(x)
def _load_passphrase_via_ssh(self, suite, owner, host): """Load passphrase from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-PASSPHRASE] %(suite)s ' % {'suite': suite} # Extract suite definition directory from remote ~/.cylc/REGDB/SUITE # Attempt to cat passphrase file under suite definition directory script = ( r'''echo -n '%(prefix)s'; ''' r'''sed -n 's/^path=//p' '.cylc/REGDB/%(suite)s' | ''' r'''xargs -I '{}' cat '{}/passphrase'; ''' r'''echo''' ) % {'prefix': prefix, 'suite': suite} ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', host, owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') # back compat command = shlex.split(ssh_tmpl) + ['-n', owner + '@' + host, script] try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix passphrase = None for line in out.splitlines(): if line.startswith(prefix): passphrase = line.replace(prefix, '').strip() if not passphrase or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return passphrase
def scan_all(hosts=None, reg_db_path=None, timeout=None): """Scan all hosts.""" if not hosts: hosts = GLOBAL_CFG.get(["suite host scanning", "hosts"]) # Ensure that it does "localhost" only once hosts = set(hosts) for host in list(hosts): if not is_remote_host(host): hosts.remove(host) hosts.add("localhost") proc_pool_size = GLOBAL_CFG.get(["process pool size"]) if proc_pool_size is None: proc_pool_size = cpu_count() if proc_pool_size > len(hosts): proc_pool_size = len(hosts) proc_pool = Pool(proc_pool_size) async_results = {} for host in hosts: async_results[host] = proc_pool.apply_async( scan, [host, reg_db_path, timeout]) proc_pool.close() scan_results = [] scan_results_hosts = [] while async_results: sleep(0.05) for host, async_result in async_results.items(): if async_result.ready(): async_results.pop(host) try: res = async_result.get() except Exception: if cylc.flags.debug: traceback.print_exc() else: scan_results.extend(res) scan_results_hosts.extend([host] * len(res)) proc_pool.join() return zip(scan_results_hosts, scan_results)
def scan_all(hosts=None, reg_db_path=None, pyro_timeout=None): """Scan all hosts.""" if not hosts: hosts = GLOBAL_CFG.get(["suite host scanning", "hosts"]) # Ensure that it does "localhost" only once hosts = set(hosts) for host in list(hosts): if not is_remote_host(host): hosts.remove(host) hosts.add("localhost") proc_pool_size = GLOBAL_CFG.get(["process pool size"]) if proc_pool_size is None: proc_pool_size = cpu_count() if proc_pool_size > len(hosts): proc_pool_size = len(hosts) proc_pool = Pool(proc_pool_size) async_results = {} for host in hosts: async_results[host] = proc_pool.apply_async( scan, [host, reg_db_path, pyro_timeout]) proc_pool.close() scan_results = [] scan_results_hosts = [] while async_results: sleep(0.05) for host, async_result in async_results.items(): if async_result.ready(): async_results.pop(host) try: res = async_result.get() except Exception: if cylc.flags.debug: traceback.print_exc() else: scan_results.extend(res) scan_results_hosts.extend([host] * len(res)) proc_pool.join() return zip(scan_results_hosts, scan_results)
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if cylc.flags.debug: cmd.append("--debug") try: if is_remote_host(host): cmd.append("--host=%s" % (host)) except IOError: # Bad host, run the command any way, command will fail and # callback will deal with it cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append(self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext(cmd_key, cmd), callback, [suite, itasks])
def _is_local_auth_ok(self, reg, owner, host): """Return True if it is OK to use local passphrase, ssl.* files. Use values in ~/cylc-run/REG/.service/contact to make a judgement. Cache results in self.can_use_load_auths. """ if (reg, owner, host) not in self.can_use_load_auths: if is_remote_user(owner) or is_remote_host(host): fname = os.path.join( self.get_suite_srv_dir(reg), self.FILE_BASE_CONTACT) data = {} try: for line in open(fname): key, value = ( [item.strip() for item in line.split("=", 1)]) data[key] = value except IOError, ValueError: # No contact file self.can_use_load_auths[(reg, owner, host)] = False else: # Contact file exists, check values match if owner is None: owner = USER if host is None: host = get_suite_host() host_value = data.get(self.KEY_HOST, "") self.can_use_load_auths[(reg, owner, host)] = ( reg == data.get(self.KEY_NAME) and owner == data.get(self.KEY_OWNER) and ( host == host_value or host == host_value.split(".", 1)[0] # no domain ) ) else: self.can_use_load_auths[(reg, owner, host)] = True
def scan_all(hosts=None, timeout=None): """Scan all hosts.""" try: timeout = float(timeout) except: timeout = CONNECT_TIMEOUT my_uuid = uuid4() # Determine hosts to scan if not hosts: hosts = GLOBAL_CFG.get(["suite host scanning", "hosts"]) # Ensure that it does "localhost" only once hosts = set(hosts) for host in list(hosts): if not is_remote_host(host): hosts.remove(host) hosts.add("localhost") # Determine ports to scan base_port = GLOBAL_CFG.get(['communication', 'base port']) max_ports = GLOBAL_CFG.get(['communication', 'maximum number of ports']) # Number of child processes max_procs = GLOBAL_CFG.get(["process pool size"]) if max_procs is None: max_procs = cpu_count() # To do and wait (submitted, waiting for results) sets todo_set = set() wait_set = set() for host in hosts: for port in range(base_port, base_port + max_ports): todo_set.add((host, port)) proc_items = [] results = [] while todo_set or proc_items: no_action = True # Get results back from child processes where possible busy_proc_items = [] while proc_items: proc, my_conn, terminate_time = proc_items.pop() if my_conn.poll(): host, port, result = my_conn.recv() if result is None: # Can't connect, ignore wait_set.remove((host, port)) elif result == MSG_TIMEOUT: # Connection timeout, leave in "wait_set" pass else: # Connection success results.append((host, port, result)) wait_set.remove((host, port)) if todo_set: # Immediately give the child process something to do host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) busy_proc_items.append( (proc, my_conn, time() + INACTIVITY_TIMEOUT)) else: # Or quit if there is nothing left to do my_conn.send(MSG_QUIT) my_conn.close() proc.join() no_action = False elif time() > terminate_time: # Terminate child process if it is taking too long proc.terminate() proc.join() no_action = False else: busy_proc_items.append((proc, my_conn, terminate_time)) proc_items += busy_proc_items # Create some child processes where necessary while len(proc_items) < max_procs and todo_set: my_conn, conn = Pipe() try: proc = Process(target=_scan1_impl, args=(conn, timeout, my_uuid)) except OSError: # Die if unable to start any worker process. # OK to wait and see if any worker process already running. if not proc_items: raise if cylc.flags.debug: traceback.print_exc() else: proc.start() host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) proc_items.append((proc, my_conn, time() + INACTIVITY_TIMEOUT)) no_action = False if no_action: sleep(SLEEP_INTERVAL) # Report host:port with no results if wait_set: print >> sys.stderr, ( 'WARNING, scan timed out, no result for the following:') for key in sorted(wait_set): print >> sys.stderr, ' %s:%s' % key return results
def load_item(self, suite, owner, host, item="certificate", create_ok=False, cache_ok=False): """Load or create a passphrase, SSL certificate or a private key. SSL files are searched from these locations in order: 1/ For running task jobs: a/ $CYLC_SUITE_RUN_DIR then $CYLC_SUITE_DEF_PATH for remote jobs. b/ $CYLC_SUITE_DEF_PATH_ON_SUITE_HOST for local jobs or remote jobs with SSH messaging. 2/ (Passphrases only) From memory cache, for remote suite passphrases. Don't use if cache_ok=False. 3/ For suite on local user@host. The suite definition directory, as registered. (Note: Previously, this needs to be the 1st location, else sub-suites load their parent suite's passphrases, etc, on start-up because the "cylc run" command runs in a parent suite task execution environment. This problem no longer exists becase on suite start up, the "load_item_from_dir" method is called directly instead of through this method.) 4/ Location under $HOME/.cylc/ for remote suite control from accounts that do not actually need the suite definition directory to be installed: $HOME/.cylc/passphrases/SUITE_OWNER@SUITE_HOST/SUITE_NAME/ 5/ (SSL files only) If create_ok is specified, create the SSL file and then return it. 6/ For remote suites, try locating the file from the suite definition directory on remote owner@host via SSH. """ item_is_passphrase = False if item == "certificate": item = self.SSL_CERTIFICATE_FILE_BASE elif item == "private_key": item = self.SSL_PRIVATE_KEY_FILE_BASE elif item == "passphrase": item_is_passphrase = True self.can_disk_cache_passphrases[(suite, owner, host)] = False suite_host = os.getenv('CYLC_SUITE_HOST') suite_owner = os.getenv('CYLC_SUITE_OWNER') if suite == os.getenv('CYLC_SUITE_NAME'): env_keys = [] if is_remote_host(suite_host) or is_remote_user(suite_owner): # 1(a)/ Task messaging call on a remote account. # First look in the remote suite run directory than suite # definition directory ($CYLC_SUITE_DEF_PATH is modified # for remote tasks): env_keys = ['CYLC_SUITE_RUN_DIR', 'CYLC_SUITE_DEF_PATH'] elif suite_host or suite_owner: # 1(b)/ Task messaging call on the suite host account. # Could be a local task or a remote task with 'ssh # messaging = True'. In either case use # $CYLC_SUITE_DEF_PATH_ON_SUITE_HOST which never # changes, not $CYLC_SUITE_DEF_PATH which gets # modified for remote tasks as described above. env_keys = ['CYLC_SUITE_DEF_PATH_ON_SUITE_HOST'] for key in env_keys: try: return self.load_item_from_dir(os.environ[key], item) except (KeyError, IOError, PassphraseError): pass # 2/ From memory cache if cache_ok and item_is_passphrase: pass_owner = owner pass_host = host if pass_owner is None: pass_owner = USER if pass_host is None: pass_host = get_hostname() try: return self.cached_passphrases[(suite, pass_owner, pass_host)] except KeyError: pass # 3/ Cylc commands with suite definition directory from local reg. if cache_ok or not is_remote_user(owner) and not is_remote_host(host): try: return self.load_item_from_dir(self.get_suitedir(suite), item) except (IOError, PassphraseError, RegistrationError): pass # 4/ Other allowed locations, as documented above. prefix = os.path.expanduser(os.path.join('~', '.cylc')) if host is None: host = suite_host if (owner is not None and host is not None and (not item_is_passphrase or cache_ok)): prefix = os.path.expanduser(os.path.join('~', '.cylc')) paths = [] path_types = [(prefix, self.PASSPHRASES_DIR_BASE, owner + "@" + host, suite)] short_host = host.split('.', 1)[0] if short_host != host: path_types.append((prefix, self.PASSPHRASES_DIR_BASE, owner + "@" + short_host, suite)) for names in path_types: try: return self.load_item_from_dir(os.path.join(*names), item) except (IOError, PassphraseError): pass if create_ok and not item_is_passphrase: # 5/ Create the SSL file if it doesn't exist. return self._dump_certificate_and_key_to_dir( self.get_suitedir(suite), suite) load_dest_root = None if not item_is_passphrase: load_dest_root = os.path.join( prefix, self.PASSPHRASES_DIR_BASE, owner + "@" + host, suite) try: # 6/ Try ssh-ing to grab the files directly. content = self._load_item_via_ssh( item, suite, owner, host, dest_dir=load_dest_root) if content and item_is_passphrase: self.can_disk_cache_passphrases[(suite, owner, host)] = True return content except Exception as exc: import traceback traceback.print_exc() raise PassphraseError("Couldn't get %s" % item)
# 1) host selection command: $(command) or `command` match = REC_COMMAND.match(host) if match: # extract the command and execute it hs_command = match.groups()[1] timeout = GLOBAL_CFG.get(["task host select command timeout"]) is_ok, outlines = run_get_stdout(hs_command, timeout) if is_ok: # host selection command succeeded host = outlines[0] else: # host selection command failed raise HostSelectError(host, "\n".join(outlines)) # 2) environment variable: ${VAR} or $VAR # (any quotes are stripped by file parsing) match = REC_ENVIRON.match(host) if match: name = match.groups()[0] try: host = os.environ[name] except KeyError, exc: raise HostSelectError(host, "Variable not defined: " + str(exc)) try: if is_remote_host(host): return host else: return "localhost" except: return host
def scan_all(hosts=None, reg_db_path=None, timeout=None): """Scan all hosts.""" try: timeout = float(timeout) except: timeout = CONNECT_TIMEOUT my_uuid = uuid4() # Determine hosts to scan if not hosts: hosts = GLOBAL_CFG.get(["suite host scanning", "hosts"]) # Ensure that it does "localhost" only once hosts = set(hosts) for host in list(hosts): if not is_remote_host(host): hosts.remove(host) hosts.add("localhost") # Determine ports to scan base_port = GLOBAL_CFG.get(['communication', 'base port']) max_ports = GLOBAL_CFG.get(['communication', 'maximum number of ports']) # Number of child processes max_procs = GLOBAL_CFG.get(["process pool size"]) if max_procs is None: max_procs = cpu_count() # To do and wait (submitted, waiting for results) sets todo_set = set() wait_set = set() for host in hosts: for port in range(base_port, base_port + max_ports): todo_set.add((host, port)) proc_items = [] results = [] while todo_set or proc_items: no_action = True # Get results back from child processes where possible busy_proc_items = [] while proc_items: proc, my_conn, terminate_time = proc_items.pop() if my_conn.poll(): host, port, result = my_conn.recv() if result is None: # Can't connect, ignore wait_set.remove((host, port)) elif result == MSG_TIMEOUT: # Connection timeout, leave in "wait_set" pass else: # Connection success results.append((host, port, result)) wait_set.remove((host, port)) if todo_set: # Immediately give the child process something to do host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) busy_proc_items.append( (proc, my_conn, time() + INACTIVITY_TIMEOUT)) else: # Or quit if there is nothing left to do my_conn.send(MSG_QUIT) my_conn.close() proc.join() no_action = False elif time() > terminate_time: # Terminate child process if it is taking too long proc.terminate() proc.join() no_action = False else: busy_proc_items.append((proc, my_conn, terminate_time)) proc_items += busy_proc_items # Create some child processes where necessary while len(proc_items) < max_procs and todo_set: my_conn, conn = Pipe() proc = Process(target=_scan1_impl, args=( conn, reg_db_path, timeout, my_uuid)) proc.start() host, port = todo_set.pop() wait_set.add((host, port)) my_conn.send((host, port)) proc_items.append((proc, my_conn, time() + INACTIVITY_TIMEOUT)) no_action = False if no_action: sleep(SLEEP_INTERVAL) # Report host:port with no results if wait_set: print >> sys.stderr, ( 'WARNING, scan timed out, no result for the following:') for key in sorted(wait_set): print >> sys.stderr, ' %s:%s' % key return results
def _load_item_via_ssh(self, item, suite, owner, host, dest_dir=None): """Load item (e.g. passphrase) from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-PASSPHRASE] %(suite)s ' % {'suite': suite} # Extract suite definition directory from remote ~/.cylc/REGDB/SUITE # Attempt to cat passphrase file under suite definition directory script = ( r'''echo -n '%(prefix)s'; ''' r'''sed -n 's/^path=//p' '.cylc/REGDB/%(suite)s' | ''' r'''xargs -I '{}' cat '{}/%(item)s'; ''' r'''echo''' ) % {'prefix': prefix, 'suite': suite, 'item': item} from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', host, owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') # back compat import shlex command = shlex.split(ssh_tmpl) + ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix if item == self.PASSPHRASE_FILE_BASE: content = None for line in out.splitlines(): if line.startswith(prefix): content = line.replace(prefix, '').strip() else: content = [] content_has_started = False for line in out.splitlines(): if line.startswith(prefix): line = line.replace(prefix, '') content_has_started = True if content_has_started: content.append(line) content = "\n".join(content) if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return if dest_dir is not None: if not os.path.exists(dest_dir): os.makedirs(dest_dir) os.chmod(dest_dir, 0700) dest_item = os.path.join(dest_dir, item) file_handle = open(dest_item, "w") file_handle.write(content) file_handle.close() os.chmod(dest_item, 0600) return dest_item return content
def __init__( self, task_id, suite, jobconfig, submit_num ): self.jobconfig = jobconfig self.task_id = task_id self.suite = suite self.logfiles = jobconfig.get( 'log files' ) self.job_submit_command_template = jobconfig.get('command template') # Local job script path: append submit number. # (used by both local and remote tasks) tag = task_id + TaskID.DELIM + submit_num gcfg = get_global_cfg() self.local_jobfile_path = os.path.join( \ gcfg.get_derived_host_item( self.suite, 'suite job log directory' ), tag ) # The directory is created in config.py self.logfiles.add_path( self.local_jobfile_path ) task_host = jobconfig.get('task host') task_owner = jobconfig.get('task owner') self.remote_shell_template = gcfg.get_host_item( 'remote shell template', task_host, task_owner ) if is_remote_host(task_host) or is_remote_user(task_owner): # REMOTE TASK OR USER ACCOUNT SPECIFIED FOR TASK - submit using ssh self.local = False if task_owner: self.task_owner = task_owner else: self.task_owner = None if task_host: self.task_host = task_host else: self.task_host = socket.gethostname() self.remote_jobfile_path = os.path.join( \ gcfg.get_derived_host_item( self.suite, 'suite job log directory', self.task_host, self.task_owner ), tag ) # Remote log files self.stdout_file = self.remote_jobfile_path + ".out" self.stderr_file = self.remote_jobfile_path + ".err" # Used in command construction: self.jobfile_path = self.remote_jobfile_path # Record paths of remote log files for access by gui if True: # by ssh URL url_prefix = self.task_host if self.task_owner: url_prefix = self.task_owner + "@" + url_prefix self.logfiles.add_path( url_prefix + ':' + self.stdout_file) self.logfiles.add_path( url_prefix + ':' + self.stderr_file) else: # CURRENTLY DISABLED: # If the remote and suite hosts see a common filesystem, or # if the remote task is really just a local task with a # different owner, we could just use local filesystem access. # But to use this: (a) special namespace config would be # required to indicate we have a common filesystem, and # (b) we'd need to consider how the log directory can be # specified (for example use of '$HOME' as for remote # task use would not work here as log file access is by # gui under the suite owner account. self.logfiles.add_path( self.stdout_file ) self.logfiles.add_path( self.stderr_file ) else: # LOCAL TASKS self.local = True self.task_owner = None # Used in command construction: self.jobfile_path = self.local_jobfile_path # Local stdout and stderr log file paths: self.stdout_file = self.local_jobfile_path + ".out" self.stderr_file = self.local_jobfile_path + ".err" # interpolate environment variables in extra logs for idx in range( 0, len( self.logfiles.paths )): self.logfiles.paths[idx] = expandvars( self.logfiles.paths[idx] ) # Record paths of local log files for access by gui self.logfiles.add_path( self.stdout_file) self.logfiles.add_path( self.stderr_file) # set some defaults that can be overridden by derived classes self.jobconfig[ 'directive prefix' ] = None self.jobconfig[ 'directive final' ] = "# FINAL DIRECTIVE" self.jobconfig[ 'directive connector' ] = " " # overrideable methods self.set_directives() self.set_scripting() self.set_environment()
def __init__(self, task_id, jobconfig, xconfig, submit_num): self.jobconfig = jobconfig self.task_id = task_id self.logfiles = xconfig["extra log files"] self.job_submit_command_template = xconfig["job submission command template"] self.remote_shell_template = xconfig["remote shell template"] # Local job script path: append submit number. # (used by both local and remote tasks) tag = task_id + TaskID.DELIM + submit_num self.local_jobfile_path = os.path.join(xconfig["log path"], tag) # The directory is created in config.py self.logfiles.add_path(self.local_jobfile_path) self.suite_owner = user remote_host = xconfig["host"] task_owner = xconfig["owner"] if is_remote_host(remote_host) or is_remote_user(task_owner): # REMOTE TASK OR USER ACCOUNT SPECIFIED FOR TASK - submit using ssh self.local = False if task_owner: self.task_owner = task_owner else: self.task_owner = self.suite_owner if remote_host: self.remote_host = remote_host else: self.remote_host = socket.gethostname() self.remote_jobfile_path = os.path.join(xconfig["remote log path"], tag) # Remote log files self.stdout_file = self.remote_jobfile_path + ".out" self.stderr_file = self.remote_jobfile_path + ".err" # Used in command construction: self.jobfile_path = self.remote_jobfile_path # Record paths of remote log files for access by gui if True: # by ssh URL url_prefix = self.task_owner + "@" + self.remote_host self.logfiles.add_path(url_prefix + ":" + self.stdout_file) self.logfiles.add_path(url_prefix + ":" + self.stderr_file) else: # CURRENTLY DISABLED: # If the remote and suite hosts see a common filesystem, or # if the remote task is really just a local task with a # different owner, we could just use local filesystem access. # But to use this: (a) special namespace config would be # required to indicate we have a common filesystem, and # (b) we'd need to consider how the log directory can be # specified (for example use of '$HOME' as for remote # task use would not work here as log file access is by # gui under the suite owner account. self.logfiles.add_path(self.stdout_file) self.logfiles.add_path(self.stderr_file) else: # LOCAL TASKS self.local = True self.task_owner = self.suite_owner # Used in command construction: self.jobfile_path = self.local_jobfile_path # Local stdout and stderr log file paths: self.stdout_file = self.local_jobfile_path + ".out" self.stderr_file = self.local_jobfile_path + ".err" # Record paths of local log files for access by gui self.logfiles.add_path(self.stdout_file) self.logfiles.add_path(self.stderr_file) # Overrideable methods self.set_directives() self.set_scripting() self.set_environment()
def scan(host=None, db=None, timeout=None): """Scan ports, return a list of suites found: [(port, suite.identify())]. Note that we could easily scan for a given suite+owner and return its port instead of reading port files, but this may not always be fast enough. """ if host is None: host = get_hostname() base_port = GLOBAL_CFG.get( ['communication', 'base port']) last_port = base_port + GLOBAL_CFG.get( ['communication', 'maximum number of ports']) if timeout: timeout = float(timeout) else: timeout = None reg_db = RegistrationDB(db) results = [] my_uuid = uuid4() host_for_anon = host if is_remote_host(host): host_for_anon = get_host_ip_by_name(host) # IP reduces DNS traffic. for port in range(base_port, last_port): client = SuiteIdClientAnon(None, host=host_for_anon, port=port, my_uuid=my_uuid, timeout=timeout) try: result = (port, client.identify()) except ConnectionError as exc: if cylc.flags.debug: traceback.print_exc() continue except Exception as exc: if cylc.flags.debug: traceback.print_exc() raise else: owner = result[1].get('owner') name = result[1].get('name') states = result[1].get('states', None) if cylc.flags.debug: print ' suite:', name, owner if states is None: # This suite keeps its state info private. # Try again with the passphrase if I have it. pphrase = reg_db.load_passphrase(name, owner, host) if pphrase: client = SuiteIdClient(name, owner=owner, host=host, port=port, my_uuid=my_uuid, timeout=timeout) try: result = (port, client.identify()) except Exception: # Nope (private suite, wrong passphrase). if cylc.flags.debug: print ' (wrong passphrase)' else: reg_db.cache_passphrase( name, owner, host, pphrase) if cylc.flags.debug: print ' (got states with passphrase)' results.append(result) return results
def detect_old_contact_file(self, reg): """Detect old suite contact file. Raise SuiteServiceFileError if old contact file exists, and there is evidence that the old suite is still running. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["ps", "-opid,args", str(old_pid_str)] if is_remote_host(old_host): import shlex from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_tmpl = str(GLOBAL_CFG.get_host_item( "remote shell template", old_host)) cmd = shlex.split(ssh_tmpl) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) proc.wait() for line in reversed(proc.communicate()[0].splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break sys.stderr.write( ( r"""ERROR, suite contact file exists: %(fname)s If %(suite)s is not running, delete the suite contact file and try again. If it is running but unresponsive, kill any left over suite processes too. To see if %(suite)s is running on '%(host)s:%(port)s': * cylc scan -n '\b%(suite)s\b' '%(host)s' * cylc ping -v --host='%(host)s' '%(suite)s' * ssh -n '%(host)s' 'ps -o pid,args %(pid)s' """ ) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, } ) raise SuiteServiceFileError( "ERROR, suite contact file exists: %s" % fname)
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ uri_data = { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object} port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if self.host is None or self.port is None: if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError as exc: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if uri_data["port"] is None: uri_data["port"] = int(lines[0]) self.port = uri_data["port"] except (IndexError, ValueError): raise PortFileError( "ERROR, bad content in port file: %s" % port_file_path) if uri_data["host"] is None: if len(lines) >= 2: uri_data["host"] = lines[1].strip() else: uri_data["host"] = "localhost" self.host = uri_data["host"] # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % uri_data)
def get_passphrase_file(self, pfile=None, suitedir=None): """ Passphrase location, order of preference: 1/ The pfile argument - used for passphrase creation by "cylc register". 2/ The suite definition directory, because suites may be automatically installed (e.g. by Rose) to remote task hosts, and remote tasks know this location from their execution environment. Local user command invocations can use the suite registration database to find the suite definition directory. HOWEVER, remote user command invocations cannot do this even if the local and remote hosts share a common filesystem, because we cannot be sure if finding the expected suite registration implies a common filesystem or a different remote suite that happens to be registered under the same name. User accounts used for remote control must therefore install the passphrase in the secondary standard locations (below) or use the command line option to explicitly reveal the location. Remote tasks with 'ssh messaging = True' look first in the suite definition directory of the suite host, which they know through the variable CYLC_SUITE_DEF_PATH_ON_SUITE_HOST in the task execution environment. 3/ Secondary locations: (i) $HOME/.cylc/SUITE_HOST/SUITE_OWNER/SUITE_NAME/passphrase (ii) $HOME/.cylc/SUITE_HOST/SUITE_NAME/passphrase (iii) $HOME/.cylc/SUITE_NAME/passphrase These are more sensible locations for remote suite control from accounts that do not actually need the suite definition directory to be installed. """ # 1/ Explicit suite definition directory given on the command line. if pfile: if os.path.isdir(pfile): pfile = os.path.join(pfile, 'passphrase') if os.path.isfile(pfile): self.set_location(pfile) else: # If an explicit location is given, the file must exist. raise PassphraseError( 'ERROR, file not found on %s@%s: %s' % ( user, get_hostname(), pfile)) # 2/ Cylc commands with suite definition directory from local reg. if not self.location and suitedir: pfile = os.path.join(suitedir, 'passphrase') if os.path.isfile(pfile): self.set_location(pfile) # (2 before 3 else sub-suites load their parent suite's # passphrase on start-up because the "cylc run" command runs in # a parent suite task execution environment). # 3/ Running tasks: suite def dir from the task execution environment. if not self.location: try: # Test for presence of task execution environment suite_host = os.environ['CYLC_SUITE_HOST'] suite_owner = os.environ['CYLC_SUITE_OWNER'] except KeyError: # not called by a task pass else: # called by a task if is_remote_host(suite_host) or is_remote_user(suite_owner): # 2(i)/ Task messaging call on a remote account. # First look in the remote suite definition # directory ($CYLC_SUITE_DEF_PATH is modified for # remote tasks): try: pfile = os.path.join( os.environ['CYLC_SUITE_DEF_PATH'], 'passphrase') except KeyError: pass else: if os.path.isfile(pfile): self.set_location(pfile) else: # 2(ii)/ Task messaging call on the suite host account. # Could be a local task or a remote task with 'ssh # messaging = True'. In either case use # $CYLC_SUITE_DEF_PATH_ON_SUITE_HOST which never # changes, not $CYLC_SUITE_DEF_PATH which gets # modified for remote tasks as described above. try: pfile = os.path.join( os.environ['CYLC_SUITE_DEF_PATH_ON_SUITE_HOST'], 'passphrase') except KeyError: pass else: if os.path.isfile(pfile): self.set_location(pfile) # 4/ Other allowed locations, as documented above. if not self.location: locations = [] # For remote control commands, self.host here will be fully # qualified or not depending on what's given on the command line. short_host = re.sub('\..*', '', self.host) prefix = os.path.join(os.environ['HOME'], '.cylc') locations.append( os.path.join( prefix, self.host, self.owner, self.suite, 'passphrase')) if short_host != self.host: locations.append(os.path.join( prefix, short_host, self.owner, self.suite, 'passphrase')) locations.append( os.path.join(prefix, self.host, self.suite, 'passphrase')) if short_host != self.host: locations.append(os.path.join( prefix, short_host, self.suite, 'passphrase')) locations.append(os.path.join(prefix, self.suite, 'passphrase')) for pfile in locations: if os.path.isfile(pfile): self.set_location(pfile) break if not self.location: raise PassphraseError( 'ERROR: passphrase for suite %s not found on %s@%s' % ( self.suite, user, get_hostname())) return self.location
def get_auth_item(self, item, reg, owner=None, host=None, content=False): """Locate/load passphrase, SSL private key, SSL certificate, etc. Return file name, or content of file if content=True is set. Files are searched from these locations in order: 1/ For running task jobs, service directory under: a/ $CYLC_SUITE_RUN_DIR for remote jobs. b/ $CYLC_SUITE_RUN_DIR_ON_SUITE_HOST for local jobs or remote jobs with SSH messaging. 2/ (Passphrases only) From memory cache, for remote suite passphrases. Don't use if content=False. 3/ For suite on local user@host. The suite service directory. 4/ Location under $HOME/.cylc/ for remote suite control from accounts that do not actually need the suite definition directory to be installed: $HOME/.cylc/auth/SUITE_OWNER@SUITE_HOST/SUITE_NAME/ 5/ For remote suites, try locating the file from the suite service directory on remote owner@host via SSH. If content=False, the value of the located file will be dumped under: $HOME/.cylc/auth/SUITE_OWNER@SUITE_HOST/SUITE_NAME/ """ if item not in [ self.FILE_BASE_SSL_CERT, self.FILE_BASE_SSL_PEM, self.FILE_BASE_PASSPHRASE, self.FILE_BASE_CONTACT]: raise ValueError("%s: item not recognised" % item) if item == self.FILE_BASE_PASSPHRASE: self.can_disk_cache_passphrases[(reg, owner, host)] = False suite_host = os.getenv('CYLC_SUITE_HOST') suite_owner = os.getenv('CYLC_SUITE_OWNER') if reg == os.getenv('CYLC_SUITE_NAME'): env_keys = [] if is_remote_host(suite_host) or is_remote_user(suite_owner): # 1(a)/ Task messaging call on a remote account. # Look in the remote suite run directory: env_keys = ['CYLC_SUITE_RUN_DIR'] elif suite_host or suite_owner: # 1(b)/ Task messaging call on the suite host account. # Could be a local task or a remote task with 'ssh # messaging = True'. In either case use # $CYLC_SUITE_RUN_DIR_ON_SUITE_HOST which never changes. env_keys = ['CYLC_SUITE_RUN_DIR_ON_SUITE_HOST'] for key in env_keys: path = os.path.join(os.environ[key], self.DIR_BASE_SRV) if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 2/ From memory cache if item in self.cache: my_owner = owner my_host = host if my_owner is None: my_owner = USER if my_host is None: my_host = get_hostname() try: return self.cache[item][(reg, my_owner, my_host)] except KeyError: pass # 3/ Local suite service directory if self._is_local_auth_ok(reg, owner, host): path = self.get_suite_srv_dir(reg) if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 4/ Disk cache for remote suites if host is None: host = suite_host if owner is not None and host is not None: paths = [self._get_cache_dir(reg, owner, host)] short_host = host.split('.', 1)[0] if short_host != host: paths.append(self._get_cache_dir(reg, owner, short_host)) for path in paths: if content: value = self._load_local_item(item, path) else: value = self._locate_item(item, path) if value: return value # 5/ Use SSH to load content from remote owner@host value = self._load_remote_item(item, reg, owner, host) if value: if item == self.FILE_BASE_PASSPHRASE: self.can_disk_cache_passphrases[(reg, owner, host)] = True if not content: path = self._get_cache_dir(reg, owner, host) self._dump_item(path, item, value) value = os.path.join(path, item) return value raise SuiteServiceFileError("Couldn't get %s" % item)
def _scan1_impl(conn, timeout, my_uuid): """Connect to host:port to get suite identify.""" srv_files_mgr = SuiteSrvFilesManager() while True: if not conn.poll(SLEEP_INTERVAL): continue item = conn.recv() if item == MSG_QUIT: break host, port = item host_anon = host if is_remote_host(host): host_anon = get_host_ip_by_name(host) # IP reduces DNS traffic client = SuiteIdClientAnon(None, host=host_anon, port=port, my_uuid=my_uuid, timeout=timeout) try: result = client.identify() except ConnectionTimeout as exc: conn.send((host, port, MSG_TIMEOUT)) except (ConnectionError, SuiteStillInitialisingError) as exc: conn.send((host, port, None)) else: owner = result.get('owner') name = result.get('name') states = result.get('states', None) if cylc.flags.debug: print >> sys.stderr, ' suite:', name, owner if states is None: # This suite keeps its state info private. # Try again with the passphrase if I have it. try: pphrase = srv_files_mgr.get_auth_item( srv_files_mgr.FILE_BASE_PASSPHRASE, name, owner, host, content=True) except SuiteServiceFileError: pass else: if pphrase: client = SuiteIdClient(name, owner=owner, host=host, port=port, my_uuid=my_uuid, timeout=timeout) try: result = client.identify() except SuiteStillInitialisingError as exc: if cylc.flags.debug: print >> sys.stderr, ( ' (connected with passphrase,' + ' suite initialising)') except ConnectionError as exc: # Nope (private suite, wrong passphrase). if cylc.flags.debug: print >> sys.stderr, ' (wrong passphrase)' else: if cylc.flags.debug: print >> sys.stderr, ( ' (got states with passphrase)') conn.send((host, port, result)) conn.close()
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ uri_data = { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object } port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if self.host is None or self.port is None: if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str( GLOBAL_CFG.get_host_item('remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path ] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError as exc: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if uri_data["port"] is None: uri_data["port"] = int(lines[0]) self.port = uri_data["port"] except (IndexError, ValueError): raise PortFileError("ERROR, bad content in port file: %s" % port_file_path) if uri_data["host"] is None: if len(lines) >= 2: uri_data["host"] = lines[1].strip() else: uri_data["host"] = "localhost" self.host = uri_data["host"] # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % uri_data)
def __init__(self, task_id, suite, jobconfig, submit_num): self.jobconfig = jobconfig self.task_id = task_id self.suite = suite self.logfiles = jobconfig.get('log files') self.command = None self.job_submit_command_template = jobconfig.get('command template') common_job_log_path = jobconfig.get('common job log path') self.local_jobfile_path = jobconfig.get('local job file path') self.logfiles.add_path(self.local_jobfile_path) task_host = jobconfig.get('task host') task_owner = jobconfig.get('task owner') self.remote_shell_template = GLOBAL_CFG.get_host_item( 'remote shell template', task_host, task_owner) if is_remote_host(task_host) or is_remote_user(task_owner): self.local = False if task_owner: self.task_owner = task_owner else: self.task_owner = None if task_host: self.task_host = task_host else: self.task_host = socket.gethostname() remote_job_log_dir = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite job log directory', self.task_host, self.task_owner) remote_jobfile_path = os.path.join(remote_job_log_dir, common_job_log_path) # Remote log files self.stdout_file = remote_jobfile_path + ".out" self.stderr_file = remote_jobfile_path + ".err" # Used in command construction: self.jobfile_path = remote_jobfile_path # Record paths of remote log files for access by gui if True: # by ssh URL url_prefix = self.task_host if self.task_owner: url_prefix = self.task_owner + "@" + url_prefix self.logfiles.add_path(url_prefix + ':' + self.stdout_file) self.logfiles.add_path(url_prefix + ':' + self.stderr_file) else: # CURRENTLY DISABLED: # If the remote and suite hosts see a common filesystem, or # if the remote task is really just a local task with a # different owner, we could just use local filesystem access. # But to use this: (a) special namespace config would be # required to indicate we have a common filesystem, and # (b) we'd need to consider how the log directory can be # specified (for example use of '$HOME' as for remote # task use would not work here as log file access is by # gui under the suite owner account. self.logfiles.add_path(self.stdout_file) self.logfiles.add_path(self.stderr_file) else: # LOCAL TASKS self.local = True self.task_owner = None # Used in command construction: self.jobfile_path = self.local_jobfile_path # Local stdout and stderr log file paths: self.stdout_file = self.local_jobfile_path + ".out" self.stderr_file = self.local_jobfile_path + ".err" # interpolate environment variables in extra logs for idx in range(0, len(self.logfiles.paths)): self.logfiles.paths[idx] = expandvars(self.logfiles.paths[idx]) # Record paths of local log files for access by gui self.logfiles.add_path(self.stdout_file) self.logfiles.add_path(self.stderr_file) # set some defaults that can be overridden by derived classes self.jobconfig['directive prefix'] = None self.jobconfig['directive final'] = "# FINAL DIRECTIVE" self.jobconfig['directive connector'] = " " self.jobconfig['job vacation signal'] = None # overrideable methods self.set_directives() self.set_job_vacation_signal() self.set_scripting() self.set_environment()
def detect_old_contact_file(self, reg): """Detect old suite contact file. Raise SuiteServiceFileError if old contact file exists, and there is evidence that the old suite is still running. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["ps", "-opid,args", str(old_pid_str)] if is_remote_host(old_host): import shlex from cylc.cfgspec.globalcfg import GLOBAL_CFG ssh_str = str(GLOBAL_CFG.get_host_item("ssh command", old_host)) cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) proc.wait() for line in reversed(proc.communicate()[0].splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break sys.stderr.write( ( r"""ERROR, suite contact file exists: %(fname)s If %(suite)s is not running, delete the suite contact file and try again. If it is running but unresponsive, kill any left over suite processes too. To see if %(suite)s is running on '%(host)s:%(port)s': * cylc scan -n '\b%(suite)s\b' '%(host)s' * cylc ping -v --host='%(host)s' '%(suite)s' * ssh -n '%(host)s' 'ps -o pid,args %(pid)s' """ ) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, } ) raise SuiteServiceFileError( "ERROR, suite contact file exists: %s" % fname)
def load_passphrase(self, suite, owner, host, cache_ok=True): """Search for passphrase file for suite, load and return content. "passphrase" file is searched from these locations in order: 1/ For running task jobs: a/ $CYLC_SUITE_RUN_DIR then $CYLC_SUITE_DEF_PATH for remote jobs. b/ $CYLC_SUITE_DEF_PATH_ON_SUITE_HOST for local jobs or remote jobs with SSH messaging. 2/ From memory cache, for passphrases of remote suites. Don't use if cache_ok=False. 3/ For suite on local user@host. The suite definition directory, as registered. (Note: Previously, this needs to be the 1st location, else sub-suites load their parent suite's passphrase on start-up because the "cylc run" command runs in a parent suite task execution environment. This problem no longer exists becase on suite start up, the "load_passphrase_from_dir" method is called directly instead of through this method.) 4/ Locations under $HOME/.cylc/ for remote suite control from accounts that do not actually need the suite definition directory to be installed (a/ is now preferred. b/ c/ d/ are for back compat): a/ $HOME/.cylc/passphrases/SUITE_OWNER@SUITE_HOST/SUITE_NAME/ b/ $HOME/.cylc/SUITE_HOST/SUITE_OWNER/SUITE_NAME/ c/ $HOME/.cylc/SUITE_HOST/SUITE_NAME/ d/ $HOME/.cylc/SUITE_NAME/ Don't use if cache_ok=False. 5/ For remote suites, try locating the passphrase file from suite definition directory on remote owner@host via SSH. """ self.can_disk_cache_passphrases[(suite, owner, host)] = False # (1 before 2 else sub-suites load their parent suite's # passphrase on start-up because the "cylc run" command runs in # a parent suite task execution environment). # 1/ Running tasks: suite run/def dir from the task job environment. # Test for presence of task execution environment of requested suite. if suite == os.getenv('CYLC_SUITE_NAME'): suite_host = os.getenv('CYLC_SUITE_HOST') suite_owner = os.getenv('CYLC_SUITE_OWNER') env_keys = [] if is_remote_host(suite_host) or is_remote_user(suite_owner): # 2(i)/ Task messaging call on a remote account. # First look in the remote suite run directory than suite # definition directory ($CYLC_SUITE_DEF_PATH is modified # for remote tasks): env_keys = ['CYLC_SUITE_RUN_DIR', 'CYLC_SUITE_DEF_PATH'] elif suite_host or suite_owner: # 2(ii)/ Task messaging call on the suite host account. # Could be a local task or a remote task with 'ssh # messaging = True'. In either case use # $CYLC_SUITE_DEF_PATH_ON_SUITE_HOST which never # changes, not $CYLC_SUITE_DEF_PATH which gets # modified for remote tasks as described above. env_keys = ['CYLC_SUITE_DEF_PATH_ON_SUITE_HOST'] for env_key in env_keys: try: return self.load_passphrase_from_dir(os.environ[env_key]) except (KeyError, IOError, PassphraseError): pass # 2/ From memory cache if owner is None: owner = USER if host is None: host = get_hostname() if cache_ok: try: return self.cached_passphrases[(suite, owner, host)] except KeyError: pass # 3/ Cylc commands with suite definition directory from local reg. if cache_ok or not is_remote_user(owner) and not is_remote_host(host): try: return self.load_passphrase_from_dir(self.get_suitedir(suite)) except (IOError, PassphraseError, RegistrationError): pass # 4/ Other allowed locations, as documented above. # For remote control commands, host here will be fully # qualified or not depending on what's given on the command line. if cache_ok: short_host = host.split('.', 1)[0] prefix = os.path.expanduser(os.path.join('~', '.cylc')) paths = [] for names in [ (prefix, self.PASSPHRASES_DIR_BASE, owner + "@" + host, suite), (prefix, self.PASSPHRASES_DIR_BASE, owner + "@" + short_host, suite), (prefix, host, owner, suite), (prefix, short_host, owner, suite), (prefix, host, suite), (prefix, short_host, suite), (prefix, suite)]: path = os.path.join(*names) if path not in paths: try: return self.load_passphrase_from_dir(path) except (IOError, PassphraseError): pass paths.append(path) # 5/ Try SSH to remote host passphrase = self._load_passphrase_via_ssh(suite, owner, host) if passphrase: self.can_disk_cache_passphrases[(suite, owner, host)] = True return passphrase if passphrase is None and cylc.flags.debug: print >> sys.stderr, ( 'ERROR: passphrase for suite %s not found for %s@%s' % ( suite, owner, host))
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ if ((self.host is None or self.port is None) and 'CYLC_SUITE_RUN_DIR' in os.environ): # Looks like we are in a running task job, so we should be able to # use "cylc-suite-env" file under the suite running directory try: suite_env = CylcSuiteEnv.load(self.suite, os.environ['CYLC_SUITE_RUN_DIR']) except CylcSuiteEnvLoadError: if cylc.flags.debug: traceback.print_exc() else: self.host = suite_env.suite_host self.port = suite_env.suite_port self.owner = suite_env.suite_owner if self.host is None or self.port is None: port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str( GLOBAL_CFG.get_host_item('remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path ] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() ret_code = proc.wait() if ret_code: if cylc.flags.debug: print >> sys.stderr, { "code": ret_code, "command": command, "stdout": out, "stderr": err } raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if self.port is None: self.port = int(lines[0]) except (IndexError, ValueError): raise PortFileError("ERROR, bad content in port file: %s" % port_file_path) if self.host is None: if len(lines) >= 2: self.host = lines[1].strip() else: self.host = get_hostname() # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object })
def _set_uri(self): """Set Pyro URI. Determine host and port using content in port file, unless already specified. """ if ((self.host is None or self.port is None) and 'CYLC_SUITE_RUN_DIR' in os.environ): # Looks like we are in a running task job, so we should be able to # use "cylc-suite-env" file under the suite running directory try: suite_env = CylcSuiteEnv.load( self.suite, os.environ['CYLC_SUITE_RUN_DIR']) except CylcSuiteEnvLoadError: if cylc.flags.debug: traceback.print_exc() else: self.host = suite_env.suite_host self.port = suite_env.suite_port self.owner = suite_env.suite_owner if self.host is None or self.port is None: port_file_path = os.path.join( GLOBAL_CFG.get(['pyro', 'ports directory']), self.suite) if is_remote_host(self.host) or is_remote_user(self.owner): ssh_tmpl = str(GLOBAL_CFG.get_host_item( 'remote shell template', self.host, self.owner)) ssh_tmpl = ssh_tmpl.replace(' %s', '') user_at_host = '' if self.owner: user_at_host = self.owner + '@' if self.host: user_at_host += self.host else: user_at_host += 'localhost' r_port_file_path = port_file_path.replace( os.environ['HOME'], '$HOME') command = shlex.split(ssh_tmpl) + [ user_at_host, 'cat', r_port_file_path] proc = Popen(command, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() ret_code = proc.wait() if ret_code: if cylc.flags.debug: print >> sys.stderr, { "code": ret_code, "command": command, "stdout": out, "stderr": err} raise PortFileError( "Port file '%s:%s' not found - suite not running?." % (user_at_host, r_port_file_path)) else: try: out = open(port_file_path).read() except IOError: raise PortFileError( "Port file '%s' not found - suite not running?." % (port_file_path)) lines = out.splitlines() try: if self.port is None: self.port = int(lines[0]) except (IndexError, ValueError): raise PortFileError( "ERROR, bad content in port file: %s" % port_file_path) if self.host is None: if len(lines) >= 2: self.host = lines[1].strip() else: self.host = get_hostname() # Qualify the obj name with user and suite name (unnecessary but # can't change it until we break back-compat with older daemons). self.uri = ( 'PYROLOC://%(host)s:%(port)s/%(owner)s.%(suite)s.%(target)s' % { "host": self.host, "port": self.port, "suite": self.suite, "owner": self.owner, "target": self.target_server_object})