def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys): """Process retrieval of task job logs from remote user@host.""" if ctx.user_at_host and "@" in ctx.user_at_host: s_user, s_host = ctx.user_at_host.split("@", 1) else: s_user, s_host = (None, ctx.user_at_host) ssh_str = str(GLOBAL_CFG.get_host_item("ssh command", s_host, s_user)) rsync_str = str(GLOBAL_CFG.get_host_item( "retrieve job logs command", s_host, s_user)) cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str] if cylc.flags.debug: cmd.append("-v") if ctx.max_size: cmd.append("--max-size=%s" % (ctx.max_size,)) # Includes and excludes includes = set() for _, point, name, submit_num in id_keys: # Include relevant directories, all levels needed includes.add("/%s" % (point)) includes.add("/%s/%s" % (point, name)) includes.add("/%s/%s/%02d" % (point, name, submit_num)) includes.add("/%s/%s/%02d/**" % (point, name, submit_num)) cmd += ["--include=%s" % (include) for include in sorted(includes)] cmd.append("--exclude=/**") # exclude everything else # Remote source cmd.append(ctx.user_at_host + ":" + GLOBAL_CFG.get_derived_host_item( schd_ctx.suite, "suite job log directory", s_host, s_user) + "/") # Local target cmd.append(GLOBAL_CFG.get_derived_host_item( schd_ctx.suite, "suite job log directory") + "/") self.proc_pool.put_command( SuiteProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys), self._job_logs_retrieval_callback, [schd_ctx])
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if '@' in user_at_host: owner, host = user_at_host.split('@', 1) else: owner, host = None, user_at_host if ((owner, host) in [(None, 'localhost'), (USER, 'localhost')] or host in self.initialised_hosts or self.single_task_mode): return suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory') sources = [os.path.join(suite_run_dir, CylcSuiteEnv.BASE_NAME)] if 'CYLC_SUITE_DEF_PATH' in os.environ: sources.append( os.path.join(os.getenv('CYLC_SUITE_DEF_PATH'), 'passphrase')) suite_run_py = os.path.join(suite_run_dir, 'python') if os.path.isdir(suite_run_py): sources.append(suite_run_py) r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite job log directory', host, owner) getLogger('main').log(INFO, 'Initialising %s:%s' % ( user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item( 'remote shell template', host, owner).replace(' %s', '') scp_tmpl = GLOBAL_CFG.get_host_item( 'remote copy template', host, owner) cmd1 = shlex.split(ssh_tmpl) + [ "-n", user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir] cmd2 = shlex.split(scp_tmpl) + ['-pr'] + sources + [ user_at_host + ":" + r_suite_run_dir + '/'] for cmd in [cmd1, cmd2]: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err) self.initialised_hosts.append(user_at_host)
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if '@' in user_at_host: owner, host = user_at_host.split('@', 1) else: owner, host = None, user_at_host if ((owner, host) in [(None, 'localhost'), (USER, 'localhost')] or host in self.initialised_hosts or self.single_task_mode): return suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory') sources = [os.path.join(suite_run_dir, CylcSuiteEnv.BASE_NAME)] if 'CYLC_SUITE_DEF_PATH' in os.environ: sources.append( os.path.join(os.getenv('CYLC_SUITE_DEF_PATH'), 'passphrase')) suite_run_py = os.path.join(suite_run_dir, 'python') if os.path.isdir(suite_run_py): sources.append(suite_run_py) r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite job log directory', host, owner) getLogger('main').log( INFO, 'Initialising %s:%s' % (user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item('remote shell template', host, owner).replace(' %s', '') scp_tmpl = GLOBAL_CFG.get_host_item('remote copy template', host, owner) cmd1 = shlex.split(ssh_tmpl) + [ "-n", user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir ] cmd2 = shlex.split(scp_tmpl) + ['-pr'] + sources + [ user_at_host + ":" + r_suite_run_dir + '/' ] for cmd in [cmd1, cmd2]: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err) self.initialised_hosts.append(user_at_host)
def write_environment_1( self, BUFFER=None ): if not BUFFER: BUFFER = self.FILE BUFFER.write( "\n\n# CYLC SUITE ENVIRONMENT:" ) # write the static suite variables for var, val in sorted(self.__class__.suite_env.items()): BUFFER.write( "\nexport " + var + "=" + str(val) ) if str(self.__class__.suite_env.get('CYLC_UTC')) == 'True': BUFFER.write( "\nexport TZ=UTC" ) BUFFER.write("\n") # override and write task-host-specific suite variables suite_work_dir = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite work directory', self.host, self.owner ) st_env = deepcopy( self.__class__.suite_task_env ) st_env[ 'CYLC_SUITE_RUN_DIR' ] = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite run directory', self.host, self.owner ) st_env[ 'CYLC_SUITE_WORK_DIR' ] = suite_work_dir st_env[ 'CYLC_SUITE_SHARE_DIR' ] = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite share directory', self.host, self.owner ) st_env[ 'CYLC_SUITE_SHARE_PATH' ] = '$CYLC_SUITE_SHARE_DIR' # DEPRECATED rsp = self.jobconfig['remote suite path'] if rsp: st_env[ 'CYLC_SUITE_DEF_PATH' ] = rsp else: # replace home dir with '$HOME' for evaluation on the task host st_env[ 'CYLC_SUITE_DEF_PATH' ] = re.sub( os.environ['HOME'], '$HOME', st_env['CYLC_SUITE_DEF_PATH'] ) for var, val in sorted(st_env.items()): BUFFER.write( "\nexport " + var + "=" + str(val) ) task_work_dir = os.path.join( suite_work_dir, self.jobconfig['work sub-directory'] ) use_login_shell = GLOBAL_CFG.get_host_item( 'use login shell', self.host, self.owner ) comms = GLOBAL_CFG.get_host_item( 'task communication method', self.host, self.owner ) BUFFER.write( "\n\n# CYLC TASK ENVIRONMENT:" ) BUFFER.write( "\nexport CYLC_TASK_COMMS_METHOD=" + comms ) BUFFER.write( "\nexport CYLC_TASK_CYCLE_POINT=" + self.point_string ) BUFFER.write( "\nexport CYLC_TASK_CYCLE_TIME=" + self.point_string ) BUFFER.write( "\nexport CYLC_TASK_ID=" + self.task_id ) BUFFER.write( "\nexport CYLC_TASK_IS_COLDSTART=" + str( self.jobconfig['is cold-start']) ) BUFFER.write( "\nexport CYLC_TASK_LOG_ROOT=" + self.log_root ) BUFFER.write( "\nexport CYLC_TASK_MSG_MAX_TRIES=" + str( GLOBAL_CFG.get( ['task messaging','maximum number of tries'])) ) BUFFER.write( "\nexport CYLC_TASK_MSG_RETRY_INTVL=" + str( GLOBAL_CFG.get( ['task messaging','retry interval in seconds'])) ) BUFFER.write( "\nexport CYLC_TASK_MSG_TIMEOUT=" + str( GLOBAL_CFG.get( ['task messaging','connection timeout in seconds'])) ) BUFFER.write( "\nexport CYLC_TASK_NAME=" + self.task_name ) BUFFER.write( '\nexport CYLC_TASK_NAMESPACE_HIERARCHY="' + ' '.join( self.jobconfig['namespace hierarchy']) + '"') BUFFER.write( "\nexport CYLC_TASK_SSH_LOGIN_SHELL=" + str(use_login_shell) ) BUFFER.write( "\nexport CYLC_TASK_SUBMIT_NUMBER=" + str(self.jobconfig['absolute submit number']) ) BUFFER.write( "\nexport CYLC_TASK_TRY_NUMBER=" + str(self.jobconfig['try number']) ) BUFFER.write( "\nexport CYLC_TASK_WORK_DIR=" + task_work_dir ) BUFFER.write( "\nexport CYLC_TASK_WORK_PATH=$CYLC_TASK_WORK_DIR") # DEPRECATED
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if '@' in user_at_host: owner, host = user_at_host.split('@', 1) else: owner, host = None, user_at_host if ((owner, host) in [(None, 'localhost'), (user, 'localhost')] or host in self.initialised_hosts or self.single_task_mode): return suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory') sources = [os.path.join(suite_run_dir, "cylc-suite-env")] suite_run_py = os.path.join(suite_run_dir, "python") if os.path.isdir(suite_run_py): sources.append(suite_run_py) try: r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( suite_name, 'suite job log directory', host, owner) getLogger('main').log(INFO, 'Initialising %s:%s' % ( user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item( 'remote shell template', host, owner).replace(" %s", "") scp_tmpl = GLOBAL_CFG.get_host_item( 'remote copy template', host, owner) cmd1 = shlex.split(ssh_tmpl) + [ user_at_host, 'mkdir -p "%s" "%s"' % (r_suite_run_dir, r_log_job_dir)] cmd2 = shlex.split(scp_tmpl) + ["-r"] + sources + [ user_at_host + ":" + r_suite_run_dir + "/"] for cmd in [cmd1, cmd2]: check_call(cmd) except Exception: raise RemoteJobHostInitError(user_at_host) self.initialised_hosts.append(user_at_host)
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if cylc.flags.debug: cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append( GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SuiteProcContext(cmd_key, cmd), callback, [suite, itasks])
def get_create_job_log_path(cls, suite, task_name, task_point, submit_num): """Return a new job log path on the suite host, in two parts. /part1/part2 * part1: the top level job log directory on the suite host. * part2: the rest, which is also used on remote task hosts. The full local job log directory is created if necessary, and its parent symlinked to NN (submit number). """ suite_job_log_dir = GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory") the_rest_dir = os.path.join(str(task_point), task_name, "%02d" % int(submit_num)) the_rest = os.path.join(the_rest_dir, "job") local_log_dir = os.path.join(suite_job_log_dir, the_rest_dir) mkdir_p(local_log_dir) target = os.path.join(os.path.dirname(local_log_dir), "NN") try: os.unlink(target) except OSError: pass try: os.symlink(os.path.basename(local_log_dir), target) except OSError as exc: if not exc.filename: exc.filename = target raise exc return suite_job_log_dir, the_rest
def get_create_job_log_path(cls, suite, task_name, task_point, submit_num): """Return a new job log path on the suite host, in two parts. /part1/part2 * part1: the top level job log directory on the suite host. * part2: the rest, which is also used on remote task hosts. The full local job log directory is created if necessary, and its parent symlinked to NN (submit number). """ suite_job_log_dir = GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory") the_rest_dir = os.path.join( str(task_point), task_name, "%02d" % int(submit_num)) the_rest = os.path.join(the_rest_dir, "job") local_log_dir = os.path.join(suite_job_log_dir, the_rest_dir) mkdir_p(local_log_dir) target = os.path.join(os.path.dirname(local_log_dir), "NN") try: os.unlink(target) except OSError: pass try: os.symlink(os.path.basename(local_log_dir), target) except OSError as exc: if not exc.filename: exc.filename = target raise exc return suite_job_log_dir, the_rest
def unlink_suite_contact_files(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for user_at_host, should_unlink in self.initialised_hosts.items(): if not should_unlink: continue if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, "suite run directory", host, owner), SuiteSrvFilesManager.DIR_BASE_SRV, SuiteSrvFilesManager.FILE_BASE_CONTACT, ) cmd = shlex.split(ssh_tmpl) + ["-n", user_at_host, "rm", "-f", r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.items(): if not proc.poll(): continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) ) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() proc.wait() ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) )
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs.""" if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks = self.prep_submit_task_jobs(suite, itasks) if not prepared_tasks: return # Submit task jobs auth_itasks = {} for itask in prepared_tasks: # The job file is now (about to be) used: reset the file write flag # so that subsequent manual retrigger will generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for auth, itasks in sorted(auth_itasks.items()): cmd = ["cylc", self.JOBS_SUBMIT] if cylc.flags.debug: cmd.append("--debug") host, owner = auth remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append("--") cmd.append(GLOBAL_CFG.get_derived_host_item( suite, 'suite job log directory', host, owner)) stdin_file_paths = [] job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): if remote_mode: stdin_file_paths.append( self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE)) job_log_dirs.append(self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext( self.JOBS_SUBMIT, cmd, stdin_file_paths=stdin_file_paths, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks])
def init_suite_run_dir(self, suite_name, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host if ( (owner, host) in [(None, "localhost"), (user, "localhost")] or host in self.initialised_hosts or self.single_task_mode ): return suite_run_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite run directory") sources = [os.path.join(suite_run_dir, "cylc-suite-env")] if "CYLC_SUITE_DEF_PATH" in os.environ: sources.append(os.path.join(os.getenv("CYLC_SUITE_DEF_PATH"), "passphrase")) suite_run_py = os.path.join(suite_run_dir, "python") if os.path.isdir(suite_run_py): sources.append(suite_run_py) r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite run directory", host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item(suite_name, "suite job log directory", host, owner) getLogger("main").log(INFO, "Initialising %s:%s" % (user_at_host, r_suite_run_dir)) ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner).replace(" %s", "") scp_tmpl = GLOBAL_CFG.get_host_item("remote copy template", host, owner) cmd1 = shlex.split(ssh_tmpl) + ["-n", user_at_host, "mkdir", "-p", r_suite_run_dir, r_log_job_dir] cmd2 = shlex.split(scp_tmpl) + ["-pr"] + sources + [user_at_host + ":" + r_suite_run_dir + "/"] for cmd in [cmd1, cmd2]: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err ) self.initialised_hosts.append(user_at_host)
def get_task_job_log( self, suite, point, name, submit_num=None, tail=None): """Return the job log path.""" args = [ GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory"), self.get_task_job_id(point, name, submit_num)] if tail: args.append(tail) return os.path.join(*args)
def __init__( self, suite ): self.ldir = GLOBAL_CFG.get_derived_host_item( suite, 'suite log directory' ) self.path = os.path.join( self.ldir, 'log' ) self.err_path = os.path.join( self.ldir, 'err' ) self.roll_at_startup = GLOBAL_CFG.get( ['suite logging','roll over at start-up'] ) self.n_keep = GLOBAL_CFG.get( ['suite logging','rolling archive length'] ) self.max_bytes = GLOBAL_CFG.get( ['suite logging','maximum size in bytes'] )
def __init__( self, suite ): sodir = GLOBAL_CFG.get_derived_host_item( suite, 'suite log directory' ) self.opath = os.path.join( sodir, 'out' ) self.epath = os.path.join( sodir, 'err' ) # use same archive length as logging (TODO: document this) self.roll_at_startup = GLOBAL_CFG.get( ['suite logging','roll over at start-up'] ) self.arclen = GLOBAL_CFG.get( ['suite logging','rolling archive length'] )
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory from cylc.cfgspec.globalcfg import GLOBAL_CFG script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( GLOBAL_CFG.get_host_item('remote shell template', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return content
def get_suite_srv_dir(self, reg, suite_owner=None): """Return service directory of a suite.""" if not suite_owner: suite_owner = get_user() run_d = os.getenv("CYLC_SUITE_RUN_DIR") if (not run_d or os.getenv("CYLC_SUITE_NAME") != reg or os.getenv("CYLC_SUITE_OWNER") != suite_owner): from cylc.cfgspec.globalcfg import GLOBAL_CFG run_d = GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory') return os.path.join(run_d, self.DIR_BASE_SRV)
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote_host(host) and not is_remote_user(owner): return # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory from cylc.cfgspec.globalcfg import GLOBAL_CFG script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( GLOBAL_CFG.get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: if cylc.flags.debug: print >> sys.stderr, ( 'ERROR: %(command)s # code=%(ret_code)s\n%(err)s\n' ) % { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, } return return content
def get_suite_srv_dir(self, reg, suite_owner=None): """Return service directory of a suite.""" if not suite_owner: suite_owner = USER run_d = os.getenv("CYLC_SUITE_RUN_DIR") if (not run_d or os.getenv("CYLC_SUITE_NAME") != reg or os.getenv("CYLC_SUITE_OWNER") != suite_owner): from cylc.cfgspec.globalcfg import GLOBAL_CFG run_d = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory') return os.path.join(run_d, self.DIR_BASE_SRV)
def __init__(self, suite): sodir = GLOBAL_CFG.get_derived_host_item(suite, 'suite log directory') self.opath = os.path.join(sodir, 'out') self.epath = os.path.join(sodir, 'err') # use same archive length as logging (TODO: document this) self.roll_at_startup = GLOBAL_CFG.get( ['suite logging', 'roll over at start-up']) self.arclen = GLOBAL_CFG.get( ['suite logging', 'rolling archive length'])
def unlink_suite_contact_files(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.initialised.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('remote shell template', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory', host, owner), SuiteSrvFilesManager.DIR_BASE_SRV, SuiteSrvFilesManager.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file ] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): ERR.warning( RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err))
def __init__(self, suite, run_mode='live', ict=None, stop_point=None): self.run_mode = run_mode self.cts_str = None self.set_cts(ict, stop_point) self.dir_name = GLOBAL_CFG.get_derived_host_item( suite, 'suite state directory') self.file_name = os.path.join(self.dir_name, self.BASE_NAME) self.arch_len = GLOBAL_CFG.get(['state dump rolling archive length']) if not self.arch_len or int(self.arch_len) <= 1: self.arch_len = 1 self.arch_files = [] self.pool = None self.log = logging.getLogger('main')
def __init__(self, suite, run_mode='live', ict=None, stop_point=None): self.run_mode = run_mode self.set_cts(ict, stop_point) self.dir_name = GLOBAL_CFG.get_derived_host_item( suite, 'suite state directory') self.file_name = os.path.join(self.dir_name, self.BASE_NAME) self.arch_len = GLOBAL_CFG.get(['state dump rolling archive length']) if not self.arch_len or int(self.arch_len) <= 1: self.arch_len = 1 self.arch_files = [] self.pool = None self.wireless = None self.log = logging.getLogger('main')
def __init__(self, suite): self.ldir = GLOBAL_CFG.get_derived_host_item( suite, 'suite log directory') self.path = os.path.join(self.ldir, 'log') self.err_path = os.path.join(self.ldir, 'err') self.roll_at_startup = GLOBAL_CFG.get( ['suite logging', 'roll over at start-up']) self.n_keep = GLOBAL_CFG.get( ['suite logging', 'rolling archive length']) self.max_bytes = GLOBAL_CFG.get( ['suite logging', 'maximum size in bytes'])
def unlink_hosts_contacts(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.init_host_map.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): ERR.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): ERR.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err))
def __init__(self, suite, test_params=None): if SuiteLog.__INSTANCE: raise Exception("Attempting to initiate a second singleton" "instance.") self._group = None if not test_params: self.is_test = False self.max_bytes = GLOBAL_CFG.get( ['suite logging', 'maximum size in bytes']) self.roll_at_startup = GLOBAL_CFG.get( ['suite logging', 'roll over at start-up']) self.archive_length = GLOBAL_CFG.get( ['suite logging', 'rolling archive length']) else: self.is_test = True self.max_bytes = test_params['max_bytes'] self.roll_at_startup = test_params['roll_at_startup'] self.archive_length = 4 # Log paths. if test_params: self.ldir = test_params['ldir'] else: self.ldir = GLOBAL_CFG.get_derived_host_item( suite, 'suite log directory') self.log_paths = {} self.log_paths[self.LOG] = os.path.join(self.ldir, self.LOG) self.log_paths[self.OUT] = os.path.join(self.ldir, self.OUT) self.log_paths[self.ERR] = os.path.join(self.ldir, self.ERR) # The loggers. self.loggers = {} self.loggers[self.LOG] = None self.loggers[self.OUT] = None self.loggers[self.ERR] = None # Filename stamp functions. if self.is_test: self.stamp = lambda: get_current_time_string(True, True, True ).replace('.', '-') else: self.stamp = lambda: get_current_time_string(False, True, True) SuiteLog.__INSTANCE = self
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if cylc.flags.debug: cmd.append("--debug") try: if is_remote_host(host): cmd.append("--host=%s" % (host)) except IOError: # Bad host, run the command any way, command will fail and # callback will deal with it cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append(self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext(cmd_key, cmd), callback, [suite, itasks])
def __init__(self, cfg, updater, theme, info_bar, xdot): super(GraphUpdater, self).__init__() self.quit = False self.cleared = False self.ignore_suicide = True self.focus_start_point_string = None self.focus_stop_point_string = None self.xdot = xdot self.first_update = False self.graph_disconnect = False self.action_required = True self.oldest_point_string = None self.newest_point_string = None self.orientation = "TB" # Top to Bottom ordering of nodes self.best_fit = True # zoom to page size self.normal_fit = False # zoom to 1.0 scale self.crop = False self.subgraphs_on = False # organise by cycle point. self.descendants = {} self.all_families = [] self.triggering_families = [] self.write_dot_frames = False self.prev_graph_id = () self.cfg = cfg self.updater = updater self.theme = theme self.info_bar = info_bar self.state_summary = {} self.fam_state_summary = {} self.global_summary = {} self.last_update_time = None self.god = None self.mode = "waiting..." self.dt = "waiting..." self.prev_graph_id = () # empty graphw object: self.graphw = graphing.CGraphPlain(self.cfg.suite) # TODO - handle failure to get a remote proxy in reconnect() self.graph_warned = {} # lists of nodes to newly group or ungroup (not of all currently # grouped and ungrouped nodes - still held server side) self.group = [] self.ungroup = [] self.have_leaves_and_feet = False self.leaves = [] self.feet = [] self.ungroup_recursive = False if "graph" in self.cfg.ungrouped_views: self.ungroup_all = True self.group_all = False else: self.ungroup_all = False self.group_all = True self.graph_frame_count = 0 self.suite_share_dir = GLOBAL_CFG.get_derived_host_item( self.cfg.suite, 'suite share directory')
def __init__(self, suite, task_name, task_point): dir_ = GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory") self.base_path = os.path.join(dir_, str(task_point), task_name) self.suite_logger = logging.getLogger("main")
def _prep_submit_task_job_impl(self, suite, itask, rtconfig): """Helper for self._prep_submit_task_job.""" # Submit number itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: owner_at_host = itask.task_owner + "@" + itask.task_host else: owner_at_host = itask.task_host itask.summary['host'] = owner_at_host itask.summary['job_hosts'][itask.submit_num] = owner_at_host itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = (TaskActionTimer( delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING) ]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf(itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) scripts = self._get_job_scripts(itask, rtconfig) # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': (rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_env_tmpl': rtconfig['parameter environment templates'], 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def init_suite_run_dir(self, reg, user_at_host): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if "@" in user_at_host: owner, host = user_at_host.split("@", 1) else: owner, host = None, user_at_host if ( (owner, host) in [(None, "localhost"), (USER, "localhost")] or host in self.initialised_hosts or self.single_task_mode ): return r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(reg, "suite run directory", host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item(reg, "suite job log directory", host, owner) r_suite_srv_dir = os.path.join(r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item("remote shell template", host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join(self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, "wb").close() proc = Popen( shlex.split(ssh_tmpl) + ["-n", user_at_host, "test", "-e", os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE, ) if proc.wait() == 0: # Initialised, but no need to tidy up self.initialised_hosts[user_at_host] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append( shlex.split(ssh_tmpl) + ["-n", user_at_host, "mkdir", "-p", r_suite_run_dir, r_log_job_dir, r_suite_srv_dir] ) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item("task communication method", host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item("remote copy template", host, owner) cmds.append( shlex.split(scp_tmpl) + [ "-p", self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item(self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), self.suite_srv_files_mgr.get_auth_item(self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg), user_at_host + ":" + r_suite_srv_dir + "/", ] ) # Command to copy python library to remote host. suite_run_py = os.path.join(GLOBAL_CFG.get_derived_host_item(reg, "suite run directory"), "python") if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + ["-pr", suite_run_py, user_at_host + ":" + r_suite_run_dir + "/"]) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, " ".join([quote(item) for item in cmd]), proc.returncode, out, err, ) self.initialised_hosts[user_at_host] = should_unlink LOG.info("Initialised %s:%s" % (user_at_host, r_suite_run_dir))
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) if not prepared_tasks: return bad_tasks # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.summary['latest_message'] = self.REMOTE_INIT_MSG continue # Persist if owner: owner_at_host = owner + '@' + host else: owner_at_host = host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and perist LOG.info('submit-num=%d, owner@host=%s' % (itask.submit_num, owner_at_host), itask=itask) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry self.task_events_mgr.log_task_job_activity( SuiteProcContext(self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, self.poll_task_jobs) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if cylc.flags.debug: cmd.append('--debug') remote_mode = False kwargs = {} for key, value, test_func in [('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append( GLOBAL_CFG.get_derived_host_item(suite, 'suite job log directory', host, owner)) stdin_file_paths = [] job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): if remote_mode: stdin_file_paths.append( self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE)) job_log_dirs.append( self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file write # flag so that subsequent manual retrigger will generate a new # job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext(self.JOBS_SUBMIT, cmd, stdin_file_paths=stdin_file_paths, job_log_dirs=job_log_dirs, **kwargs), self._submit_task_jobs_callback, [suite, itasks]) return done_tasks
def __init__(self, suite, task_name, task_point): dir_ = GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory") self.base_path = os.path.join(dir_, str(task_point), task_name) self.suite_logger = logging.getLogger("main")
def init_host(self, reg, host, owner): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if host is None: host = 'localhost' if ((host, owner) in [('localhost', None), ('localhost', USER)] or (host, owner) in self.init_host_map or self.single_task_mode): return user_at_host = host if owner: user_at_host = owner + '@' + host r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite job log directory', host, owner) r_suite_srv_dir = os.path.join( r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, 'wb').close() proc = Popen( shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE) if proc.wait() == 0: # Initialised, but no need to tidy up self.init_host_map[(host, owner)] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append(shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item( 'task communication method', host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner) cmds.append(shlex.split(scp_tmpl) + [ '-p', self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg), user_at_host + ':' + r_suite_srv_dir + '/']) # Command to copy python library to remote host. suite_run_py = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'), 'python') if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + [ '-pr', suite_run_py, user_at_host + ':' + r_suite_run_dir + '/']) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, ' '.join([quote(item) for item in cmd]), proc.returncode, out, err) self.init_host_map[(host, owner)] = should_unlink LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
def init_host(self, reg, host, owner): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if host is None: host = 'localhost' if (self.single_task_mode or (host, owner) in self.init_host_map or not is_remote(host, owner)): return user_at_host = host if owner: user_at_host = owner + '@' + host r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite job log directory', host, owner) r_suite_srv_dir = os.path.join( r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, 'wb').close() proc = Popen( shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE) if proc.wait() == 0: # Initialised, but no need to tidy up self.init_host_map[(host, owner)] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append(shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item( 'task communication method', host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner) # Handle not having SSL certs installed. try: ssl_cert = self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg) except (SuiteServiceFileError, ValueError): ssl_cert = None cmds.append(shlex.split(scp_tmpl) + [ '-p', self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), user_at_host + ':' + r_suite_srv_dir + '/']) if ssl_cert is not None: cmds[-1].insert(-1, ssl_cert) # Command to copy python library to remote host. suite_run_py = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'), 'python') if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + [ '-pr', suite_run_py, user_at_host + ':' + r_suite_run_dir + '/']) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err) self.init_host_map[(host, owner)] = should_unlink LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))
def get_dir_for_suite(suite): """Returns the logging directory for a given suite without setting up suite logging.""" return GLOBAL_CFG.get_derived_host_item(suite, 'suite log directory')
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": HTTP(S) and SSH+HTTP(S) task comm ".service/passphrase": HTTP(S) task comm ".service/ssl.cert": HTTPS task comm "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install items = self._remote_init_items(host, owner) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create "stdin_file_paths" file, with "items" in it. tmphandle = NamedTemporaryFile() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), 'uuid') if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid)) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append(str(self.uuid)) cmd.append( GLOBAL_CFG.get_derived_host_item(self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SuiteProcContext('remote-init', cmd, stdin_file_paths=[tmphandle.name]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def _prep_submit_task_job_impl(self, suite, itask): """Helper for self._prep_submit_task_job.""" overrides = BroadcastServer.get_inst().get(itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides) else: rtconfig = itask.tdef.rtconfig # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Submit number and try number LOG.debug("[%s] -incrementing submit number" % (itask.identity,)) itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.local_job_file_path = None self.suite_db_mgr.put_insert_task_jobs(itask, { "is_manual_submit": itask.is_manual_submit, "try_num": itask.get_try_num(), "time_submit": get_current_time_string(), }) itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) # Determine task host settings now, just before job submission, # because dynamic host selection may be used. # host may be None (= run task on suite host) itask.task_host = get_task_host(rtconfig['remote']['host']) if not itask.task_host: itask.task_host = 'localhost' elif itask.task_host != "localhost": LOG.info("[%s] -Task host: %s" % ( itask.identity, itask.task_host)) itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host itask.summary['host'] = user_at_host itask.summary['job_hosts'][itask.submit_num] = user_at_host try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = ( TaskActionTimer(delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING)]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf( itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) self.init_host(suite, itask.task_host, itask.task_owner) self.suite_db_mgr.put_update_task_jobs(itask, { "user_at_host": user_at_host, "batch_sys_name": itask.summary['batch_sys_name'], }) itask.is_manual_submit = False scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def _get_derived_host_item(job_conf, key): """Return derived host item from GLOBAL_CFG.""" return GLOBAL_CFG.get_derived_host_item(job_conf['suite_name'], key, job_conf["host"], job_conf["owner"])
def get_latest_job_log(cls, suite, task_name, task_point): """Return the latest job log path on the suite host.""" suite_job_log_dir = GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory") the_rest = os.path.join(str(task_point), task_name, "NN", "job") return os.path.join(suite_job_log_dir, the_rest)
def __init__(self, cfg, updater, theme, info_bar, xdot): super(GraphUpdater, self).__init__() self.quit = False self.cleared = False self.ignore_suicide = True self.focus_start_point_string = None self.focus_stop_point_string = None self.xdot = xdot self.first_update = False self.graph_disconnect = False self.action_required = True self.oldest_point_string = None self.newest_point_string = None self.orientation = "TB" # Top to Bottom ordering of nodes self.best_fit = True # zoom to page size self.normal_fit = False # zoom to 1.0 scale self.crop = False self.subgraphs_on = False # organise by cycle point. self.descendants = {} self.all_families = [] self.write_dot_frames = False self.prev_graph_id = () self.cfg = cfg self.updater = updater self.theme = theme self.info_bar = info_bar self.state_summary = {} self.fam_state_summary = {} self.global_summary = {} self.last_update_time = None self.god = None self.mode = "waiting..." self.update_time_str = "waiting..." self.prev_graph_id = () # empty graphw object: self.graphw = CGraphPlain(self.cfg.suite) # TODO - handle failure to get a remote proxy in reconnect() self.graph_warned = {} # lists of nodes to newly group or ungroup (not of all currently # grouped and ungrouped nodes - still held server side) self.group = [] self.ungroup = [] self.have_leaves_and_feet = False self.leaves = [] self.feet = [] self.ungroup_recursive = False if "graph" in self.cfg.ungrouped_views: self.ungroup_all = True self.group_all = False else: self.ungroup_all = False self.group_all = True self.graph_frame_count = 0 self.suite_share_dir = GLOBAL_CFG.get_derived_host_item( self.cfg.suite, 'suite share directory')
def _prep_submit_task_job_impl(self, suite, itask): """Helper for self._prep_submit_task_job.""" overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides) else: rtconfig = itask.tdef.rtconfig # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Submit number and try number LOG.debug("[%s] -incrementing submit number" % (itask.identity,)) itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.local_job_file_path = None self.suite_db_mgr.put_insert_task_jobs(itask, { "is_manual_submit": itask.is_manual_submit, "try_num": itask.get_try_num(), "time_submit": get_current_time_string(), }) itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) # Determine task host settings now, just before job submission, # because dynamic host selection may be used. # host may be None (= run task on suite host) itask.task_host = get_task_host(rtconfig['remote']['host']) if not itask.task_host: itask.task_host = 'localhost' elif itask.task_host != "localhost": LOG.info("[%s] -Task host: %s" % ( itask.identity, itask.task_host)) itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host itask.summary['host'] = user_at_host itask.summary['job_hosts'][itask.submit_num] = user_at_host try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = ( TaskActionTimer(delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING)]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf( itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) self.init_host(suite, itask.task_host, itask.task_owner) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.suite_db_mgr.put_update_task_jobs(itask, { "user_at_host": user_at_host, "batch_sys_name": itask.summary['batch_sys_name'], }) itask.is_manual_submit = False scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def _write_environment_1(self, handle, job_conf): """Suite and task environment.""" handle.write("\n\n# CYLC SUITE ENVIRONMENT:") # write the static suite variables for var, val in sorted(self.suite_env.items()): handle.write("\nexport " + var + "=" + str(val)) if str(self.suite_env.get("CYLC_UTC")) == "True": handle.write("\nexport TZ=UTC") handle.write("\n") # override and write task-host-specific suite variables suite_work_dir = GLOBAL_CFG.get_derived_host_item( job_conf["suite name"], "suite work directory", job_conf["host"], job_conf["owner"] ) st_env = {} st_env["CYLC_SUITE_RUN_DIR"] = GLOBAL_CFG.get_derived_host_item( job_conf["suite name"], "suite run directory", job_conf["host"], job_conf["owner"] ) st_env["CYLC_SUITE_WORK_DIR"] = suite_work_dir st_env["CYLC_SUITE_SHARE_DIR"] = GLOBAL_CFG.get_derived_host_item( job_conf["suite name"], "suite share directory", job_conf["host"], job_conf["owner"] ) # DEPRECATED st_env["CYLC_SUITE_SHARE_PATH"] = "$CYLC_SUITE_SHARE_DIR" rsp = job_conf["remote suite path"] if rsp: st_env["CYLC_SUITE_DEF_PATH"] = rsp else: # replace home dir with '$HOME' for evaluation on the task host st_env["CYLC_SUITE_DEF_PATH"] = re.sub( os.environ["HOME"], "$HOME", self.suite_env["CYLC_SUITE_DEF_PATH_ON_SUITE_HOST"] ) for var, val in sorted(st_env.items()): handle.write("\nexport " + var + "=" + str(val)) task_work_dir = os.path.join(suite_work_dir, job_conf["work sub-directory"]) use_login_shell = GLOBAL_CFG.get_host_item("use login shell", job_conf["host"], job_conf["owner"]) comms = GLOBAL_CFG.get_host_item("task communication method", job_conf["host"], job_conf["owner"]) task_name, point_string = TaskID.split(job_conf["task id"]) handle.write("\n\n# CYLC TASK ENVIRONMENT:") handle.write("\nexport CYLC_TASK_COMMS_METHOD=" + comms) handle.write("\nexport CYLC_TASK_CYCLE_POINT=" + point_string) handle.write("\nexport CYLC_TASK_CYCLE_TIME=" + point_string) handle.write("\nexport CYLC_TASK_ID=" + job_conf["task id"]) handle.write("\nexport CYLC_TASK_IS_COLDSTART=" + str(job_conf["is cold-start"])) handle.write("\nexport CYLC_TASK_LOG_ROOT=" + job_conf["job file path"]) handle.write( "\nexport CYLC_TASK_MSG_MAX_TRIES=" + str(GLOBAL_CFG.get(["task messaging", "maximum number of tries"])) ) handle.write("\nexport CYLC_TASK_MSG_RETRY_INTVL=" + str(GLOBAL_CFG.get(["task messaging", "retry interval"]))) handle.write("\nexport CYLC_TASK_MSG_TIMEOUT=" + str(GLOBAL_CFG.get(["task messaging", "connection timeout"]))) handle.write("\nexport CYLC_TASK_NAME=" + task_name) handle.write('\nexport CYLC_TASK_NAMESPACE_HIERARCHY="' + " ".join(job_conf["namespace hierarchy"]) + '"') handle.write("\nexport CYLC_TASK_SSH_LOGIN_SHELL=" + str(use_login_shell)) handle.write("\nexport CYLC_TASK_SUBMIT_NUMBER=" + str(job_conf["absolute submit number"])) handle.write("\nexport CYLC_TASK_TRY_NUMBER=" + str(job_conf["try number"])) handle.write("\nexport CYLC_TASK_WORK_DIR=" + task_work_dir) # DEPRECATED handle.write("\nexport CYLC_TASK_WORK_PATH=$CYLC_TASK_WORK_DIR") handle.write("\nexport CYLC_JOB_PID=$$")
def __init__(self, task_id, suite, jobconfig, submit_num): self.jobconfig = jobconfig self.task_id = task_id self.suite = suite self.logfiles = jobconfig.get('log files') self.command = None self.job_submit_command_template = jobconfig.get('command template') common_job_log_path = jobconfig.get('common job log path') self.local_jobfile_path = jobconfig.get('local job file path') self.logfiles.add_path(self.local_jobfile_path) task_host = jobconfig.get('task host') task_owner = jobconfig.get('task owner') self.remote_shell_template = GLOBAL_CFG.get_host_item( 'remote shell template', task_host, task_owner) if is_remote_host(task_host) or is_remote_user(task_owner): self.local = False if task_owner: self.task_owner = task_owner else: self.task_owner = None if task_host: self.task_host = task_host else: self.task_host = socket.gethostname() remote_job_log_dir = GLOBAL_CFG.get_derived_host_item( self.suite, 'suite job log directory', self.task_host, self.task_owner) remote_jobfile_path = os.path.join(remote_job_log_dir, common_job_log_path) # Remote log files self.stdout_file = remote_jobfile_path + ".out" self.stderr_file = remote_jobfile_path + ".err" # Used in command construction: self.jobfile_path = remote_jobfile_path # Record paths of remote log files for access by gui if True: # by ssh URL url_prefix = self.task_host if self.task_owner: url_prefix = self.task_owner + "@" + url_prefix self.logfiles.add_path(url_prefix + ':' + self.stdout_file) self.logfiles.add_path(url_prefix + ':' + self.stderr_file) else: # CURRENTLY DISABLED: # If the remote and suite hosts see a common filesystem, or # if the remote task is really just a local task with a # different owner, we could just use local filesystem access. # But to use this: (a) special namespace config would be # required to indicate we have a common filesystem, and # (b) we'd need to consider how the log directory can be # specified (for example use of '$HOME' as for remote # task use would not work here as log file access is by # gui under the suite owner account. self.logfiles.add_path(self.stdout_file) self.logfiles.add_path(self.stderr_file) else: # LOCAL TASKS self.local = True self.task_owner = None # Used in command construction: self.jobfile_path = self.local_jobfile_path # Local stdout and stderr log file paths: self.stdout_file = self.local_jobfile_path + ".out" self.stderr_file = self.local_jobfile_path + ".err" # interpolate environment variables in extra logs for idx in range(0, len(self.logfiles.paths)): self.logfiles.paths[idx] = expandvars(self.logfiles.paths[idx]) # Record paths of local log files for access by gui self.logfiles.add_path(self.stdout_file) self.logfiles.add_path(self.stderr_file) # set some defaults that can be overridden by derived classes self.jobconfig['directive prefix'] = None self.jobconfig['directive final'] = "# FINAL DIRECTIVE" self.jobconfig['directive connector'] = " " self.jobconfig['job vacation signal'] = None # overrideable methods self.set_directives() self.set_job_vacation_signal() self.set_scripting() self.set_environment()
def _write_environment_1(self, handle, job_conf): """Suite and task environment.""" handle.write("\n\n# CYLC SUITE ENVIRONMENT:") # write the static suite variables for var, val in sorted(self.suite_env.items()): handle.write("\nexport " + var + "=" + str(val)) if str(self.suite_env.get('CYLC_UTC')) == 'True': handle.write("\nexport TZ=UTC") handle.write("\n") # override and write task-host-specific suite variables suite_work_dir = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite work directory', job_conf['host'], job_conf['owner']) st_env = {} st_env['CYLC_SUITE_RUN_DIR'] = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite run directory', job_conf['host'], job_conf['owner']) st_env['CYLC_SUITE_WORK_DIR'] = suite_work_dir st_env['CYLC_SUITE_SHARE_DIR'] = GLOBAL_CFG.get_derived_host_item( job_conf['suite name'], 'suite share directory', job_conf['host'], job_conf['owner']) # DEPRECATED st_env['CYLC_SUITE_SHARE_PATH'] = '$CYLC_SUITE_SHARE_DIR' rsp = job_conf['remote suite path'] if rsp: st_env['CYLC_SUITE_DEF_PATH'] = rsp else: # replace home dir with '$HOME' for evaluation on the task host st_env['CYLC_SUITE_DEF_PATH'] = re.sub( os.environ['HOME'], '$HOME', self.suite_env['CYLC_SUITE_DEF_PATH_ON_SUITE_HOST']) for var, val in sorted(st_env.items()): handle.write("\nexport " + var + "=" + str(val)) task_work_dir = os.path.join( suite_work_dir, job_conf['work sub-directory']) use_login_shell = GLOBAL_CFG.get_host_item( 'use login shell', job_conf['host'], job_conf['owner']) comms = GLOBAL_CFG.get_host_item( 'task communication method', job_conf['host'], job_conf['owner']) task_name, point_string = TaskID.split(job_conf['task id']) handle.write("\n\n# CYLC TASK ENVIRONMENT:") handle.write("\nexport CYLC_TASK_COMMS_METHOD=" + comms) handle.write("\nexport CYLC_TASK_CYCLE_POINT=" + point_string) handle.write("\nexport CYLC_TASK_CYCLE_TIME=" + point_string) handle.write("\nexport CYLC_TASK_ID=" + job_conf['task id']) handle.write( "\nexport CYLC_TASK_IS_COLDSTART=" + str(job_conf['is cold-start'])) handle.write( "\nexport CYLC_TASK_LOG_ROOT=" + job_conf['job file path']) handle.write( "\nexport CYLC_TASK_MSG_MAX_TRIES=" + str(GLOBAL_CFG.get(['task messaging', 'maximum number of tries']))) handle.write( "\nexport CYLC_TASK_MSG_RETRY_INTVL=%f" % GLOBAL_CFG.get(['task messaging', 'retry interval'])) handle.write( "\nexport CYLC_TASK_MSG_TIMEOUT=%f" % GLOBAL_CFG.get(['task messaging', 'connection timeout'])) handle.write("\nexport CYLC_TASK_NAME=" + task_name) handle.write( '\nexport CYLC_TASK_NAMESPACE_HIERARCHY="' + ' '.join(job_conf['namespace hierarchy']) + '"') handle.write( "\nexport CYLC_TASK_SSH_LOGIN_SHELL=" + str(use_login_shell)) handle.write( "\nexport CYLC_TASK_SUBMIT_NUMBER=" + str(job_conf['submit num'])) handle.write( "\nexport CYLC_TASK_TRY_NUMBER=" + str(job_conf['try number'])) handle.write("\nexport CYLC_TASK_WORK_DIR=" + task_work_dir) # DEPRECATED handle.write("\nexport CYLC_TASK_WORK_PATH=$CYLC_TASK_WORK_DIR") handle.write("\nexport %s=$$" % (TaskMessage.CYLC_JOB_PID))
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), 'uuid') try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append( os.path.join( GLOBAL_CFG.get_derived_host_item(self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = proc.communicate() if proc.wait(): LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err))