Ejemplo n.º 1
0
    def _setup_job_logs_retrieval(self, itask, event):
        """Set up remote job logs retrieval.

        For a task with a job completion event, i.e. succeeded, failed,
        (execution) retry.
        """
        id_key = ((self.HANDLER_JOB_LOGS_RETRIEVE, event), str(itask.point),
                  itask.tdef.name, itask.submit_num)
        events = (self.EVENT_FAILED, self.EVENT_RETRY, self.EVENT_SUCCEEDED)
        host = get_host_from_platform(itask.platform)
        if (event not in events or not is_remote_host(host)
                or not self.get_host_conf(itask, "retrieve job logs")
                or id_key in self.event_timers):
            return
        retry_delays = self.get_host_conf(itask,
                                          "retrieve job logs retry delays")
        if not retry_delays:
            retry_delays = [0]
        self.event_timers[id_key] = TaskActionTimer(
            TaskJobLogsRetrieveContext(
                self.HANDLER_JOB_LOGS_RETRIEVE,  # key
                self.HANDLER_JOB_LOGS_RETRIEVE,  # ctx_type
                itask.platform['name'],
                self.get_host_conf(itask, "retrieve job logs max size"),
            ),
            retry_delays)
Ejemplo n.º 2
0
async def scan_one(reg, host, port, timeout=None, methods=None):
    if not methods:
        methods = ['identify']

    if is_remote_host(host):
        try:
            host = get_host_ip_by_name(host)  # IP reduces DNS traffic
        except socket.error as exc:
            if cylc.flow.flags.debug:
                raise
            sys.stderr.write("ERROR: %s: %s\n" % (exc, host))
            return (reg, host, port, None)

    # NOTE: Connect to the suite by host:port, this was the
    #       SuiteRuntimeClient will not attempt to check the contact file
    #       which would be unnecessary as we have already done so.
    # NOTE: This part of the scan *is* IO blocking.
    client = SuiteRuntimeClient(reg, host=host, port=port, timeout=timeout)

    result = {}
    for method in methods:
        # work our way up the chain of identity methods, extract as much
        # information as we can before the suite rejects us
        try:
            msg = await client.async_request(method)
        except ClientTimeout as exc:
            LOG.exception(f"Timeout: name:{reg}, host:{host}, port:{port}")
            return (reg, host, port, MSG_TIMEOUT)
        except ClientError as exc:
            LOG.exception("ClientError")
            return (reg, host, port, result or None)
        else:
            result.update(msg)
    return (reg, host, port, result)
Ejemplo n.º 3
0
    def rsync_255_fail(ctx, platform=None) -> bool:
        """Tests context for rsync failing to communicate with a host.

        If there has been a failure caused by rsync being unable to connect
        try a test of ssh connectivity. Necessary becuase loss of connectivity
        may cause different rsync failures depending on version, and some of
        the failures may be caused by other problems.
        """
        rsync_255_fail = False
        platform_rsync_cmd = (platform['rsync command']
                              if platform is not None else 'rsync')
        rsync_cmd = shlex.split(platform_rsync_cmd)
        if (ctx.cmd[0] == rsync_cmd[0] and ctx.ret_code not in [0, 255]
                and is_remote_host(ctx.host)):
            ssh_cmd = (platform['ssh command']
                       if platform is not None else 'ssh')
            ssh_test_cmd = shlex.split(f'{ssh_cmd} {ctx.host} true')
            LOG.info(
                f'testing connectivity for {ctx.host} using {ssh_test_cmd}')
            ssh_test = run(shlex.split(f'{ssh_cmd} {ctx.host} true'),
                           capture_output=True)
            if ssh_test.returncode == 255:
                rsync_255_fail = True
        elif (ctx.cmd[0] == rsync_cmd[0] and ctx.ret_code == 255):
            rsync_255_fail = True
        return rsync_255_fail
Ejemplo n.º 4
0
Archivo: scan.py Proyecto: cylc/cylc
async def scan_one(reg, host, port, timeout=None, methods=None):
    if not methods:
        methods = ['identify']

    if is_remote_host(host):
        try:
            host = get_host_ip_by_name(host)  # IP reduces DNS traffic
        except socket.error as exc:
            if cylc.flow.flags.debug:
                raise
            sys.stderr.write("ERROR: %s: %s\n" % (exc, host))
            return (reg, host, port, None)

    # NOTE: Connect to the suite by host:port, this was the
    #       SuiteRuntimeClient will not attempt to check the contact file
    #       which would be unnecessary as we have already done so.
    # NOTE: This part of the scan *is* IO blocking.
    client = SuiteRuntimeClient(reg, host=host, port=port, timeout=timeout)

    result = {}
    for method in methods:
        # work our way up the chain of identity methods, extract as much
        # information as we can before the suite rejects us
        try:
            msg = await client.async_request(method)
        except ClientTimeout as exc:
            return (reg, host, port, MSG_TIMEOUT)
        except ClientError as exc:
            return (reg, host, port, result or None)
        else:
            result.update(msg)
    return (reg, host, port, result)
Ejemplo n.º 5
0
    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their user@host.
        Put a job command for each user@host to the multiprocess pool.

        """
        if not itasks:
            return
        auth_itasks = {}
        for itask in itasks:
            if (itask.task_host, itask.task_owner) not in auth_itasks:
                auth_itasks[(itask.task_host, itask.task_owner)] = []
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        for (host, owner), itasks in sorted(auth_itasks.items()):
            cmd = ["cylc", cmd_key]
            if LOG.isEnabledFor(DEBUG):
                cmd.append("--debug")
            if is_remote_host(host):
                cmd.append("--host=%s" % (host))
            if is_remote_user(owner):
                cmd.append("--user=%s" % (owner))
            cmd.append("--")
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, "suite job log directory", host, owner))
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(get_task_job_id(
                    itask.point, itask.tdef.name, itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(
                SubProcContext(cmd_key, cmd), callback, [suite, itasks])
Ejemplo n.º 6
0
    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their user@host.
        Put a job command for each user@host to the multiprocess pool.

        """
        if not itasks:
            return
        auth_itasks = {}
        for itask in itasks:
            if (itask.task_host, itask.task_owner) not in auth_itasks:
                auth_itasks[(itask.task_host, itask.task_owner)] = []
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        for (host, owner), itasks in sorted(auth_itasks.items()):
            cmd = ["cylc", cmd_key]
            if LOG.isEnabledFor(DEBUG):
                cmd.append("--debug")
            if is_remote_host(host):
                cmd.append("--host=%s" % (host))
            if is_remote_user(owner):
                cmd.append("--user=%s" % (owner))
            cmd.append("--")
            cmd.append(get_remote_suite_run_job_dir(host, owner, suite))
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(
                    get_task_job_id(itask.point, itask.tdef.name,
                                    itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback,
                                       [suite, itasks])
Ejemplo n.º 7
0
    def remote_tidy(self):
        """Remove suite contact files from initialised remotes.

        Call "cylc remote-tidy".
        This method is called on suite shutdown, so we want nothing to hang.
        Timeout any incomplete commands after 10 seconds.

        Also remove UUID file on suite host ".service/uuid".
        """
        # Remove UUID file
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        try:
            os.unlink(uuid_fname)
        except OSError:
            pass
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), init_with_contact in self.remote_init_map.items():
            if init_with_contact != REMOTE_INIT_DONE:
                continue
            cmd = ['timeout', '10', 'cylc', 'remote-tidy']
            if is_remote_host(host):
                cmd.append('--host=%s' % host)
            if is_remote_user(owner):
                cmd.append('--user=%s' % owner)
            if cylc.flow.flags.debug:
                cmd.append('--debug')
            cmd.append(os.path.join(glbl_cfg().get_derived_host_item(
                self.suite, 'suite run directory', host, owner)))
            procs[(host, owner)] = (
                cmd,
                Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull)))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for (host, owner), (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[(host, owner)]
                out, err = (f.decode() for f in proc.communicate())
                if proc.wait():
                    LOG.warning(TaskRemoteMgmtError(
                        TaskRemoteMgmtError.MSG_TIDY,
                        (host, owner), ' '.join(quote(item) for item in cmd),
                        proc.returncode, out, err))
        # Terminate any remaining commands
        for (host, owner), (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(TaskRemoteMgmtError(
                    TaskRemoteMgmtError.MSG_TIDY,
                    (host, owner), ' '.join(quote(item) for item in cmd),
                    proc.returncode, out, err))
Ejemplo n.º 8
0
def _get_metrics(hosts, metrics, data=None):
    """Retrieve host metrics using SSH if necessary.

    Note hosts will not appear in the returned results if:
    * They are not contactable.
    * There is an error in the command which returns the results.

    Args:
        hosts (list):
            List of host fqdns.
        metrics (list):
            List in the form [(function, arg1, arg2, ...), ...]
        data (dict):
            Used for logging success/fail outcomes of the form {host: {}}

    Examples:
        Command failure:
        >>> _get_metrics(['localhost'], [['elephant']])
        ({}, {'localhost': {'get_metrics': 'Command failed (exit: 1)'}})

    Returns:
        dict - {host: {(function, arg1, arg2, ...): result}}

    """
    host_stats = {}
    proc_map = {}
    if not data:
        data = {host: dict() for host in hosts}

    # Start up commands on hosts
    cmd = ['psutil']
    kwargs = {'stdin_str': json.dumps(metrics), 'capture_process': True}
    for host in hosts:
        if is_remote_host(host):
            proc_map[host] = remote_cylc_cmd(cmd, host=host, **kwargs)
        else:
            proc_map[host] = run_cmd(['cylc'] + cmd, **kwargs)

    # Collect results from commands
    while proc_map:
        for host, proc in list(proc_map.copy().items()):
            if proc.poll() is None:
                continue
            del proc_map[host]
            out, err = (f.decode() for f in proc.communicate())
            if proc.wait():
                # Command failed in verbose/debug mode
                LOG.warning('Could not evaluate "%s" (return code %d)\n%s',
                            host, proc.returncode, err)
                data[host]['get_metrics'] = (
                    f'Command failed (exit: {proc.returncode})')
            else:
                host_stats[host] = dict(
                    zip(
                        metrics,
                        # convert JSON dicts -> namedtuples
                        _deserialise(metrics, parse_dirty_json(out))))
        sleep(0.01)
    return host_stats, data
Ejemplo n.º 9
0
def _distribute(host):
    """Re-invoke this command on a different host if requested."""
    # Check whether a run host is explicitly specified, else select one.
    if not host:
        host = select_workflow_host()[0]
    if is_remote_host(host):
        # Prevent recursive host selection
        cmd = sys.argv[1:]
        cmd.append("--host=localhost")
        _remote_cylc_cmd(cmd, host=host)
        sys.exit(0)
Ejemplo n.º 10
0
    def remote_host_select(self, host_str):
        """Evaluate a task host string.

        Arguments:
            host_str (str):
                An explicit host name, a command in back-tick or $(command)
                format, or an environment variable holding a hostname.

        Return (str):
            None if evaluate of host_str is still taking place.
            'localhost' if host_str is not defined or if the evaluated host
            name is equivalent to 'localhost'.
            Otherwise, return the evaluated host name on success.

        Raise TaskRemoteMgmtError on error.

        """
        if not host_str:
            return 'localhost'

        # Host selection command: $(command) or `command`
        match = REC_COMMAND.match(host_str)
        if match:
            cmd_str = match.groups()[1]
            if cmd_str in self.remote_host_str_map:
                # Command recently launched
                value = self.remote_host_str_map[cmd_str]
                if isinstance(value, TaskRemoteMgmtError):
                    raise value  # command failed
                elif value is None:
                    return  # command not yet ready
                else:
                    host_str = value  # command succeeded
            else:
                # Command not launched (or already reset)
                timeout = glbl_cfg().get(['task host select command timeout'])
                if timeout:
                    cmd = ['timeout', str(int(timeout)), 'bash', '-c', cmd_str]
                else:
                    cmd = ['bash', '-c', cmd_str]
                self.proc_pool.put_command(
                    SubProcContext(
                        'remote-host-select', cmd, env=dict(os.environ)),
                    self._remote_host_select_callback, [cmd_str])
                self.remote_host_str_map[cmd_str] = None
                return self.remote_host_str_map[cmd_str]

        # Environment variable substitution
        host_str = os.path.expandvars(host_str)
        # Remote?
        if is_remote_host(host_str):
            return host_str
        else:
            return 'localhost'
Ejemplo n.º 11
0
def scheduler_cli(parser, options, args, is_restart=False):
    """CLI main."""
    reg = args[0]
    # Check suite is not already running before start of host selection.
    try:
        suite_files.detect_old_contact_file(reg)
    except SuiteServiceFileError as exc:
        sys.exit(exc)

    suite_run_dir = get_suite_run_dir(reg)

    if not os.path.exists(suite_run_dir):
        sys.stderr.write(f'suite service directory not found '
                         f'at: {suite_run_dir}\n')
        sys.exit(1)

    # Create auth files if needed.
    suite_files.create_auth_files(reg)

    # Extract job.sh from library, for use in job scripts.
    extract_resources(suite_files.get_suite_srv_dir(reg), ['etc/job.sh'])

    # Check whether a run host is explicitly specified, else select one.
    if not options.host:
        try:
            host = HostAppointer().appoint_host()
        except EmptyHostList as exc:
            if cylc.flow.flags.debug:
                raise
            else:
                sys.exit(str(exc))
        if is_remote_host(host):
            if is_restart:
                base_cmd = ["restart"] + sys.argv[1:]
            else:
                base_cmd = ["run"] + sys.argv[1:]
            # Prevent recursive host selection
            base_cmd.append("--host=localhost")
            return remote_cylc_cmd(base_cmd, host=host)
    if remrun(set_rel_local=True):  # State localhost as above.
        sys.exit()

    try:
        suite_files.get_suite_source_dir(args[0], options.owner)
    except SuiteServiceFileError:
        # Source path is assumed to be the run directory
        suite_files.register(args[0], get_suite_run_dir(args[0]))

    try:
        scheduler = Scheduler(is_restart, options, args)
    except SuiteServiceFileError as exc:
        sys.exit(exc)
    scheduler.start()
Ejemplo n.º 12
0
async def scan_one(reg, host, port, pub_port, api, timeout=None, methods=None):
    """Connect to and identify workflow server if possible.

    Args:
        reg (str): Registered name of workflow.
        host (str): Workflow host.
        port (int): Workflow server port.
        pub_port (int): Workflow publisher port.
        api (str): Workflow API version.
        timeout (float, optional): Client socket receiver timeout.
        methods (list): List of methods/endpoints to request.

    Returns:
        tuple: (reg, host, port, pub_port, result)

    """
    if not methods:
        methods = ['identify']

    if is_remote_host(host):
        try:
            host = get_host_ip_by_name(host)  # IP reduces DNS traffic
        except socket.error as exc:
            if cylc.flow.flags.debug:
                raise
            sys.stderr.write("ERROR: %s: %s\n" % (exc, host))
            return (reg, host, port, pub_port, api, None)

    # NOTE: Connect to the suite by host:port, this was the
    #       SuiteRuntimeClient will not attempt to check the contact file
    #       which would be unnecessary as we have already done so.
    # NOTE: This part of the scan *is* IO blocking.
    client = SuiteRuntimeClient(reg, host=host, port=port, timeout=timeout)

    result = {}
    for method in methods:
        # work our way up the chain of identity methods, extract as much
        # information as we can before the suite rejects us
        try:
            msg = await client.async_request(method)
        except ClientTimeout as exc:
            LOG.exception(
                "Timeout: name:%s, host:%s, port:%s", reg, host, port)
            return (reg, host, port, pub_port, api, MSG_TIMEOUT)
        except ClientError as exc:
            LOG.exception("ClientError")
            return (reg, host, port, pub_port, api, result or None)
        else:
            result.update(msg)
    return (reg, host, port, pub_port, api, result)
Ejemplo n.º 13
0
def _distribute(host, is_restart):
    """Re-invoke this command on a different host if requested."""
    # Check whether a run host is explicitly specified, else select one.
    if not host:
        host = select_suite_host()[0]
    if is_remote_host(host):
        if is_restart:
            base_cmd = ["restart"] + sys.argv[1:]
        else:
            base_cmd = ["run"] + sys.argv[1:]
        # Prevent recursive host selection
        base_cmd.append("--host=localhost")
        remote_cylc_cmd(base_cmd, host=host)
        sys.exit(0)
Ejemplo n.º 14
0
    def remote_host_select(self, host_str):
        """Evaluate a task host string.

        Arguments:
            host_str (str):
                An explicit host name, a command in back-tick or $(command)
                format, or an environment variable holding a hostname.

        Return (str):
            None if evaluate of host_str is still taking place.
            'localhost' if host_str is not defined or if the evaluated host
            name is equivalent to 'localhost'.
            Otherwise, return the evaluated host name on success.

        Raise TaskRemoteMgmtError on error.

        """
        if not host_str:
            return 'localhost'

        # Host selection command: $(command) or `command`
        match = REC_COMMAND.match(host_str)
        if match:
            cmd_str = match.groups()[1]
            if cmd_str in self.remote_host_str_map:
                # Command recently launched
                value = self.remote_host_str_map[cmd_str]
                if isinstance(value, TaskRemoteMgmtError):
                    raise value  # command failed
                elif value is None:
                    return  # command not yet ready
                else:
                    host_str = value  # command succeeded
            else:
                # Command not launched (or already reset)
                self.proc_pool.put_command(
                    SubProcContext('remote-host-select',
                                   ['bash', '-c', cmd_str],
                                   env=dict(os.environ)),
                    self._remote_host_select_callback, [cmd_str])
                self.remote_host_str_map[cmd_str] = None
                return self.remote_host_str_map[cmd_str]

        # Environment variable substitution
        host_str = os.path.expandvars(host_str)
        # Remote?
        if is_remote_host(host_str):
            return host_str
        else:
            return 'localhost'
Ejemplo n.º 15
0
    def _get_host_metrics(self):
        """Run "cylc get-host-metrics" commands on hosts.

        Return (dict): {host: host-metrics-dict, ...}
        """
        host_stats = {}
        # Run "cylc get-host-metrics" commands on hosts
        host_proc_map = {}
        cmd = [self.CMD_BASE] + sorted(self._get_host_metrics_opts())
        # Start up commands on hosts
        for host in self.hosts:
            if is_remote_host(host):
                host_proc_map[host] = remote_cylc_cmd(cmd,
                                                      stdin=None,
                                                      host=host,
                                                      capture_process=True)
            elif 'localhost' in host_proc_map:
                continue  # Don't duplicate localhost
            else:
                # 1st instance of localhost
                host_proc_map['localhost'] = run_cmd(['cylc'] + cmd,
                                                     capture_process=True)
        # Collect results from commands
        while host_proc_map:
            for host, proc in list(host_proc_map.copy().items()):
                if proc.poll() is None:
                    continue
                del host_proc_map[host]
                out, err = (f.decode() for f in proc.communicate())
                if proc.wait():
                    # Command failed in verbose/debug mode
                    LOG.warning(
                        "can't get host metric from '%s'" +
                        "%s  # returncode=%d, err=%s\n", host, ' '.join(
                            (quote(item) for item in cmd)), proc.returncode,
                        err)
                else:
                    # Command OK
                    # Users may have profile scripts that write to STDOUT.
                    # Drop all output lines until the the first character of a
                    # line is '{'. Hopefully this is enough to find us the
                    # first line that denotes the beginning of the expected
                    # JSON data structure.
                    out = ''.join(
                        dropwhile(lambda s: not s.startswith('{'),
                                  out.splitlines(True)))
                    host_stats[host] = json.loads(out)
            sleep(0.01)
        return host_stats
Ejemplo n.º 16
0
    def _get_host_metrics(self):
        """Run "cylc get-host-metrics" commands on hosts.

        Return (dict): {host: host-metrics-dict, ...}
        """
        host_stats = {}
        # Run "cylc get-host-metrics" commands on hosts
        host_proc_map = {}
        cmd = [self.CMD_BASE] + sorted(self._get_host_metrics_opts())
        # Start up commands on hosts
        for host in self.hosts:
            if is_remote_host(host):
                host_proc_map[host] = remote_cylc_cmd(
                    cmd, stdin=None, host=host, capture_process=True)
            elif 'localhost' in host_proc_map:
                continue  # Don't duplicate localhost
            else:
                # 1st instance of localhost
                host_proc_map['localhost'] = run_cmd(
                    ['cylc'] + cmd, capture_process=True)
        # Collect results from commands
        while host_proc_map:
            for host, proc in list(host_proc_map.copy().items()):
                if proc.poll() is None:
                    continue
                del host_proc_map[host]
                out, err = (f.decode() for f in proc.communicate())
                if proc.wait():
                    # Command failed in verbose/debug mode
                    LOG.warning(
                        "can't get host metric from '%s'" +
                        "%s  # returncode=%d, err=%s\n",
                        host, ' '.join((quote(item) for item in cmd)),
                        proc.returncode, err)
                else:
                    # Command OK
                    # Users may have profile scripts that write to STDOUT.
                    # Drop all output lines until the the first character of a
                    # line is '{'. Hopefully this is enough to find us the
                    # first line that denotes the beginning of the expected
                    # JSON data structure.
                    out = ''.join(dropwhile(
                        lambda s: not s.startswith('{'), out.splitlines(True)))
                    host_stats[host] = json.loads(out)
            sleep(0.01)
        return host_stats
Ejemplo n.º 17
0
async def est_workflow(reg, host, port, pub_port, context=None, timeout=None):
    """Establish communication with workflow, instantiating REQ client."""
    if is_remote_host(host):
        try:
            host = get_host_ip_by_name(host)  # IP reduces DNS traffic
        except socket.error as exc:
            if flags.debug:
                raise
            logger.error("ERROR: %s: %s\n", exc, host)
            return (reg, host, port, pub_port, None)

    # NOTE: Connect to the suite by host:port. This way the
    #       SuiteRuntimeClient will not attempt to check the contact file
    #       which would be unnecessary as we have already done so.
    # NOTE: This part of the scan *is* IO blocking.
    client = SuiteRuntimeClient(reg, context=context, timeout=timeout)
    _, result = await workflow_request(client, 'identify')
    return (reg, host, port, pub_port, client, result)
Ejemplo n.º 18
0
def scheduler_cli(parser, options, args, is_restart=False):
    """CLI main."""
    # Check suite is not already running before start of host selection.
    try:
        SuiteSrvFilesManager().detect_old_contact_file(args[0])
    except SuiteServiceFileError as exc:
        sys.exit(exc)

    # Create auth files if needed.
    SuiteSrvFilesManager().create_auth_files(args[0])

    # Check whether a run host is explicitly specified, else select one.
    if not options.host:
        try:
            host = HostAppointer().appoint_host()
        except EmptyHostList as exc:
            if cylc.flow.flags.debug:
                raise
            else:
                sys.exit(str(exc))
        if is_remote_host(host):
            if is_restart:
                base_cmd = ["restart"] + sys.argv[1:]
            else:
                base_cmd = ["run"] + sys.argv[1:]
            # Prevent recursive host selection
            base_cmd.append("--host=localhost")
            return remote_cylc_cmd(base_cmd, host=host)
    if remrun(set_rel_local=True):  # State localhost as above.
        sys.exit()

    try:
        SuiteSrvFilesManager().get_suite_source_dir(args[0], options.owner)
    except SuiteServiceFileError:
        # Source path is assumed to be the run directory
        SuiteSrvFilesManager().register(args[0], get_suite_run_dir(args[0]))

    try:
        scheduler = Scheduler(is_restart, options, args)
    except SuiteServiceFileError as exc:
        sys.exit(exc)
    scheduler.start()
Ejemplo n.º 19
0
    def remote_init(self, platform, curve_auth,
                    client_pub_key_dir):
        """Initialise a remote [owner@]host if necessary.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": For TCP task communication
            "python/": if source exists

        Args:
            curve_auth (ThreadAuthenticator):
                The ZMQ authenticator.
            client_pub_key_dir (str):
                Client public key directory, used by the ZMQ authenticator.
            platform (dict):
                A dictionary containing settings relating to platform used in
                this remote installation.

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        self.install_target = platform['install target']

        # If task is running locally or the install target is localhost
        # we can skip the rest of this function
        if (self.install_target == 'localhost' or
                self.single_task_mode or
                not is_remote_host(get_host_from_platform(platform))):
            LOG.debug(f"REMOTE INIT NOT REQUIRED for {self.install_target}")
            return REMOTE_INIT_NOT_REQUIRED

        # See if a previous failed attempt to initialize this platform has
        # occurred.
        try:
            status = self.remote_init_map[platform['install target']]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[platform['install target']]
            return status

        # Determine what items to install
        comm_meth = platform['communication method']

        # Get a list of files and folders to install;
        # if nothing needs install say so to remote_init_map and return.
        items = self._remote_init_items(comm_meth)

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # Build the remote-init command to be run over ssh
        cmd = ['remote-init']
        if cylc.flow.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(str(self.install_target))
        cmd.append(get_remote_suite_run_dir(platform, self.suite))
        # Create the ssh command
        cmd = construct_platform_ssh_cmd(cmd, platform)

        self.proc_pool.put_command(
            SubProcContext(
                'remote-init',
                cmd,
                stdin_files=[tmphandle]),
            self._remote_init_callback,
            [platform, tmphandle,
             curve_auth, client_pub_key_dir])
        # None status: Waiting for command to finish
        self.remote_init_map[platform['install target']] = None
        return self.remote_init_map[platform['install target']]
Ejemplo n.º 20
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%02d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks
Ejemplo n.º 21
0
 def _load_remote_item(self, item, reg, owner, host):
     """Load content of service item from remote [owner@]host via SSH."""
     if not is_remote(host, owner):
         return
     if host is None:
         host = 'localhost'
     if owner is None:
         owner = get_user()
     if item == self.FILE_BASE_CONTACT and not is_remote_host(host):
         # Attempt to read suite contact file via the local filesystem.
         path = r'%(run_d)s/%(srv_base)s' % {
             'run_d': glbl_cfg().get_derived_host_item(
                 reg, 'suite run directory', 'localhost', owner,
                 replace_home=False),
             'srv_base': self.DIR_BASE_SRV,
         }
         content = self._load_local_item(item, path)
         if content is not None:
             return content
         # Else drop through and attempt via ssh to the suite account.
     # Prefix STDOUT to ensure returned content is relevant
     prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg}
     # Attempt to cat passphrase file under suite service directory
     script = (
         r"""echo '%(prefix)s'; """
         r'''cat "%(run_d)s/%(srv_base)s/%(item)s"'''
     ) % {
         'prefix': prefix,
         'run_d': glbl_cfg().get_derived_host_item(
             reg, 'suite run directory', host, owner),
         'srv_base': self.DIR_BASE_SRV,
         'item': item
     }
     import shlex
     command = shlex.split(
         glbl_cfg().get_host_item('ssh command', host, owner))
     command += ['-n', owner + '@' + host, script]
     from subprocess import Popen, PIPE
     try:
         proc = Popen(
             command, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE)
     except OSError:
         if cylc.flow.flags.debug:
             import traceback
             traceback.print_exc()
         return
     out, err = (f.decode() for f in proc.communicate())
     ret_code = proc.wait()
     # Extract passphrase from STDOUT
     # It should live in the line with the correct prefix
     content = ""
     can_read = False
     for line in out.splitlines(True):
         if can_read:
             content += line
         elif line.strip() == prefix:
             can_read = True
     if not content or ret_code:
         LOG.debug(
             '$ %(command)s  # code=%(ret_code)s\n%(err)s',
             {
                 'command': command,
                 # STDOUT may contain passphrase, so not safe to print
                 # 'out': out,
                 'err': err,
                 'ret_code': ret_code,
             })
         return
     return content
Ejemplo n.º 22
0
    def detect_old_contact_file(self, reg, check_host_port=None):
        """Detect old suite contact file.

        If an old contact file does not exist, do nothing. If one does exist
        but the suite process is definitely not alive, remove it. If one exists
        and the suite process is still alive, raise SuiteServiceFileError.

        If check_host_port is specified and does not match the (host, port)
        value in the old contact file, raise AssertionError.

        Args:
            reg (str): suite name
            check_host_port (tuple): (host, port) to check against

        Raise:
            AssertionError:
                If old contact file exists but does not have matching
                (host, port) with value of check_host_port.
            SuiteServiceFileError:
                If old contact file exists and the suite process still alive.
        """
        # An old suite of the same name may be running if a contact file exists
        # and can be loaded.
        try:
            data = self.load_contact_file(reg)
            old_host = data[self.KEY_HOST]
            old_port = data[self.KEY_PORT]
            old_proc_str = data[self.KEY_PROCESS]
        except (IOError, ValueError, SuiteServiceFileError):
            # Contact file does not exist or corrupted, should be OK to proceed
            return
        if check_host_port and check_host_port != (old_host, int(old_port)):
            raise AssertionError("%s != (%s, %s)" %
                                 (check_host_port, old_host, old_port))
        # Run the "ps" command to see if the process is still running or not.
        # If the old suite process is still running, it should show up with the
        # same command line as before.
        # Terminate command after 10 seconds to prevent hanging, etc.
        old_pid_str = old_proc_str.split(None, 1)[0].strip()
        cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)]
        if is_remote_host(old_host):
            import shlex
            ssh_str = str(glbl_cfg().get_host_item("ssh command", old_host))
            cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd
        from subprocess import Popen, PIPE, DEVNULL  # nosec
        from time import sleep, time
        proc = Popen(cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE)  # nosec
        # Terminate command after 10 seconds to prevent hanging SSH, etc.
        timeout = time() + 10.0
        while proc.poll() is None:
            if time() > timeout:
                proc.terminate()
            sleep(0.1)
        fname = self.get_contact_file(reg)
        ret_code = proc.wait()
        out, err = (f.decode() for f in proc.communicate())
        if ret_code:
            LOG.debug("$ %s  # return %d\n%s", ' '.join(cmd), ret_code, err)
        for line in reversed(out.splitlines()):
            if line.strip() == old_proc_str:
                # Suite definitely still running
                break
            elif line.split(None, 1)[0].strip() == "PID":
                # Only "ps" header - "ps" has run, but no matching results.
                # Suite not running. Attempt to remove suite contact file.
                try:
                    os.unlink(fname)
                    return
                except OSError:
                    break

        raise SuiteServiceFileError(
            (r"""suite contact file exists: %(fname)s

Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s".

To start a new run, stop the old one first with one or more of these:
* cylc stop %(suite)s              # wait for active tasks/event handlers
* cylc stop --kill %(suite)s       # kill active tasks and wait
* cylc stop --now %(suite)s        # don't wait for active tasks
* cylc stop --now --now %(suite)s  # don't wait
* ssh -n "%(host)s" kill %(pid)s   # final brute force!
""") % {
                "host": old_host,
                "port": old_port,
                "pid": old_pid_str,
                "fname": fname,
                "suite": reg,
            })
Ejemplo n.º 23
0
 def _load_remote_item(self, item, reg, owner, host):
     """Load content of service item from remote [owner@]host via SSH."""
     if not is_remote(host, owner):
         return
     if host is None:
         host = 'localhost'
     if owner is None:
         owner = get_user()
     if item == self.FILE_BASE_CONTACT and not is_remote_host(host):
         # Attempt to read suite contact file via the local filesystem.
         path = r'%(run_d)s/%(srv_base)s' % {
             'run_d': get_remote_suite_run_dir('localhost', owner, reg),
             'srv_base': self.DIR_BASE_SRV,
         }
         content = self._load_local_item(item, path)
         if content is not None:
             return content
         # Else drop through and attempt via ssh to the suite account.
     # Prefix STDOUT to ensure returned content is relevant
     prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg}
     # Attempt to cat passphrase file under suite service directory
     script = (r"""echo '%(prefix)s'; """
               r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''') % {
                   'prefix': prefix,
                   'run_d': get_remote_suite_run_dir(host, owner, reg),
                   'srv_base': self.DIR_BASE_SRV,
                   'item': item
               }
     import shlex
     command = shlex.split(glbl_cfg().get_host_item('ssh command', host,
                                                    owner))
     command += ['-n', owner + '@' + host, script]
     from subprocess import Popen, PIPE, DEVNULL  # nosec
     try:
         proc = Popen(command, stdin=DEVNULL, stdout=PIPE,
                      stderr=PIPE)  # nosec
     except OSError:
         if cylc.flow.flags.debug:
             import traceback
             traceback.print_exc()
         return
     out, err = (f.decode() for f in proc.communicate())
     ret_code = proc.wait()
     # Extract passphrase from STDOUT
     # It should live in the line with the correct prefix
     content = ""
     can_read = False
     for line in out.splitlines(True):
         if can_read:
             content += line
         elif line.strip() == prefix:
             can_read = True
     if not content or ret_code:
         LOG.debug(
             '$ %(command)s  # code=%(ret_code)s\n%(err)s',
             {
                 'command': command,
                 # STDOUT may contain passphrase, so not safe to print
                 # 'out': out,
                 'err': err,
                 'ret_code': ret_code,
             })
         return
     return content
Ejemplo n.º 24
0
    def detect_old_contact_file(self, reg, check_host_port=None):
        """Detect old suite contact file.

        If an old contact file does not exist, do nothing. If one does exist
        but the suite process is definitely not alive, remove it. If one exists
        and the suite process is still alive, raise SuiteServiceFileError.

        If check_host_port is specified and does not match the (host, port)
        value in the old contact file, raise AssertionError.

        Args:
            reg (str): suite name
            check_host_port (tuple): (host, port) to check against

        Raise:
            AssertionError:
                If old contact file exists but does not have matching
                (host, port) with value of check_host_port.
            SuiteServiceFileError:
                If old contact file exists and the suite process still alive.
        """
        # An old suite of the same name may be running if a contact file exists
        # and can be loaded.
        try:
            data = self.load_contact_file(reg)
            old_host = data[self.KEY_HOST]
            old_port = data[self.KEY_PORT]
            old_proc_str = data[self.KEY_PROCESS]
        except (IOError, ValueError, SuiteServiceFileError):
            # Contact file does not exist or corrupted, should be OK to proceed
            return
        if check_host_port and check_host_port != (old_host, int(old_port)):
            raise AssertionError("%s != (%s, %s)" % (
                check_host_port, old_host, old_port))
        # Run the "ps" command to see if the process is still running or not.
        # If the old suite process is still running, it should show up with the
        # same command line as before.
        # Terminate command after 10 seconds to prevent hanging, etc.
        old_pid_str = old_proc_str.split(None, 1)[0].strip()
        cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)]
        if is_remote_host(old_host):
            import shlex
            ssh_str = str(glbl_cfg().get_host_item("ssh command", old_host))
            cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd
        from subprocess import Popen, PIPE
        from time import sleep, time
        proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE)
        # Terminate command after 10 seconds to prevent hanging SSH, etc.
        timeout = time() + 10.0
        while proc.poll() is None:
            if time() > timeout:
                proc.terminate()
            sleep(0.1)
        fname = self.get_contact_file(reg)
        ret_code = proc.wait()
        out, err = (f.decode() for f in proc.communicate())
        if ret_code:
            LOG.debug("$ %s  # return %d\n%s", ' '.join(cmd), ret_code, err)
        for line in reversed(out.splitlines()):
            if line.strip() == old_proc_str:
                # Suite definitely still running
                break
            elif line.split(None, 1)[0].strip() == "PID":
                # Only "ps" header - "ps" has run, but no matching results.
                # Suite not running. Attempt to remove suite contact file.
                try:
                    os.unlink(fname)
                    return
                except OSError:
                    break

        raise SuiteServiceFileError(
            (
                r"""suite contact file exists: %(fname)s

Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s".

To start a new run, stop the old one first with one or more of these:
* cylc stop %(suite)s              # wait for active tasks/event handlers
* cylc stop --kill %(suite)s       # kill active tasks and wait
* cylc stop --now %(suite)s        # don't wait for active tasks
* cylc stop --now --now %(suite)s  # don't wait
* ssh -n "%(host)s" kill %(pid)s   # final brute force!
"""
            ) % {
                "host": old_host,
                "port": old_port,
                "pid": old_pid_str,
                "fname": fname,
                "suite": reg,
            }
        )
Ejemplo n.º 25
0
def test_is_remote_host_on_localhost():
    """is_remote_host with localhost."""
    assert not is_remote_host(None)
    assert not is_remote_host('localhost')
    assert not is_remote_host(os.getenv('HOSTNAME'))
    assert not is_remote_host(get_host())
Ejemplo n.º 26
0
 def test_is_remote_host_on_localhost(self):
     """is_remote_host with localhost."""
     self.assertFalse(is_remote_host(None))
     self.assertFalse(is_remote_host('localhost'))
     self.assertFalse(is_remote_host(os.getenv('HOSTNAME')))
     self.assertFalse(is_remote_host(get_host()))
Ejemplo n.º 27
0
 def test_is_remote_host_on_localhost(self):
     """is_remote_host with localhost."""
     self.assertFalse(is_remote_host(None))
     self.assertFalse(is_remote_host('localhost'))
     self.assertFalse(is_remote_host(os.getenv('HOSTNAME')))
     self.assertFalse(is_remote_host(get_host()))
Ejemplo n.º 28
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                    self.job_pool.add_job_msg(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num),
                        self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name'])
                    and not is_remote_host(host)):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask,
                         itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(
                    itask, {
                        'is_manual_submit': itask.is_manual_submit,
                        'try_num': itask.get_try_num(),
                        'time_submit': now_str,
                        'user_at_host': owner_at_host,
                        'batch_sys_name': itask.summary['batch_sys_name'],
                    })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(self.JOBS_SUBMIT,
                                       '(init %s)' % owner_at_host,
                                       err=REMOTE_INIT_FAILED,
                                       ret_code=1), suite, itask.point,
                        itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [('host', host, is_remote_host),
                                          ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(get_remote_suite_run_job_dir(host, owner, suite))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size]
                for i in range(0, len(itasks), chunk_size)
            ]
            LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd,
                      [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(suite, itask.point,
                                                 itask.tdef.name,
                                                 itask.submit_num))
                    job_log_dirs.append(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(self.JOBS_SUBMIT,
                                   cmd + job_log_dirs,
                                   stdin_files=stdin_files,
                                   job_log_dirs=job_log_dirs,
                                   **kwargs), self._submit_task_jobs_callback,
                    [suite, itasks_batch])
        return done_tasks
Ejemplo n.º 29
0
    def remote_tidy(self):
        """Remove suite contact files from initialised remotes.

        Call "cylc remote-tidy".
        This method is called on suite shutdown, so we want nothing to hang.
        Timeout any incomplete commands after 10 seconds.

        Also remove UUID file on suite host ".service/uuid".
        """
        # Remove UUID file
        uuid_fname = os.path.join(get_suite_srv_dir(self.suite),
                                  FILE_BASE_UUID)
        try:
            os.unlink(uuid_fname)
        except OSError:
            pass
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), init_with_contact in self.remote_init_map.items():
            if init_with_contact != REMOTE_INIT_DONE:
                continue
            cmd = ['timeout', '10', 'cylc', 'remote-tidy']
            if is_remote_host(host):
                cmd.append('--host=%s' % host)
            if is_remote_user(owner):
                cmd.append('--user=%s' % owner)
            if cylc.flow.flags.debug:
                cmd.append('--debug')
            cmd.append(get_remote_suite_run_dir(host, owner, self.suite))
            procs[(host, owner)] = (cmd,
                                    Popen(cmd,
                                          stdout=PIPE,
                                          stderr=PIPE,
                                          stdin=DEVNULL))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for (host, owner), (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[(host, owner)]
                out, err = (f.decode() for f in proc.communicate())
                if proc.wait():
                    LOG.warning(
                        TaskRemoteMgmtError(
                            TaskRemoteMgmtError.MSG_TIDY, (host, owner),
                            ' '.join(quote(item) for item in cmd),
                            proc.returncode, out, err))
        # Terminate any remaining commands
        for (host, owner), (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(
                    TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY,
                                        (host, owner),
                                        ' '.join(quote(item) for item in cmd),
                                        proc.returncode, out, err))
Ejemplo n.º 30
0
    def remote_init(self, host, owner):
        """Initialise a remote [owner@]host if necessary.

        Create UUID file on suite host ".service/uuid" for remotes to identify
        shared file system with suite host.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": For TCP task communication
            ".service/passphrase": For TCP task communication
            "python/": if source exists

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        if self.single_task_mode or not is_remote(host, owner):
            return REMOTE_INIT_NOT_REQUIRED
        try:
            status = self.remote_init_map[(host, owner)]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[(host, owner)]  # reset to allow retry
            return status

        # Determine what items to install
        comm_meth = glbl_cfg().get_host_item('task communication method', host,
                                             owner)
        owner_at_host = 'localhost'
        if host:
            owner_at_host = host
        if owner:
            owner_at_host = owner + '@' + owner_at_host
        LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth))
        items = self._remote_init_items(comm_meth)
        # No item to install
        if not items:
            self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED
            return self.remote_init_map[(host, owner)]

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # UUID file - for remote to identify shared file system with suite host
        uuid_fname = os.path.join(get_suite_srv_dir(self.suite),
                                  FILE_BASE_UUID)
        if not os.path.exists(uuid_fname):
            open(uuid_fname, 'wb').write(str(self.uuid_str).encode())
        # Build the command
        cmd = ['cylc', 'remote-init']
        if is_remote_host(host):
            cmd.append('--host=%s' % host)
        if is_remote_user(owner):
            cmd.append('--user=%s' % owner)
        if cylc.flow.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(str(self.uuid_str))
        cmd.append(get_remote_suite_run_dir(host, owner, self.suite))
        self.proc_pool.put_command(
            SubProcContext('remote-init', cmd, stdin_files=[tmphandle]),
            self._remote_init_callback, [host, owner, tmphandle])
        # None status: Waiting for command to finish
        self.remote_init_map[(host, owner)] = None
        return self.remote_init_map[(host, owner)]
Ejemplo n.º 31
0
    def subshell_eval(self, command, command_pattern, host_check=True):
        """Evaluate a task platform from a subshell string.

        At Cylc 7, from a host string.

        Arguments:
            command (str):
                An explicit host name, a command in back-tick or $(command)
                format, or an environment variable holding a hostname.
            command_pattern (re.Pattern):
                A compiled regex pattern designed to match subshell strings.
            host_check (bool):
                A flag to enable remote testing. If True, and if the command
                is running locally, then it will return 'localhost'.

        Return (str):
            - None if evaluation of command is still taking place.
            - If command is not defined or the evaluated name is equivalent
              to 'localhost', _and_ host_check is set to True then
              'localhost'
            - Otherwise, return the evaluated host name on success.

        Raise TaskRemoteMgmtError on error.

        """
        # BACK COMPAT: references to "host"
        # remove at:
        #     Cylc9
        if not command:
            return 'localhost'

        # Host selection command: $(command) or `command`
        match = command_pattern.match(command)
        if match:
            cmd_str = match.groups()[1]
            if cmd_str in self.remote_command_map:
                # Command recently launched
                value = self.remote_command_map[cmd_str]
                if isinstance(value, TaskRemoteMgmtError):
                    raise value  # command failed
                elif value is None:
                    return  # command not yet ready
                else:
                    command = value  # command succeeded
            else:
                # Command not launched (or already reset)
                self.proc_pool.put_command(
                    SubProcContext('remote-host-select',
                                   ['bash', '-c', cmd_str],
                                   env=dict(os.environ)),
                    self._subshell_eval_callback, [cmd_str])
                self.remote_command_map[cmd_str] = None
                return self.remote_command_map[cmd_str]

        # Environment variable substitution
        command = os.path.expandvars(command)
        # Remote?
        # TODO - Remove at Cylc 9 as this only makes sense with host logic
        if host_check is True:
            if is_remote_host(command):
                return command
            else:
                return 'localhost'
        else:
            return command
Ejemplo n.º 32
0
    def remote_init(self, host, owner):
        """Initialise a remote [owner@]host if necessary.

        Create UUID file on suite host ".service/uuid" for remotes to identify
        shared file system with suite host.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": For TCP task communication
            ".service/passphrase": For TCP task communication
            "python/": if source exists

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        if self.single_task_mode or not is_remote(host, owner):
            return REMOTE_INIT_NOT_REQUIRED
        try:
            status = self.remote_init_map[(host, owner)]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[(host, owner)]  # reset to allow retry
            return status

        # Determine what items to install
        comm_meth = glbl_cfg().get_host_item(
            'task communication method', host, owner)
        owner_at_host = 'localhost'
        if host:
            owner_at_host = host
        if owner:
            owner_at_host = owner + '@' + owner_at_host
        LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth))
        items = self._remote_init_items(comm_meth)
        # No item to install
        if not items:
            self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED
            return self.remote_init_map[(host, owner)]

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # UUID file - for remote to identify shared file system with suite host
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        if not os.path.exists(uuid_fname):
            open(uuid_fname, 'wb').write(str(self.uuid_str).encode())
        # Build the command
        cmd = ['cylc', 'remote-init']
        if is_remote_host(host):
            cmd.append('--host=%s' % host)
        if is_remote_user(owner):
            cmd.append('--user=%s' % owner)
        if cylc.flow.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(str(self.uuid_str))
        cmd.append(glbl_cfg().get_derived_host_item(
            self.suite, 'suite run directory', host, owner))
        self.proc_pool.put_command(
            SubProcContext('remote-init', cmd, stdin_files=[tmphandle]),
            self._remote_init_callback,
            [host, owner, tmphandle])
        # None status: Waiting for command to finish
        self.remote_init_map[(host, owner)] = None
        return self.remote_init_map[(host, owner)]
Ejemplo n.º 33
0
def platform_from_job_info(platforms: Dict[str, Any], job: Dict[str, Any],
                           remote: Dict[str, Any]) -> str:
    """
    Find out which job platform to use given a list of possible platforms
    and the task dictionary with cylc 7 definitions in it.

    (Note: "batch system" (Cylc 7) and "job runner" (Cylc 8)
    mean the same thing)

          +------------+ Yes    +-----------------------+
    +-----> Tried all  +------->+ RAISE                 |
    |     | platforms? |        | PlatformNotFoundError |
    |     +------------+        +-----------------------+
    |              No|
    |     +----------v---+
    |     | Examine next |
    |     | platform     |
    |     +--------------+
    |                |
    |     +----------v----------------+
    |     | Do all items other than   |
    +<----+ "host" and "batch system" |
    |   No| match for this plaform    |
    |     +---------------------------+
    |                           |Yes
    |                +----------v----------------+ No
    |                | Task host is 'localhost'? +--+
    |                +---------------------------+  |
    |                           |Yes                |
    |              No+----------v----------------+  |
    |            +---+ Task batch system is      |  |
    |            |   | 'background'?             |  |
    |            |   +---------------------------+  |
    |            |              |Yes                |
    |            |   +----------v----------------+  |
    |            |   | RETURN 'localhost'        |  |
    |            |   +---------------------------+  |
    |            |                                  |
    |    +-------v-------------+     +--------------v-------+
    |  No| batch systems match |  Yes| batch system and     |
    +<---+ and 'localhost' in  |  +--+ host both match      |
    |    | platform hosts?     |  |  +----------------------+
         +---------------------+  |                 |No
    |            |Yes             |  +--------------v-------+
    |    +-------v--------------+ |  | batch system match   |
    |    | RETURN this platform <-+--+ and regex of platform|
    |    +----------------------+ Yes| name matches host    |
    |                                +----------------------+
    |                                  |No
    +<---------------------------------+

    Args:
        job: Workflow config [runtime][TASK][job] section.
        remote: Workflow config [runtime][TASK][remote] section.
        platforms: Dictionary containing platform definitions.

    Returns:
        platform: string representing a platform from the global config.

    Raises:
        PlatformLookupError:
            If no matching platform can be a found an error is raised.

    Example:
        >>> platforms = {
        ...         'desktop[0-9][0-9]|laptop[0-9][0-9]': {},
        ...         'sugar': {
        ...             'hosts': 'localhost',
        ...             'job runner': 'slurm'
        ...         }
        ... }
        >>> job = {'batch system': 'slurm'}
        >>> remote = {'host': 'localhost'}
        >>> platform_from_job_info(platforms, job, remote)
        'sugar'
        >>> remote = {}
        >>> platform_from_job_info(platforms, job, remote)
        'sugar'
        >>> remote ={'host': 'desktop92'}
        >>> job = {}
        >>> platform_from_job_info(platforms, job, remote)
        'desktop92'
    """

    # These settings are removed from the incoming dictionaries for special
    # handling later - we want more than a simple match:
    #   - In the case of "host" we also want a regex match to the platform name
    #   - In the case of "batch system" we want to match the name of the
    #     system/job runner to a platform when host is localhost.
    if 'host' in remote and remote['host']:
        task_host = remote['host']
    else:
        task_host = 'localhost'
    if 'batch system' in job and job['batch system']:
        task_job_runner = job['batch system']
    else:
        # Necessary? Perhaps not if batch system default is 'background'
        task_job_runner = 'background'
    # Riffle through the platforms looking for a match to our task settings.
    # reverse dict order so that user config platforms added last are examined
    # before site config platforms.
    for platform_name, platform_spec in reversed(list(platforms.items())):
        # Handle all the items requiring an exact match.
        # All items other than batch system and host must be an exact match
        if not generic_items_match(platform_spec, job, remote):
            continue
        # We have some special logic to identify whether task host and task
        # batch system match the platform in question.
        if (not is_remote_host(task_host) and task_job_runner == 'background'):
            return 'localhost'

        elif ('hosts' in platform_spec and task_host in platform_spec['hosts']
              and task_job_runner == platform_spec['job runner']):
            # If we have localhost with a non-background batch system we
            # use the batch system to give a sensible guess at the platform
            return platform_name

        elif (re.fullmatch(platform_name, task_host)
              and ((task_job_runner == 'background'
                    and 'job runner' not in platform_spec)
                   or task_job_runner == platform_spec['job runner'])):
            return task_host

    raise PlatformLookupError('No platform found matching your task')
Ejemplo n.º 34
0
def detect_old_contact_file(reg, check_host_port=None):
    """Detect old suite contact file.

    If an old contact file does not exist, do nothing. If one does exist
    but the suite process is definitely not alive, remove it. If one exists
    and the suite process is still alive, raise SuiteServiceFileError.

    If check_host_port is specified and does not match the (host, port)
    value in the old contact file, raise AssertionError.

    Args:
        reg (str): suite name
        check_host_port (tuple): (host, port) to check against

    Raise:
        AssertionError:
            If old contact file exists but does not have matching
            (host, port) with value of check_host_port.
        SuiteServiceFileError:
            If old contact file exists and the suite process still alive.
    """
    # An old suite of the same name may be running if a contact file exists
    # and can be loaded.
    try:
        data = load_contact_file(reg)
        old_host = data[ContactFileFields.HOST]
        old_port = data[ContactFileFields.PORT]
        old_proc_str = data[ContactFileFields.PROCESS]
    except (IOError, ValueError, SuiteServiceFileError):
        # Contact file does not exist or corrupted, should be OK to proceed
        return
    if check_host_port and check_host_port != (old_host, int(old_port)):
        raise AssertionError("%s != (%s, %s)" % (
            check_host_port, old_host, old_port))
    # Run the "ps" command to see if the process is still running or not.
    # If the old suite process is still running, it should show up with the
    # same command line as before.
    # Terminate command after 10 seconds to prevent hanging, etc.
    old_pid_str = old_proc_str.split(None, 1)[0].strip()
    cmd = ["timeout", "10", "ps", PS_OPTS, str(old_pid_str)]
    if is_remote_host(old_host):
        import shlex
        ssh_str = get_platform()["ssh command"]
        cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd
    from time import sleep, time
    proc = Popen(cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE)
    # Terminate command after 10 seconds to prevent hanging SSH, etc.
    timeout = time() + 10.0
    while proc.poll() is None:
        if time() > timeout:
            proc.terminate()
        sleep(0.1)
    fname = get_contact_file(reg)
    ret_code = proc.wait()
    out, err = (f.decode() for f in proc.communicate())
    if ret_code:
        LOG.debug("$ %s  # return %d\n%s", ' '.join(cmd), ret_code, err)
    for line in reversed(out.splitlines()):
        if line.strip() == old_proc_str:
            # Suite definitely still running
            break
        elif line.split(None, 1)[0].strip() == "PID":
            # Only "ps" header - "ps" has run, but no matching results.
            # Suite not running. Attempt to remove suite contact file.
            try:
                os.unlink(fname)
                return
            except OSError:
                break

    raise SuiteServiceFileError(
        CONTACT_FILE_EXISTS_MSG % {
            "host": old_host,
            "port": old_port,
            "pid": old_pid_str,
            "fname": fname,
            "suite": reg,
        }
    )