Beispiel #1
0
    def _remote_init_items(self, comm_meth):
        """Return list of items to install based on communication method.

        Return (list):
            Each item is (source_path, dest_path) where:
            - source_path is the path to the source file to install.
            - dest_path is relative path under suite run directory
              at target remote.
        """
        items = []
        if comm_meth in ['ssh', 'zmq']:
            # Contact file
            items.append((get_contact_file(self.suite),
                          os.path.join(SuiteFiles.Service.DIRNAME,
                                       SuiteFiles.Service.CONTACT)))

        if comm_meth in ['zmq']:

            suite_srv_dir = get_suite_srv_dir(self.suite)
            server_pub_keyinfo = KeyInfo(KeyType.PUBLIC,
                                         KeyOwner.SERVER,
                                         suite_srv_dir=suite_srv_dir)
            client_pri_keyinfo = KeyInfo(KeyType.PRIVATE,
                                         KeyOwner.CLIENT,
                                         suite_srv_dir=suite_srv_dir)
            dest_path_srvr_public_key = os.path.join(
                SuiteFiles.Service.DIRNAME, server_pub_keyinfo.file_name)
            items.append(
                (server_pub_keyinfo.full_key_path, dest_path_srvr_public_key))
            dest_path_cli_pri_key = os.path.join(SuiteFiles.Service.DIRNAME,
                                                 client_pri_keyinfo.file_name)
            items.append(
                (client_pri_keyinfo.full_key_path, dest_path_cli_pri_key))
        return items
Beispiel #2
0
    def _socket_bind(self, min_port, max_port, srv_prv_key_loc=None):
        """Bind socket.

        Will use a port range provided to select random ports.

        """
        if srv_prv_key_loc is None:
            # Create new KeyInfo object for the server private key
            suite_srv_dir = get_suite_srv_dir(self.suite)
            srv_prv_key_info = KeyInfo(KeyType.PRIVATE,
                                       KeyOwner.SERVER,
                                       suite_srv_dir=suite_srv_dir)
        else:
            srv_prv_key_info = KeyInfo(KeyType.PRIVATE,
                                       KeyOwner.SERVER,
                                       full_key_path=srv_prv_key_loc)

        # create socket
        self.socket = self.context.socket(self.pattern)
        self._socket_options()

        try:
            server_public_key, server_private_key = zmq.auth.load_certificate(
                srv_prv_key_info.full_key_path)
        except (ValueError):
            raise SuiteServiceFileError(f"Failed to find server's public "
                                        f"key in "
                                        f"{srv_prv_key_info.full_key_path}.")
        except (OSError):
            raise SuiteServiceFileError(f"IO error opening server's private "
                                        f"key from "
                                        f"{srv_prv_key_info.full_key_path}.")

        if server_private_key is None:  # this can't be caught by exception
            raise SuiteServiceFileError(f"Failed to find server's private "
                                        f"key in "
                                        f"{srv_prv_key_info.full_key_path}.")

        self.socket.curve_publickey = server_public_key
        self.socket.curve_secretkey = server_private_key
        self.socket.curve_server = True

        try:
            if min_port == max_port:
                self.port = min_port
                self.socket.bind(f'tcp://*:{min_port}')
            else:
                self.port = self.socket.bind_to_random_port(
                    'tcp://*', min_port, max_port)
        except (zmq.error.ZMQError, zmq.error.ZMQBindError) as exc:
            raise CylcError(f'could not start Cylc ZMQ server: {exc}')

        if self.barrier is not None:
            self.barrier.wait()
Beispiel #3
0
def scheduler_cli(parser, options, args, is_restart=False):
    """CLI main."""
    reg = args[0]
    # Check suite is not already running before start of host selection.
    try:
        suite_files.detect_old_contact_file(reg)
    except SuiteServiceFileError as exc:
        sys.exit(exc)

    suite_run_dir = get_suite_run_dir(reg)

    if not os.path.exists(suite_run_dir):
        sys.stderr.write(f'suite service directory not found '
                         f'at: {suite_run_dir}\n')
        sys.exit(1)

    # Create auth files if needed.
    suite_files.create_auth_files(reg)

    # Extract job.sh from library, for use in job scripts.
    extract_resources(suite_files.get_suite_srv_dir(reg), ['etc/job.sh'])

    # Check whether a run host is explicitly specified, else select one.
    if not options.host:
        try:
            host = HostAppointer().appoint_host()
        except EmptyHostList as exc:
            if cylc.flow.flags.debug:
                raise
            else:
                sys.exit(str(exc))
        if is_remote_host(host):
            if is_restart:
                base_cmd = ["restart"] + sys.argv[1:]
            else:
                base_cmd = ["run"] + sys.argv[1:]
            # Prevent recursive host selection
            base_cmd.append("--host=localhost")
            return remote_cylc_cmd(base_cmd, host=host)
    if remrun(set_rel_local=True):  # State localhost as above.
        sys.exit()

    try:
        suite_files.get_suite_source_dir(args[0], options.owner)
    except SuiteServiceFileError:
        # Source path is assumed to be the run directory
        suite_files.register(args[0], get_suite_run_dir(args[0]))

    try:
        scheduler = Scheduler(is_restart, options, args)
    except SuiteServiceFileError as exc:
        sys.exit(exc)
    scheduler.start()
Beispiel #4
0
    def _socket_connect(self, host, port, srv_public_key_loc=None):
        """Connect socket to stub."""
        suite_srv_dir = get_suite_srv_dir(self.suite)
        if srv_public_key_loc is None:
            # Create new KeyInfo object for the server public key
            srv_pub_key_info = KeyInfo(KeyType.PUBLIC,
                                       KeyOwner.SERVER,
                                       suite_srv_dir=suite_srv_dir)

        else:
            srv_pub_key_info = KeyInfo(KeyType.PUBLIC,
                                       KeyOwner.SERVER,
                                       full_key_path=srv_public_key_loc)

        self.host = host
        self.port = port
        self.socket = self.context.socket(self.pattern)
        self._socket_options()

        client_priv_key_info = KeyInfo(KeyType.PRIVATE,
                                       KeyOwner.CLIENT,
                                       suite_srv_dir=suite_srv_dir)
        error_msg = "Failed to find user's private key, so cannot connect."
        try:
            client_public_key, client_priv_key = zmq.auth.load_certificate(
                client_priv_key_info.full_key_path)
        except (OSError, ValueError):
            raise ClientError(error_msg)
        if client_priv_key is None:  # this can't be caught by exception
            raise ClientError(error_msg)
        self.socket.curve_publickey = client_public_key
        self.socket.curve_secretkey = client_priv_key

        # A client can only connect to the server if it knows its public key,
        # so we grab this from the location it was created on the filesystem:
        try:
            # 'load_certificate' will try to load both public & private keys
            # from a provided file but will return None, not throw an error,
            # for the latter item if not there (as for all public key files)
            # so it is OK to use; there is no method to load only the
            # public key.
            server_public_key = zmq.auth.load_certificate(
                srv_pub_key_info.full_key_path)[0]
            self.socket.curve_serverkey = server_public_key
        except (OSError, ValueError):  # ValueError raised w/ no public key
            raise ClientError(
                "Failed to load the suite's public key, so cannot connect.")

        self.socket.connect(f'tcp://{host}:{port}')
Beispiel #5
0
    def _remote_init_callback(self, proc_ctx, platform, tmphandle, curve_auth,
                              client_pub_key_dir):
        """Callback when "cylc remote-init" exits.

        Write public key for install target into client public key
        directory.
        Set remote_init__map status to REMOTE_INIT_DONE on success which
        in turn will trigger file installation to start.
        Set remote_init_map status to REMOTE_INIT_FAILED on error.

        """
        try:
            tmphandle.close()
        except OSError:  # E.g. ignore bad unlink, etc
            pass
        install_target = platform['install target']
        if proc_ctx.ret_code == 0:
            if "KEYSTART" in proc_ctx.out:
                regex_result = re.search('KEYSTART((.|\n|\r)*)KEYEND',
                                         proc_ctx.out)
                key = regex_result.group(1)
                suite_srv_dir = get_suite_srv_dir(self.suite)
                public_key = KeyInfo(KeyType.PUBLIC,
                                     KeyOwner.CLIENT,
                                     suite_srv_dir=suite_srv_dir,
                                     install_target=install_target)
                old_umask = os.umask(0o177)
                with open(public_key.full_key_path, 'w',
                          encoding='utf8') as text_file:
                    text_file.write(key)
                os.umask(old_umask)
                # configure_curve must be called every time certificates are
                # added or removed, in order to update the Authenticator's
                # state.
                curve_auth.configure_curve(domain='*',
                                           location=(client_pub_key_dir))
                self.remote_init_map[install_target] = REMOTE_INIT_DONE
                self.ready = True
                return
        # Bad status
        LOG.error(
            TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, install_target,
                                ' '.join(quote(item) for item in proc_ctx.cmd),
                                proc_ctx.ret_code, proc_ctx.out, proc_ctx.err))

        self.remote_init_map[platform['install target']] = REMOTE_INIT_FAILED
        self.ready = True
Beispiel #6
0
def key_housekeeping(reg, platform=None, create=True):
    """Clean any existing authentication keys and create new ones.
        If create is set to false, keys will only be cleaned from
        server."""
    suite_srv_dir = get_suite_srv_dir(reg)
    keys = {
        "client_public_key":
        KeyInfo(KeyType.PUBLIC,
                KeyOwner.CLIENT,
                suite_srv_dir=suite_srv_dir,
                install_target=platform),
        "client_private_key":
        KeyInfo(KeyType.PRIVATE, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir),
        "server_public_key":
        KeyInfo(KeyType.PUBLIC, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir),
        "server_private_key":
        KeyInfo(KeyType.PRIVATE, KeyOwner.SERVER, suite_srv_dir=suite_srv_dir)
    }
    remove_keys_on_server(keys)
    if create:
        create_server_keys(keys, suite_srv_dir)
Beispiel #7
0
def test_client_requires_valid_server_public_key_in_private_key_file():
    """Client should not be able to connect to host/port without
    server public key."""
    suite_name = f"test_suite-{time()}"
    port = random.choice(PORT_RANGE)
    client = ZMQSocketBase(zmq.REP, suite=suite_name)

    test_suite_srv_dir = get_suite_srv_dir(reg=suite_name)
    key_info = KeyInfo(
        KeyType.PRIVATE,
        KeyOwner.CLIENT,
        suite_srv_dir=test_suite_srv_dir)
    directory = os.path.expanduser("~/cylc-run")
    tmpdir = os.path.join(directory, suite_name)
    os.makedirs(key_info.key_path, exist_ok=True)

    _pub, _priv = zmq.auth.create_certificates(key_info.key_path, "client")

    with pytest.raises(ClientError, match=r"Failed to load the suite's public "
                                          r"key, so cannot connect."):
        client.start(HOST, port, srv_public_key_loc="fake_location")

    client.stop()
    rmtree(tmpdir, ignore_errors=True)
Beispiel #8
0
def setup_keys(suite_name):
    suite_srv_dir = get_suite_srv_dir(suite_name)
    server_keys = {
        "client_public_key": KeyInfo(
            KeyType.PUBLIC,
            KeyOwner.CLIENT,
            suite_srv_dir=suite_srv_dir),
        "client_private_key": KeyInfo(
            KeyType.PRIVATE,
            KeyOwner.CLIENT,
            suite_srv_dir=suite_srv_dir),
        "server_public_key": KeyInfo(
            KeyType.PUBLIC,
            KeyOwner.SERVER,
            suite_srv_dir=suite_srv_dir),
        "server_private_key": KeyInfo(
            KeyType.PRIVATE,
            KeyOwner.SERVER,
            suite_srv_dir=suite_srv_dir)
    }
    remove_keys_on_server(server_keys)
    remove_keys_on_client(suite_srv_dir, None, full_clean=True)
    create_server_keys(server_keys, suite_srv_dir)
    create_client_keys(suite_srv_dir, None)
Beispiel #9
0
    def remote_tidy(self):
        """Remove suite contact files from initialised remotes.

        Call "cylc remote-tidy".
        This method is called on suite shutdown, so we want nothing to hang.
        Timeout any incomplete commands after 10 seconds.

        Also remove UUID file on suite host ".service/uuid".
        """
        # Remove UUID file
        uuid_fname = os.path.join(get_suite_srv_dir(self.suite),
                                  FILE_BASE_UUID)
        try:
            os.unlink(uuid_fname)
        except OSError:
            pass
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), init_with_contact in self.remote_init_map.items():
            if init_with_contact != REMOTE_INIT_DONE:
                continue
            cmd = ['timeout', '10', 'cylc', 'remote-tidy']
            if is_remote_host(host):
                cmd.append('--host=%s' % host)
            if is_remote_user(owner):
                cmd.append('--user=%s' % owner)
            if cylc.flow.flags.debug:
                cmd.append('--debug')
            cmd.append(get_remote_suite_run_dir(host, owner, self.suite))
            procs[(host, owner)] = (cmd,
                                    Popen(cmd,
                                          stdout=PIPE,
                                          stderr=PIPE,
                                          stdin=DEVNULL))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for (host, owner), (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[(host, owner)]
                out, err = (f.decode() for f in proc.communicate())
                if proc.wait():
                    LOG.warning(
                        TaskRemoteMgmtError(
                            TaskRemoteMgmtError.MSG_TIDY, (host, owner),
                            ' '.join(quote(item) for item in cmd),
                            proc.returncode, out, err))
        # Terminate any remaining commands
        for (host, owner), (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(
                    TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY,
                                        (host, owner),
                                        ' '.join(quote(item) for item in cmd),
                                        proc.returncode, out, err))
Beispiel #10
0
    def remote_init(self, host, owner):
        """Initialise a remote [owner@]host if necessary.

        Create UUID file on suite host ".service/uuid" for remotes to identify
        shared file system with suite host.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": For TCP task communication
            ".service/passphrase": For TCP task communication
            "python/": if source exists

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        if self.single_task_mode or not is_remote(host, owner):
            return REMOTE_INIT_NOT_REQUIRED
        try:
            status = self.remote_init_map[(host, owner)]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[(host, owner)]  # reset to allow retry
            return status

        # Determine what items to install
        comm_meth = glbl_cfg().get_host_item('task communication method', host,
                                             owner)
        owner_at_host = 'localhost'
        if host:
            owner_at_host = host
        if owner:
            owner_at_host = owner + '@' + owner_at_host
        LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth))
        items = self._remote_init_items(comm_meth)
        # No item to install
        if not items:
            self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED
            return self.remote_init_map[(host, owner)]

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # UUID file - for remote to identify shared file system with suite host
        uuid_fname = os.path.join(get_suite_srv_dir(self.suite),
                                  FILE_BASE_UUID)
        if not os.path.exists(uuid_fname):
            open(uuid_fname, 'wb').write(str(self.uuid_str).encode())
        # Build the command
        cmd = ['cylc', 'remote-init']
        if is_remote_host(host):
            cmd.append('--host=%s' % host)
        if is_remote_user(owner):
            cmd.append('--user=%s' % owner)
        if cylc.flow.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(str(self.uuid_str))
        cmd.append(get_remote_suite_run_dir(host, owner, self.suite))
        self.proc_pool.put_command(
            SubProcContext('remote-init', cmd, stdin_files=[tmphandle]),
            self._remote_init_callback, [host, owner, tmphandle])
        # None status: Waiting for command to finish
        self.remote_init_map[(host, owner)] = None
        return self.remote_init_map[(host, owner)]
Beispiel #11
0
def main(parser, options, suite, *task_ids):
    """cylc submit CLI.

    No TASK EVENT HOOKS are set for the submit command because there is
    no scheduler instance watching for task failure etc.

    Note: a suite contact env file is not written by this command (it
    would overwrite the real one if the suite is running).
    """
    if not options.verbose and not options.debug:
        LOG.setLevel(WARNING)
    for task_id in task_ids:
        if not TaskID.is_valid_id(task_id):
            raise UserInputError("Invalid task ID %s" % task_id)
    suiterc = get_suite_rc(suite)
    suite_dir = os.path.dirname(suiterc)
    # For user-defined batch system handlers
    sys.path.append(os.path.join(suite_dir, 'python'))

    # Load suite config and tasks
    config = SuiteConfig(
        suite, suiterc, options,
        load_template_vars(options.templatevars, options.templatevars_file))
    itasks = []
    for task_id in task_ids:
        name_str, point_str = TaskID.split(task_id)
        taskdefs = config.find_taskdefs(name_str)
        if not taskdefs:
            raise UserInputError("No task found for %s" % task_id)
        for taskdef in taskdefs:
            itasks.append(
                TaskProxy(taskdef,
                          get_point(point_str).standardise(),
                          is_startup=True))

    # Initialise job submit environment
    make_suite_run_tree(suite)
    # Extract job.sh from library, for use in job scripts.
    extract_resources(get_suite_srv_dir(suite), ['etc/job.sh'])
    pool = SubProcPool()
    owner = get_user()
    job_pool = JobPool(suite, owner)
    db_mgr = SuiteDatabaseManager()
    task_job_mgr = TaskJobManager(
        suite, pool, db_mgr,
        TaskEventsManager(suite, pool, db_mgr, BroadcastMgr(db_mgr), job_pool),
        job_pool)
    task_job_mgr.task_remote_mgr.single_task_mode = True
    task_job_mgr.job_file_writer.set_suite_env({
        'CYLC_UTC':
        str(config.cfg['cylc']['UTC mode']),
        'CYLC_DEBUG':
        str(cylc.flow.flags.debug).lower(),
        'CYLC_VERBOSE':
        str(cylc.flow.flags.verbose).lower(),
        'CYLC_SUITE_NAME':
        suite,
        'CYLC_CYCLING_MODE':
        str(config.cfg['scheduling']['cycling mode']),
        'CYLC_SUITE_INITIAL_CYCLE_POINT':
        str(config.cfg['scheduling']['initial cycle point']),
        'CYLC_SUITE_FINAL_CYCLE_POINT':
        str(config.cfg['scheduling']['final cycle point']),
    })

    ret_code = 0
    waiting_tasks = list(itasks)
    if options.dry_run:
        while waiting_tasks:
            prep_tasks, bad_tasks = task_job_mgr.prep_submit_task_jobs(
                suite, waiting_tasks, dry_run=True)
            for itask in prep_tasks + bad_tasks:
                waiting_tasks.remove(itask)
            if waiting_tasks:
                task_job_mgr.proc_pool.process()
                sleep(1.0)

        for itask in itasks:
            if itask.local_job_file_path:
                print(('JOB SCRIPT=%s' % itask.local_job_file_path))
            else:
                print(('Unable to prepare job file for %s' % itask.identity),
                      file=sys.stderr)
                ret_code = 1
    else:
        while waiting_tasks:
            for itask in task_job_mgr.submit_task_jobs(suite, waiting_tasks):
                waiting_tasks.remove(itask)
            if waiting_tasks:
                task_job_mgr.proc_pool.process()
                sleep(1.0)
        while task_job_mgr.proc_pool.is_not_done():
            task_job_mgr.proc_pool.process()
        for itask in itasks:
            if itask.summary.get('submit_method_id') is not None:
                print(('[%s] Job ID: %s' %
                       (itask.identity, itask.summary['submit_method_id'])))
            if itask.state(TASK_STATUS_SUBMIT_FAILED):
                ret_code = 1
    sys.exit(ret_code)
Beispiel #12
0
    def _remote_init_callback(
            self, proc_ctx, platform, tmphandle,
            curve_auth, client_pub_key_dir):
        """Callback when "cylc remote-init" exits"""
        self.ready = True
        try:
            tmphandle.close()
        except OSError:  # E.g. ignore bad unlink, etc
            pass
        self.install_target = platform['install target']
        if proc_ctx.ret_code == 0:
            if REMOTE_INIT_DONE in proc_ctx.out:
                src_path = get_suite_run_dir(self.suite)
                dst_path = get_remote_suite_run_dir(platform, self.suite)
                try:
                    process = procopen(construct_rsync_over_ssh_cmd(
                        src_path,
                        dst_path,
                        platform,
                        self.rsync_includes),
                        stdoutpipe=True,
                        stderrpipe=True,
                        universal_newlines=True)

                    out, err = process.communicate(timeout=600)
                    install_target = platform['install target']
                    if out:
                        RSYNC_LOG.info(
                            'File installation information for '
                            f'{install_target}:\n {out}')
                    if err:
                        LOG.error(
                            'File installation error on '
                            f'{install_target}:\n {err}')
                except Exception as ex:
                    LOG.error(f"Problem during rsync: {ex}")
                    self.remote_init_map[self.install_target] = (
                        REMOTE_INIT_FAILED)
                    return
            if "KEYSTART" in proc_ctx.out:
                regex_result = re.search(
                    'KEYSTART((.|\n|\r)*)KEYEND', proc_ctx.out)
                key = regex_result.group(1)
                suite_srv_dir = get_suite_srv_dir(self.suite)
                public_key = KeyInfo(
                    KeyType.PUBLIC,
                    KeyOwner.CLIENT,
                    suite_srv_dir=suite_srv_dir,
                    install_target=self.install_target
                )
                old_umask = os.umask(0o177)
                with open(
                        public_key.full_key_path,
                        'w', encoding='utf8') as text_file:
                    text_file.write(key)
                os.umask(old_umask)
                # configure_curve must be called every time certificates are
                # added or removed, in order to update the Authenticator's
                # state.
                curve_auth.configure_curve(
                    domain='*', location=(client_pub_key_dir))
            for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED):
                if status in proc_ctx.out:
                    # Good status
                    LOG.debug(proc_ctx)
                    self.remote_init_map[self.install_target] = status
                    return
        # Bad status
        LOG.error(TaskRemoteMgmtError(
            TaskRemoteMgmtError.MSG_INIT,
            platform['install target'], ' '.join(
                quote(item) for item in proc_ctx.cmd),
            proc_ctx.ret_code, proc_ctx.out, proc_ctx.err))
        LOG.error(proc_ctx)
        self.remote_init_map[platform['install target']] = REMOTE_INIT_FAILED