Ejemplo n.º 1
0
    def remote_init(self, platform: Dict[str, Any],
                    curve_auth: 'ThreadAuthenticator',
                    client_pub_key_dir: str) -> None:
        """Initialise a remote host if necessary.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": For TCP task communication
            "python/": if source exists

        Args:
            platform: A dict containing settings relating to platform used in
                this remote installation.
            curve_auth: The ZMQ authenticator.
            client_pub_key_dir: Client public key directory, used by the
                ZMQ authenticator.

        """
        install_target = platform['install target']
        if install_target == get_localhost_install_target():
            self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE
            return
        # Set status of install target to in progress while waiting for remote
        # initialisation to finish
        self.remote_init_map[install_target] = REMOTE_INIT_IN_PROGRESS

        # Determine what items to install
        comm_meth = platform['communication method']
        items = self._remote_init_items(comm_meth)

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # Build the remote-init command to be run over ssh
        cmd = ['remote-init']
        if cylc.flow.flags.debug:
            cmd.append('--debug')
        cmd.append(str(install_target))
        cmd.append(get_remote_suite_run_dir(platform, self.suite))
        dirs_to_symlink = get_dirs_to_symlink(install_target, self.suite)
        for key, value in dirs_to_symlink.items():
            if value is not None:
                cmd.append(f"{key}={quote(value)} ")
        # Create the ssh command
        cmd = construct_ssh_cmd(cmd, platform)
        self.proc_pool.put_command(
            SubProcContext('remote-init', cmd, stdin_files=[tmphandle]),
            self._remote_init_callback,
            [platform, tmphandle, curve_auth, client_pub_key_dir])
Ejemplo n.º 2
0
def _remote_clean_cmd(reg, platform, timeout):
    """Remove a stopped workflow on a remote host.

    Call "cylc clean --local-only" over ssh and return the subprocess.

    Args:
        reg (str): Workflow name.
        platform (dict): Config for the platform on which to remove the
            workflow.
        timeout (str): Number of seconds to wait before cancelling the command.
    """
    LOG.debug(
        f'Cleaning on install target: {platform["install target"]} '
        f'(using platform: {platform["name"]})')
    cmd = ['clean', '--local-only', reg]
    if cylc.flow.flags.debug:
        cmd.append('--debug')
    cmd = construct_ssh_cmd(cmd, platform, timeout=timeout)
    LOG.debug(" ".join(cmd))
    return Popen(cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE)
Ejemplo n.º 3
0
    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their platform_name and host.
        Put a job command for each group to the multiprocess pool.

        """
        if not itasks:
            return
        # sort itasks into lists based upon where they were run.
        auth_itasks = {}
        for itask in itasks:
            platform_n = itask.platform['name']
            if platform_n not in auth_itasks:
                auth_itasks[platform_n] = []
            auth_itasks[platform_n].append(itask)

        # Go through each list of itasks and carry out commands as required.
        for platform_n, itasks in sorted(auth_itasks.items()):
            platform = get_platform(platform_n)
            if is_remote_platform(platform):
                remote_mode = True
                cmd = [cmd_key]
            else:
                cmd = ["cylc", cmd_key]
                remote_mode = False
            if LOG.isEnabledFor(DEBUG):
                cmd.append("--debug")
            cmd.append("--")
            cmd.append(get_remote_suite_run_job_dir(platform, suite))
            job_log_dirs = []
            if remote_mode:
                cmd = construct_ssh_cmd(cmd, platform)
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(
                    get_task_job_id(itask.point, itask.tdef.name,
                                    itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback,
                                       [suite, itasks])
Ejemplo n.º 4
0
    def submit_task_jobs(self,
                         suite,
                         itasks,
                         curve_auth,
                         client_pub_key_dir,
                         is_simulation=False):
        """Prepare for job submission and submit task jobs.

        Preparation (host selection, remote host init, and remote install)
        is done asynchronously. Newly released tasks may be sent here several
        times until these init subprocesses have returned. Failure during
        preparation is considered to be job submission failure.

        Once preparation has completed or failed, reset .waiting_on_job_prep in
        task instances so the scheduler knows to stop sending them back here.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.subshell_eval_reset()

        if not prepared_tasks:
            return bad_tasks
        auth_itasks = {}  # {platform: [itask, ...], ...}
        for itask in prepared_tasks:
            platform_name = itask.platform['name']
            auth_itasks.setdefault(platform_name, [])
            auth_itasks[platform_name].append(itask)
        # Submit task jobs for each platform
        done_tasks = bad_tasks

        for platform_name, itasks in sorted(auth_itasks.items()):
            platform = itasks[0].platform
            install_target = get_install_target_from_platform(platform)
            ri_map = self.task_remote_mgr.remote_init_map

            if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE):
                if install_target == get_localhost_install_target():
                    # Skip init and file install for localhost.
                    LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}")
                    ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE)

                elif install_target not in ri_map:
                    # Remote init not in progress for target, so start it.
                    self.task_remote_mgr.remote_init(platform, curve_auth,
                                                     client_pub_key_dir)
                    for itask in itasks:
                        itask.set_summary_message(self.REMOTE_INIT_MSG)
                        self.data_store_mgr.delta_job_msg(
                            get_task_job_id(itask.point, itask.tdef.name,
                                            itask.submit_num),
                            self.REMOTE_INIT_MSG)
                    continue

                elif (ri_map[install_target] == REMOTE_INIT_DONE):
                    # Already done remote init so move on to file install
                    self.task_remote_mgr.file_install(platform)
                    continue

                elif (ri_map[install_target] in self.IN_PROGRESS.keys()):
                    # Remote init or file install in progress.
                    for itask in itasks:
                        msg = self.IN_PROGRESS[ri_map[install_target]]
                        itask.set_summary_message(msg)
                        self.data_store_mgr.delta_job_msg(
                            get_task_job_id(itask.point, itask.tdef.name,
                                            itask.submit_num), msg)
                    continue

            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            host = get_host_from_platform(platform)
            if (self.job_runner_mgr.is_job_local_to_host(
                    itask.summary['job_runner_name'])
                    and not is_remote_platform(platform)):
                host = get_host()

            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info('[%s] -submit-num=%02d, host=%s', itask,
                         itask.submit_num, host)
                self.suite_db_mgr.put_insert_task_jobs(
                    itask, {
                        'is_manual_submit': itask.is_manual_submit,
                        'try_num': itask.get_try_num(),
                        'time_submit': now_str,
                        'platform_name': itask.platform['name'],
                        'job_runner_name': itask.summary['job_runner_name'],
                    })
                itask.is_manual_submit = False

            if (ri_map[install_target]
                    in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]):
                # Remote init or install failed. Set submit-failed for all
                # affected tasks and remove target from remote init map
                # - this enables new tasks to re-initialise that target
                init_error = (ri_map[install_target])
                del ri_map[install_target]
                for itask in itasks:
                    itask.waiting_on_job_prep = False
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(self.JOBS_SUBMIT,
                                       '(init %s)' % host,
                                       err=init_error,
                                       ret_code=1), suite, itask.point,
                        itask.tdef.name)
                    self._prep_submit_task_job_error(suite, itask,
                                                     '(remote init)', '')

                continue
            # Build the "cylc jobs-submit" command
            cmd = [self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            if is_remote_platform(itask.platform):
                remote_mode = True
                cmd.append('--remote-mode')
            else:
                remote_mode = False
            if itask.platform['clean job submission environment']:
                cmd.append('--clean-env')
            for var in itask.platform[
                    'job submission environment pass-through']:
                cmd.append(f"--env={var}")
            for path in itask.platform[
                    'job submission executable paths'] + SYSPATH:
                cmd.append(f"--path={path}")
            cmd.append('--')
            cmd.append(get_remote_suite_run_job_dir(platform, suite))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = (len(itasks) // (
                (len(itasks) // platform['max batch submit size']) + 1) + 1)
            itasks_batches = [
                itasks[i:i + chunk_size]
                for i in range(0, len(itasks), chunk_size)
            ]
            LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd,
                      [len(b) for b in itasks_batches])

            if remote_mode:
                cmd = construct_ssh_cmd(cmd, platform)
            else:
                cmd = ['cylc'] + cmd

            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            os.path.expandvars(
                                get_task_job_job_log(suite, itask.point,
                                                     itask.tdef.name,
                                                     itask.submit_num)))
                    job_log_dirs.append(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)

                    itask.waiting_on_job_prep = False
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                    ), self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks
Ejemplo n.º 5
0
    def remote_init(self, platform: Dict[str, Any],
                    curve_auth: 'ThreadAuthenticator',
                    client_pub_key_dir: str) -> None:
        """Initialise a remote host if necessary.

        Call "cylc remote-init" to install workflow items to remote:
            ".service/contact": For TCP task communication
            "python/": if source exists

        Args:
            platform: A dict containing settings relating to platform used in
                this remote installation.
            curve_auth: The ZMQ authenticator.
            client_pub_key_dir: Client public key directory, used by the
                ZMQ authenticator.

        """
        install_target = platform['install target']
        if install_target == get_localhost_install_target():
            self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE
            return

        # Set status of install target to in progress while waiting for remote
        # initialisation to finish
        self.remote_init_map[install_target] = REMOTE_INIT_IN_PROGRESS

        # Determine what items to install
        comms_meth: CommsMeth = CommsMeth(platform['communication method'])
        items = self._remote_init_items(comms_meth)

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # Build the remote-init command to be run over ssh
        cmd = ['remote-init']
        cmd.extend(verbosity_to_opts(cylc.flow.flags.verbosity))
        cmd.append(str(install_target))
        cmd.append(get_remote_workflow_run_dir(self.workflow))
        dirs_to_symlink = get_dirs_to_symlink(install_target, self.workflow)
        for key, value in dirs_to_symlink.items():
            if value is not None:
                cmd.append(f"{key}={quote(value)} ")
        # Create the ssh command
        try:
            host = get_host_from_platform(platform, bad_hosts=self.bad_hosts)
        except NoHostsError as exc:
            LOG.error(
                PlatformError(
                    f'{PlatformError.MSG_INIT}\n{exc}',
                    platform['name'],
                ))
            self.remote_init_map[
                platform['install target']] = REMOTE_INIT_FAILED
            self.bad_hosts -= set(platform['hosts'])
            self.ready = True
        else:
            log_platform_event('remote init', platform, host)
            cmd = construct_ssh_cmd(cmd, platform, host)
            self.proc_pool.put_command(
                SubProcContext('remote-init',
                               cmd,
                               stdin_files=[tmphandle],
                               host=host),
                bad_hosts=self.bad_hosts,
                callback=self._remote_init_callback,
                callback_args=[
                    platform, tmphandle, curve_auth, client_pub_key_dir
                ],
                callback_255=self._remote_init_callback_255,
                callback_255_args=[platform])