Example #1
0
 def __init__(self, suite, proc_pool, suite_db_mgr, task_events_mgr,
              job_pool):
     self.suite = suite
     self.proc_pool = proc_pool
     self.suite_db_mgr = suite_db_mgr
     self.task_events_mgr = task_events_mgr
     self.job_pool = job_pool
     self.job_file_writer = JobFileWriter()
     self.batch_sys_mgr = self.job_file_writer.batch_sys_mgr
     self.task_remote_mgr = TaskRemoteMgr(suite, proc_pool)
Example #2
0
 def __init__(self, suite, proc_pool, suite_db_mgr, task_events_mgr,
              data_store_mgr):
     self.suite = suite
     self.proc_pool = proc_pool
     self.suite_db_mgr = suite_db_mgr
     self.task_events_mgr = task_events_mgr
     self.data_store_mgr = data_store_mgr
     self.job_file_writer = JobFileWriter()
     self.job_runner_mgr = self.job_file_writer.job_runner_mgr
     self.task_remote_mgr = TaskRemoteMgr(suite, proc_pool)
Example #3
0
 def __init__(self, workflow, proc_pool, workflow_db_mgr, task_events_mgr,
              data_store_mgr):
     self.workflow = workflow
     self.proc_pool = proc_pool
     self.workflow_db_mgr = workflow_db_mgr
     self.task_events_mgr = task_events_mgr
     self.data_store_mgr = data_store_mgr
     self.job_file_writer = JobFileWriter()
     self.job_runner_mgr = self.job_file_writer.job_runner_mgr
     self.task_remote_mgr = TaskRemoteMgr(workflow, proc_pool)
Example #4
0
def test_write_script():
    """Test script is correctly written in jobscript"""

    expected = (
        "\n\ncylc__job__inst__init_script() {\n# INIT-SCRIPT:\n"
        "This is the init script\n}\n\ncylc__job__inst__env_script()"
        " {\n# ENV-SCRIPT:\nThis is the env script\n}\n\n"
        "cylc__job__inst__err_script() {\n# ERR-SCRIPT:\nThis is the err "
        "script\n}\n\ncylc__job__inst__pre_script() {\n# PRE-SCRIPT:\n"
        "This is the pre script\n}\n\ncylc__job__inst__script() {\n"
        "# SCRIPT:\nThis is the script\n}\n\ncylc__job__inst__post_script"
        "() {\n# POST-SCRIPT:\nThis is the post script\n}\n\n"
        "cylc__job__inst__exit_script() {\n# EXIT-SCRIPT:\n"
        "This is the exit script\n}")

    job_conf = {
        "init-script": "This is the init script",
        "env-script": "This is the env script",
        "err-script": "This is the err script",
        "pre-script": "This is the pre script",
        "script": "This is the script",
        "post-script": "This is the post script",
        "exit-script": "This is the exit script",
        "workflow_name": "test_write_script",
    }

    with io.StringIO() as fake_file:
        JobFileWriter()._write_script(fake_file, job_conf)
        assert (fake_file.getvalue() == expected)
Example #5
0
def test_write_task_environment():
    """Test task environment is correctly written in jobscript"""
    # set some task environment conditions
    expected = ('\n\n    # CYLC TASK ENVIRONMENT:\n    '
                'export CYLC_TASK_COMMS_METHOD=ssh\n    '
                'export CYLC_TASK_JOB="1/moo/01"\n    export '
                'CYLC_TASK_NAMESPACE_HIERARCHY="baa moo"\n    export '
                'CYLC_TASK_DEPENDENCIES="moo neigh quack"\n    export '
                'CYLC_TASK_TRY_NUMBER=1\n    export '
                'CYLC_TASK_FLOWS=1\n    export '
                'CYLC_TASK_PARAM_duck="quack"\n    export '
                'CYLC_TASK_PARAM_mouse="squeak"\n    '
                'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}')
    job_conf = {
        "platform": {
            'communication method': 'ssh'
        },
        "job_d": "1/moo/01",
        "namespace_hierarchy": ["baa", "moo"],
        "dependencies": ['moo', 'neigh', 'quack'],
        "try_num": 1,
        "flow_nums": {1},
        "param_var": {
            "duck": "quack",
            "mouse": "squeak"
        },
        "work_d": "farm_noises/work_d",
        "workflow_name": "test_write_task_environment",
    }
    with io.StringIO() as fake_file:
        JobFileWriter()._write_task_environment(fake_file, job_conf)
        assert (fake_file.getvalue() == expected)
Example #6
0
def test_write_prelude(monkeypatch, fixture_get_platform):
    """Test the prelude section of job script file is correctly
    written.
    """
    monkeypatch.setattr('cylc.flow.flags.verbosity', 2)
    expected = ('\nCYLC_FAIL_SIGNALS=\'EXIT ERR TERM XCPU\'\n'
                'CYLC_VACATION_SIGNALS=\'USR1\'\nexport PATH=moo/baa:$PATH'
                '\nexport CYLC_VERBOSE=true'
                '\nexport CYLC_DEBUG=true'
                '\nexport CYLC_VERSION=\'%s\'\nexport ' % __version__
                + 'CYLC_WORKFLOW_INITIAL_CYCLE_POINT=\'20200101T0000Z\'')
    job_conf = {
        "platform": fixture_get_platform({
            "job runner": "loadleveler",
            "job runner command template": "test_workflow",
            "host": "localhost",
            "copyable environment variables": [
                "CYLC_WORKFLOW_INITIAL_CYCLE_POINT"
            ],
            "cylc path": "moo/baa"
        }),
        "directives": {"restart": "yes"},
    }
    monkeypatch.setenv("CYLC_WORKFLOW_INITIAL_CYCLE_POINT", "20200101T0000Z")

    with io.StringIO() as fake_file:
        # copyable environment variables
        JobFileWriter()._write_prelude(fake_file, job_conf)
        assert(fake_file.getvalue() == expected)
Example #7
0
def test_write_epilogue():
    """Test epilogue is correctly written in jobscript"""

    expected = ('\n\n. \"cylc-run/farm_noises/.service/etc/job.sh\"\n'
                'cylc__job__main\n\n#EOF: 1/moo/01\n')
    job_conf = {'job_d': "1/moo/01"}
    run_d = "cylc-run/farm_noises"
    with io.StringIO() as fake_file:
        JobFileWriter()._write_epilogue(fake_file, job_conf, run_d)
        assert (fake_file.getvalue() == expected)
Example #8
0
def test_homeless_platform(fixture_get_platform):
    """Ensure there are no uses of $HOME before the global init-script.

    This is to allow users to configure a $HOME on machines with no $HOME
    directory.
    """
    job_conf = {
        "platform": fixture_get_platform({'global init-script':
                                          'some-script'}),
        "task_id": "a",
        "workflow_name": "b",
        "work_d": "c/d",
        "uuid_str": "e",
        'environment': {},
        'cow': '~/moo',
        "job_d": "1/a/01",
        "try_num": 1,
        "flow_nums": {1},
        # "job_runner_name": "background",
        "param_var": {},
        "execution_time_limit": None,
        "namespace_hierarchy": [],
        "dependencies": [],
        "init-script": "",
        "env-script": "",
        "err-script": "",
        "pre-script": "",
        "script": "",
        "post-script": "",
        "exit-script": "",
    }

    with NamedTemporaryFile() as local_job_file_path:
        local_job_file_path = local_job_file_path.name
        JobFileWriter().write(local_job_file_path, job_conf)
        with open(local_job_file_path, 'r') as local_job_file:
            job_script = local_job_file.read()

    # ensure that $HOME is not used before the global init-script
    for line in job_script.splitlines():
        if line.startswith(' '):
            # ignore env/script functions which aren't run until later
            continue
        if line == '# GLOBAL INIT-SCRIPT:':
            # quit once we've hit the global init-script
            break
        if 'HOME' in line:
            # bail if $HOME is used
            raise Exception(f'$HOME found in {line}\n{job_script}')

    # also ensure there is no use of $HOME in the job.sh script
    with open(Path(cylc_flow_file).parent / 'etc/job.sh', 'r') as job_sh:
        job_sh_txt = job_sh.read()
        if 'HOME' in job_sh_txt:
            raise Exception('$HOME found in job.sh\n{job_sh_txt}')
Example #9
0
 def __init__(self, suite, proc_pool, suite_db_mgr, suite_srv_files_mgr,
              task_events_mgr):
     self.suite = suite
     self.proc_pool = proc_pool
     self.suite_db_mgr = suite_db_mgr
     self.task_events_mgr = task_events_mgr
     self.job_file_writer = JobFileWriter()
     self.batch_sys_mgr = self.job_file_writer.batch_sys_mgr
     self.suite_srv_files_mgr = suite_srv_files_mgr
     self.task_remote_mgr = TaskRemoteMgr(
         suite, proc_pool, suite_srv_files_mgr)
Example #10
0
def test_traps_for_each_job_runner(job_runner: str):
    """Test traps for each job runner"""
    platform = platform_from_name()
    platform.update({
        "job runner": f"{job_runner}",
    })
    job_conf = {"platform": platform, "directives": {}}

    with io.StringIO() as fake_file:
        JobFileWriter()._write_prelude(fake_file, job_conf)
        output = fake_file.getvalue()
        assert ("CYLC_FAIL_SIGNALS='EXIT ERR TERM XCPU" in output)
Example #11
0
def test_write_suite_environment_no_remote_suite_d(fixture_get_platform,
                                                   monkeypatch):
    """Test suite environment is correctly written in jobscript"""

    monkeypatch.setattr(cylc.flow.job_file, "get_remote_suite_work_dir",
                        lambda a, b: "work/dir")
    cylc.flow.flags.debug = True
    cylc.flow.flags.verbose = True
    suite_env = {'CYLC_UTC': 'True', 'CYLC_CYCLING_MODE': 'integer'}
    job_file_writer = JobFileWriter()
    job_file_writer.set_suite_env(suite_env)
    expected = ('\n\ncylc__job__inst__cylc_env() {\n    # CYLC SUITE '
                'ENVIRONMENT:\n    export CYLC_CYCLING_MODE="integer"\n    '
                'export CYLC_UTC="True"\n    export TZ="UTC"\n\n    export '
                'CYLC_SUITE_RUN_DIR="cylc-run/farm_noises"\n    '
                'export CYLC_SUITE_WORK_DIR_ROOT="work/dir"\n   '
                ' export CYLC_SUITE_UUID="neigh"')
    job_conf = {
        "platform": fixture_get_platform({
            "host": "localhost",
        }),
        "suite_name": "farm_noises",
        "uuid_str": "neigh",
        "remote_suite_d": ""
    }
    rund = "cylc-run/farm_noises"
    with io.StringIO() as fake_file:
        job_file_writer._write_suite_environment(fake_file, job_conf, rund)
        result = fake_file.getvalue()
        assert result == expected
Example #12
0
def test_write_suite_environment(fixture_get_platform, monkeypatch):
    """Test suite environment is correctly written in jobscript"""
    # set some suite environment conditions
    monkeypatch.setattr(cylc.flow.job_file, "get_remote_suite_work_dir",
                        lambda a, b: "work/dir")
    monkeypatch.setenv('CYLC_SUITE_DEF_PATH', 'cylc/suite/def/path')
    cylc.flow.flags.debug = True
    cylc.flow.flags.verbose = True
    suite_env = {'CYLC_UTC': 'True', 'CYLC_CYCLING_MODE': 'integer'}
    job_file_writer = JobFileWriter()
    job_file_writer.set_suite_env(suite_env)
    # suite env not correctly setting...check this
    expected = ('\n\ncylc__job__inst__cylc_env() {\n    # CYLC SUITE '
                'ENVIRONMENT:\n    export CYLC_CYCLING_MODE="integer"\n  '
                '  export CYLC_UTC="True"\n    export TZ="UTC"\n\n   '
                ' export CYLC_SUITE_RUN_DIR="cylc-run/farm_noises"\n   '
                ' export CYLC_SUITE_WORK_DIR_ROOT="work/dir"\n   '
                ' export CYLC_SUITE_DEF_PATH="remote/suite/dir"\n    expor'
                't CYLC_SUITE_DEF_PATH_ON_SUITE_HOST="cylc/suite/def/path"'
                '\n    export CYLC_SUITE_UUID="neigh"')
    job_conf = {
        "platform": fixture_get_platform({
            "host": "localhost",
            "owner": "me",
        }),
        "suite_name": "farm_noises",
        "remote_suite_d": "remote/suite/dir",
        "uuid_str": "neigh"
    }
    rund = "cylc-run/farm_noises"
    with io.StringIO() as fake_file:
        job_file_writer._write_suite_environment(fake_file, job_conf, rund)
        assert (fake_file.getvalue() == expected)
Example #13
0
def test_write_workflow_environment(fixture_get_platform, monkeypatch):
    """Test workflow environment is correctly written in jobscript"""
    # set some workflow environment conditions
    monkeypatch.setattr('cylc.flow.flags.verbosity', 2)
    workflow_env = {'CYLC_UTC': 'True',
                    'CYLC_CYCLING_MODE': 'integer'}
    job_file_writer = JobFileWriter()
    job_file_writer.set_workflow_env(workflow_env)
    # workflow env not correctly setting...check this
    expected = ('\n\ncylc__job__inst__cylc_env() {\n    # CYLC WORKFLOW '
                'ENVIRONMENT:\n    export CYLC_CYCLING_MODE="integer"\n  '
                '  export CYLC_UTC="True"\n    export TZ="UTC"\n\n   '
                ' export CYLC_WORKFLOW_RUN_DIR="cylc-run/farm_noises"\n   '
                ' export CYLC_WORKFLOW_UUID="neigh"')
    job_conf = {
        "platform": fixture_get_platform({
            "host": "localhost",
        }),
        "workflow_name": "farm_noises",
        "uuid_str": "neigh"
    }
    rund = "cylc-run/farm_noises"
    with io.StringIO() as fake_file:
        job_file_writer._write_workflow_environment(fake_file, job_conf, rund)
        result = fake_file.getvalue()
        assert result == expected
Example #14
0
def test_traps_for_each_batch_system(batch_sys: str):
    """Test traps for each batch system"""
    platform = platform_from_name()
    platform.update({"batch system": f"{batch_sys}", "owner": "me"})
    job_conf = {"platform": platform, "directives": {}}

    with io.StringIO() as fake_file:
        JobFileWriter()._write_prelude(fake_file, job_conf)
        output = fake_file.getvalue()
        if batch_sys == "slurm":
            assert ("CYLC_FAIL_SIGNALS='EXIT ERR XCPU" in output)
        else:
            assert ("CYLC_FAIL_SIGNALS='EXIT ERR TERM XCPU" in output)
Example #15
0
 def test_write_prelude_invalid_cylc_command(self, mocked_glbl_cfg):
     job_conf = {
         "batch_system_name": "background",
         "host": "localhost",
         "owner": "me"
     }
     mocked = mock.MagicMock()
     mocked_glbl_cfg.return_value = mocked
     mocked.get_host_item.return_value = 'cylc-testing'
     with self.assertRaises(ValueError) as ex:
         with TemporaryFile(mode="w+") as handle:
             JobFileWriter()._write_prelude(handle, job_conf)
     self.assertIn("bad cylc executable", str(ex.exception))
Example #16
0
def test_write_prelude_invalid_cylc_command():
    job_conf = {
        "platform": {
            "batch system": "background",
            "hosts": ["localhost"],
            "owner": "me",
            "cylc executable": "sl -a"
        }
    }
    with pytest.raises(ValueError) as ex:
        with TemporaryFile(mode="w+") as handle:
            JobFileWriter()._write_prelude(handle, job_conf)
        assert ("bad cylc executable" in str(ex))
Example #17
0
def test_write_epilogue():
    """Test epilogue is correctly written in jobscript"""
    expected = '\n' + dedent('''
        CYLC_RUN_DIR="${CYLC_RUN_DIR:-$HOME/cylc-run}"
        . "${CYLC_RUN_DIR}/${CYLC_WORKFLOW_ID}/.service/etc/job.sh"
        cylc__job__main

        #EOF: 1/moo/01
    ''')
    job_conf = {'job_d': "1/moo/01"}
    with io.StringIO() as fake_file:
        JobFileWriter()._write_epilogue(fake_file, job_conf)
        assert (fake_file.getvalue() == expected)
Example #18
0
def test_write_runtime_environment():
    """Test runtime environment is correctly written in jobscript"""

    expected = (
        '\n\ncylc__job__inst__user_env() {\n    # TASK RUNTIME '
        'ENVIRONMENT:\n    export cow sheep duck\n'
        '    cow=~/"moo"\n    sheep=~baa/"baa"\n    '
        'duck=~quack\n}')
    job_conf = {
        'environment': {'cow': '~/moo',
                        'sheep': '~baa/baa',
                        'duck': '~quack'}
    }
    with io.StringIO() as fake_file:
        JobFileWriter()._write_runtime_environment(fake_file, job_conf)
        assert(fake_file.getvalue() == expected)
Example #19
0
def test_write_global_init_scripts(fixture_get_platform):
    """Test global init script is correctly written in jobscript"""

    job_conf = {
        "platform":
        fixture_get_platform({
            "global init-script": ('global init-script = \n'
                                   'export COW=moo\n'
                                   'export PIG=oink\n'
                                   'export DONKEY=HEEHAW\n')
        })
    }
    expected = ('\n\ncylc__job__inst__global_init_script() {\n'
                '# GLOBAL-INIT-SCRIPT:\nglobal init-script = \nexport '
                'COW=moo\nexport PIG=oink\nexport DONKEY=HEEHAW\n\n}')
    with io.StringIO() as fake_file:
        JobFileWriter()._write_global_init_script(fake_file, job_conf)
        assert (fake_file.getvalue() == expected)
Example #20
0
def test_write_global_init_scripts(fixture_get_platform):
    """Test global init script is correctly written in jobscript"""

    job_conf = {
        "platform":
        fixture_get_platform({
            "global init-script": ('export COW=moo\n'
                                   'export PIG=oink\n'
                                   'export DONKEY=HEEHAW\n')
        })
    }
    expected = '\n' + dedent('''
        # GLOBAL INIT-SCRIPT:
        export COW=moo
        export PIG=oink
        export DONKEY=HEEHAW
    ''')
    with io.StringIO() as fake_file:
        JobFileWriter()._write_global_init_script(fake_file, job_conf)
        assert (fake_file.getvalue() == expected)
Example #21
0
def test_no_script_section_with_comment_only_script():
    """Test jobfilewriter does not generate script section when script is
        comment only"""
    expected = ("")

    job_conf = {
        "init-script": "",
        "env-script": "",
        "err-script": "",
        "pre-script": "#This is the pre script/n #moo /n#baa",
        "script": "",
        "post-script": "",
        "exit-script": ""
    }

    with io.StringIO() as fake_file:

        JobFileWriter()._write_script(fake_file, job_conf)
        blah = fake_file.getvalue()
        print(blah)
        assert (fake_file.getvalue() == expected)
Example #22
0
def test_write(fixture_get_platform):
    """Test write function outputs jobscript file correctly."""
    with NamedTemporaryFile() as local_job_file_path:
        local_job_file_path = local_job_file_path.name
        platform = fixture_get_platform({
            "job runner command template": "woof",
        })
        job_conf = {
            "platform": platform,
            "task_id": "baa",
            "workflow_name": "farm_noises",
            "work_d": "farm_noises/work_d",
            "uuid_str": "neigh",
            'environment': {
                'cow': '~/moo',
                'sheep': '~baa/baa',
                'duck': '~quack'
            },
            "job_d": "1/baa/01",
            "try_num": 1,
            "flow_nums": {1},
            # "job_runner_name": "background",
            "param_var": {
                "duck": "quack",
                "mouse": "squeak"
            },
            "execution_time_limit": "moo",
            "namespace_hierarchy": ["root", "baa", "moo"],
            "dependencies": ['moo', 'neigh', 'quack'],
            "init-script": "This is the init script",
            "env-script": "This is the env script",
            "err-script": "This is the err script",
            "pre-script": "This is the pre script",
            "script": "This is the script",
            "post-script": "This is the post script",
            "exit-script": "This is the exit script",
        }
        JobFileWriter().write(local_job_file_path, job_conf)

        assert (os.path.exists(local_job_file_path))
        size_of_file = os.stat(local_job_file_path).st_size
        # This test only needs to check that the file is created and is
        # non-empty as each section is covered by individual unit tests.
        assert (size_of_file > 10)
    """Test the header is correctly written"""

    expected = ('#!/bin/bash -l\n#\n# ++++ THIS IS A CYLC TASK JOB SCRIPT '
                '++++\n# Workflow: farm_noises\n# Task: baa\n# Job '
                'log directory: 1/baa/01\n# Job submit method: '
                'background\n# Job submit command template: woof\n#'
                ' Execution time limit: moo')

    platform = fixture_get_platform({"job runner command template": "woof"})
    job_conf = {
        "platform": platform,
        "job runner": "background",
        "execution_time_limit": "moo",
        "workflow_name": "farm_noises",
        "task_id": "baa",
        "job_d": "1/baa/01"
    }

    with io.StringIO() as fake_file:
        JobFileWriter()._write_header(fake_file, job_conf)
        assert (fake_file.getvalue() == expected)
Example #23
0
 def test_get_variable_value_definition(self):
     """Test the value for single/tilde variables are correctly quoted"""
     for in_value, out_value in TILDE_IN_OUT:
         res = JobFileWriter._get_variable_value_definition(in_value)
         self.assertEqual(out_value, res)
Example #24
0
def test_get_variable_value_definition(in_value, out_value):
    """Test the value for single/tilde variables are correctly quoted, and
    parameter environment templates are handled"""
    param_dict = {'i': 3}
    res = JobFileWriter._get_variable_value_definition(in_value, param_dict)
    assert (out_value == res)
Example #25
0
 def test_get_variable_value_definition(self):
     """Test the value for single/tilde variables are correctly quoted"""
     for in_value, out_value in TILDE_IN_OUT:
         res = JobFileWriter._get_variable_value_definition(in_value)
         self.assertEqual(out_value, res)
Example #26
0
class TaskJobManager(object):
    """Manage task job submit, poll and kill.

    This class provides logic to:
    * Submit task jobs.
    * Poll task jobs.
    * Kill task jobs.
    * Set up the directory structure on job hosts.
    * Install suite communicate client files on job hosts.
    * Remove suite contact files on job hosts.
    """

    JOBS_KILL = 'jobs-kill'
    JOBS_POLL = 'jobs-poll'
    JOBS_SUBMIT = SubProcPool.JOBS_SUBMIT
    POLL_FAIL = 'poll failed'
    REMOTE_SELECT_MSG = 'waiting for remote host selection'
    REMOTE_INIT_MSG = 'remote host initialising'
    KEY_EXECUTE_TIME_LIMIT = TaskEventsManager.KEY_EXECUTE_TIME_LIMIT

    def __init__(self, suite, proc_pool, suite_db_mgr, suite_srv_files_mgr,
                 task_events_mgr):
        self.suite = suite
        self.proc_pool = proc_pool
        self.suite_db_mgr = suite_db_mgr
        self.task_events_mgr = task_events_mgr
        self.job_file_writer = JobFileWriter()
        self.batch_sys_mgr = self.job_file_writer.batch_sys_mgr
        self.suite_srv_files_mgr = suite_srv_files_mgr
        self.task_remote_mgr = TaskRemoteMgr(
            suite, proc_pool, suite_srv_files_mgr)

    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if self.task_events_mgr.check_job_time(itask, now):
                poll_tasks.add(itask)
                if itask.poll_timer.delay is not None:
                    LOG.info(
                        '[%s] -poll now, (next in %s)',
                        itask, itask.poll_timer.delay_timeout_as_str())
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)

    def kill_task_jobs(self, suite, itasks):
        """Kill jobs of active tasks, and hold the tasks.

        If items is specified, kill active tasks matching given IDs.

        """
        to_kill_tasks = []
        for itask in itasks:
            if itask.state.status in TASK_STATUSES_ACTIVE:
                itask.state.set_held()
                to_kill_tasks.append(itask)
            else:
                LOG.warning('skipping %s: task not killable' % itask.identity)
        self._run_job_cmd(
            self.JOBS_KILL, suite, to_kill_tasks,
            self._kill_task_jobs_callback)

    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)

    def prep_submit_task_jobs(self, suite, itasks, dry_run=False,
                              check_syntax=True):
        """Prepare task jobs for submit.

        Prepare tasks where possible. Ignore tasks that are waiting for host
        select command to complete. Bad host select command or error writing to
        a job file will cause a bad task - leading to submission failure.

        Return [list, list]: list of good tasks, list of bad tasks
        """
        prepared_tasks = []
        bad_tasks = []
        for itask in itasks:
            prep_task = self._prep_submit_task_job(suite, itask, dry_run,
                                                   check_syntax=check_syntax)
            if prep_task:
                prepared_tasks.append(itask)
            elif prep_task is False:
                bad_tasks.append(itask)
        return [prepared_tasks, bad_tasks]

    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%02d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks

    @staticmethod
    def _create_job_log_path(suite, itask):
        """Create job log directory for a task job, etc.

        Create local job directory, and NN symbolic link.
        If NN => 01, remove numbered directories with submit numbers greater
        than 01.
        Return a string in the form "POINT/NAME/SUBMIT_NUM".

        """
        job_file_dir = get_task_job_log(
            suite, itask.point, itask.tdef.name, itask.submit_num)
        task_log_dir = os.path.dirname(job_file_dir)
        if itask.submit_num == 1:
            try:
                names = os.listdir(task_log_dir)
            except OSError:
                pass
            else:
                for name in names:
                    if name not in ["01", NN]:
                        rmtree(
                            os.path.join(task_log_dir, name),
                            ignore_errors=True)
        else:
            rmtree(job_file_dir, ignore_errors=True)

        os.makedirs(job_file_dir, exist_ok=True)
        target = os.path.join(task_log_dir, NN)
        source = os.path.basename(job_file_dir)
        try:
            prev_source = os.readlink(target)
        except OSError:
            prev_source = None
        if prev_source == source:
            return
        try:
            if prev_source:
                os.unlink(target)
            os.symlink(source, target)
        except OSError as exc:
            if not exc.filename:
                exc.filename = target
            raise exc

    @staticmethod
    def _get_job_scripts(itask, rtconfig):
        """Return pre-script, script, post-script for a job."""
        script = rtconfig['script']
        pre_script = rtconfig['pre-script']
        post_script = rtconfig['post-script']
        if itask.tdef.suite_polling_cfg:
            # Automatic suite state polling script
            comstr = "cylc suite-state " + \
                     " --task=" + itask.tdef.suite_polling_cfg['task'] + \
                     " --point=" + str(itask.point)
            if LOG.isEnabledFor(DEBUG):
                comstr += ' --debug'
            for key, fmt in [
                    ('user', ' --%s=%s'),
                    ('host', ' --%s=%s'),
                    ('interval', ' --%s=%d'),
                    ('max-polls', ' --%s=%s'),
                    ('run-dir', ' --%s=%s')]:
                if rtconfig['suite state polling'][key]:
                    comstr += fmt % (key, rtconfig['suite state polling'][key])
            if rtconfig['suite state polling']['message']:
                comstr += " --message='%s'" % (
                    rtconfig['suite state polling']['message'])
            else:
                comstr += " --status=" + itask.tdef.suite_polling_cfg['status']
            comstr += " " + itask.tdef.suite_polling_cfg['suite']
            script = "echo " + comstr + "\n" + comstr
        return pre_script, script, post_script

    @staticmethod
    def _job_cmd_out_callback(suite, itask, cmd_ctx, line):
        """Callback on job command STDOUT/STDERR."""
        if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"):
            owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs
        elif cmd_ctx.cmd_kwargs.get("host"):
            owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs
        elif cmd_ctx.cmd_kwargs.get("user"):
            owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs
        else:
            owner_at_host = ""
        try:
            timestamp, _, content = line.split("|")
        except ValueError:
            pass
        else:
            line = "%s %s" % (timestamp, content)
        job_activity_log = get_task_job_activity_log(
            suite, itask.point, itask.tdef.name)
        try:
            with open(job_activity_log, "ab") as handle:
                if not line.endswith("\n"):
                    line += "\n"
                handle.write((owner_at_host + line).encode())
        except IOError as exc:
            LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
            LOG.warning("[%s] -%s%s", itask, owner_at_host, line)

    def _kill_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when kill tasks command exits."""
        self._manip_task_jobs_callback(
            ctx,
            suite,
            itasks,
            self._kill_task_job_callback,
            {self.batch_sys_mgr.OUT_PREFIX_COMMAND: self._job_cmd_out_callback}
        )

    def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _kill_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_KILL, None)
        ctx.out = line
        try:
            ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
        log_lvl = INFO
        log_msg = 'killed'
        if ctx.ret_code:  # non-zero exit status
            log_lvl = WARNING
            log_msg = 'kill failed'
            itask.state.kill_failed = True
        elif itask.state.status == TASK_STATUS_SUBMITTED:
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)
        elif itask.state.status == TASK_STATUS_RUNNING:
            self.task_events_mgr.process_message(
                itask, CRITICAL, TASK_OUTPUT_FAILED)
        else:
            log_lvl = DEBUG
            log_msg = (
                'ignoring job kill result, unexpected task state: %s' %
                itask.state.status)
        itask.set_summary_message(log_msg)
        LOG.log(log_lvl, "[%s] -job(%02d) %s" % (
            itask.identity, itask.submit_num, log_msg))

    def _manip_task_jobs_callback(
            self, ctx, suite, itasks, summary_callback, more_callbacks=None):
        """Callback when submit/poll/kill tasks command exits."""
        if ctx.ret_code:
            LOG.error(ctx)
        else:
            LOG.debug(ctx)
        # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy
        #
        # Note for "reload": A TaskProxy instance may be replaced on reload, so
        # the "itasks" list may not reference the TaskProxy objects that
        # replace the old ones. The .reload_successor attribute provides the
        # link(s) for us to get to the latest replacement.
        #
        # Note for "kill": It is possible for a job to trigger its trap and
        # report back to the suite back this logic is called. If so, the task
        # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
        # its output line will be ignored here.
        tasks = {}
        for itask in itasks:
            while itask.reload_successor is not None:
                itask = itask.reload_successor
            if itask.point is not None and itask.submit_num:
                submit_num = "%02d" % (itask.submit_num)
                tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
        handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
        if more_callbacks:
            for prefix, callback in more_callbacks.items():
                handlers.append((prefix, callback))
        out = ctx.out
        if not out:
            out = ""
        bad_tasks = dict(tasks)
        for line in out.splitlines(True):
            for prefix, callback in handlers:
                if line.startswith(prefix):
                    line = line[len(prefix):].strip()
                    try:
                        path = line.split("|", 2)[1]  # timestamp, path, status
                        point, name, submit_num = path.split(os.sep, 2)
                        if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY:
                            del bad_tasks[(point, name, submit_num)]
                        itask = tasks[(point, name, submit_num)]
                        callback(suite, itask, ctx, line)
                    except (LookupError, ValueError, KeyError) as exc:
                        LOG.warning(
                            'Unhandled %s output: %s', ctx.cmd_key, line)
                        LOG.exception(exc)
        # Task jobs that are in the original command but did not get a status
        # in the output. Handle as failures.
        for key, itask in sorted(bad_tasks.items()):
            line = (
                "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n")
            summary_callback(suite, itask, ctx, line)

    def _poll_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when poll tasks command exits."""
        self._manip_task_jobs_callback(
            ctx,
            suite,
            itasks,
            self._poll_task_job_callback,
            {self.batch_sys_mgr.OUT_PREFIX_MESSAGE:
             self._poll_task_job_message_callback})

    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        # See cylc.flow.batch_sys_manager.JobPollContext
        try:
            job_log_dir, context = line.split('|')[1:3]
            items = json.loads(context)
            jp_ctx = JobPollContext(job_log_dir, **items)
        except TypeError:
            itask.set_summary_message(self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        except ValueError:
            # back compat for cylc 7.7.1 and previous
            try:
                values = line.split('|')
                items = dict(  # done this way to ensure IndexError is raised
                    (key, values[x]) for
                    x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES))
                job_log_dir = items.pop('job_log_dir')
            except (ValueError, IndexError):
                itask.set_summary_message(self.POLL_FAIL)
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
                return
        finally:
            log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        flag = self.task_events_mgr.FLAG_POLLED
        if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1:
            # Failed by a signal, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
            self.task_events_mgr.process_message(
                itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal,
                jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.run_status == 1:
            # The job has terminated, but is still managed by batch system.
            # Some batch system may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.run_status == 0:
            # The job succeeded
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1:
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(),
                flag)
        elif jp_ctx.time_run:
            # The job has started, and is still managed by batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.batch_sys_exit_polled == 1:
            # The job never ran, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                jp_ctx.time_submit_exit, flag)
        else:
            # The job never ran, and is in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit,
                flag)

    def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on message of one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        try:
            event_time, severity, message = line.split("|")[2:5]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = 0
            self.task_events_mgr.process_message(
                itask, severity, message, event_time,
                self.task_events_mgr.FLAG_POLLED)
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their user@host.
        Put a job command for each user@host to the multiprocess pool.

        """
        if not itasks:
            return
        auth_itasks = {}
        for itask in itasks:
            if (itask.task_host, itask.task_owner) not in auth_itasks:
                auth_itasks[(itask.task_host, itask.task_owner)] = []
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        for (host, owner), itasks in sorted(auth_itasks.items()):
            cmd = ["cylc", cmd_key]
            if LOG.isEnabledFor(DEBUG):
                cmd.append("--debug")
            if is_remote_host(host):
                cmd.append("--host=%s" % (host))
            if is_remote_user(owner):
                cmd.append("--user=%s" % (owner))
            cmd.append("--")
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, "suite job log directory", host, owner))
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(get_task_job_id(
                    itask.point, itask.tdef.name, itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(
                SubProcContext(cmd_key, cmd), callback, [suite, itasks])

    @staticmethod
    def _set_retry_timers(itask, rtconfig=None):
        """Set try number and retry delays."""
        if rtconfig is None:
            rtconfig = itask.tdef.rtconfig
        try:
            no_retry = (
                rtconfig[itask.tdef.run_mode + ' mode']['disable retries'])
        except KeyError:
            no_retry = False
        if not no_retry:
            for key, cfg_key in [
                    (TASK_STATUS_SUBMIT_RETRYING, 'submission retry delays'),
                    (TASK_STATUS_RETRYING, 'execution retry delays')]:
                delays = rtconfig['job'][cfg_key]
                if delays is None:
                    delays = []
                try:
                    itask.try_timers[key].set_delays(delays)
                except KeyError:
                    itask.try_timers[key] = TaskActionTimer(delays=delays)

    def _simulation_submit_task_jobs(self, itasks):
        """Simulation mode task jobs submission."""
        for itask in itasks:
            self._set_retry_timers(itask)
            itask.task_host = 'SIMULATION'
            itask.task_owner = 'SIMULATION'
            itask.summary['batch_sys_name'] = 'SIMULATION'
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = (
                itask.tdef.rtconfig['job']['simulated run length'])
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUBMITTED)
        return itasks

    def _submit_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when submit task jobs command exits."""
        self._manip_task_jobs_callback(
            ctx,
            suite,
            itasks,
            self._submit_task_job_callback,
            {self.batch_sys_mgr.OUT_PREFIX_COMMAND: self._job_cmd_out_callback}
        )

    def _submit_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _submit_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_SUBMIT, None)
        ctx.out = line
        items = line.split("|")
        try:
            ctx.timestamp, _, ctx.ret_code = items[0:3]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING:
            return

        try:
            itask.summary['submit_method_id'] = items[3]
        except IndexError:
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] == "None":
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] and ctx.ret_code == 0:
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUBMITTED, ctx.timestamp)
        else:
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)

    def _prep_submit_task_job(self, suite, itask, dry_run, check_syntax=True):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path and not dry_run:
            return itask

        # Handle broadcasts
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides, prepend=True)
        else:
            rtconfig = itask.tdef.rtconfig

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.
        try:
            task_host = self.task_remote_mgr.remote_host_select(
                rtconfig['remote']['host'])
        except TaskRemoteMgmtError as exc:
            # Submit number not yet incremented
            itask.submit_num += 1
            itask.summary['job_hosts'][itask.submit_num] = ''
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)
            self._prep_submit_task_job_error(
                suite, itask, dry_run, '(remote host select)', exc)
            return False
        else:
            if task_host is None:  # host select not ready
                itask.set_summary_message(self.REMOTE_SELECT_MSG)
                return
            itask.task_host = task_host
            # Submit number not yet incremented
            itask.submit_num += 1
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig)
            local_job_file_path = get_task_job_job_log(
                suite, itask.point, itask.tdef.name, itask.submit_num)
            self.job_file_writer.write(local_job_file_path, job_conf,
                                       check_syntax=check_syntax)
        except Exception as exc:
            # Could be a bad command template, IOError, etc
            self._prep_submit_task_job_error(
                suite, itask, dry_run, '(prepare job file)', exc)
            return False
        itask.local_job_file_path = local_job_file_path

        if dry_run:
            itask.set_summary_message('job file written (edit/dry-run)')
            LOG.debug('[%s] -%s', itask, itask.summary['latest_message'])

        # Return value used by "cylc submit" and "cylc jobscript":
        return itask

    def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
        """Helper for self._prep_submit_task_job. On error."""
        LOG.debug("submit_num %s" % itask.submit_num)
        LOG.debug(traceback.format_exc())
        LOG.error(exc)
        log_task_job_activity(
            SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1),
            suite, itask.point, itask.tdef.name, submit_num=itask.submit_num)
        if not dry_run:
            # Persist
            self.suite_db_mgr.put_insert_task_jobs(itask, {
                'is_manual_submit': itask.is_manual_submit,
                'try_num': itask.get_try_num(),
                'time_submit': get_current_time_string(),
                'batch_sys_name': itask.summary.get('batch_sys_name'),
            })
            itask.is_manual_submit = False
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)

    def _prep_submit_task_job_impl(self, suite, itask, rtconfig):
        """Helper for self._prep_submit_task_job."""
        itask.task_owner = rtconfig['remote']['owner']
        if itask.task_owner:
            owner_at_host = itask.task_owner + "@" + itask.task_host
        else:
            owner_at_host = itask.task_host
        itask.summary['host'] = owner_at_host
        itask.summary['job_hosts'][itask.submit_num] = owner_at_host

        itask.summary['batch_sys_name'] = rtconfig['job']['batch system']
        for name in rtconfig['extra log files']:
            itask.summary['logfiles'].append(
                os.path.expanduser(os.path.expandvars(name)))
        try:
            batch_sys_conf = self.task_events_mgr.get_host_conf(
                itask, 'batch systems')[itask.summary['batch_sys_name']]
        except (TypeError, KeyError):
            batch_sys_conf = {}
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['job']['execution time limit'])
        except TypeError:
            pass

        scripts = self._get_job_scripts(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = get_task_job_id(
            itask.point, itask.tdef.name, itask.submit_num)
        job_file_path = os.path.join(
            glbl_cfg().get_derived_host_item(
                suite, "suite job log directory",
                itask.task_host, itask.task_owner),
            job_d, JOB_LOG_JOB)
        return {
            'batch_system_name': rtconfig['job']['batch system'],
            'batch_submit_command_template': (
                rtconfig['job']['batch submit command template']),
            'batch_system_conf': batch_sys_conf,
            'dependencies': itask.state.get_resolved_dependencies(),
            'directives': rtconfig['directives'],
            'environment': rtconfig['environment'],
            'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script': rtconfig['env-script'],
            'err-script': rtconfig['err-script'],
            'exit-script': rtconfig['exit-script'],
            'host': itask.task_host,
            'init-script': rtconfig['init-script'],
            'job_file_path': job_file_path,
            'job_d': job_d,
            'namespace_hierarchy': itask.tdef.namespace_hierarchy,
            'owner': itask.task_owner,
            'param_env_tmpl': rtconfig['parameter environment templates'],
            'param_var': itask.tdef.param_var,
            'post-script': scripts[2],
            'pre-script': scripts[0],
            'remote_suite_d': rtconfig['remote']['suite definition directory'],
            'script': scripts[1],
            'submit_num': itask.submit_num,
            'suite_name': suite,
            'task_id': itask.identity,
            'try_num': itask.get_try_num(),
            'uuid_str': self.task_remote_mgr.uuid_str,
            'work_d': rtconfig['work sub-directory'],
        }
Example #27
0
def test_write_directives(fixture_get_platform, job_conf: dict, expected: str):
    """"Test the directives section of job script file is correctly
        written"""
    with io.StringIO() as fake_file:
        JobFileWriter()._write_directives(fake_file, job_conf)
        assert (fake_file.getvalue() == expected)
Example #28
0
class TaskJobManager(object):
    """Manage task job submit, poll and kill.

    This class provides logic to:
    * Submit task jobs.
    * Poll task jobs.
    * Kill task jobs.
    * Set up the directory structure on job hosts.
    * Install suite communicate client files on job hosts.
    * Remove suite contact files on job hosts.
    """

    JOBS_KILL = 'jobs-kill'
    JOBS_POLL = 'jobs-poll'
    JOBS_SUBMIT = SubProcPool.JOBS_SUBMIT
    POLL_FAIL = 'poll failed'
    REMOTE_SELECT_MSG = 'waiting for remote host selection'
    REMOTE_INIT_MSG = 'remote host initialising'
    DRY_RUN_MSG = 'job file written (edit/dry-run)'
    KEY_EXECUTE_TIME_LIMIT = TaskEventsManager.KEY_EXECUTE_TIME_LIMIT

    def __init__(self, suite, proc_pool, suite_db_mgr, suite_srv_files_mgr,
                 task_events_mgr, job_pool):
        self.suite = suite
        self.proc_pool = proc_pool
        self.suite_db_mgr = suite_db_mgr
        self.task_events_mgr = task_events_mgr
        self.job_pool = job_pool
        self.job_file_writer = JobFileWriter()
        self.batch_sys_mgr = self.job_file_writer.batch_sys_mgr
        self.suite_srv_files_mgr = suite_srv_files_mgr
        self.task_remote_mgr = TaskRemoteMgr(suite, proc_pool,
                                             suite_srv_files_mgr)

    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if self.task_events_mgr.check_job_time(itask, now):
                poll_tasks.add(itask)
                if itask.poll_timer.delay is not None:
                    LOG.info('[%s] -poll now, (next in %s)', itask,
                             itask.poll_timer.delay_timeout_as_str())
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)

    def kill_task_jobs(self, suite, itasks):
        """Kill jobs of active tasks, and hold the tasks.

        If items is specified, kill active tasks matching given IDs.

        """
        to_kill_tasks = []
        for itask in itasks:
            if itask.state(*TASK_STATUSES_ACTIVE):
                itask.state.reset(is_held=True)
                to_kill_tasks.append(itask)
            else:
                LOG.warning('skipping %s: task not killable' % itask.identity)
        self._run_job_cmd(self.JOBS_KILL, suite, to_kill_tasks,
                          self._kill_task_jobs_callback)

    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = {
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED
        }
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state(*pollable_statuses):
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(self.JOBS_POLL, suite, to_poll_tasks,
                              self._poll_task_jobs_callback)

    def prep_submit_task_jobs(self,
                              suite,
                              itasks,
                              dry_run=False,
                              check_syntax=True):
        """Prepare task jobs for submit.

        Prepare tasks where possible. Ignore tasks that are waiting for host
        select command to complete. Bad host select command or error writing to
        a job file will cause a bad task - leading to submission failure.

        Return [list, list]: list of good tasks, list of bad tasks
        """
        prepared_tasks = []
        bad_tasks = []
        for itask in itasks:
            prep_task = self._prep_submit_task_job(suite,
                                                   itask,
                                                   dry_run,
                                                   check_syntax=check_syntax)
            if prep_task:
                prepared_tasks.append(itask)
            elif prep_task is False:
                bad_tasks.append(itask)
        return [prepared_tasks, bad_tasks]

    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                    self.job_pool.add_job_msg(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num),
                        self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name'])
                    and not is_remote_host(host)):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask,
                         itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(
                    itask, {
                        'is_manual_submit': itask.is_manual_submit,
                        'try_num': itask.get_try_num(),
                        'time_submit': now_str,
                        'user_at_host': owner_at_host,
                        'batch_sys_name': itask.summary['batch_sys_name'],
                    })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(self.JOBS_SUBMIT,
                                       '(init %s)' % owner_at_host,
                                       err=REMOTE_INIT_FAILED,
                                       ret_code=1), suite, itask.point,
                        itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [('host', host, is_remote_host),
                                          ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(get_remote_suite_run_job_dir(host, owner, suite))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size]
                for i in range(0, len(itasks), chunk_size)
            ]
            LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd,
                      [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(suite, itask.point,
                                                 itask.tdef.name,
                                                 itask.submit_num))
                    job_log_dirs.append(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(self.JOBS_SUBMIT,
                                   cmd + job_log_dirs,
                                   stdin_files=stdin_files,
                                   job_log_dirs=job_log_dirs,
                                   **kwargs), self._submit_task_jobs_callback,
                    [suite, itasks_batch])
        return done_tasks

    @staticmethod
    def _create_job_log_path(suite, itask):
        """Create job log directory for a task job, etc.

        Create local job directory, and NN symbolic link.
        If NN => 01, remove numbered directories with submit numbers greater
        than 01.
        Return a string in the form "POINT/NAME/SUBMIT_NUM".

        """
        job_file_dir = get_task_job_log(suite, itask.point, itask.tdef.name,
                                        itask.submit_num)
        task_log_dir = os.path.dirname(job_file_dir)
        if itask.submit_num == 1:
            try:
                names = os.listdir(task_log_dir)
            except OSError:
                pass
            else:
                for name in names:
                    if name not in ["01", NN]:
                        rmtree(os.path.join(task_log_dir, name),
                               ignore_errors=True)
        else:
            rmtree(job_file_dir, ignore_errors=True)

        os.makedirs(job_file_dir, exist_ok=True)
        target = os.path.join(task_log_dir, NN)
        source = os.path.basename(job_file_dir)
        try:
            prev_source = os.readlink(target)
        except OSError:
            prev_source = None
        if prev_source == source:
            return
        try:
            if prev_source:
                os.unlink(target)
            os.symlink(source, target)
        except OSError as exc:
            if not exc.filename:
                exc.filename = target
            raise exc

    @staticmethod
    def _get_job_scripts(itask, rtconfig):
        """Return pre-script, script, post-script for a job."""
        script = rtconfig['script']
        pre_script = rtconfig['pre-script']
        post_script = rtconfig['post-script']
        if itask.tdef.suite_polling_cfg:
            # Automatic suite state polling script
            comstr = "cylc suite-state " + \
                     " --task=" + itask.tdef.suite_polling_cfg['task'] + \
                     " --point=" + str(itask.point)
            if LOG.isEnabledFor(DEBUG):
                comstr += ' --debug'
            for key, fmt in [('user', ' --%s=%s'), ('host', ' --%s=%s'),
                             ('interval', ' --%s=%d'),
                             ('max-polls', ' --%s=%s'),
                             ('run-dir', ' --%s=%s')]:
                if rtconfig['suite state polling'][key]:
                    comstr += fmt % (key, rtconfig['suite state polling'][key])
            if rtconfig['suite state polling']['message']:
                comstr += " --message='%s'" % (
                    rtconfig['suite state polling']['message'])
            else:
                comstr += " --status=" + itask.tdef.suite_polling_cfg['status']
            comstr += " " + itask.tdef.suite_polling_cfg['suite']
            script = "echo " + comstr + "\n" + comstr
        return pre_script, script, post_script

    @staticmethod
    def _job_cmd_out_callback(suite, itask, cmd_ctx, line):
        """Callback on job command STDOUT/STDERR."""
        if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"):
            owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs
        elif cmd_ctx.cmd_kwargs.get("host"):
            owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs
        elif cmd_ctx.cmd_kwargs.get("user"):
            owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs
        else:
            owner_at_host = ""
        try:
            timestamp, _, content = line.split("|")
        except ValueError:
            pass
        else:
            line = "%s %s" % (timestamp, content)
        job_activity_log = get_task_job_activity_log(suite, itask.point,
                                                     itask.tdef.name)
        try:
            with open(job_activity_log, "ab") as handle:
                if not line.endswith("\n"):
                    line += "\n"
                handle.write((owner_at_host + line).encode())
        except IOError as exc:
            LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
            LOG.warning("[%s] -%s%s", itask, owner_at_host, line)

    def _kill_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when kill tasks command exits."""
        self._manip_task_jobs_callback(
            ctx, suite, itasks, self._kill_task_job_callback, {
                self.batch_sys_mgr.OUT_PREFIX_COMMAND:
                self._job_cmd_out_callback
            })

    def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _kill_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_KILL, None)
        ctx.out = line
        try:
            ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
        log_lvl = INFO
        log_msg = 'killed'
        if ctx.ret_code:  # non-zero exit status
            log_lvl = WARNING
            log_msg = 'kill failed'
            itask.state.kill_failed = True
        elif itask.state(TASK_STATUS_SUBMITTED):
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)
        elif itask.state(TASK_STATUS_RUNNING):
            self.task_events_mgr.process_message(itask, CRITICAL,
                                                 TASK_OUTPUT_FAILED)
        else:
            log_lvl = DEBUG
            log_msg = ('ignoring job kill result, unexpected task state: %s' %
                       itask.state.status)
        itask.set_summary_message(log_msg)
        self.job_pool.add_job_msg(
            get_task_job_id(itask.point, itask.tdef.name, itask.submit_num),
            log_msg)
        LOG.log(
            log_lvl,
            "[%s] -job(%02d) %s" % (itask.identity, itask.submit_num, log_msg))

    def _manip_task_jobs_callback(self,
                                  ctx,
                                  suite,
                                  itasks,
                                  summary_callback,
                                  more_callbacks=None):
        """Callback when submit/poll/kill tasks command exits."""
        if ctx.ret_code:
            LOG.error(ctx)
        else:
            LOG.debug(ctx)
        # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy
        #
        # Note for "reload": A TaskProxy instance may be replaced on reload, so
        # the "itasks" list may not reference the TaskProxy objects that
        # replace the old ones. The .reload_successor attribute provides the
        # link(s) for us to get to the latest replacement.
        #
        # Note for "kill": It is possible for a job to trigger its trap and
        # report back to the suite back this logic is called. If so, the task
        # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
        # its output line will be ignored here.
        tasks = {}
        for itask in itasks:
            while itask.reload_successor is not None:
                itask = itask.reload_successor
            if itask.point is not None and itask.submit_num:
                submit_num = "%02d" % (itask.submit_num)
                tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
        handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
        if more_callbacks:
            for prefix, callback in more_callbacks.items():
                handlers.append((prefix, callback))
        out = ctx.out
        if not out:
            out = ""
        bad_tasks = dict(tasks)
        for line in out.splitlines(True):
            for prefix, callback in handlers:
                if line.startswith(prefix):
                    line = line[len(prefix):].strip()
                    try:
                        path = line.split("|", 2)[1]  # timestamp, path, status
                        point, name, submit_num = path.split(os.sep, 2)
                        if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY:
                            del bad_tasks[(point, name, submit_num)]
                        itask = tasks[(point, name, submit_num)]
                        callback(suite, itask, ctx, line)
                    except (LookupError, ValueError, KeyError) as exc:
                        LOG.warning('Unhandled %s output: %s', ctx.cmd_key,
                                    line)
                        LOG.exception(exc)
        # Task jobs that are in the original command but did not get a status
        # in the output. Handle as failures.
        for key, itask in sorted(bad_tasks.items()):
            line = ("|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n")
            summary_callback(suite, itask, ctx, line)

    def _poll_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when poll tasks command exits."""
        self._manip_task_jobs_callback(
            ctx, suite, itasks, self._poll_task_job_callback, {
                self.batch_sys_mgr.OUT_PREFIX_MESSAGE:
                self._poll_task_job_message_callback
            })

    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        # See cylc.flow.batch_sys_manager.JobPollContext
        job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
        try:
            job_log_dir, context = line.split('|')[1:3]
            items = json.loads(context)
            jp_ctx = JobPollContext(job_log_dir, **items)
        except TypeError:
            itask.set_summary_message(self.POLL_FAIL)
            self.job_pool.add_job_msg(job_d, self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        except ValueError:
            # back compat for cylc 7.7.1 and previous
            try:
                values = line.split('|')
                items = dict(  # done this way to ensure IndexError is raised
                    (key, values[x])
                    for x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES))
                job_log_dir = items.pop('job_log_dir')
            except (ValueError, IndexError):
                itask.set_summary_message(self.POLL_FAIL)
                self.job_pool.add_job_msg(job_d, self.POLL_FAIL)
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
                return
        finally:
            log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        flag = self.task_events_mgr.FLAG_POLLED
        if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1:
            # Failed by a signal, and no longer in batch system
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 jp_ctx.time_run_exit, flag)
            self.task_events_mgr.process_message(
                itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal,
                jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1:
            # The job has terminated, but is still managed by batch system.
            # Some batch system may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_STARTED,
                                                 jp_ctx.time_run, flag)
        elif jp_ctx.run_status == 0:
            # The job succeeded
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_SUCCEEDED,
                                                 jp_ctx.time_run_exit, flag)
        elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1:
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 get_current_time_string(),
                                                 flag)
        elif jp_ctx.time_run:
            # The job has started, and is still managed by batch system
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_STARTED,
                                                 jp_ctx.time_run, flag)
        elif jp_ctx.batch_sys_exit_polled == 1:
            # The job never ran, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                jp_ctx.time_submit_exit, flag)
        else:
            # The job never ran, and is in batch system
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_STATUS_SUBMITTED,
                                                 jp_ctx.time_submit_exit, flag)

    def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on message of one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        try:
            event_time, severity, message = line.split("|")[2:5]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = 0
            self.task_events_mgr.process_message(
                itask, severity, message, event_time,
                self.task_events_mgr.FLAG_POLLED)
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their user@host.
        Put a job command for each user@host to the multiprocess pool.

        """
        if not itasks:
            return
        auth_itasks = {}
        for itask in itasks:
            if (itask.task_host, itask.task_owner) not in auth_itasks:
                auth_itasks[(itask.task_host, itask.task_owner)] = []
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        for (host, owner), itasks in sorted(auth_itasks.items()):
            cmd = ["cylc", cmd_key]
            if LOG.isEnabledFor(DEBUG):
                cmd.append("--debug")
            if is_remote_host(host):
                cmd.append("--host=%s" % (host))
            if is_remote_user(owner):
                cmd.append("--user=%s" % (owner))
            cmd.append("--")
            cmd.append(get_remote_suite_run_job_dir(host, owner, suite))
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(
                    get_task_job_id(itask.point, itask.tdef.name,
                                    itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback,
                                       [suite, itasks])

    @staticmethod
    def _set_retry_timers(itask, rtconfig=None):
        """Set try number and retry delays."""
        if rtconfig is None:
            rtconfig = itask.tdef.rtconfig
        try:
            no_retry = (rtconfig[itask.tdef.run_mode +
                                 ' mode']['disable retries'])
        except KeyError:
            no_retry = False
        if not no_retry:
            for key, cfg_key in [
                (TASK_STATUS_SUBMIT_RETRYING, 'submission retry delays'),
                (TASK_STATUS_RETRYING, 'execution retry delays')
            ]:
                delays = rtconfig['job'][cfg_key]
                if delays is None:
                    delays = []
                try:
                    itask.try_timers[key].set_delays(delays)
                except KeyError:
                    itask.try_timers[key] = TaskActionTimer(delays=delays)

    def _simulation_submit_task_jobs(self, itasks):
        """Simulation mode task jobs submission."""
        for itask in itasks:
            self._set_retry_timers(itask)
            itask.task_host = 'SIMULATION'
            itask.task_owner = 'SIMULATION'
            itask.summary['batch_sys_name'] = 'SIMULATION'
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = (
                itask.tdef.rtconfig['job']['simulated run length'])
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_SUBMITTED)
        return itasks

    def _submit_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when submit task jobs command exits."""
        self._manip_task_jobs_callback(
            ctx, suite, itasks, self._submit_task_job_callback, {
                self.batch_sys_mgr.OUT_PREFIX_COMMAND:
                self._job_cmd_out_callback
            })

    def _submit_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _submit_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_SUBMIT, None)
        ctx.out = line
        items = line.split("|")
        try:
            ctx.timestamp, _, ctx.ret_code = items[0:3]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING:
            return

        job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
        try:
            itask.summary['submit_method_id'] = items[3]
            self.job_pool.set_job_attr(job_d, 'batch_sys_job_id', items[3])
        except IndexError:
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] == "None":
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] and ctx.ret_code == 0:
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_SUBMITTED,
                                                 ctx.timestamp)
        else:
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)

    def _prep_submit_task_job(self, suite, itask, dry_run, check_syntax=True):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path and not dry_run:
            return itask

        # Handle broadcasts
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides, prepend=True)
        else:
            rtconfig = itask.tdef.rtconfig

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.
        try:
            task_host = self.task_remote_mgr.remote_host_select(
                rtconfig['remote']['host'])
        except TaskRemoteMgmtError as exc:
            # Submit number not yet incremented
            itask.submit_num += 1
            itask.summary['job_hosts'][itask.submit_num] = ''
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)
            self._prep_submit_task_job_error(suite, itask, dry_run,
                                             '(remote host select)', exc)
            return False
        else:
            if task_host is None:  # host select not ready
                itask.set_summary_message(self.REMOTE_SELECT_MSG)
                return
            itask.task_host = task_host
            # Submit number not yet incremented
            itask.submit_num += 1
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig)
            local_job_file_path = get_task_job_job_log(suite, itask.point,
                                                       itask.tdef.name,
                                                       itask.submit_num)
            self.job_file_writer.write(local_job_file_path,
                                       job_conf,
                                       check_syntax=check_syntax)
        except Exception as exc:
            # Could be a bad command template, IOError, etc
            self._prep_submit_task_job_error(suite, itask, dry_run,
                                             '(prepare job file)', exc)
            return False
        itask.local_job_file_path = local_job_file_path

        job_config = deepcopy(job_conf)
        job_config['logfiles'] = deepcopy(itask.summary['logfiles'])
        job_config['job_log_dir'] = get_task_job_log(suite, itask.point,
                                                     itask.tdef.name,
                                                     itask.submit_num)
        itask.jobs.append(job_config['job_d'])
        self.job_pool.insert_job(job_config)

        if dry_run:
            itask.set_summary_message(self.DRY_RUN_MSG)
            self.job_pool.add_job_msg(job_config['job_d'], self.DRY_RUN_MSG)
            LOG.debug(f'[{itask}] -{self.DRY_RUN_MSG}')

        # Return value used by "cylc submit" and "cylc jobscript":
        return itask

    def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
        """Helper for self._prep_submit_task_job. On error."""
        LOG.debug("submit_num %s" % itask.submit_num)
        LOG.debug(traceback.format_exc())
        LOG.error(exc)
        log_task_job_activity(SubProcContext(self.JOBS_SUBMIT,
                                             action,
                                             err=exc,
                                             ret_code=1),
                              suite,
                              itask.point,
                              itask.tdef.name,
                              submit_num=itask.submit_num)
        if not dry_run:
            # Persist
            self.suite_db_mgr.put_insert_task_jobs(
                itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': get_current_time_string(),
                    'batch_sys_name': itask.summary.get('batch_sys_name'),
                })
            itask.is_manual_submit = False
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)

    def _prep_submit_task_job_impl(self, suite, itask, rtconfig):
        """Helper for self._prep_submit_task_job."""
        itask.task_owner = rtconfig['remote']['owner']
        if itask.task_owner:
            owner_at_host = itask.task_owner + "@" + itask.task_host
        else:
            owner_at_host = itask.task_host
        itask.summary['host'] = owner_at_host
        itask.summary['job_hosts'][itask.submit_num] = owner_at_host

        itask.summary['batch_sys_name'] = rtconfig['job']['batch system']
        for name in rtconfig['extra log files']:
            itask.summary['logfiles'].append(
                os.path.expanduser(os.path.expandvars(name)))
        try:
            batch_sys_conf = self.task_events_mgr.get_host_conf(
                itask, 'batch systems')[itask.summary['batch_sys_name']]
        except (TypeError, KeyError):
            batch_sys_conf = {}
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['job']['execution time limit'])
        except TypeError:
            pass

        scripts = self._get_job_scripts(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
        job_file_path = get_remote_suite_run_job_dir(itask.task_host,
                                                     itask.task_owner, suite,
                                                     job_d, JOB_LOG_JOB)
        return {
            'batch_system_name':
            rtconfig['job']['batch system'],
            'batch_submit_command_template':
            (rtconfig['job']['batch submit command template']),
            'batch_system_conf':
            batch_sys_conf,
            'dependencies':
            itask.state.get_resolved_dependencies(),
            'directives':
            rtconfig['directives'],
            'environment':
            rtconfig['environment'],
            'execution_time_limit':
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script':
            rtconfig['env-script'],
            'err-script':
            rtconfig['err-script'],
            'exit-script':
            rtconfig['exit-script'],
            'host':
            itask.task_host,
            'init-script':
            rtconfig['init-script'],
            'job_file_path':
            job_file_path,
            'job_d':
            job_d,
            'namespace_hierarchy':
            itask.tdef.namespace_hierarchy,
            'owner':
            itask.task_owner,
            'param_env_tmpl':
            rtconfig['parameter environment templates'],
            'param_var':
            itask.tdef.param_var,
            'post-script':
            scripts[2],
            'pre-script':
            scripts[0],
            'remote_suite_d':
            rtconfig['remote']['suite definition directory'],
            'script':
            scripts[1],
            'submit_num':
            itask.submit_num,
            'suite_name':
            suite,
            'task_id':
            itask.identity,
            'try_num':
            itask.get_try_num(),
            'uuid_str':
            self.task_remote_mgr.uuid_str,
            'work_d':
            rtconfig['work sub-directory'],
        }
Example #29
0
class TaskJobManager:
    """Manage task job submit, poll and kill.

    This class provides logic to:
    * Submit task jobs.
    * Poll task jobs.
    * Kill task jobs.
    * Set up the directory structure on job hosts.
    * Install suite communicate client files on job hosts.
    * Remove suite contact files on job hosts.
    """

    JOBS_KILL = 'jobs-kill'
    JOBS_POLL = 'jobs-poll'
    JOBS_SUBMIT = SubProcPool.JOBS_SUBMIT
    POLL_FAIL = 'poll failed'
    REMOTE_SELECT_MSG = 'waiting for remote host selection'
    REMOTE_INIT_MSG = 'remote host initialising'
    REMOTE_FILE_INSTALL_MSG = 'file installation in progress'
    KEY_EXECUTE_TIME_LIMIT = TaskEventsManager.KEY_EXECUTE_TIME_LIMIT

    IN_PROGRESS = {
        REMOTE_FILE_INSTALL_IN_PROGRESS: REMOTE_FILE_INSTALL_MSG,
        REMOTE_INIT_IN_PROGRESS: REMOTE_INIT_MSG
    }

    def __init__(self, suite, proc_pool, suite_db_mgr, task_events_mgr,
                 data_store_mgr):
        self.suite = suite
        self.proc_pool = proc_pool
        self.suite_db_mgr = suite_db_mgr
        self.task_events_mgr = task_events_mgr
        self.data_store_mgr = data_store_mgr
        self.job_file_writer = JobFileWriter()
        self.job_runner_mgr = self.job_file_writer.job_runner_mgr
        self.task_remote_mgr = TaskRemoteMgr(suite, proc_pool)

    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if self.task_events_mgr.check_job_time(itask, now):
                poll_tasks.add(itask)
                if itask.poll_timer.delay is not None:
                    LOG.info('[%s] -poll now, (next in %s)', itask,
                             itask.poll_timer.delay_timeout_as_str())
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)

    def kill_task_jobs(self, suite, itasks):
        """Kill jobs of active tasks, and hold the tasks.

        If items is specified, kill active tasks matching given IDs.

        """
        to_kill_tasks = []
        for itask in itasks:
            if itask.state(*TASK_STATUSES_ACTIVE):
                itask.state.reset(is_held=True)
                self.data_store_mgr.delta_task_held(itask)
                to_kill_tasks.append(itask)
            else:
                LOG.warning('skipping %s: task not killable' % itask.identity)
        self._run_job_cmd(self.JOBS_KILL, suite, to_kill_tasks,
                          self._kill_task_jobs_callback)

    def poll_task_jobs(self, suite, itasks, msg=None):
        """Poll jobs of specified tasks.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        if itasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(self.JOBS_POLL, suite, itasks,
                              self._poll_task_jobs_callback)

    def prep_submit_task_jobs(self, suite, itasks, check_syntax=True):
        """Prepare task jobs for submit.

        Prepare tasks where possible. Ignore tasks that are waiting for host
        select command to complete. Bad host select command or error writing to
        a job file will cause a bad task - leading to submission failure.

        Return [list, list]: list of good tasks, list of bad tasks
        """
        prepared_tasks = []
        bad_tasks = []
        for itask in itasks:
            prep_task = self._prep_submit_task_job(suite,
                                                   itask,
                                                   check_syntax=check_syntax)
            if prep_task:
                prepared_tasks.append(itask)
            elif prep_task is False:
                bad_tasks.append(itask)
        return [prepared_tasks, bad_tasks]

    def submit_task_jobs(self,
                         suite,
                         itasks,
                         curve_auth,
                         client_pub_key_dir,
                         is_simulation=False):
        """Prepare for job submission and submit task jobs.

        Preparation (host selection, remote host init, and remote install)
        is done asynchronously. Newly released tasks may be sent here several
        times until these init subprocesses have returned. Failure during
        preparation is considered to be job submission failure.

        Once preparation has completed or failed, reset .waiting_on_job_prep in
        task instances so the scheduler knows to stop sending them back here.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.subshell_eval_reset()

        if not prepared_tasks:
            return bad_tasks
        auth_itasks = {}  # {platform: [itask, ...], ...}
        for itask in prepared_tasks:
            platform_name = itask.platform['name']
            auth_itasks.setdefault(platform_name, [])
            auth_itasks[platform_name].append(itask)
        # Submit task jobs for each platform
        done_tasks = bad_tasks

        for platform_name, itasks in sorted(auth_itasks.items()):
            platform = itasks[0].platform
            install_target = get_install_target_from_platform(platform)
            ri_map = self.task_remote_mgr.remote_init_map

            if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE):
                if install_target == get_localhost_install_target():
                    # Skip init and file install for localhost.
                    LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}")
                    ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE)

                elif install_target not in ri_map:
                    # Remote init not in progress for target, so start it.
                    self.task_remote_mgr.remote_init(platform, curve_auth,
                                                     client_pub_key_dir)
                    for itask in itasks:
                        itask.set_summary_message(self.REMOTE_INIT_MSG)
                        self.data_store_mgr.delta_job_msg(
                            get_task_job_id(itask.point, itask.tdef.name,
                                            itask.submit_num),
                            self.REMOTE_INIT_MSG)
                    continue

                elif (ri_map[install_target] == REMOTE_INIT_DONE):
                    # Already done remote init so move on to file install
                    self.task_remote_mgr.file_install(platform)
                    continue

                elif (ri_map[install_target] in self.IN_PROGRESS.keys()):
                    # Remote init or file install in progress.
                    for itask in itasks:
                        msg = self.IN_PROGRESS[ri_map[install_target]]
                        itask.set_summary_message(msg)
                        self.data_store_mgr.delta_job_msg(
                            get_task_job_id(itask.point, itask.tdef.name,
                                            itask.submit_num), msg)
                    continue

            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            host = get_host_from_platform(platform)
            if (self.job_runner_mgr.is_job_local_to_host(
                    itask.summary['job_runner_name'])
                    and not is_remote_platform(platform)):
                host = get_host()

            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info('[%s] -submit-num=%02d, host=%s', itask,
                         itask.submit_num, host)
                self.suite_db_mgr.put_insert_task_jobs(
                    itask, {
                        'is_manual_submit': itask.is_manual_submit,
                        'try_num': itask.get_try_num(),
                        'time_submit': now_str,
                        'platform_name': itask.platform['name'],
                        'job_runner_name': itask.summary['job_runner_name'],
                    })
                itask.is_manual_submit = False

            if (ri_map[install_target]
                    in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]):
                # Remote init or install failed. Set submit-failed for all
                # affected tasks and remove target from remote init map
                # - this enables new tasks to re-initialise that target
                init_error = (ri_map[install_target])
                del ri_map[install_target]
                for itask in itasks:
                    itask.waiting_on_job_prep = False
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(self.JOBS_SUBMIT,
                                       '(init %s)' % host,
                                       err=init_error,
                                       ret_code=1), suite, itask.point,
                        itask.tdef.name)
                    self._prep_submit_task_job_error(suite, itask,
                                                     '(remote init)', '')

                continue
            # Build the "cylc jobs-submit" command
            cmd = [self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            if is_remote_platform(itask.platform):
                remote_mode = True
                cmd.append('--remote-mode')
            else:
                remote_mode = False
            if itask.platform['clean job submission environment']:
                cmd.append('--clean-env')
            for var in itask.platform[
                    'job submission environment pass-through']:
                cmd.append(f"--env={var}")
            for path in itask.platform[
                    'job submission executable paths'] + SYSPATH:
                cmd.append(f"--path={path}")
            cmd.append('--')
            cmd.append(get_remote_suite_run_job_dir(platform, suite))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = (len(itasks) // (
                (len(itasks) // platform['max batch submit size']) + 1) + 1)
            itasks_batches = [
                itasks[i:i + chunk_size]
                for i in range(0, len(itasks), chunk_size)
            ]
            LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd,
                      [len(b) for b in itasks_batches])

            if remote_mode:
                cmd = construct_ssh_cmd(cmd, platform)
            else:
                cmd = ['cylc'] + cmd

            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            os.path.expandvars(
                                get_task_job_job_log(suite, itask.point,
                                                     itask.tdef.name,
                                                     itask.submit_num)))
                    job_log_dirs.append(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)

                    itask.waiting_on_job_prep = False
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                    ), self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks

    @staticmethod
    def _create_job_log_path(suite, itask):
        """Create job log directory for a task job, etc.

        Create local job directory, and NN symbolic link.
        If NN => 01, remove numbered directories with submit numbers greater
        than 01.
        Return a string in the form "POINT/NAME/SUBMIT_NUM".

        """
        job_file_dir = get_task_job_log(suite, itask.point, itask.tdef.name,
                                        itask.submit_num)
        job_file_dir = os.path.expandvars(job_file_dir)
        task_log_dir = os.path.dirname(job_file_dir)
        if itask.submit_num == 1:
            try:
                names = os.listdir(task_log_dir)
            except OSError:
                pass
            else:
                for name in names:
                    if name not in ["01", NN]:
                        rmtree(os.path.join(task_log_dir, name),
                               ignore_errors=True)
        else:
            rmtree(job_file_dir, ignore_errors=True)

        os.makedirs(job_file_dir, exist_ok=True)
        target = os.path.join(task_log_dir, NN)
        source = os.path.basename(job_file_dir)
        try:
            prev_source = os.readlink(target)
        except OSError:
            prev_source = None
        if prev_source == source:
            return
        try:
            if prev_source:
                os.unlink(target)
            os.symlink(source, target)
        except OSError as exc:
            if not exc.filename:
                exc.filename = target
            raise exc

    @staticmethod
    def _get_job_scripts(itask, rtconfig):
        """Return pre-script, script, post-script for a job."""
        script = rtconfig['script']
        pre_script = rtconfig['pre-script']
        post_script = rtconfig['post-script']
        if itask.tdef.suite_polling_cfg:
            # Automatic suite state polling script
            comstr = "cylc suite-state " + \
                     " --task=" + itask.tdef.suite_polling_cfg['task'] + \
                     " --point=" + str(itask.point)
            if LOG.isEnabledFor(DEBUG):
                comstr += ' --debug'
            for key, fmt in [('user', ' --%s=%s'), ('host', ' --%s=%s'),
                             ('interval', ' --%s=%d'),
                             ('max-polls', ' --%s=%s'),
                             ('run-dir', ' --%s=%s')]:
                if rtconfig['suite state polling'][key]:
                    comstr += fmt % (key, rtconfig['suite state polling'][key])
            if rtconfig['suite state polling']['message']:
                comstr += " --message='%s'" % (
                    rtconfig['suite state polling']['message'])
            else:
                comstr += " --status=" + itask.tdef.suite_polling_cfg['status']
            comstr += " " + itask.tdef.suite_polling_cfg['suite']
            script = "echo " + comstr + "\n" + comstr
        return pre_script, script, post_script

    @staticmethod
    def _job_cmd_out_callback(suite, itask, cmd_ctx, line):
        """Callback on job command STDOUT/STDERR."""
        if cmd_ctx.cmd_kwargs.get("host"):
            host = "(%(host)s) " % cmd_ctx.cmd_kwargs
        else:
            host = ""
        try:
            timestamp, _, content = line.split("|")
        except ValueError:
            pass
        else:
            line = "%s %s" % (timestamp, content)
        job_activity_log = get_task_job_activity_log(suite, itask.point,
                                                     itask.tdef.name)
        try:
            with open(os.path.expandvars(job_activity_log), "ab") as handle:
                if not line.endswith("\n"):
                    line += "\n"
                handle.write((host + line).encode())
        except IOError as exc:
            LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
            LOG.warning("[%s] -%s%s", itask, host, line)

    def _kill_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when kill tasks command exits."""
        self._manip_task_jobs_callback(
            ctx, suite, itasks, self._kill_task_job_callback, {
                self.job_runner_mgr.OUT_PREFIX_COMMAND:
                self._job_cmd_out_callback
            })

    def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _kill_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_KILL, None)
        ctx.out = line
        try:
            ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
        log_lvl = INFO
        log_msg = 'killed'
        if ctx.ret_code:  # non-zero exit status
            log_lvl = WARNING
            log_msg = 'kill failed'
            itask.state.kill_failed = True
        elif itask.state(TASK_STATUS_SUBMITTED):
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)
        elif itask.state(TASK_STATUS_RUNNING):
            self.task_events_mgr.process_message(itask, CRITICAL,
                                                 TASK_OUTPUT_FAILED)
        else:
            log_lvl = DEBUG
            log_msg = ('ignoring job kill result, unexpected task state: %s' %
                       itask.state.status)
        itask.set_summary_message(log_msg)
        self.data_store_mgr.delta_job_msg(
            get_task_job_id(itask.point, itask.tdef.name, itask.submit_num),
            log_msg)
        LOG.log(
            log_lvl,
            "[%s] -job(%02d) %s" % (itask.identity, itask.submit_num, log_msg))

    def _manip_task_jobs_callback(self,
                                  ctx,
                                  suite,
                                  itasks,
                                  summary_callback,
                                  more_callbacks=None):
        """Callback when submit/poll/kill tasks command exits."""
        if ctx.ret_code:
            LOG.error(ctx)
        else:
            LOG.debug(ctx)
        # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy
        #
        # Note for "reload": A TaskProxy instance may be replaced on reload, so
        # the "itasks" list may not reference the TaskProxy objects that
        # replace the old ones. The .reload_successor attribute provides the
        # link(s) for us to get to the latest replacement.
        #
        # Note for "kill": It is possible for a job to trigger its trap and
        # report back to the suite back this logic is called. If so, the task
        # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
        # its output line will be ignored here.
        tasks = {}
        for itask in itasks:
            while itask.reload_successor is not None:
                itask = itask.reload_successor
            if itask.point is not None and itask.submit_num:
                submit_num = "%02d" % (itask.submit_num)
                tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
        handlers = [(self.job_runner_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
        if more_callbacks:
            for prefix, callback in more_callbacks.items():
                handlers.append((prefix, callback))
        out = ctx.out
        if not out:
            out = ""
        bad_tasks = dict(tasks)
        for line in out.splitlines(True):
            for prefix, callback in handlers:
                if line.startswith(prefix):
                    line = line[len(prefix):].strip()
                    try:
                        path = line.split("|", 2)[1]  # timestamp, path, status
                        point, name, submit_num = path.split(os.sep, 2)
                        if prefix == self.job_runner_mgr.OUT_PREFIX_SUMMARY:
                            del bad_tasks[(point, name, submit_num)]
                        itask = tasks[(point, name, submit_num)]
                        callback(suite, itask, ctx, line)
                    except (LookupError, ValueError, KeyError) as exc:
                        LOG.warning('Unhandled %s output: %s', ctx.cmd_key,
                                    line)
                        LOG.exception(exc)
        # Task jobs that are in the original command but did not get a status
        # in the output. Handle as failures.
        for key, itask in sorted(bad_tasks.items()):
            line = ("|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n")
            summary_callback(suite, itask, ctx, line)

    def _poll_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when poll tasks command exits."""
        self._manip_task_jobs_callback(
            ctx, suite, itasks, self._poll_task_job_callback, {
                self.job_runner_mgr.OUT_PREFIX_MESSAGE:
                self._poll_task_job_message_callback
            })

    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        # See cylc.flow.job_runner_mgr.JobPollContext
        job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
        try:
            job_log_dir, context = line.split('|')[1:3]
            items = json.loads(context)
            jp_ctx = JobPollContext(job_log_dir, **items)
        except TypeError:
            itask.set_summary_message(self.POLL_FAIL)
            self.data_store_mgr.delta_job_msg(job_d, self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        except ValueError:
            itask.set_summary_message(self.POLL_FAIL)
            self.data_store_mgr.delta_job_msg(job_d, self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        finally:
            log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        flag = self.task_events_mgr.FLAG_POLLED
        if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1 and jp_ctx.job_runner_exit_polled == 1:
            # Failed by a signal, and no longer in job runner
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 jp_ctx.time_run_exit, flag)
            self.task_events_mgr.process_message(
                itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal,
                jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1:
            # The job has terminated, but is still managed by job runner.
            # Some job runners may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_STARTED,
                                                 jp_ctx.time_run, flag)
        elif jp_ctx.run_status == 0:
            # The job succeeded
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_SUCCEEDED,
                                                 jp_ctx.time_run_exit, flag)
        elif jp_ctx.time_run and jp_ctx.job_runner_exit_polled == 1:
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_FAILED,
                                                 get_current_time_string(),
                                                 flag)
        elif jp_ctx.time_run:
            # The job has started, and is still managed by job runner
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_STARTED,
                                                 jp_ctx.time_run, flag)
        elif jp_ctx.job_runner_exit_polled == 1:
            # The job never ran, and no longer in job runner
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                jp_ctx.time_submit_exit, flag)
        else:
            # The job never ran, and is in job runner
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_STATUS_SUBMITTED,
                                                 jp_ctx.time_submit_exit, flag)

    def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on message of one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        try:
            event_time, severity, message = line.split("|")[2:5]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = 0
            self.task_events_mgr.process_message(
                itask, severity, message, event_time,
                self.task_events_mgr.FLAG_POLLED)
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their platform_name and host.
        Put a job command for each group to the multiprocess pool.

        """
        if not itasks:
            return
        # sort itasks into lists based upon where they were run.
        auth_itasks = {}
        for itask in itasks:
            platform_n = itask.platform['name']
            if platform_n not in auth_itasks:
                auth_itasks[platform_n] = []
            auth_itasks[platform_n].append(itask)

        # Go through each list of itasks and carry out commands as required.
        for platform_n, itasks in sorted(auth_itasks.items()):
            platform = get_platform(platform_n)
            if is_remote_platform(platform):
                remote_mode = True
                cmd = [cmd_key]
            else:
                cmd = ["cylc", cmd_key]
                remote_mode = False
            if LOG.isEnabledFor(DEBUG):
                cmd.append("--debug")
            cmd.append("--")
            cmd.append(get_remote_suite_run_job_dir(platform, suite))
            job_log_dirs = []
            if remote_mode:
                cmd = construct_ssh_cmd(cmd, platform)
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(
                    get_task_job_id(itask.point, itask.tdef.name,
                                    itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback,
                                       [suite, itasks])

    @staticmethod
    def _set_retry_timers(itask, rtconfig=None, retry=True):
        """Set try number and retry delays."""
        if rtconfig is None:
            rtconfig = itask.tdef.rtconfig
        if (itask.tdef.run_mode + ' mode' in rtconfig and 'disable retries'
                in rtconfig[itask.tdef.run_mode + ' mode']):
            retry = False

        if retry:
            if rtconfig['submission retry delays']:
                submit_delays = rtconfig['submission retry delays']
            else:
                submit_delays = itask.platform['submission retry delays']
            # TODO: same for execution delays?

            for key, delays in [(TimerFlags.SUBMISSION_RETRY, submit_delays),
                                (TimerFlags.EXECUTION_RETRY,
                                 rtconfig['execution retry delays'])]:
                if delays is None:
                    delays = []
                try:
                    itask.try_timers[key].set_delays(delays)
                except KeyError:
                    itask.try_timers[key] = TaskActionTimer(delays=delays)

    def _simulation_submit_task_jobs(self, itasks):
        """Simulation mode task jobs submission."""
        for itask in itasks:
            itask.waiting_on_job_prep = False
            self._set_retry_timers(itask)
            itask.platform = 'SIMULATION'
            itask.summary['job_runner_name'] = 'SIMULATION'
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = (
                itask.tdef.rtconfig['job']['simulated run length'])
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_SUBMITTED)
        return itasks

    def _submit_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when submit task jobs command exits."""
        self._manip_task_jobs_callback(
            ctx, suite, itasks, self._submit_task_job_callback, {
                self.job_runner_mgr.OUT_PREFIX_COMMAND:
                self._job_cmd_out_callback
            })

    def _submit_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _submit_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_SUBMIT, None)
        ctx.out = line
        items = line.split("|")
        try:
            ctx.timestamp, _, ctx.ret_code = items[0:3]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING:
            return

        job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
        try:
            itask.summary['submit_method_id'] = items[3]
            self.data_store_mgr.delta_job_attr(job_d, 'job_id', items[3])
        except IndexError:
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] == "None":
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] and ctx.ret_code == 0:
            self.task_events_mgr.process_message(itask, INFO,
                                                 TASK_OUTPUT_SUBMITTED,
                                                 ctx.timestamp)
        else:
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)

    def _prep_submit_task_job(self, suite, itask, check_syntax=True):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path:
            return itask

        # Handle broadcasts
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides, prepend=True)
        else:
            rtconfig = itask.tdef.rtconfig

        # BACK COMPAT: host logic
        # Determine task host or platform now, just before job submission,
        # because dynamic host/platform selection may be used.
        # cases:
        # - Platform exists, host does = throw error here:
        #    Although errors of this sort should ideally be caught on config
        #    load this cannot be done because inheritance may create conflicts
        #    which appear later. Although this error is also raised
        #    by the platforms module it's probably worth putting it here too
        #    to prevent trying to run the remote_host/platform_select logic for
        #    tasks which will fail anyway later.
        # - Platform exists, host doesn't = eval platform_n
        # - host exists - eval host_n
        # remove at:
        #     Cylc9
        if (rtconfig['platform'] is not None
                and rtconfig['remote']['host'] is not None):
            raise SuiteConfigError(
                "A mixture of Cylc 7 (host) and Cylc 8 (platform) "
                "logic should not be used. In this case for the task "
                f"\"{itask.identity}\" the following are not compatible:\n")

        host_n, platform_n = None, None
        try:
            if rtconfig['remote']['host'] is not None:
                host_n = self.task_remote_mgr.subshell_eval(
                    rtconfig['remote']['host'], HOST_REC_COMMAND)
            else:
                platform_n = self.task_remote_mgr.subshell_eval(
                    rtconfig['platform'], PLATFORM_REC_COMMAND)
        except TaskRemoteMgmtError as exc:
            # Submit number not yet incremented
            itask.waiting_on_job_prep = False
            itask.submit_num += 1
            itask.summary['platforms_used'][itask.submit_num] = ''
            # Retry delays, needed for the try_num
            self._create_job_log_path(suite, itask)
            self._set_retry_timers(itask, rtconfig)
            self._prep_submit_task_job_error(suite, itask,
                                             '(remote host select)', exc)
            return False
        else:
            # host/platform select not ready
            if host_n is None and platform_n is None:
                itask.set_summary_message(self.REMOTE_SELECT_MSG)
                return
            elif (host_n is None and rtconfig['platform']
                  and rtconfig['platform'] != platform_n):
                LOG.debug(f"for task {itask.identity}: platform = "
                          f"{rtconfig['platform']} evaluated as {platform_n}")
                rtconfig['platform'] = platform_n
            elif platform_n is None and rtconfig['remote']['host'] != host_n:
                LOG.debug(
                    f"for task {itask.identity}: host = "
                    f"{rtconfig['remote']['host']} evaluated as {host_n}")
                rtconfig['remote']['host'] = host_n

            try:
                platform = get_platform(rtconfig)
            except PlatformLookupError as exc:
                # Submit number not yet incremented
                itask.waiting_on_job_prep = False
                itask.submit_num += 1
                itask.summary['platforms_used'][itask.submit_num] = ''
                # Retry delays, needed for the try_num
                self._create_job_log_path(suite, itask)
                self._set_retry_timers(itask, rtconfig, False)
                self._prep_submit_task_job_error(suite, itask,
                                                 '(platform not defined)', exc)
                return False
            else:
                itask.platform = platform
                # Submit number not yet incremented
                itask.submit_num += 1
                # Retry delays, needed for the try_num
                self._set_retry_timers(itask, rtconfig)

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig)

            # Job pool insertion
            job_config = deepcopy(job_conf)
            job_config['logfiles'] = deepcopy(itask.summary['logfiles'])
            itask.jobs.append(job_config['job_d'])
            self.data_store_mgr.insert_job(itask.tdef.name, itask.point,
                                           job_config)

            local_job_file_path = get_task_job_job_log(suite, itask.point,
                                                       itask.tdef.name,
                                                       itask.submit_num)
            self.job_file_writer.write(local_job_file_path,
                                       job_conf,
                                       check_syntax=check_syntax)
        except Exception as exc:
            # Could be a bad command template, IOError, etc
            itask.waiting_on_job_prep = False
            self._prep_submit_task_job_error(suite, itask,
                                             '(prepare job file)', exc)
            return False

        itask.local_job_file_path = local_job_file_path
        return itask

    def _prep_submit_task_job_error(self, suite, itask, action, exc):
        """Helper for self._prep_submit_task_job. On error."""
        LOG.debug("submit_num %s" % itask.submit_num)
        log_task_job_activity(SubProcContext(self.JOBS_SUBMIT,
                                             action,
                                             err=exc,
                                             ret_code=1),
                              suite,
                              itask.point,
                              itask.tdef.name,
                              submit_num=itask.submit_num)
        # Persist
        self.suite_db_mgr.put_insert_task_jobs(
            itask, {
                'is_manual_submit': itask.is_manual_submit,
                'try_num': itask.get_try_num(),
                'time_submit': get_current_time_string(),
                'job_runner_name': itask.summary.get('job_runner_name'),
            })
        itask.is_manual_submit = False
        self.task_events_mgr.process_message(
            itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)

    def _prep_submit_task_job_impl(self, suite, itask, rtconfig):
        """Helper for self._prep_submit_task_job."""

        itask.summary['platforms_used'][
            itask.submit_num] = itask.platform['name']

        itask.summary['job_runner_name'] = itask.platform['job runner']
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['execution time limit'])
        except TypeError:
            pass

        scripts = self._get_job_scripts(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)
        job_file_path = get_remote_suite_run_job_dir(itask.platform, suite,
                                                     job_d, JOB_LOG_JOB)
        return {
            'job_runner_name':
            itask.platform['job runner'],
            'job_runner_command_template':
            (itask.platform['job runner command template']),
            'dependencies':
            itask.state.get_resolved_dependencies(),
            'directives':
            rtconfig['directives'],
            'environment':
            rtconfig['environment'],
            'execution_time_limit':
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script':
            rtconfig['env-script'],
            'err-script':
            rtconfig['err-script'],
            'exit-script':
            rtconfig['exit-script'],
            'platform':
            itask.platform,
            'init-script':
            rtconfig['init-script'],
            'job_file_path':
            job_file_path,
            'job_d':
            job_d,
            'namespace_hierarchy':
            itask.tdef.namespace_hierarchy,
            'param_var':
            itask.tdef.param_var,
            'post-script':
            scripts[2],
            'pre-script':
            scripts[0],
            'remote_suite_d':
            itask.platform['suite definition directory'],
            'script':
            scripts[1],
            'submit_num':
            itask.submit_num,
            'flow_label':
            itask.flow_label,
            'suite_name':
            suite,
            'task_id':
            itask.identity,
            'try_num':
            itask.get_try_num(),
            'uuid_str':
            self.task_remote_mgr.uuid_str,
            'work_d':
            rtconfig['work sub-directory'],
        }