Ejemplo n.º 1
0
 def insert_db_job(self, row_idx, row):
     """Load job element from DB post restart."""
     if row_idx == 0:
         LOG.info("LOADING job data")
     (point_string, name, status, submit_num, time_submit, time_run,
      time_run_exit, batch_sys_name, batch_sys_job_id, platform_name) = row
     if status not in JOB_STATUS_SET:
         return
     t_id = f'{self.workflow_id}{ID_DELIM}{point_string}{ID_DELIM}{name}'
     j_id = f'{t_id}{ID_DELIM}{submit_num}'
     try:
         tdef = self.schd.config.get_taskdef(name)
         j_owner = self.schd.owner
         if platform_name:
             j_host = get_host_from_platform(get_platform(platform_name))
         else:
             j_host = self.schd.host
         j_buf = PbJob(
             stamp=f'{j_id}@{time()}',
             id=j_id,
             submit_num=submit_num,
             state=status,
             task_proxy=t_id,
             submitted_time=time_submit,
             started_time=time_run,
             finished_time=time_run_exit,
             batch_sys_name=batch_sys_name,
             batch_sys_job_id=batch_sys_job_id,
             host=j_host,
             owner=j_owner,
             name=name,
             cycle_point=point_string,
         )
         # Add in log files.
         j_buf.job_log_dir = get_task_job_log(self.schd.suite, point_string,
                                              name, submit_num)
         overrides = self.schd.task_events_mgr.broadcast_mgr.get_broadcast(
             TaskID.get(name, point_string))
         if overrides:
             rtconfig = pdeepcopy(tdef.rtconfig)
             poverride(rtconfig, overrides, prepend=True)
         else:
             rtconfig = tdef.rtconfig
         j_buf.extra_logs.extend([
             os.path.expanduser(os.path.expandvars(log_file))
             for log_file in rtconfig['extra log files']
         ])
     except SuiteConfigError:
         LOG.exception(
             ('ignoring job %s from the suite run database\n'
              '(its task definition has probably been deleted).') % j_id)
     except Exception:
         LOG.exception('could not load job %s' % j_id)
     else:
         self.added[j_id] = j_buf
         self.task_jobs.setdefault(t_id, set()).add(j_id)
         self.updates_pending = True
Ejemplo n.º 2
0
    def _create_job_log_path(suite, itask):
        """Create job log directory for a task job, etc.

        Create local job directory, and NN symbolic link.
        If NN => 01, remove numbered directories with submit numbers greater
        than 01.
        Return a string in the form "POINT/NAME/SUBMIT_NUM".

        """
        job_file_dir = get_task_job_log(
            suite, itask.point, itask.tdef.name, itask.submit_num)
        task_log_dir = os.path.dirname(job_file_dir)
        if itask.submit_num == 1:
            try:
                names = os.listdir(task_log_dir)
            except OSError:
                pass
            else:
                for name in names:
                    if name not in ["01", NN]:
                        rmtree(
                            os.path.join(task_log_dir, name),
                            ignore_errors=True)
        else:
            rmtree(job_file_dir, ignore_errors=True)

        os.makedirs(job_file_dir, exist_ok=True)
        target = os.path.join(task_log_dir, NN)
        source = os.path.basename(job_file_dir)
        try:
            prev_source = os.readlink(target)
        except OSError:
            prev_source = None
        if prev_source == source:
            return
        try:
            if prev_source:
                os.unlink(target)
            os.symlink(source, target)
        except OSError as exc:
            if not exc.filename:
                exc.filename = target
            raise exc
Ejemplo n.º 3
0
    def _create_job_log_path(suite, itask):
        """Create job log directory for a task job, etc.

        Create local job directory, and NN symbolic link.
        If NN => 01, remove numbered directories with submit numbers greater
        than 01.
        Return a string in the form "POINT/NAME/SUBMIT_NUM".

        """
        job_file_dir = get_task_job_log(suite, itask.point, itask.tdef.name,
                                        itask.submit_num)
        job_file_dir = os.path.expandvars(job_file_dir)
        task_log_dir = os.path.dirname(job_file_dir)
        if itask.submit_num == 1:
            try:
                names = os.listdir(task_log_dir)
            except OSError:
                pass
            else:
                for name in names:
                    if name not in ["01", NN]:
                        rmtree(os.path.join(task_log_dir, name),
                               ignore_errors=True)
        else:
            rmtree(job_file_dir, ignore_errors=True)

        os.makedirs(job_file_dir, exist_ok=True)
        target = os.path.join(task_log_dir, NN)
        source = os.path.basename(job_file_dir)
        try:
            prev_source = os.readlink(target)
        except OSError:
            prev_source = None
        if prev_source == source:
            return
        try:
            if prev_source:
                os.unlink(target)
            os.symlink(source, target)
        except OSError as exc:
            if not exc.filename:
                exc.filename = target
            raise exc
Ejemplo n.º 4
0
    def insert_job(self, job_conf):
        """Insert job into pool."""
        job_owner = job_conf['owner']
        sub_num = job_conf['submit_num']
        name, point_string = TaskID.split(job_conf['task_id'])
        t_id = f'{self.workflow_id}{ID_DELIM}{point_string}{ID_DELIM}{name}'
        j_id = f'{t_id}{ID_DELIM}{sub_num}'
        j_buf = PbJob(stamp=f'{j_id}@{time()}',
                      id=j_id,
                      submit_num=sub_num,
                      state=JOB_STATUSES_ALL[0],
                      task_proxy=t_id,
                      batch_sys_name=job_conf['batch_system_name'],
                      env_script=job_conf['env-script'],
                      err_script=job_conf['err-script'],
                      exit_script=job_conf['exit-script'],
                      execution_time_limit=job_conf['execution_time_limit'],
                      host=job_conf['platform']['name'],
                      init_script=job_conf['init-script'],
                      owner=job_owner,
                      post_script=job_conf['post-script'],
                      pre_script=job_conf['pre-script'],
                      script=job_conf['script'],
                      work_sub_dir=job_conf['work_d'],
                      name=name,
                      cycle_point=point_string,
                      batch_sys_conf=json.dumps(job_conf['batch_system_conf']),
                      directives=json.dumps(job_conf['directives']),
                      environment=json.dumps(job_conf['environment']),
                      param_var=json.dumps(job_conf['param_var']))

        # Add in log files.
        j_buf.job_log_dir = get_task_job_log(self.schd.suite, point_string,
                                             name, sub_num)
        j_buf.extra_logs.extend(job_conf['logfiles'])

        self.added[j_id] = j_buf
        self.task_jobs.setdefault(t_id, set()).add(j_id)
        self.updates_pending = True
Ejemplo n.º 5
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     if proc_ctx.ret_code:
         LOG.error(proc_ctx)
     else:
         LOG.debug(proc_ctx)
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(
                     get_task_job_log(schd_ctx.suite, point, name,
                                      submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(log_ctx, schd_ctx.suite, point, name,
                                   submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Ejemplo n.º 6
0
    def _prep_submit_task_job(self, suite, itask, dry_run, check_syntax=True):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path and not dry_run:
            return itask

        # Handle broadcasts
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides, prepend=True)
        else:
            rtconfig = itask.tdef.rtconfig

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.
        try:
            task_host = self.task_remote_mgr.remote_host_select(
                rtconfig['remote']['host'])
        except TaskRemoteMgmtError as exc:
            # Submit number not yet incremented
            itask.submit_num += 1
            itask.summary['job_hosts'][itask.submit_num] = ''
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)
            self._prep_submit_task_job_error(suite, itask, dry_run,
                                             '(remote host select)', exc)
            return False
        else:
            if task_host is None:  # host select not ready
                itask.set_summary_message(self.REMOTE_SELECT_MSG)
                return
            itask.task_host = task_host
            # Submit number not yet incremented
            itask.submit_num += 1
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig)
            local_job_file_path = get_task_job_job_log(suite, itask.point,
                                                       itask.tdef.name,
                                                       itask.submit_num)
            self.job_file_writer.write(local_job_file_path,
                                       job_conf,
                                       check_syntax=check_syntax)
        except Exception as exc:
            # Could be a bad command template, IOError, etc
            self._prep_submit_task_job_error(suite, itask, dry_run,
                                             '(prepare job file)', exc)
            return False
        itask.local_job_file_path = local_job_file_path

        job_config = deepcopy(job_conf)
        job_config['logfiles'] = deepcopy(itask.summary['logfiles'])
        job_config['job_log_dir'] = get_task_job_log(suite, itask.point,
                                                     itask.tdef.name,
                                                     itask.submit_num)
        itask.jobs.append(job_config['job_d'])
        self.job_pool.insert_job(job_config)

        if dry_run:
            itask.set_summary_message(self.DRY_RUN_MSG)
            self.job_pool.add_job_msg(job_config['job_d'], self.DRY_RUN_MSG)
            LOG.debug(f'[{itask}] -{self.DRY_RUN_MSG}')

        # Return value used by "cylc submit" and "cylc jobscript":
        return itask