예제 #1
0
 def _log_error_tail(self, pk, retcode):
     fname = self.outfiles[pk].name
     if os.path.exists(fname):
         tail = get_tail(self.outfiles[pk].name)
     else:
         tail = ''
     logmsg = self.log_prefix(pk) + f'nonzero return {retcode}:\n {tail}'
     logger.error(logmsg)
     return tail
예제 #2
0
def preprocess(job):
    logger.debug(f'{job.cute_id} in preprocess')

    # Get preprocesser exe
    preproc_app = job.preprocess
    if not preproc_app:
        job.state = 'PREPROCESSED'
        return

    if not os.path.exists(preproc_app.split()[0]):
        #TODO: look for preproc in the EXE directories
        message = f"Preprocessor {preproc_app} does not exist on filesystem"
        raise BalsamTransitionError(message)

    # Create preprocess-specific environment
    envs = job.get_envs()

    # Run preprocesser with special environment in job working directory
    out = os.path.join(job.working_directory, f"preprocess.log")
    with open(out, 'w') as fp:
        fp.write(f"# Balsam Preprocessor: {preproc_app}")
        fp.flush()
        try:
            args = preproc_app.split()
            logger.info(f"{job.cute_id} preprocess Popen {args}")
            proc = subprocess.Popen(
                args,
                stdout=fp,
                stderr=subprocess.STDOUT,
                env=envs,
                cwd=job.working_directory,
            )
            retcode = proc.wait(timeout=PREPROCESS_TIMEOUT_SECONDS)
            proc.communicate()
        except Exception as e:
            message = f"Preprocess failed: {e}"
            try:
                proc.kill()
            except:
                pass
            raise BalsamTransitionError(message) from e

    if retcode != 0:
        tail = get_tail(out)
        message = f"{job.cute_id} preprocess returned {retcode}:\n{tail}"
        raise BalsamTransitionError(message)

    job.state = 'PREPROCESSED'
    logger.debug(f"{job.cute_id} preprocess done")
예제 #3
0
 def check_state(self, run):
     retcode = run.process.poll()
     if retcode is None:
         run.current_state = 'RUNNING'
     elif retcode == 0:
         logger.info(f"MPIRun {run.job.cute_id} done")
         run.current_state = 'RUN_DONE'
         run.outfile.close()
         run.free_workers()
     else:
         run.process.communicate()
         run.outfile.close()
         tail = get_tail(run.outfile.name)
         run.current_state = 'RUN_ERROR'
         run.err_msg = tail
         logger.info(f"MPIRun {run.job.cute_id} error code {retcode}:\n{tail}")
         run.free_workers()
     return run.current_state
예제 #4
0
def postprocess(job, *, error_handling=False, timeout_handling=False):
    logger.debug(f'{job.cute_id} in postprocess')
    if error_handling and timeout_handling:
        raise ValueError("Both error-handling and timeout-handling is invalid")
    if error_handling: logger.info(f'{job.cute_id} handling RUN_ERROR')
    if timeout_handling: logger.info(f'{job.cute_id} handling RUN_TIMEOUT')

    # Get postprocesser exe
    postproc_app = job.postprocess

    # If no postprocesssor; move on (unless in error_handling mode)
    if not postproc_app:
        if error_handling:
            message = f"{job.cute_id} handle error: no postprocessor found!"
            raise BalsamTransitionError(message)
        elif timeout_handling:
            job.state = 'RESTART_READY'
            logger.warning(
                f'{job.cute_id} unhandled job timeout: marked RESTART_READY')
            return
        else:
            job.state = 'POSTPROCESSED',
            logger.debug(f'{job.cute_id} no postprocess: skipped')
            return

    if not os.path.exists(postproc_app.split()[0]):
        #TODO: look for postproc in the EXE directories
        message = f"Postprocessor {postproc_app} does not exist on filesystem"
        raise BalsamTransitionError(message)

    # Create postprocess-specific environment
    envs = job.get_envs(timeout=timeout_handling, error=error_handling)

    # Run postprocesser with special environment in job working directory
    out = os.path.join(job.working_directory, f"postprocess.log")
    with open(out, 'w') as fp:
        fp.write(f"# Balsam Postprocessor: {postproc_app}\n")
        if timeout_handling: fp.write("# Invoked to handle RUN_TIMEOUT\n")
        if error_handling: fp.write("# Invoked to handle RUN_ERROR\n")
        fp.flush()

        try:
            args = postproc_app.split()
            logger.info(f"{job.cute_id} postprocess Popen {args}")
            proc = subprocess.Popen(
                args,
                stdout=fp,
                stderr=subprocess.STDOUT,
                env=envs,
                cwd=job.working_directory,
            )
            retcode = proc.wait(timeout=POSTPROCESS_TIMEOUT_SECONDS)
            proc.communicate()
        except Exception as e:
            message = f"Postprocess failed: {e}"
            try:
                proc.kill()
            except:
                pass
            raise BalsamTransitionError(message) from e

    if retcode != 0:
        tail = get_tail(out, nlines=30)
        message = f"{job.cute_id} postprocess returned {retcode}:\n{tail}"
        raise BalsamTransitionError(message)

    job.refresh_from_db()
    # If postprocessor handled error or timeout, it should have changed job's
    # state. If it failed to do this, mark FAILED.  Otherwise, POSTPROCESSED.
    if error_handling and job.state == 'RUN_ERROR':
        message = f"{job.cute_id} Error handling didn't fix job state: marking FAILED"
        raise BalsamTransitionError(message)

    if timeout_handling and job.state == 'RUN_TIMEOUT':
        message = f"{job.cute_id} Timeout handling didn't change job state: marking FAILED"
        raise BalsamTransitionError(message)

    # Only move the state along to POSTPROCESSED if the job is still in RUN_DONE
    # and the post.py returned normally.  Otherwise, post.py might mark a job
    # FAILED, and you override it with POSTPROCESSED, breaking the workflow.
    if job.state == 'RUN_DONE':
        job.state = 'POSTPROCESSED'
    logger.debug(f"{job.cute_id} postprocess done")