def memory_usage(job): """ Perform memory usage verification. :param job: job object :return: exit code (int), diagnostics (string). """ exit_code = 0 diagnostics = "" # Get the maxPSS value from the memory monitor summary_dictionary = get_memory_values(job.workdir, name=job.memorymonitor) if not summary_dictionary: exit_code = errors.BADMEMORYMONITORJSON diagnostics = "Memory monitor output could not be read" return exit_code, diagnostics maxdict = summary_dictionary.get('Max', {}) maxpss_int = maxdict.get('maxPSS', -1) # Only proceed if values are set if maxpss_int != -1: maxrss = job.infosys.queuedata.maxrss if maxrss: # correction for SCORE/4CORE/nCORE jobs on UCORE queues scale = get_ucore_scale_factor(job) try: maxrss_int = 2 * int(maxrss * scale) * 1024 # Convert to int and kB except Exception as e: logger.warning("unexpected value for maxRSS: %s" % e) else: # Compare the maxRSS with the maxPSS from memory monitor if maxrss_int > 0 and maxpss_int > 0: if maxpss_int > maxrss_int: diagnostics = "job has exceeded the memory limit %d kB > %d kB (2 * queuedata.maxrss)" % \ (maxpss_int, maxrss_int) logger.warning(diagnostics) # Create a lockfile to let RunJob know that it should not restart the memory monitor after it has been killed #pUtil.createLockFile(False, self.__env['jobDic'][k][1].workdir, lockfile="MEMORYEXCEEDED") # Kill the job set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXCEEDMAXMEM) kill_processes(job.pid) else: logger.info("max memory (maxPSS) used by the payload is within the allowed limit: " "%d B (2 * maxRSS = %d B)" % (maxpss_int, maxrss_int)) else: if maxrss == 0 or maxrss == "0": logger.info("queuedata.maxrss set to 0 (no memory checks will be done)") else: logger.warning("queuedata.maxrss is not set") return exit_code, diagnostics
def check_work_dir(job): """ Check the size of the work directory. The function also updates the workdirsizes list in the job object. :param job: job object. :return: exit code (int), error diagnostics (string) """ exit_code = 0 diagnostics = "" log = get_logger(job.jobid) if os.path.exists(job.workdir): # get the limit of the workdir maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata) if os.path.exists(job.workdir): workdirsize = get_directory_size(directory=job.workdir) # is user dir within allowed size limit? if workdirsize > maxwdirsize: exit_code = errors.USERDIRTOOLARGE diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \ (job.workdir, workdirsize, maxwdirsize) log.fatal("%s" % diagnostics) cmd = 'ls -altrR %s' % job.workdir exit_code, stdout, stderr = execute(cmd, mute=True) log.info("%s: %s" % (cmd + '\n', stdout)) # kill the job # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED") kill_processes(job.pid) # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: remove_files(job.workdir, lfns) # remeasure the size of the workdir at this point since the value is stored below workdirsize = get_directory_size(directory=job.workdir) else: log.info( "size of work directory %s: %d B (within %d B limit)" % (job.workdir, workdirsize, maxwdirsize)) # Store the measured disk space (the max value will later be sent with the job metrics) if workdirsize > 0: job.add_workdir_size(workdirsize) else: log.warning('job work dir does not exist: %s' % job.workdir) else: log.warning( 'skipping size check of workdir since it has not been created yet') return exit_code, diagnostics
def kill_looping_job(job): """ Kill the looping process. TODO: add allow_looping_job() exp. spec? :param job: job object. :return: (updated job object.) """ # the child process is looping, kill it diagnostics = "pilot has decided to kill looping job %s at %s" % ( job.jobid, time_stamp()) logger.fatal(diagnostics) cmd = 'ps -fwu %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ls -ltr %s' % (job.workdir) exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'pstree -g -a' exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) # set the relevant error code if job.state == 'stagein': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINTIMEOUT) elif job.state == 'stageout': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEOUTTIMEOUT) else: # most likely in the 'running' state, but use the catch-all 'else' job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.LOOPINGJOB) set_pilot_state(job=job, state="failed") # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: ec = remove_files(job.workdir, lfns) if ec != 0: logger.warning('failed to remove all files') kill_processes(job.pid)
def interrupt(args, signum, frame): """ Interrupt function on the receiving end of kill signals. This function is forwarded any incoming signals (SIGINT, SIGTERM, etc) and will set abort_job which instructs the threads to abort the job. :param args: pilot arguments. :param signum: signal. :param frame: stack/execution frame pointing to the frame that was interrupted by the signal. :return: """ try: sig = [v for v, k in signal.__dict__.iteritems() if k == signum][0] except Exception: sig = [v for v, k in list(signal.__dict__.items()) if k == signum][0] args.signal_counter += 1 # keep track of when first kill signal arrived, any stuck loops should abort at a defined cut off time if args.kill_time == 0: args.kill_time = int(time()) max_kill_wait_time = MAX_KILL_WAIT_TIME + 60 # add another minute of grace to let threads finish current_time = int(time()) if args.kill_time and current_time - args.kill_time > max_kill_wait_time: logger.warning( 'passed maximum waiting time after first kill signal - will commit suicide - farewell' ) try: rmtree(args.sourcedir) except Exception as e: logger.warning(e) logging.shutdown() kill_processes(getpid()) add_to_pilot_timing('0', PILOT_KILL_SIGNAL, time(), args) add_to_pilot_timing('1', PILOT_KILL_SIGNAL, time(), args) logger.warning('caught signal: %s in FRAME=\n%s' % (sig, '\n'.join(traceback.format_stack(frame)))) args.signal = sig logger.warning('will instruct threads to abort and update the server') args.abort_job.set() logger.warning('waiting for threads to finish') args.job_aborted.wait() logger.warning( 'setting graceful stop (in case it was not set already), pilot will abort' ) args.graceful_stop.set()
def run(self): # noqa: C901 """ Run all payload processes (including pre- and post-processes, and utilities). In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special exit code. :return: """ # get the payload command from the user specific code self.pre_setup(self.__job) cmd = self.get_payload_command(self.__job) # extract the setup in case the preprocess command needs it self.__job.setup = self.extract_setup(cmd) self.post_setup(self.__job) # a loop is needed for HPO jobs # abort when nothing more to run, or when the preprocess returns a special exit code iteration = 0 while True: logger.info('payload iteration loop #%d', iteration + 1) os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration show_memory_usage() # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd jobparams_pre = self.__job.jobparams exit_code = self.run_preprocess(self.__job) jobparams_post = self.__job.jobparams if exit_code: if exit_code >= 160 and exit_code <= 162: exit_code = 0 # wipe the output file list since there won't be any new files # any output files from previous iterations, should have been transferred already logger.debug( 'reset outdata since further output should not be expected after preprocess exit' ) self.__job.outdata = [] break if jobparams_pre != jobparams_post: logger.debug( 'jobparams were updated by utility_before_payload()') # must update cmd cmd = cmd.replace(jobparams_pre, jobparams_post) # now run the main payload, when it finishes, run the postprocess (if necessary) # note: no need to run any main payload in HPO Horovod jobs on Kubernetes if os.environ.get('HARVESTER_HOROVOD', '') == '': #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[before payload start] stdout=%s', _stdout) #logger.debug('[before payload start] stderr=%s', _stderr) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) else: proc = None proc_co = None if proc is None: # run the post-process command even if there was no main payload if os.environ.get('HARVESTER_HOROVOD', '') != '': logger.info('No need to execute any main payload') exit_code = self.run_utility_after_payload_finished( exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2) self.post_payload(self.__job) else: break else: # the process is now running, update the server # test 'tobekilled' from here to try payload kill send_state(self.__job, self.__args, self.__job.state) # note: when sending a state change to the server, the server might respond with 'tobekilled' if self.__job.state == 'failed': logger.warning( 'job state is \'failed\' - abort payload and run()') kill_processes(proc.pid) break # allow for a secondary command to be started after the payload (e.g. a coprocess) utility_cmd = self.get_utility_command( order=UTILITY_AFTER_PAYLOAD_STARTED2) if utility_cmd: logger.debug('starting utility command: %s', utility_cmd) label = 'coprocess' if 'coprocess' in utility_cmd else None proc_co = self.run_command(utility_cmd, label=label) logger.info('will wait for graceful exit') exit_code = self.wait_graceful(self.__args, proc) # reset error if Raythena decided to kill payload (no error) if errors.KILLPAYLOAD in self.__job.piloterrorcodes: logger.debug('ignoring KILLPAYLOAD error') self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.remove_error_code( errors.KILLPAYLOAD, pilot_error_codes=self.__job.piloterrorcodes, pilot_error_diags=self.__job.piloterrordiags) exit_code = 0 state = 'finished' else: state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=self.__job, state=state) logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n', proc.pid, exit_code, self.__job.state) #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[after payload finish] stdout=%s', _stdout) #logger.debug('[after payload finish] stderr=%s', _stderr) # stop the utility command (e.g. a coprocess if necessary if proc_co: logger.debug('stopping utility command: %s', utility_cmd) kill_processes(proc_co.pid) if exit_code is None: logger.warning( 'detected unset exit_code from wait_graceful - reset to -1' ) exit_code = -1 for order in [ UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2 ]: exit_code = self.run_utility_after_payload_finished( exit_code, state, order) self.post_payload(self.__job) # stop any running utilities if self.__job.utilities != {}: self.stop_utilities() if self.__job.is_hpo and state != 'failed': # in case there are more hyper-parameter points, move away the previous log files #self.rename_log_files(iteration) iteration += 1 else: break return exit_code
def check_payload_stdout(job): """ Check the size of the payload stdout. :param job: job object. :return: exit code (int), diagnostics (string). """ exit_code = 0 diagnostics = "" # get list of log files file_list = glob(os.path.join(job.workdir, 'log.*')) # is this a multi-trf job? n_jobs = job.jobparams.count("\n") + 1 for _i in range(n_jobs): # get name of payload stdout file created by the pilot _stdout = config.Payload.payloadstdout if n_jobs > 1: _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1)) # add the primary stdout file to the fileList file_list.append(os.path.join(job.workdir, _stdout)) tmp_list = glob(os.path.join(job.workdir, 'workDir/tmp.stdout.*')) if tmp_list: file_list += tmp_list logger.debug('file list=%s' % str(file_list)) # now loop over all files and check each individually (any large enough file will fail the job) for filename in file_list: logger.debug('check_payload_stdout: filename=%s', filename) if "job.log.tgz" in filename: logger.info("skipping file size check of file (%s) since it is a special log file", filename) continue if os.path.exists(filename): try: # get file size in bytes fsize = os.path.getsize(filename) except Exception as error: logger.warning("could not read file size of %s: %s", filename, error) else: # is the file too big? localsizelimit_stdout = get_local_size_limit_stdout() if fsize > localsizelimit_stdout: exit_code = errors.STDOUTTOOBIG diagnostics = "Payload stdout file too big: %d B (larger than limit %d B)" % \ (fsize, localsizelimit_stdout) logger.warning(diagnostics) # kill the job set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) kill_processes(job.pid) # remove the payload stdout file after the log extracts have been created # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: # remove any lingering input files from the work dir exit_code = remove_files(job.workdir, lfns) else: logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize) else: logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename) return exit_code, diagnostics