def check_work_dir(job): """ Check the size of the work directory. The function also updates the workdirsizes list in the job object. :param job: job object. :return: exit code (int), error diagnostics (string) """ exit_code = 0 diagnostics = "" log = get_logger(job.jobid) if os.path.exists(job.workdir): # get the limit of the workdir maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata) if os.path.exists(job.workdir): workdirsize = get_directory_size(directory=job.workdir) # is user dir within allowed size limit? if workdirsize > maxwdirsize: exit_code = errors.USERDIRTOOLARGE diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \ (job.workdir, workdirsize, maxwdirsize) log.fatal("%s" % diagnostics) cmd = 'ls -altrR %s' % job.workdir exit_code, stdout, stderr = execute(cmd, mute=True) log.info("%s: %s" % (cmd + '\n', stdout)) # kill the job # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED") kill_processes(job.pid) # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: remove_files(job.workdir, lfns) # remeasure the size of the workdir at this point since the value is stored below workdirsize = get_directory_size(directory=job.workdir) else: log.info( "size of work directory %s: %d B (within %d B limit)" % (job.workdir, workdirsize, maxwdirsize)) # Store the measured disk space (the max value will later be sent with the job metrics) if workdirsize > 0: job.add_workdir_size(workdirsize) else: log.warning('job work dir does not exist: %s' % job.workdir) else: log.warning( 'skipping size check of workdir since it has not been created yet') return exit_code, diagnostics
def kill_looping_job(job): """ Kill the looping process. TODO: add allow_looping_job() exp. spec? :param job: job object. :return: (updated job object.) """ # the child process is looping, kill it diagnostics = "pilot has decided to kill looping job %s at %s" % ( job.jobid, time_stamp()) logger.fatal(diagnostics) cmd = 'ps -fwu %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ls -ltr %s' % (job.workdir) exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'pstree -g -a' exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) # set the relevant error code if job.state == 'stagein': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINTIMEOUT) elif job.state == 'stageout': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEOUTTIMEOUT) else: # most likely in the 'running' state, but use the catch-all 'else' job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.LOOPINGJOB) set_pilot_state(job=job, state="failed") # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: ec = remove_files(job.workdir, lfns) if ec != 0: logger.warning('failed to remove all files') kill_processes(job.pid)
def check_payload_stdout(job): """ Check the size of the payload stdout. :param job: job object. :return: exit code (int), diagnostics (string). """ exit_code = 0 diagnostics = "" # get list of log files file_list = glob(os.path.join(job.workdir, 'log.*')) # is this a multi-trf job? n_jobs = job.jobparams.count("\n") + 1 for _i in range(n_jobs): # get name of payload stdout file created by the pilot _stdout = config.Payload.payloadstdout if n_jobs > 1: _stdout = _stdout.replace(".txt", "_%d.txt" % (_i + 1)) # add the primary stdout file to the fileList file_list.append(os.path.join(job.workdir, _stdout)) tmp_list = glob(os.path.join(job.workdir, 'workDir/tmp.stdout.*')) if tmp_list: file_list += tmp_list logger.debug('file list=%s' % str(file_list)) # now loop over all files and check each individually (any large enough file will fail the job) for filename in file_list: logger.debug('check_payload_stdout: filename=%s', filename) if "job.log.tgz" in filename: logger.info("skipping file size check of file (%s) since it is a special log file", filename) continue if os.path.exists(filename): try: # get file size in bytes fsize = os.path.getsize(filename) except Exception as error: logger.warning("could not read file size of %s: %s", filename, error) else: # is the file too big? localsizelimit_stdout = get_local_size_limit_stdout() if fsize > localsizelimit_stdout: exit_code = errors.STDOUTTOOBIG diagnostics = "Payload stdout file too big: %d B (larger than limit %d B)" % \ (fsize, localsizelimit_stdout) logger.warning(diagnostics) # kill the job set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) kill_processes(job.pid) # remove the payload stdout file after the log extracts have been created # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: # remove any lingering input files from the work dir exit_code = remove_files(job.workdir, lfns) else: logger.info("payload log (%s) within allowed size limit (%d B): %d B", os.path.basename(filename), localsizelimit_stdout, fsize) else: logger.info("skipping file size check of payload stdout file (%s) since it has not been created yet", filename) return exit_code, diagnostics