def copy_jobreport(job_working_dir, worker_communication_point, payload_report_file, workerattributesfile): src_file = os.path.join(job_working_dir, payload_report_file) dst_file = os.path.join(worker_communication_point, payload_report_file) try: logger.info( "Copy of payload report [{0}] to access point: {1}".format(payload_report_file, worker_communication_point)) cp_start = time.time() # shrink jobReport job_report = read_json(src_file) if 'executor' in job_report: for executor in job_report['executor']: if 'logfileReport' in executor: executor['logfileReport'] = {} with open(dst_file, 'w') as job_report_outfile: json.dump(job_report, job_report_outfile) cp_time = time.time() - cp_start logger.info("Copy of payload report file took: {0} sec.".format(cp_time)) except: logger.error("Job report copy failed, execution terminated': \n %s " % (sys.exc_info()[1])) work_report = dict() work_report["jobStatus"] = "failed" work_report["pilotErrorCode"] = 1103 # Should be changed to Pilot2 errors work_report["exitMsg"] = str(sys.exc_info()[1]) main_exit(1103, work_report, workerattributesfile)
def publish_job_report(job, args, job_report_file="jobReport.json"): """ Copy job report file to make it accessible by Harvester. Shrink job report file. :param job: job object. :param args: Pilot arguments object. :param job_report_file: name of job report (string). :raises FileHandlingFailure: in case of IOError. :return True or False """ src_file = os.path.join(job.workdir, job_report_file) dst_file = os.path.join(args.harvester_workdir, job_report_file) try: logger.info("copy of payload report [{0}] to access point: {1}".format( job_report_file, args.harvester_workdir)) # shrink jobReport job_report = read_json(src_file) if 'executor' in job_report: for executor in job_report['executor']: if 'logfileReport' in executor: executor['logfileReport'] = {} if write_json(dst_file, job_report): return True else: return False except IOError: logger.error("job report copy failed") return False
def process_jobreport(payload_report_file, job_scratch_path, job_communication_point): """ Copy job report file to make it accessible by Harvester. Shrink job report file. :param payload_report_file: name of job report (string). :param job_scratch_path: path to scratch directory (string). :param job_communication_point: path to updated job report accessible by Harvester (string). :raises FileHandlingFailure: in case of IOError. """ src_file = os.path.join(job_scratch_path, payload_report_file) dst_file = os.path.join(job_communication_point, payload_report_file) try: logger.info("Copy of payload report [{0}] to access point: {1}".format( payload_report_file, job_communication_point)) # shrink jobReport job_report = read_json(src_file) if 'executor' in job_report: for executor in job_report['executor']: if 'logfileReport' in executor: executor['logfileReport'] = {} write_json(dst_file, job_report) except IOError: logger.error("Job report copy failed, execution terminated': \n %s " % (sys.exc_info()[1])) raise FileHandlingFailure("Job report copy from RAM failed")
def get_schedconfig_queuedata(queue): """ Return and store the schedconfig queuedata. :param queue: PanDA queue name (e.g. BNL_PROD_MCORE) :return: schedconfig queuedata json dictionary """ # read it locally if the queuedata file already exists filename = os.path.join(os.environ.get('PILOT_HOME'), config.Information.queuedata) if os.path.exists(filename): queuedata = read_json(filename) return queuedata url = config.Information.schedconfig if url == "": logger.fatal('URL for schedconfig not set') return False else: # add the queuename to the URL if not url.endswith('/'): url += '/' url += queue + '.all.json' queuedata = retrieve_json(url) # also write the queuedata to disk if not write_json(filename, queuedata): logger.warning("failed to write queuedata json to file") else: logger.info("wrote queuedata to local file %s" % filename) return queuedata
def handle_updated_job_object(job, xdata, label='stage-in'): """ Handle updated job object fields. :param job: job object. :param xdata: list of FileSpec objects. :param label: 'stage-in/out' (string). :return: :raises: StageInFailure, StageOutFailure """ dictionary_name = config.Container.stagein_status_dictionary if label == 'stage-in' else config.Container.stageout_status_dictionary # read the JSON file created by the stage-in/out script if path.exists(path.join(job.workdir, dictionary_name + '.log')): dictionary_name += '.log' file_dictionary = read_json(path.join(job.workdir, dictionary_name)) # update the job object accordingly if file_dictionary: # get file info and set essential parameters for fspec in xdata: try: fspec.status = file_dictionary[fspec.lfn][0] fspec.status_code = file_dictionary[fspec.lfn][1] if label == 'stage-in': fspec.turl = file_dictionary[fspec.lfn][2] fspec.ddmendpoint = file_dictionary[fspec.lfn][3] else: fspec.surl = file_dictionary[fspec.lfn][2] fspec.turl = file_dictionary[fspec.lfn][3] fspec.checksum['adler32'] = file_dictionary[fspec.lfn][4] fspec.filesize = file_dictionary[fspec.lfn][5] except Exception as exc: msg = "exception caught while reading file dictionary: %s" % exc logger.warning(msg) if label == 'stage-in': raise StageInFailure(msg) else: raise StageOutFailure(msg) # get main error info ('error': [error_diag, error_code]) error_diag = file_dictionary['error'][0] error_code = file_dictionary['error'][1] if error_code: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( error_code, msg=error_diag) else: msg = "%s file dictionary not found" % label logger.warning(msg) if label == 'stage-in': raise StageInFailure(msg) else: raise StageOutFailure(msg)
def read_pilot_timing(): """ Read the pilot timing dictionary from file. :return: pilot timing dictionary (json dictionary). """ pilot_timing_dictionary = {} path = os.path.join(os.environ.get('PILOT_HOME', ''), config.Pilot.timing_file) if os.path.exists(path): pilot_timing_dictionary = read_json(path) return pilot_timing_dictionary
def get_job(harvesterpath): """ Return job description in dictionary and MPI rank (if applicable) :param harvesterpath: path to config.Harvester.jobs_list_file (string). :return: job object, rank (int). """ rank = 0 job = None logger.info("Going to read job definition from file") pandaids_list_filename = os.path.join(harvesterpath, config.Harvester.jobs_list_file) if not os.path.isfile(pandaids_list_filename): logger.info("File with PanDA IDs are missing. Nothing to execute.") return job, rank harvesterpath = os.path.abspath(harvesterpath) rank, max_ranks = get_ranks_info() pandaids = read_json(pandaids_list_filename) logger.info('Got {0} job ids'.format(len(pandaids))) pandaid = pandaids[rank] job_workdir = os.path.join(harvesterpath, str(pandaid)) logger.info('Rank: {2} with job {0} will have work directory {1}'.format( pandaid, job_workdir, rank)) job_def_filename = os.path.join(job_workdir, config.Harvester.pandajob_file) jobs_dict = read_json(job_def_filename) job_dict = jobs_dict[str(pandaid)] job = JobDescription() job.load(job_dict) return job, rank
def get_memory_values(workdir, name=""): """ Find the values in the memory monitor output file. In case the summary JSON file has not yet been produced, create a summary dictionary with the same format using the output text file (produced by the memory monitor and which is updated once per minute). FORMAT: {"Max":{"maxVMEM":40058624,"maxPSS":10340177,"maxRSS":16342012,"maxSwap":16235568}, "Avg":{"avgVMEM":19384236,"avgPSS":5023500,"avgRSS":6501489,"avgSwap":5964997}, "Other":{"rchar":NN,"wchar":NN,"rbytes":NN,"wbytes":NN}} :param workdir: relevant work directory (string). :param name: name of memory monitor (string). :return: memory values dictionary. """ summary_dictionary = {} # Get the path to the proper memory info file (priority ordered) path = get_memory_monitor_info_path(workdir, allowtxtfile=True) if os.path.exists(path): logger.info("using path: %s (trf name=%s)" % (path, name)) # Does a JSON summary file exist? If so, there's no need to calculate maximums and averages in the pilot if path.lower().endswith('json'): # Read the dictionary from the JSON file summary_dictionary = read_json(path) else: # Loop over the output file, line by line, and look for the maximum PSS value if name == "prmon": summary_dictionary = get_average_summary_dictionary_prmon(path) else: summary_dictionary = get_average_summary_dictionary(path) logger.debug('summary_dictionary=%s (trf name=%s)' % (str(summary_dictionary), name)) else: if path == "": logger.warning("filename not set for memory monitor output") else: # Normally this means that the memory output file has not been produced yet pass return summary_dictionary
def parse_job_definition_file(filename): """ This function parses the Harvester job definition file and re-packages the job definition dictionaries. The format of the Harvester job definition dictionary is: dict = { job_id: { key: value, .. }, .. } The function returns a list of these dictionaries each re-packaged as dict = { key: value } (where the job_id is now one of the key-value pairs: 'jobid': job_id) :param filename: file name (string). :return: list of job definition dictionaries. """ job_definitions_list = [] # re-package dictionaries job_definitions_dict = read_json(filename) if job_definitions_dict: for job_id in job_definitions_dict: res = {'jobid': job_id} res.update(job_definitions_dict[job_id]) job_definitions_list.append(res) return job_definitions_list
# get the args from the arg parser args = get_args() args.debug = True args.nopilotlog = False establish_logging(args, filename=config.Pilot.stageinlog) logger = logging.getLogger(__name__) #ret = verify_args() #if ret: # exit(ret) # get the file info try: replica_dictionary = read_json( os.path.join(args.workdir, args.replicadictionary)) except Exception as e: message('exception caught reading json: %s' % e) exit(1) # file_list_dictionary = get_file_lists(args.lfns, args.scopes, args.filesizes, args.checksums, args.allowlans, # args.allowwans, args.directaccesslans, args.directaccesswans, args.istars, # args.accessmodes, args.storagetokens, args.guids) # lfns = file_list_dictionary.get('lfns') # scopes = file_list_dictionary.get('scopes') # filesizes = file_list_dictionary.get('filesizes') # checksums = file_list_dictionary.get('checksums') # allowlans = file_list_dictionary.get('allowlans') # allowwans = file_list_dictionary.get('allowwans') # directaccesslans = file_list_dictionary.get('directaccesslans') # directaccesswans = file_list_dictionary.get('directaccesswans')
#ret = verify_args() #if ret: # exit(ret) # get the file info lfns, scopes = get_file_lists(args.lfns, args.scopes) if len(lfns) != len(scopes): message('file lists not same length: len(lfns)=%d, len(scopes)=%d' % (len(lfns), len(scopes))) # get the initial trace report path = os.path.join(args.workdir, args.tracereportname) if not os.path.exists(path): message('file does not exist: %s' % path) exit(NO_TRACEREPORT) trace_report = read_json(args.tracereportname) if not trace_report: message('failed to read trace report') exit(NO_TRACEREPORT) try: infoservice = InfoService() infoservice.init(args.queuename, infosys.confinfo, infosys.extinfo) infosys.init(args.queuename) # is this correct? otherwise infosys.queuedata doesn't get set except Exception as e: message(e) # perform stage-in (single transfers) err = "" for lfn, scope in list(zip(lfns, scopes)): try:
def run(args): """ Main execution function for the generic HPC workflow. :param args: pilot arguments. :returns: traces object. """ # set communication point. Worker report should be placed there, matched with working directory of Harvester if args.harvester_workdir: communication_point = args.harvester_workdir else: communication_point = os.getcwd() work_report = get_initial_work_report() worker_attributes_file = config.Harvester.workerAttributesFile worker_stageout_declaration = config.Harvester.StageOutnFile payload_report_file = config.Payload.jobreport payload_stdout_file = config.Payload.payloadstdout payload_stderr_file = config.Payload.payloadstderr try: logger.info('setting up signal handling') signal.signal(signal.SIGINT, functools.partial(interrupt, args)) logger.info('setting up tracing') traces = namedtuple('traces', ['pilot']) traces.pilot = {'state': SUCCESS, 'nr_jobs': 0} if args.hpc_resource == '': logger.critical('hpc resource not specified, cannot continue') traces.pilot['state'] = FAILURE return traces # get the resource reference resource = __import__('pilot.resource.%s' % args.hpc_resource, globals(), locals(), [args.hpc_resource], -1) # get the user reference user = __import__('pilot.user.%s.common' % args.pilot_user.lower(), globals(), locals(), [args.pilot_user.lower()], -1) # get job (and rank) add_to_pilot_timing('0', PILOT_PRE_GETJOB, time.time(), args) job, rank = resource.get_job(communication_point) add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args) # cd to job working directory add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, time.time(), args) work_dir = resource.set_job_workdir(job, communication_point) work_report['workdir'] = work_dir worker_attributes_file = os.path.join(work_dir, worker_attributes_file) logger.debug("Worker attributes will be publeshied in: {0}".format( worker_attributes_file)) set_pilot_state(job=job, state="starting") work_report["jobStatus"] = job.state publish_work_report(work_report, worker_attributes_file) # Get HPC specific setup commands logger.info('setup for resource %s: %s' % (args.hpc_resource, str(resource.get_setup()))) setup_str = "; ".join(resource.get_setup()) # Prepare job scratch directory (RAM disk etc.) job_scratch_dir = resource.set_scratch_workdir(job, work_dir, args) my_command = " ".join([job.script, job.script_parameters]) my_command = resource.command_fix(my_command, job_scratch_dir) my_command = setup_str + my_command add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), args) # Basic execution. Should be replaced with something like 'run_payload' logger.debug("Going to launch: {0}".format(my_command)) logger.debug("Current work directory: {0}".format(job_scratch_dir)) payloadstdout = open(payload_stdout_file, "w") payloadstderr = open(payload_stderr_file, "w") add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, time.time(), args) set_pilot_state(job=job, state="running") work_report["jobStatus"] = job.state work_report["startTime"] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") start_time = time.asctime(time.localtime(time.time())) job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") publish_work_report(work_report, worker_attributes_file) stime = time.time() t0 = os.times() exit_code, stdout, stderr = execute(my_command, stdout=payloadstdout, stderr=payloadstderr, shell=True) logger.debug("Payload exit code: {0}".format(exit_code)) t1 = os.times() exetime = time.time() - stime end_time = time.asctime(time.localtime(time.time())) t = map(lambda x, y: x - y, t1, t0) t_tot = reduce(lambda x, y: x + y, t[2:3]) job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") payloadstdout.close() payloadstderr.close() add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), args) state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=job, state=state) job.exitcode = exit_code work_report["startTime"] = job.startTime work_report["endTime"] = job.endTime work_report["jobStatus"] = job.state work_report["cpuConsumptionTime"] = t_tot work_report["transExitCode"] = job.exitcode log_jobreport = "\nPayload exit code: {0} JobID: {1} \n".format( exit_code, job.jobid) log_jobreport += "CPU comsumption time: {0} JobID: {1} \n".format( t_tot, job.jobid) log_jobreport += "Start time: {0} JobID: {1} \n".format( start_time, job.jobid) log_jobreport += "End time: {0} JobID: {1} \n".format( end_time, job.jobid) log_jobreport += "Execution time: {0} sec. JobID: {1} \n".format( exetime, job.jobid) logger.info(log_jobreport) log_jobreport = "\nJob report start time: {0}\nJob report end time: {1}".format( job.startTime, job.endTime) logger.debug(log_jobreport) # Parse job report file and update of work report if os.path.exists(payload_report_file): payload_report = user.parse_jobreport_data( read_json(payload_report_file)) work_report.update(payload_report) resource.process_jobreport(payload_report_file, job_scratch_dir, work_dir) resource.postprocess_workdir(job_scratch_dir) # output files should not be packed with logs protectedfiles = job.output_files.keys() # log file not produced (yet), so should be excluded if job.log_file in protectedfiles: protectedfiles.remove(job.log_file) else: logger.info("Log files was not declared") logger.info("Cleanup of working directory") protectedfiles.extend( [worker_attributes_file, worker_stageout_declaration]) user.remove_redundant_files(job_scratch_dir, protectedfiles) res = tar_files(job_scratch_dir, protectedfiles, job.log_file) if res > 0: raise FileHandlingFailure("Log file tar failed") add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args) # Copy of output to shared FS for stageout if not job_scratch_dir == work_dir: copy_output(job, job_scratch_dir, work_dir) add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args) logger.info("Declare stage-out") add_to_pilot_timing(job.jobid, PILOT_PRE_FINAL_UPDATE, time.time(), args) declare_output(job, work_report, worker_stageout_declaration) logger.info("All done") publish_work_report(work_report, worker_attributes_file) traces.pilot['state'] = SUCCESS logger.debug("Final report: {0}".format(work_report)) add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(), args) except Exception as e: work_report["jobStatus"] = "failed" work_report["exitMsg"] = str(e) publish_work_report(work_report, worker_attributes_file) logging.exception('exception caught:') traces.pilot['state'] = FAILURE return traces
try: job_id = panda_ids[rank] except ValueError: logger.critical("Pilot have no job for rank {0}".format(rank)) logger.critical("Exit pilot") main_exit(1) logger.debug("Job [{0}] will be processed".format(job_id)) os.chdir(str(job_id)) worker_communication_point = os.getcwd() work_report['workdir'] = worker_communication_point workerAttributesFile = os.path.join(worker_communication_point, workerAttributesFile) trans_job_workdir = os.path.join(scratch_path, str(job_id)) jobs_dict = read_json("HPCJobs.json") job_dict = jobs_dict[str(job_id)] job = JobDescription() job.load(job_dict) # add path to input files in RAM for inp_file in job.input_files: job.input_files[inp_file]["scratch_path"] = os.path.join(trans_job_workdir, inp_file) job.startTime = "" job.endTime = "" setup_str = "; ".join(get_setup(job)) job_working_dir = titan_prepare_wd(scratch_path, trans_job_workdir, worker_communication_point, job, workerAttributesFile)