def interrupt(args, signum, frame): """ Interrupt function on the receiving end of kill signals. This function is forwarded any incoming signals (SIGINT, SIGTERM, etc) and will set abort_job which instructs the threads to abort the job. :param args: pilot arguments. :param signum: signal. :param frame: stack/execution frame pointing to the frame that was interrupted by the signal. :return: """ try: sig = [v for v, k in signal.__dict__.iteritems() if k == signum][0] # Python 2 except Exception: sig = [v for v, k in list(signal.__dict__.items()) if k == signum][0] # Python 3 add_to_pilot_timing('0', PILOT_KILL_SIGNAL, time(), args) add_to_pilot_timing('1', PILOT_KILL_SIGNAL, time(), args) logger.warning('caught signal: %s' % sig) args.signal = sig logger.warning('will instruct threads to abort and update the server') args.abort_job.set() logger.warning('waiting for threads to finish') args.job_aborted.wait() logger.warning('setting graceful stop (in case it was not set already), pilot will abort') args.graceful_stop.set()
def post_payload(self, job): """ Functions to run pilot post payload :param job: job object """ # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), self.__args)
def post_setup(self, job): """ Functions to run post setup :param job: job object """ # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), self.__args)
def post_payload(self, job): """ Calls to functions to run after payload. E.g. write time stamps to timing file. :param job: job object """ # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), self.__args)
def interrupt(args, signum, frame): """ Interrupt function on the receiving end of kill signals. This function is forwarded any incoming signals (SIGINT, SIGTERM, etc) and will set abort_job which instructs the threads to abort the job. :param args: pilot arguments. :param signum: signal. :param frame: stack/execution frame pointing to the frame that was interrupted by the signal. :return: """ try: sig = [v for v, k in signal.__dict__.iteritems() if k == signum][0] except Exception: sig = [v for v, k in list(signal.__dict__.items()) if k == signum][0] args.signal_counter += 1 # keep track of when first kill signal arrived, any stuck loops should abort at a defined cut off time if args.kill_time == 0: args.kill_time = int(time()) max_kill_wait_time = MAX_KILL_WAIT_TIME + 60 # add another minute of grace to let threads finish current_time = int(time()) if args.kill_time and current_time - args.kill_time > max_kill_wait_time: logger.warning( 'passed maximum waiting time after first kill signal - will commit suicide - farewell' ) try: rmtree(args.sourcedir) except Exception as e: logger.warning(e) logging.shutdown() kill_processes(getpid()) add_to_pilot_timing('0', PILOT_KILL_SIGNAL, time(), args) add_to_pilot_timing('1', PILOT_KILL_SIGNAL, time(), args) logger.warning('caught signal: %s in FRAME=\n%s' % (sig, '\n'.join(traceback.format_stack(frame)))) args.signal = sig logger.warning('will instruct threads to abort and update the server') args.abort_job.set() logger.warning('waiting for threads to finish') args.job_aborted.wait() logger.warning( 'setting graceful stop (in case it was not set already), pilot will abort' ) args.graceful_stop.set()
def _stage_in(args, job): """ :return: True in case of success """ # tested ok: #logger.info('testing sending SIGUSR1') #import signal #os.kill(os.getpid(), signal.SIGUSR1) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(), args) # any DBRelease files should not be staged in skip_special_files(job) # now that the trace report has been created, remove any files that are not to be transferred (DBRelease files) from the indata list update_indata(job) label = 'stage-in' # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)? use_container = pilot.util.middleware.use_middleware_script( job.infosys.queuedata.container_type.get("middleware")) if use_container: logger.info('stage-in will be done by a script') try: eventtype, localsite, remotesite = get_trace_report_variables( job, label=label) pilot.util.middleware.containerise_middleware( job, job.indata, args.queue, eventtype, localsite, remotesite, job.infosys.queuedata.container_options, args.input_dir, label=label, container_type=job.infosys.queuedata.container_type.get( "middleware")) except PilotException as error: logger.warning( 'stage-in containerisation threw a pilot exception: %s', error) except Exception as error: import traceback logger.warning('stage-in containerisation threw an exception: %s', error) logger.error(traceback.format_exc()) else: try: logger.info('stage-in will not be done in a container') # create the trace report trace_report = create_trace_report(job, label=label) if job.is_eventservicemerge: client = StageInESClient(job.infosys, logger=logger, trace_report=trace_report) activity = 'es_events_read' else: client = StageInClient(job.infosys, logger=logger, trace_report=trace_report) activity = 'pr' use_pcache = job.infosys.queuedata.use_pcache kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, use_pcache=use_pcache, use_bulk=False, input_dir=args.input_dir, use_vp=job.use_vp, catchall=job.infosys.queuedata.catchall) client.prepare_sources(job.indata) client.transfer(job.indata, activity=activity, **kwargs) except PilotException as error: import traceback error_msg = traceback.format_exc() logger.error(error_msg) msg = errors.format_diagnostics(error.get_error_code(), error_msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( error.get_error_code(), msg=msg) except Exception as error: logger.error('failed to stage-in: error=%s', error) logger.info('summary of transferred files:') for infile in job.indata: status = infile.status if infile.status else "(not transferred)" logger.info(" -- lfn=%s, status_code=%s, status=%s", infile.lfn, infile.status_code, status) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) remain_files = [ infile for infile in job.indata if infile.status not in ['remote_io', 'transferred', 'no_transfer'] ] logger.info("stage-in finished") if not remain_files else logger.info( "stage-in failed") return not remain_files
def _stage_out_new(job, args): """ Stage-out of all output files. If job.stageout=log then only log files will be transferred. :param job: job object. :param args: pilot args object. :return: True in case of success, False otherwise. """ #logger.info('testing sending SIGUSR1') #import signal #os.kill(os.getpid(), signal.SIGUSR1) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args) is_success = True if not job.outdata or job.is_eventservice: logger.info( 'this job does not have any output files, only stage-out log file') job.stageout = 'log' if job.stageout != 'log': ## do stage-out output files if not _do_stageout(job, job.outdata, ['pw', 'w'], args.queue, title='output', output_dir=args.output_dir): is_success = False logger.warning('transfer of output file(s) failed') if job.stageout in ['log', 'all' ] and job.logdata: ## do stage-out log files # prepare log file, consider only 1st available log file status = job.get_status('LOG_TRANSFER') if status != LOG_TRANSFER_NOT_DONE: logger.warning('log transfer already attempted') return False job.status['LOG_TRANSFER'] = LOG_TRANSFER_IN_PROGRESS logfile = job.logdata[0] try: tarball_name = 'tarball_PandaJob_%s_%s' % (job.jobid, job.infosys.pandaqueue) input_files = [fspec.lfn for fspec in job.indata] output_files = [fspec.lfn for fspec in job.outdata] create_log(job.workdir, logfile.lfn, tarball_name, args.cleanup, input_files=input_files, output_files=output_files, is_looping=errors.LOOPINGJOB in job.piloterrorcodes, debugmode=job.debug) except LogFileCreationFailure as error: logger.warning('failed to create tar file: %s', error) set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.LOGFILECREATIONFAILURE) return False if not _do_stageout(job, [logfile], ['pl', 'pw', 'w'], args.queue, title='log', output_dir=args.output_dir): is_success = False logger.warning('log transfer failed') job.status['LOG_TRANSFER'] = LOG_TRANSFER_FAILED else: job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE elif not job.logdata: logger.info('no log was defined - will not create log file') job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args) # generate fileinfo details to be send to Panda fileinfo = {} for iofile in job.outdata + job.logdata: if iofile.status in ['transferred']: fileinfo[iofile.lfn] = { 'guid': iofile.guid, 'fsize': iofile.filesize, 'adler32': iofile.checksum.get('adler32'), 'surl': iofile.turl } job.fileinfo = fileinfo # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS if not is_success: # set error code + message (a more precise error code might have been set already) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEOUTFAILED) set_pilot_state(job=job, state="failed") logger.warning('stage-out failed') return False logger.info('stage-out finished correctly') if not job.state or ( job.state and job.state == 'stageout' ): # is the job state already set? if so, don't change the state (unless it's the stageout state) logger.debug('changing job state from %s to finished', job.state) set_pilot_state(job=job, state="finished") # send final server update since all transfers have finished correctly # send_state(job, args, 'finished', xml=dumps(fileinfodict)) return is_success
def _stage_out_new(job, args): """ Stage-out of all output files. If job.stageout=log then only log files will be transferred. :param job: job object. :param args: pilot args object. :return: True in case of success, False otherwise. """ log = get_logger(job.jobid) #log.info('testing sending SIGUSR1') #import signal #os.kill(os.getpid(), signal.SIGUSR1) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args) is_success = True if not job.outdata or job.is_eventservice: log.info('this job does not have any output files, only stage-out log file') job.stageout = 'log' if job.stageout != 'log': ## do stage-out output files if not _do_stageout(job, job.outdata, ['pw', 'w'], title='output'): is_success = False log.warning('transfer of output file(s) failed') if job.stageout in ['log', 'all'] and job.logdata: ## do stage-out log files # prepare log file, consider only 1st available log file status = job.get_status('LOG_TRANSFER') if status != LOG_TRANSFER_NOT_DONE: log.warning('log transfer already attempted') return False job.status['LOG_TRANSFER'] = LOG_TRANSFER_IN_PROGRESS logfile = job.logdata[0] try: create_log(job, logfile, 'tarball_PandaJob_%s_%s' % (job.jobid, job.infosys.pandaqueue)) except LogFileCreationFailure as e: log.warning('failed to create tar file: %s' % e) set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.LOGFILECREATIONFAILURE) return False if not _do_stageout(job, [logfile], ['pl', 'pw', 'w'], title='log'): is_success = False log.warning('log transfer failed') job.status['LOG_TRANSFER'] = LOG_TRANSFER_FAILED else: job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args) # generate fileinfo details to be send to Panda fileinfo = {} for e in job.outdata + job.logdata: if e.status in ['transferred']: fileinfo[e.lfn] = {'guid': e.guid, 'fsize': e.filesize, 'adler32': e.checksum.get('adler32'), 'surl': e.turl} job.fileinfo = fileinfo log.info('prepared job.fileinfo=%s' % job.fileinfo) # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS if not is_success: # set error code + message (a more precise error code might have been set already) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEOUTFAILED) set_pilot_state(job=job, state="failed") log.warning('stage-out failed') # with error: %d, %s (setting job state to failed)' % # log.warning('stage-out failed with error: %d, %s (setting job state to failed)' % # (job['pilotErrorCode'], job['pilotErrorDiag'])) # send_state(job, args, 'failed') return False log.info('stage-out finished correctly') if not job.state: # is the job state already set? if so, don't change the state set_pilot_state(job=job, state="finished") # send final server update since all transfers have finished correctly # send_state(job, args, 'finished', xml=dumps(fileinfodict)) return is_success
def _stage_in(args, job): """ :return: True in case of success """ log = get_logger(job.jobid) # tested ok: #log.info('testing sending SIGUSR1') #import signal #os.kill(os.getpid(), signal.SIGUSR1) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(), args) # any DBRelease files should not be staged in for fspec in job.indata: if 'DBRelease' in fspec.lfn: fspec.status = 'no_transfer' event_type = "get_sm" #if log_transfer: # eventType += '_logs' #if special_log_transfer: # eventType += '_logs_os' if job.is_analysis(): event_type += "_a" rse = get_rse(job.indata) localsite = remotesite = rse trace_report = TraceReport(pq='', localSite=localsite, remoteSite=remotesite, dataset="", eventType=event_type) trace_report.init(job) # now that the trace report has been created, remove any files that are not to be transferred (DBRelease files) from the indata list toberemoved = [] for fspec in job.indata: if fspec.status == 'no_transfer': toberemoved.append(fspec) for fspec in toberemoved: logger.info('removing fspec object (lfn=%s) from list of input files' % fspec.lfn) job.indata.remove(fspec) try: if job.is_eventservicemerge: client = StageInESClient(job.infosys, logger=log, trace_report=trace_report) activity = 'es_events_read' else: client = StageInClient(job.infosys, logger=log, trace_report=trace_report) activity = 'pr' kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job) #, mode='stage-in') client.transfer(job.indata, activity=activity, **kwargs) except PilotException as error: log.error('PilotException caught: %s' % error) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) except Exception as error: log.error('failed to stage-in: error=%s' % error) log.info('summary of transferred files:') for e in job.indata: if not e.status: status = "(not transferred)" else: status = e.status log.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status)) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) remain_files = [e for e in job.indata if e.status not in ['remote_io', 'transferred', 'no_transfer']] if not remain_files: log.info("stage-in finished") else: log.info("stage-in failed") return not remain_files
""" # get the args from the arg parser args = get_args() # Define and set the main harvester control boolean args.harvester = is_harvester_mode(args) # initialize the pilot timing dictionary args.timing = {} # TODO: move to singleton? # initialize job status dictionary (e.g. used to keep track of log transfers) args.job_status = {} # TODO: move to singleton or to job object directly? # store T0 time stamp add_to_pilot_timing('0', PILOT_START_TIME, time.time(), args) add_to_pilot_timing('1', PILOT_MULTIJOB_START_TIME, time.time(), args) # if requested by the wrapper via a pilot option, create the main pilot workdir and cd into it args.sourcedir = getcwd() #get_pilot_source_dir() exit_code, mainworkdir = create_main_work_dir(args) if exit_code != 0: sys.exit(exit_code) # set environment variables (to be replaced with singleton implementation) set_environment_variables(args, mainworkdir) # setup and establish standard logging establish_logging(debug=args.debug, nopilotlog=args.nopilotlog)
def set_scratch_workdir(job, work_dir, args): """ Copy input files and some db files to RAM disk. :param job: job object. :param work_dir: job working directory (permanent FS) (string). :param args: args dictionary to collect timing metrics. :return: job working directory in scratch (string). """ scratch_path = config.HPC.scratch du = disk_usage(scratch_path) logger.info("Scratch dir available space: {0} used: {1}".format( du.free, du.used)) job_scratch_dir = os.path.join(scratch_path, str(job.jobid)) for inp_file in job.input_files: job.input_files[inp_file]["scratch_path"] = job_scratch_dir logger.debug("Job scratch path: {0}".format(job_scratch_dir)) # special data, that should be preplaced in RAM disk dst_db_path = 'sqlite200/' dst_db_filename = 'ALLP200.db' dst_db_path_2 = 'geomDB/' dst_db_filename_2 = 'geomDB_sqlite' tmp_path = 'tmp/' src_file = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/sqlite200/ALLP200.db' src_file_2 = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/geomDB/geomDB_sqlite' if os.path.exists(scratch_path): try: add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(), args) logger.debug("Prepare \'tmp\' dir in scratch ") if not os.path.exists(scratch_path + tmp_path): os.makedirs(scratch_path + tmp_path) logger.debug("Prepare dst and copy sqlite db files") t0 = time.time() if not os.path.exists(scratch_path + dst_db_path): os.makedirs(scratch_path + dst_db_path) shutil.copyfile(src_file, scratch_path + dst_db_path + dst_db_filename) logger.debug("") sql_cp_time = time.time() - t0 logger.debug("Copy of sqlite files took: {0}".format(sql_cp_time)) logger.debug("Prepare dst and copy geomDB files") t0 = time.time() if not os.path.exists(scratch_path + dst_db_path_2): os.makedirs(scratch_path + dst_db_path_2) shutil.copyfile(src_file_2, scratch_path + dst_db_path_2 + dst_db_filename_2) geomdb_cp_time = time.time() - t0 logger.debug( "Copy of geomDB files took: {0} s".format(geomdb_cp_time)) logger.debug("Prepare job scratch dir") t0 = time.time() if not os.path.exists(job_scratch_dir): os.makedirs(job_scratch_dir) logger.debug("Copy input file") for inp_file in job.input_files: logger.debug("Copy: {0} to {1}".format( os.path.join(work_dir, inp_file), job.input_files[inp_file]["scratch_path"])) shutil.copyfile( os.path.join(work_dir, inp_file), os.path.join(job.input_files[inp_file]["scratch_path"], inp_file)) input_cp_time = time.time() - t0 logger.debug( "Copy of input files took: {0} s".format(input_cp_time)) except IOError as e: logger.error("I/O error({0}): {1}".format(e.errno, e.strerror)) logger.error( "Copy to scratch failed, execution terminated': \n %s " % (sys.exc_info()[1])) raise FileHandlingFailure("Copy to RAM disk failed") finally: add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) else: logger.info('Scratch directory (%s) dos not exist' % scratch_path) return work_dir os.chdir(job_scratch_dir) logger.debug("Current directory: {0}".format(os.getcwd())) true_dir = '/ccs/proj/csc108/AtlasReleases/21.0.15/nfs_db_files' pseudo_dir = "./poolcond" os.symlink(true_dir, pseudo_dir) du = disk_usage(scratch_path) logger.info("Scratch dir available space for job: {0} used: {1}".format( du.free, du.used)) return job_scratch_dir
def run(args): """ Main execution function for the generic HPC workflow. :param args: pilot arguments. :returns: traces object. """ # set communication point. Worker report should be placed there, matched with working directory of Harvester if args.harvester_workdir: communication_point = args.harvester_workdir else: communication_point = os.getcwd() work_report = get_initial_work_report() worker_attributes_file = config.Harvester.workerAttributesFile worker_stageout_declaration = config.Harvester.StageOutnFile payload_report_file = config.Payload.jobreport payload_stdout_file = config.Payload.payloadstdout payload_stderr_file = config.Payload.payloadstderr try: logger.info('setting up signal handling') signal.signal(signal.SIGINT, functools.partial(interrupt, args)) logger.info('setting up tracing') traces = namedtuple('traces', ['pilot']) traces.pilot = {'state': SUCCESS, 'nr_jobs': 0} if args.hpc_resource == '': logger.critical('hpc resource not specified, cannot continue') traces.pilot['state'] = FAILURE return traces # get the resource reference resource = __import__('pilot.resource.%s' % args.hpc_resource, globals(), locals(), [args.hpc_resource], -1) # get the user reference user = __import__('pilot.user.%s.common' % args.pilot_user.lower(), globals(), locals(), [args.pilot_user.lower()], -1) # get job (and rank) add_to_pilot_timing('0', PILOT_PRE_GETJOB, time.time(), args) job, rank = resource.get_job(communication_point) add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args) # cd to job working directory add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, time.time(), args) work_dir = resource.set_job_workdir(job, communication_point) work_report['workdir'] = work_dir worker_attributes_file = os.path.join(work_dir, worker_attributes_file) logger.debug("Worker attributes will be publeshied in: {0}".format( worker_attributes_file)) set_pilot_state(job=job, state="starting") work_report["jobStatus"] = job.state publish_work_report(work_report, worker_attributes_file) # Get HPC specific setup commands logger.info('setup for resource %s: %s' % (args.hpc_resource, str(resource.get_setup()))) setup_str = "; ".join(resource.get_setup()) # Prepare job scratch directory (RAM disk etc.) job_scratch_dir = resource.set_scratch_workdir(job, work_dir, args) my_command = " ".join([job.script, job.script_parameters]) my_command = resource.command_fix(my_command, job_scratch_dir) my_command = setup_str + my_command add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), args) # Basic execution. Should be replaced with something like 'run_payload' logger.debug("Going to launch: {0}".format(my_command)) logger.debug("Current work directory: {0}".format(job_scratch_dir)) payloadstdout = open(payload_stdout_file, "w") payloadstderr = open(payload_stderr_file, "w") add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, time.time(), args) set_pilot_state(job=job, state="running") work_report["jobStatus"] = job.state work_report["startTime"] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") start_time = time.asctime(time.localtime(time.time())) job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") publish_work_report(work_report, worker_attributes_file) stime = time.time() t0 = os.times() exit_code, stdout, stderr = execute(my_command, stdout=payloadstdout, stderr=payloadstderr, shell=True) logger.debug("Payload exit code: {0}".format(exit_code)) t1 = os.times() exetime = time.time() - stime end_time = time.asctime(time.localtime(time.time())) t = map(lambda x, y: x - y, t1, t0) t_tot = reduce(lambda x, y: x + y, t[2:3]) job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") payloadstdout.close() payloadstderr.close() add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), args) state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=job, state=state) job.exitcode = exit_code work_report["startTime"] = job.startTime work_report["endTime"] = job.endTime work_report["jobStatus"] = job.state work_report["cpuConsumptionTime"] = t_tot work_report["transExitCode"] = job.exitcode log_jobreport = "\nPayload exit code: {0} JobID: {1} \n".format( exit_code, job.jobid) log_jobreport += "CPU comsumption time: {0} JobID: {1} \n".format( t_tot, job.jobid) log_jobreport += "Start time: {0} JobID: {1} \n".format( start_time, job.jobid) log_jobreport += "End time: {0} JobID: {1} \n".format( end_time, job.jobid) log_jobreport += "Execution time: {0} sec. JobID: {1} \n".format( exetime, job.jobid) logger.info(log_jobreport) log_jobreport = "\nJob report start time: {0}\nJob report end time: {1}".format( job.startTime, job.endTime) logger.debug(log_jobreport) # Parse job report file and update of work report if os.path.exists(payload_report_file): payload_report = user.parse_jobreport_data( read_json(payload_report_file)) work_report.update(payload_report) resource.process_jobreport(payload_report_file, job_scratch_dir, work_dir) resource.postprocess_workdir(job_scratch_dir) # output files should not be packed with logs protectedfiles = job.output_files.keys() # log file not produced (yet), so should be excluded if job.log_file in protectedfiles: protectedfiles.remove(job.log_file) else: logger.info("Log files was not declared") logger.info("Cleanup of working directory") protectedfiles.extend( [worker_attributes_file, worker_stageout_declaration]) user.remove_redundant_files(job_scratch_dir, protectedfiles) res = tar_files(job_scratch_dir, protectedfiles, job.log_file) if res > 0: raise FileHandlingFailure("Log file tar failed") add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args) # Copy of output to shared FS for stageout if not job_scratch_dir == work_dir: copy_output(job, job_scratch_dir, work_dir) add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args) logger.info("Declare stage-out") add_to_pilot_timing(job.jobid, PILOT_PRE_FINAL_UPDATE, time.time(), args) declare_output(job, work_report, worker_stageout_declaration) logger.info("All done") publish_work_report(work_report, worker_attributes_file) traces.pilot['state'] = SUCCESS logger.debug("Final report: {0}".format(work_report)) add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(), args) except Exception as e: work_report["jobStatus"] = "failed" work_report["exitMsg"] = str(e) publish_work_report(work_report, worker_attributes_file) logging.exception('exception caught:') traces.pilot['state'] = FAILURE return traces
def _stage_in(args, job): """ :return: True in case of success """ log = get_logger(job.jobid) # tested ok: #log.info('testing sending SIGUSR1') #import signal #os.kill(os.getpid(), signal.SIGUSR1) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(), args) # any DBRelease files should not be staged in for fspec in job.indata: if 'DBRelease' in fspec.lfn: fspec.status = 'no_transfer' event_type = "get_sm" if job.is_analysis(): event_type += "_a" rse = get_rse(job.indata) localsite = remotesite = rse trace_report = TraceReport(pq=os.environ.get('PILOT_SITENAME', ''), localSite=localsite, remoteSite=remotesite, dataset="", eventType=event_type) trace_report.init(job) # now that the trace report has been created, remove any files that are not to be transferred (DBRelease files) from the indata list toberemoved = [] for fspec in job.indata: if fspec.status == 'no_transfer': toberemoved.append(fspec) for fspec in toberemoved: logger.info('removing fspec object (lfn=%s) from list of input files' % fspec.lfn) job.indata.remove(fspec) ########### bulk transfer test # THE FOLLOWING WORKS BUT THERE IS AN ISSUE WITH TRACES, CHECK STAGEIN SCRIPT IF STORED CORRECTLY #filename = 'initial_trace_report.json' #tpath = os.path.join(job.workdir, filename) #write_json(tpath, trace_report) #lfns, scopes = get_filedata_strings(job.indata) #script = 'stagein.py' #srcdir = os.environ.get('PILOT_SOURCE_DIR') #scriptpath = os.path.join(os.path.join(srcdir, 'pilot/scripts'), script) #copy(scriptpath, srcdir) #cmd = 'python %s --lfns=%s --scopes=%s --tracereportname=%s -w %s -d -q %s' %\ # (os.path.join(srcdir, script), lfns, scopes, tpath, job.workdir, args.queue) #logger.debug('could have executed: %s' % script) #exit_code, stdout, stderr = execute(cmd, mode='python') #logger.debug('exit_code=%d' % exit_code) #logger.debug('stdout=%s' % stdout) #logger.debug('stderr=%s' % stderr) ########### bulk transfer test try: if job.is_eventservicemerge: client = StageInESClient(job.infosys, logger=log, trace_report=trace_report) activity = 'es_events_read' else: client = StageInClient(job.infosys, logger=log, trace_report=trace_report) activity = 'pr' kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job, use_bulk=False) client.prepare_sources(job.indata) client.transfer(job.indata, activity=activity, **kwargs) except PilotException as error: import traceback error_msg = traceback.format_exc() log.error(error_msg) msg = errors.format_diagnostics(error.get_error_code(), error_msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( error.get_error_code(), msg=msg) except Exception as error: log.error('failed to stage-in: error=%s' % error) log.info('summary of transferred files:') for e in job.indata: status = e.status if e.status else "(not transferred)" log.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status)) # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) remain_files = [ e for e in job.indata if e.status not in ['remote_io', 'transferred', 'no_transfer'] ] if not remain_files: log.info("stage-in finished") else: log.info("stage-in failed") return not remain_files