def copy_out(files, copy_type="mv", **kwargs): """ Tries to upload the given files using mv directly. :param files: list of `FileSpec` objects :raises PilotException: StageOutFailure """ if copy_type not in ["cp", "mv"]: raise StageOutFailure("incorrect method for copy out") if not kwargs.get('workdir'): raise StageOutFailure("Workdir is not specified") exit_code, stdout, stderr = move_all_files(files, copy_type, kwargs.get('workdir')) if exit_code != 0: # raise failure raise StageOutFailure(stdout) # Create output list for ARC CE if necessary logger.debug('init_dir for output.list=%s' % os.path.dirname(kwargs.get('workdir'))) output_dir = kwargs.get('output_dir', '') if not output_dir: create_output_list(files, os.path.dirname(kwargs.get('workdir')), kwargs.get('ddmconf', None)) return files
def handle_updated_job_object(job, xdata, label='stage-in'): """ Handle updated job object fields. :param job: job object. :param xdata: list of FileSpec objects. :param label: 'stage-in/out' (string). :return: :raises: StageInFailure, StageOutFailure """ dictionary_name = config.Container.stagein_status_dictionary if label == 'stage-in' else config.Container.stageout_status_dictionary # read the JSON file created by the stage-in/out script if path.exists(path.join(job.workdir, dictionary_name + '.log')): dictionary_name += '.log' file_dictionary = read_json(path.join(job.workdir, dictionary_name)) # update the job object accordingly if file_dictionary: # get file info and set essential parameters for fspec in xdata: try: fspec.status = file_dictionary[fspec.lfn][0] fspec.status_code = file_dictionary[fspec.lfn][1] if label == 'stage-in': fspec.turl = file_dictionary[fspec.lfn][2] fspec.ddmendpoint = file_dictionary[fspec.lfn][3] else: fspec.surl = file_dictionary[fspec.lfn][2] fspec.turl = file_dictionary[fspec.lfn][3] fspec.checksum['adler32'] = file_dictionary[fspec.lfn][4] fspec.filesize = file_dictionary[fspec.lfn][5] except Exception as exc: msg = "exception caught while reading file dictionary: %s" % exc logger.warning(msg) if label == 'stage-in': raise StageInFailure(msg) else: raise StageOutFailure(msg) # get main error info ('error': [error_diag, error_code]) error_diag = file_dictionary['error'][0] error_code = file_dictionary['error'][1] if error_code: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( error_code, msg=error_diag) else: msg = "%s file dictionary not found" % label logger.warning(msg) if label == 'stage-in': raise StageInFailure(msg) else: raise StageOutFailure(msg)
def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, transfer_timeout): # init. download client from rucio.client.uploadclient import UploadClient upload_client = UploadClient(logger=logger) # traces are turned off if hasattr(upload_client, 'tracing'): upload_client.tracing = tracing_rucio if tracing_rucio: upload_client.trace = trace_report # file specifications before the upload f = {} f['path'] = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(fspec.workdir, fspec.lfn) f['rse'] = fspec.ddmendpoint f['did_scope'] = fspec.scope f['no_register'] = True if transfer_timeout: f['transfer_timeout'] = transfer_timeout # if fspec.storageId and int(fspec.storageId) > 0: # if fspec.turl and fspec.is_nondeterministic: # f['pfn'] = fspec.turl # elif fspec.lfn and '.root' in fspec.lfn: # f['guid'] = fspec.guid if fspec.lfn and '.root' in fspec.lfn: f['guid'] = fspec.guid # process with the upload logger.info('_stage_out_api: %s' % str(f)) result = None # upload client raises an exception if any file failed try: result = upload_client.upload([f], summary_file_path=summary_file_path, traces_copy_out=trace_report_out) except UnboundLocalError: logger.warning('rucio still needs a bug fix of the summary in the uploadclient') logger.debug('Rucio upload client returned %s' % result) try: file_exists = verify_stage_out(fspec) logger.info('File exists at the storage: %s' % str(file_exists)) if not file_exists: raise StageOutFailure('stageOut: Physical check after upload failed.') except Exception as e: msg = 'stageOut: File existence verification failed with: %s' % str(e) logger.info(msg) raise StageOutFailure(msg) return trace_report_out
def copy_out_old(files): """ Tries to upload the given files using lsm-put directly. :param files: Files to upload :raises PilotException: StageOutFailure """ if not check_for_lsm(dst_in=False): raise StageOutFailure("No LSM tools found") exit_code, stdout, stderr = move_all_files_out(files) if exit_code != 0: # raise failure raise StageOutFailure(stdout)
def copy_out(files, **kwargs): """ Upload given files using gfal command. :param files: Files to upload :raises: PilotException in case of errors """ if not check_for_gfal(): raise StageOutFailure("No GFAL2 tools found") trace_report = kwargs.get('trace_report') for fspec in files: trace_report.update(scope=fspec.scope, dataset=fspec.dataset, url=fspec.surl, filesize=fspec.filesize) trace_report.update(catStart=time(), filename=fspec.lfn, guid=fspec.guid.replace('-', '')) src = fspec.workdir or kwargs.get('workdir') or '.' timeout = get_timeout(fspec.filesize) source = "file://%s" % os.path.abspath(fspec.surl or os.path.join(src, fspec.lfn)) destination = fspec.turl cmd = ['gfal-copy --verbose -f', ' -t %s' % timeout] if fspec.checksum: cmd += ['-K', '%s:%s' % list(fspec.checksum.items())[0]] # Python 2/3 cmd += [source, destination] rcode, stdout, stderr = execute(" ".join(cmd), **kwargs) if rcode: ## error occurred if rcode in [errno.ETIMEDOUT, errno.ETIME]: error = {'rcode': ErrorCodes.STAGEOUTTIMEOUT, 'state': 'CP_TIMEOUT', 'error': 'Copy command timed out: %s' % stderr} else: error = resolve_common_transfer_errors(stdout + stderr, is_stagein=False) fspec.status = 'failed' fspec.status_code = error.get('rcode') trace_report.update(clientState=error.get('state', None) or 'STAGEOUT_ATTEMPT_FAILED', stateReason=error.get('error', 'unknown error'), timeEnd=time()) trace_report.send() raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) fspec.status_code = 0 fspec.status = 'transferred' trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() return files
def copy_out(files, copy_type="mv", **kwargs): """ Tries to upload the given files using mv directly. :param files: list of `FileSpec` objects :raises PilotException: StageOutFailure """ if copy_type not in ["cp", "mv"]: raise StageOutFailure("Incorrect method for copy out") if not kwargs.get('workdir'): raise StageOutFailure("Workdir is not specified") exit_code, stdout, stderr = move_all_files(files, copy_type, kwargs.get('workdir')) if exit_code != 0: # raise failure raise StageOutFailure(stdout) # Create output list for ARC CE create_output_list(files, os.path.dirname(kwargs.get('workdir'))) return files
def stageout_es_real(self, output_file): """ Stage out event service output file. :param output_file: output file name. """ job = self.get_job() logger.info('prepare to stage-out eventservice files') error = None file_data = { 'scope': 'transient', 'lfn': os.path.basename(output_file), } file_spec = FileSpec(filetype='output', **file_data) xdata = [file_spec] kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job) try_failover = False activity = [ 'es_events', 'pw' ] ## FIX ME LATER: replace `pw` with `write_lan` once AGIS is updated (acopytools) try: client = StageOutESClient(job.infosys, logger=logger) try_failover = True client.prepare_destinations( xdata, activity ) ## IF ES job should be allowed to write only at `es_events` astorages, then fix activity names here client.transfer(xdata, activity=activity, **kwargs) except PilotException as error: logger.error(error.get_detail()) except Exception as e: logger.error(traceback.format_exc()) error = StageOutFailure("stageOut failed with error=%s" % e) logger.info('Summary of transferred files:') logger.info(" -- lfn=%s, status_code=%s, status=%s" % (file_spec.lfn, file_spec.status_code, file_spec.status)) if error: logger.error( 'Failed to stage-out eventservice file(%s): error=%s' % (output_file, error.get_detail())) elif file_spec.status != 'transferred': msg = 'Failed to stage-out ES file(%s): logic corrupted: unknown internal error, fspec=%s' % ( output_file, file_spec) logger.error(msg) raise StageOutFailure(msg) failover_storage_activity = ['es_failover', 'pw'] if try_failover and error and error.get_error_code() not in [ ErrorCodes.MISSINGOUTPUTFILE ]: ## try to failover to other storage xdata2 = [FileSpec(filetype='output', **file_data)] try: client.prepare_destinations(xdata2, failover_storage_activity) if xdata2[0].ddmendpoint != xdata[ 0].ddmendpoint: ## skip transfer to same output storage msg = 'Will try to failover ES transfer to astorage with activity=%s, rse=%s' % ( failover_storage_activity, xdata2[0].ddmendpoint) logger.info(msg) client.transfer(xdata2, activity=activity, **kwargs) logger.info( 'Summary of transferred files (failover transfer):') logger.info(" -- lfn=%s, status_code=%s, status=%s" % (xdata2[0].lfn, xdata2[0].status_code, xdata2[0].status)) except PilotException as e: if e.get_error_code() == ErrorCodes.NOSTORAGE: logger.info( 'Failover ES storage is not defined for activity=%s .. skipped' % failover_storage_activity) else: logger.error( 'Transfer to failover storage=%s failed .. skipped, error=%s' % (xdata2[0].ddmendpoint, e.get_detail())) except Exception: logger.error('Failover ES stageout failed .. skipped') logger.error(traceback.format_exc()) if xdata2[0].status == 'transferred': error = None file_spec = xdata2[0] if error: raise error storage_id = infosys.get_storage_id(file_spec.ddmendpoint) return file_spec.ddmendpoint, storage_id, file_spec.filesize, file_spec.checksum
def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, container_options, external_dir, label='stage-in', container_type='container'): """ Containerise the middleware by performing stage-in/out steps in a script that in turn can be run in a container. Note: a container will only be used for option container_type='container'. If this is 'bash', then stage-in/out will still be done by a script, but not containerised. Note: this function is tailor made for stage-in/out. :param job: job object. :param xdata: list of FileSpec objects. :param queue: queue name (string). :param eventtype: :param localsite: :param remotesite: :param container_options: container options from queuedata (string). :param external_dir: input or output files directory (string). :param label: optional 'stage-in/out' (String). :param container_type: optional 'container/bash' :raises StageInFailure: for stage-in failures :raises StageOutFailure: for stage-out failures :return: """ cwd = getcwd() # get the name of the stage-in/out isolation script script = config.Container.middleware_container_stagein_script if label == 'stage-in' else config.Container.middleware_container_stageout_script try: cmd = get_command(job, xdata, queue, script, eventtype, localsite, remotesite, external_dir, label=label, container_type=container_type) except PilotException as e: raise e if container_type == 'container': # add bits and pieces needed to run the cmd in a container pilot_user = environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.container' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: cmd = user.create_middleware_container_command(job.workdir, cmd, container_options, label=label) except PilotException as e: raise e else: logger.warning( '%s will not be done in a container (but it will be done by a script)', label) try: logger.info('*** executing %s (logging will be redirected) ***', label) exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False) except Exception as exc: logger.info('*** %s has failed ***', label) logger.warning('exception caught: %s', exc) else: if exit_code == 0: logger.info('*** %s has finished ***', label) else: logger.info('*** %s has failed ***', label) logger.warning('stderr:\n%s', stderr) logger.warning('stdout:\n%s', stdout) logger.debug('%s script returned exit_code=%d', label, exit_code) # write stdout+stderr to files try: _stdout_name, _stderr_name = get_logfile_names(label) write_file(path.join(job.workdir, _stdout_name), stdout, mute=False) write_file(path.join(job.workdir, _stderr_name), stderr, mute=False) except PilotException as exc: msg = 'exception caught: %s' % exc if label == 'stage-in': raise StageInFailure(msg) else: raise StageOutFailure(msg) # handle errors, file statuses, etc (the stage-in/out scripts write errors and file status to a json file) try: handle_updated_job_object(job, xdata, label=label) except PilotException as exc: raise exc
def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, transfer_timeout): ec = 0 # init. download client from rucio.client.uploadclient import UploadClient upload_client = UploadClient(logger=logger) # traces are turned off if hasattr(upload_client, 'tracing'): upload_client.tracing = tracing_rucio if tracing_rucio: upload_client.trace = trace_report # file specifications before the upload f = {} f['path'] = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join( fspec.workdir, fspec.lfn) f['rse'] = fspec.ddmendpoint f['did_scope'] = fspec.scope f['no_register'] = True if transfer_timeout: f['transfer_timeout'] = transfer_timeout f['connection_timeout'] = 60 * 60 # if fspec.storageId and int(fspec.storageId) > 0: # if fspec.turl and fspec.is_nondeterministic: # f['pfn'] = fspec.turl # elif fspec.lfn and '.root' in fspec.lfn: # f['guid'] = fspec.guid if fspec.lfn and '.root' in fspec.lfn: f['guid'] = fspec.guid logger.info('rucio API stage-out dictionary: %s' % f) # upload client raises an exception if any file failed try: logger.info('*** rucio API uploading file (taking over logging) ***') logger.debug('summary_file_path=%s' % summary_file_path) logger.debug('trace_report_out=%s' % trace_report_out) result = upload_client.upload([f], summary_file_path=summary_file_path, traces_copy_out=trace_report_out) except Exception as e: logger.warning('*** rucio API upload client failed ***') logger.warning('caught exception: %s' % e) import traceback logger.error(traceback.format_exc()) logger.debug('trace_report_out=%s' % trace_report_out) if not trace_report_out: raise e if not trace_report_out[0].get('stateReason'): raise e ec = -1 except UnboundLocalError: logger.warning('*** rucio API upload client failed ***') logger.warning( 'rucio still needs a bug fix of the summary in the uploadclient') else: logger.warning('*** rucio API upload client finished ***') logger.debug('client returned %s' % result) try: file_exists = verify_stage_out(fspec) logger.info('file exists at the storage: %s' % str(file_exists)) if not file_exists: raise StageOutFailure('physical check after upload failed') except Exception as e: msg = 'file existence verification failed with: %s' % e logger.info(msg) raise StageOutFailure(msg) return ec, trace_report_out