def updateRunCommandList(runCommandList, pworkdir, jobId, statusPFCTurl, analysisJob, usedFAXandDirectIO, hasInput, prodDBlockToken): """ update the run command list if --directIn is no longer needed """ # the method is using the file state dictionary # remove later dumpFileStates(pworkdir, jobId, ftype="input") # remove any instruction regarding tag file creation for event service jobs _runCommandList = [] for cmd in runCommandList: if "--createTAGFileForES" in cmd: cmd = cmd.replace("--createTAGFileForES","") _runCommandList.append(cmd) runCommandList = _runCommandList # no need to continue if no input files if not hasInput: return runCommandList # are there only copy_to_scratch transfer modes in the file state dictionary? # if so, remove any lingering --directIn instruction only_copy_to_scratch = hasOnlyCopyToScratch(pworkdir, jobId) if only_copy_to_scratch or 'local' in prodDBlockToken: # if hasOnlyCopyToScratch(pworkdir, jobId): # python bug? does not work, have to use previous two lines? _runCommandList = [] if only_copy_to_scratch: tolog("There are only copy_to_scratch transfer modes in file state dictionary") for cmd in runCommandList: # remove the --directIn string if present if "--directIn" in cmd: tolog("(Removing --directIn instruction from run command since it is not needed)") cmd = cmd.replace("--directIn", "") # remove the --useFileStager string if present if "--useFileStager" in cmd: tolog("(Removing --useFileStager instruction from run command since it is not needed)") cmd = cmd.replace("--useFileStager", "") # remove additional run options if creation of TURL based PFC failed if statusPFCTurl == False: # (note: can also be None, so do not use 'if not statusPFCTurl') if "--usePFCTurl" in cmd: tolog("(Removing --usePFCTurl instruction from run command since it is not needed)") cmd = cmd.replace(" --usePFCTurl", "") if not "--lfcHost" in cmd and analysisJob: tolog("Adding lfcHost to run command") cmd += ' --lfcHost %s' % (readpar('lfchost')) tolog("Updated run command: %s" % (cmd)) _runCommandList.append(cmd) else: tolog("Nothing to update in run command list related to copy-to-scratch") _runCommandList = runCommandList # was FAX used as primary site mover in combination with direct I/O? if usedFAXandDirectIO == True: tolog("Since FAX was used as primary site mover in combination with direct I/O, the run command list need to be updated") _runCommandList2 = [] for cmd in _runCommandList: # remove the --lfcHost if "--lfcHost" in cmd: _lfcHost = ' --lfcHost %s' % (readpar('lfchost')) cmd = cmd.replace(_lfcHost, '') tolog("(Removed the LFC host:%s)" % (_lfcHost)) # remove the --oldPrefix if "--oldPrefix" in cmd: pattern = "(\-\-oldPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --oldPrefix pattern)") # remove the --newPrefix if "--newPrefix" in cmd: pattern = "(\-\-newPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --newPrefix pattern)") # add the --usePFCTurl if not there already if not "--usePFCTurl" in cmd and analysisJob: cmd += " --usePFCTurl" tolog("(Added --usePFCTurl)") tolog("Updated run command: %s" % (cmd)) _runCommandList2.append(cmd) _runCommandList = _runCommandList2 ### new movers quick integration: reuse usedFAXandDirectIO variable with special meaning ### to avoid any LFC and prefixes lookups in transformation scripts ### since new movers already form proper pfn values ### proper workflow is required: to be reimplemented later if usedFAXandDirectIO == 'newmover' or usedFAXandDirectIO == 'newmover-directaccess': tolog("updateRunCommandList(): use new movers logic") tolog("updateRunCommandList(): remove to be deprecated options (--lfcHost, --oldPrefix, --newPrefix) from command list") tolog("updateRunCommandList(): force to set --usePFCTurl") tolog("updateRunCommandList(): check directaccess mode if need (--directIn)") tolog("current runCommandList=%s" % _runCommandList) _runCommandList2 = [] for cmd in _runCommandList: # remove the --lfcHost, --oldPrefix, --newPrefix # add --usePFCTurl if "--lfcHost" in cmd: cmd = removePattern(cmd, "(\-\-lfcHost\ \S+)") tolog("(Removed the --lfcHost)") if "--oldPrefix" in cmd: pattern = "(\-\-oldPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --oldPrefix pattern)") if "--newPrefix" in cmd: pattern = "(\-\-newPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --newPrefix pattern)") if "--usePFCTurl" not in cmd and analysisJob: cmd += " --usePFCTurl" tolog("(Added --usePFCTurl)") # add --directIn if need if usedFAXandDirectIO == 'newmover-directaccess': if "--directIn" not in cmd and analysisJob: cmd += " --directIn" tolog("(Added --directIn)") tolog("Updated run command: %s" % cmd) _runCommandList2.append(cmd) _runCommandList = _runCommandList2 tolog("Dumping final input file states") dumpFileStates(pworkdir, jobId, ftype="input") return _runCommandList
def stageout(self, activity, files): """ Copy files to dest SE: main control function, it should care about alternative stageout and retry-policy for diffrent ddmendpoints :return: list of entries (is_success, success_transfers, failed_transfers, exception) for each ddmendpoint :return: (transferred_files, failed_transfers) :raise: PilotException in case of error """ if not files: raise PilotException( "Failed to put files: empty file list to be transferred") pandaqueue = self.si.getQueueName() # FIX ME LATER protocols = self.protocols.setdefault( activity, self.si.resolvePandaProtocols(pandaqueue, activity)[pandaqueue]) copytools = self.si.resolvePandaCopytools(pandaqueue, activity)[pandaqueue] self.log( "Mover.stageout() [new implementation] started for activity=%s, files=%s, protocols=%s, copytools=%s" % (activity, files, protocols, copytools)) # check if file exists before actual processing # populate filesize if need for fspec in files: pfn = os.path.join(self.job.workdir, fspec.lfn) if not os.path.isfile(pfn) or not os.access(pfn, os.R_OK): error = "Erron: input pfn file is not exist: %s" % pfn self.log(error) raise PilotException(error, code=PilotErrors.ERR_MISSINGOUTPUTFILE, state="FILE_INFO_FAIL") fspec.filesize = os.path.getsize(pfn) totalsize = reduce(lambda x, y: x + y.filesize, files, 0) transferred_files, failed_transfers = [], [] self.log( "Found N=%s files to be transferred, total_size=%.3f MB: %s" % (len(files), totalsize / 1024. / 1024., [e.lfn for e in files])) # first resolve protocol settings from PQ specific aprotocols settings # then resolve settings from default ddm.protocols supported by copytools # group protocols, files by ddmendpoint ddmprotocols, ddmfiles = {}, {} for e in files: ddmfiles.setdefault(e.ddmendpoint, []).append(e) # load DDM conf/protocols self.ddmconf.update(self.si.resolveDDMConf(ddmfiles.keys())) for e in protocols: if e['ddm'] not in ddmfiles: # skip not affected protocols settings continue e['copytools'] = [{ 'copytool': e['copytool'], 'copysetup': e['copysetup'] }] ddmprotocols.setdefault(e['ddm'], []).append(e) # generate default protocols from copytools/schemes and ddmconf unknown_ddms = set(ddmfiles) - set(ddmprotocols) for ddmendpoint in unknown_ddms: dd = self.ddmconf.get(ddmendpoint, {}).get('aprotocols', {}) dat = dd.get(activity, []) or dd.get('w', []) dprotocols = [ dict(se=e[0], path=e[2], resolve_scheme=True) for e in sorted(dat, key=lambda x: x[1]) ] ddmprotocols.setdefault(ddmendpoint, dprotocols) unknown_ddms = set(ddmfiles) - set(ddmprotocols) if unknown_ddms: raise PilotException( "Failed to put files: no protocols defined for output ddmendpoints=%s .. check aprotocols schedconfig settings for activity=%s or default ddm.aprotocols entries" % (unknown_ddms, activity), code=PilotErrors.ERR_NOSTORAGE) self.log( "[stage-out] [%s] filtered protocols to be used to transfer files: protocols=%s" % (activity, ddmprotocols)) # get SURL endpoint for Panda callback registration # resolve from special protocol activity='SE' or fallback to activity='a', then to 'r' surl_protocols, no_surl_ddms = {}, set() for fspec in files: if not fspec.surl: # initialize only if not already set d = self.ddmconf.get(fspec.ddmendpoint, {}).get('aprotocols', {}) xprot = d.get('SE', []) if not xprot: xprot = [ e for e in d.get('a', d.get('r', [])) if e[0] and e[0].startswith('srm') ] surl_prot = [ dict(se=e[0], path=e[2]) for e in sorted(xprot, key=lambda x: x[1]) ] if surl_prot: surl_protocols.setdefault(fspec.ddmendpoint, surl_prot[0]) else: no_surl_ddms.add(fspec.ddmendpoint) if no_surl_ddms: # failed to resolve SURLs self.log( 'FAILED to resolve default SURL path for ddmendpoints=%s' % list(no_surl_ddms)) raise PilotException( "Failed to put files: no SE/SURL protocols defined for output ddmendpoints=%s .. check ddmendpoints aprotocols settings for activity=SE/a/r" % list(no_surl_ddms), code=PilotErrors.ERR_NOSTORAGE) sitemover_objects = {} # try to iterate over protocol of given ddmendpoint until successfull transfer for ddmendpoint, iprotocols in ddmprotocols.iteritems(): for dat in iprotocols: remain_files = [ e for e in ddmfiles.get(ddmendpoint) if e.status not in ['transferred'] ] if not remain_files: self.log( 'INFO: all files to be transfered to ddm=%s have been successfully processed for activity=%s ..' % (ddmendpoint, activity)) # stop checking other protocols of ddmendpoint break if not 'copytools' in dat: # use allowed copytools cdat = [] for cp, settings in copytools: cdat.append({ 'copytool': cp, 'copysetup': settings.get('setup') }) dat['copytools'] = cdat if not dat['copytools']: msg = 'FAILED to resolve final copytools settings for ddmendpoint=%s, please check schedconf.copytools settings: copytools=%s, iprotocols=' % list( ddmendpoint, copytools, iprotocols) self.log(msg) raise PilotException(msg, code=PilotErrors.ERR_NOSTORAGE) for cpsettings in dat.get('copytools', []): copytool, copysetup = cpsettings.get( 'copytool'), cpsettings.get('copysetup') try: sitemover = sitemover_objects.get(copytool) if not sitemover: sitemover = getSiteMover(copytool)( copysetup, workDir=self.job.workdir) sitemover_objects.setdefault(copytool, sitemover) sitemover.trace_report = self.trace_report sitemover.protocol = dat # ## sitemover.ddmconf = self.ddmconf # quick workaround ### sitemover.setup() if dat.get('resolve_scheme'): dat['scheme'] = sitemover.schemes except Exception, e: self.log( 'WARNING: Failed to get SiteMover: %s .. skipped .. try to check next available protocol, current protocol details=%s' % (e, dat)) continue if dat.get( 'scheme' ): # filter protocols by accepted scheme from copytool should_skip = True for scheme in dat.get('scheme'): if dat['se'].startswith(scheme): should_skip = False break if should_skip: self.log( "[stage-out] protocol=%s of ddmendpoint=%s is skipped since copytool=%s does not support it, accepted schemes=%s" % (dat['se'], ddmendpoint, copytool, dat['scheme'])) continue self.log("Copy command [stage-out]: %s, sitemover=%s" % (copytool, sitemover)) self.log("Copy setup [stage-out]: %s" % copysetup) self.trace_report.update(protocol=copytool, localSite=ddmendpoint, remoteSite=ddmendpoint) # validate se value? se, se_path = dat.get('se', ''), dat.get('path', '') for fdata in remain_files: if not fdata.surl: fdata.surl = sitemover.getSURL( surl_protocols[fdata.ddmendpoint].get('se'), surl_protocols[fdata.ddmendpoint].get('path'), fdata.scope, fdata.lfn, self.job ) # job is passing here for possible JOB specific processing updateFileState(fdata.lfn, self.workDir, self.job.jobId, mode="file_state", state="not_transferred", ftype="output") fdata.turl = sitemover.getSURL( se, se_path, fdata.scope, fdata.lfn, self.job ) # job is passing here for possible JOB specific processing self.log( "[stage-out] resolved SURL=%s to be used for lfn=%s, ddmendpoint=%s" % (fdata.surl, fdata.lfn, fdata.ddmendpoint)) self.log( "[stage-out] resolved TURL=%s to be used for lfn=%s, ddmendpoint=%s" % (fdata.turl, fdata.lfn, fdata.ddmendpoint)) self.log( "[stage-out] Prepare to put_data: ddmendpoint=%s, protocol=%s, fspec=%s" % (ddmendpoint, dat, fdata)) self.trace_report.update(catStart=time.time(), filename=fdata.lfn, guid=fdata.guid.replace( '-', '')) self.trace_report.update( scope=fdata.scope, dataset=fdata.destinationDblock, url=fdata.turl) self.log( "[stage-out] Preparing copy for lfn=%s using copytool=%s: mover=%s" % (fdata.lfn, copytool, sitemover)) #dumpFileStates(self.workDir, self.job.jobId, ftype="output") # loop over multple stage-out attempts for _attempt in xrange(1, self.stageoutretry + 1): if _attempt > 1: # if not first stage-out attempt, take a nap before next attempt self.log( " -- Waiting %s seconds before next stage-out attempt for file=%s --" % (self.stageout_sleeptime, fdata.lfn)) time.sleep(self.stageout_sleeptime) self.log("Put attempt %s/%s for filename=%s" % (_attempt, self.stageoutretry, fdata.lfn)) try: result = sitemover.put_data(fdata) fdata.status = 'transferred' # mark as successful if result.get('surl'): fdata.surl = result.get('surl') #if result.get('pfn'): # fdata.turl = result.get('pfn') #self.trace_report.update(url=fdata.surl) ### self.trace_report.update(url=fdata.turl) ### # finalize and send trace report self.trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time.time()) self.sendTrace(self.trace_report) updateFileState(fdata.lfn, self.workDir, self.job.jobId, mode="file_state", state="transferred", ftype="output") dumpFileStates(self.workDir, self.job.jobId, ftype="output") self.updateSURLDictionary( fdata.guid, fdata.surl, self.workDir, self. job.jobId) # FIXME LATER: isolate later fdat = result.copy() #fdat.update(lfn=lfn, pfn=pfn, guid=guid, surl=surl) transferred_files.append(fdat) break # transferred successfully except PilotException, e: result = e self.log(traceback.format_exc()) except Exception, e: result = PilotException( "stageOut failed with error=%s" % e, code=PilotErrors.ERR_STAGEOUTFAILED) self.log(traceback.format_exc()) self.log( 'WARNING: Error in copying file (attempt %s/%s): %s' % (_attempt, self.stageoutretry, result)) if isinstance(result, Exception): # failure transfer failed_transfers.append(result)
def do_put_files(self, ddmendpoint, protocols, files): # old function : TO BE DEPRECATED ... """ Copy files to dest SE :ddmendpoint: DDMEndpoint name used to store files :return: (list of transferred_files details, list of failed_transfers details) :raise: PilotException in case of error """ self.log( '[deprecated do_put_files()]Prepare to copy files=%s to ddmendpoint=%s using protocols data=%s' % (files, ddmendpoint, protocols)) self.log("[deprecated do_put_files()]Number of stage-out tries: %s" % self.stageoutretry) # get SURL for Panda calback registration # resolve from special protocol activity=SE # fix me later to proper name of activitiy=SURL (panda SURL, at the moment only 2-letter name is allowed on AGIS side) # if SE is not found, try to fallback to a surl_prot = [ dict(se=e[0], path=e[2]) for e in sorted(self.ddmconf.get(ddmendpoint, {}).get('aprotocols', {}).get( 'SE', self.ddmconf.get(ddmendpoint, {}).get('aprotocols', {}).get( 'a', [])), key=lambda x: x[1]) ] if not surl_prot: self.log('FAILED to resolve default SURL path for ddmendpoint=%s' % ddmendpoint) return [], [] surl_prot = surl_prot[0] # take first self.log("[do_put_files] SURL protocol to be used: %s" % surl_prot) self.trace_report.update(localSite=ddmendpoint, remoteSite=ddmendpoint) transferred_files, failed_transfers = [], [] for dat in protocols: copytool, copysetup = dat.get('copytool'), dat.get('copysetup') try: sitemover = getSiteMover(copytool)(copysetup, workDir=self.job.workdir) sitemover.trace_report = self.trace_report sitemover.protocol = dat # ## sitemover.ddmconf = self.ddmconf # quick workaround ### sitemover.setup() except Exception, e: self.log( '[do_put_files] WARNING: Failed to get SiteMover: %s .. skipped .. try to check next available protocol, current protocol details=%s' % (e, dat)) continue self.log("[do_put_files] Copy command: %s, sitemover=%s" % (copytool, sitemover)) self.log("[do_put_files] Copy setup: %s" % copysetup) self.trace_report.update(protocol=copytool) se, se_path = dat.get('se', ''), dat.get('path', '') self.log("[do_put_files] Found N=%s files to be transferred: %s" % (len(files), [e.get('pfn') for e in files])) for fdata in files: scope, lfn, pfn = fdata.get( 'scope', ''), fdata.get('lfn'), fdata.get('pfn') guid = fdata.get('guid', '') surl = sitemover.getSURL( surl_prot.get('se'), surl_prot.get('path'), scope, lfn, self.job ) # job is passing here for possible JOB specific processing turl = sitemover.getSURL( se, se_path, scope, lfn, self.job ) # job is passing here for possible JOB specific processing self.trace_report.update(scope=scope, dataset=fdata.get('dsname_report'), url=surl) self.trace_report.update(catStart=time.time(), filename=lfn, guid=guid.replace('-', '')) self.log( "[do_put_files] Preparing copy for pfn=%s to ddmendpoint=%s using copytool=%s: mover=%s" % (pfn, ddmendpoint, copytool, sitemover)) self.log("[do_put_files] lfn=%s: SURL=%s" % (lfn, surl)) self.log("[do_put_files] TURL=%s" % turl) if not os.path.isfile(pfn) or not os.access(pfn, os.R_OK): error = "Erron: input pfn file is not exist: %s" % pfn self.log(error) raise PilotException( error, code=PilotErrors.ERR_MISSINGOUTPUTFILE, state="FILE_INFO_FAIL") filename = os.path.basename(pfn) # update the current file state updateFileState(filename, self.workDir, self.job.jobId, mode="file_state", state="not_transferred") dumpFileStates(self.workDir, self.job.jobId) # loop over multple stage-out attempts for _attempt in xrange(1, self.stageoutretry + 1): if _attempt > 1: # if not first stage-out attempt, take a nap before next attempt self.log( " -- Waiting %d seconds before next stage-out attempt for file=%s --" % (self.stageout_sleeptime, filename)) time.sleep(self.stageout_sleeptime) self.log( "[do_put_files] Put attempt %d/%d for filename=%s" % (_attempt, self.stageoutretry, filename)) try: # quick work around from Job import FileSpec stub_fspec = FileSpec(ddmendpoint=ddmendpoint, guid=guid, scope=scope, lfn=lfn) result = sitemover.stageOut(pfn, turl, stub_fspec) break # transferred successfully except PilotException, e: result = e self.log(traceback.format_exc()) except Exception, e: self.log(traceback.format_exc()) result = PilotException( "stageOut failed with error=%s" % e, code=PilotErrors.ERR_STAGEOUTFAILED) self.log( 'WARNING [do_put_files]: Error in copying file (attempt %s): %s' % (_attempt, result))
# update the job state file tolog(runJob.getOutputDir()) job.jobState = "stageout" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # verify and prepare and the output files for transfer ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(job.outFiles, job.logFile, job.workdir) if ec: # missing output file (only error code from prepareOutFiles) runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # update the current file states updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created") dumpFileStates(runJob.getParentWorkDir(), job.jobId) # create xml string to pass to dispatcher for atlas jobs outputFileInfo = {} if outs or (job.logFile and job.logFile != ''): # get the datasets for the output files dsname, datasetDict = runJob.getDatasets(job) # re-create the metadata.xml file, putting guids of ALL output files into it. # output files that miss guids from the job itself will get guids in PFCxml function # first rename and copy the trf metadata file for non-build jobs if not pUtil.isBuildJob(outs): runJob.moveTrfMetadata(job.workdir, job.jobId) # create the metadata for the output + log files
if not isinstance(result, Exception): # transferred successfully # finalize and send trace report self.trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time.time()) self.sendTrace(self.trace_report) updateFileState(fdata.lfn, self.workDir, self.job.jobId, mode="file_state", state="transferred", ftype="input") dumpFileStates(self.workDir, self.job.jobId, ftype="input") ## self.updateSURLDictionary(guid, surl, self.workDir, self.job.jobId) # FIX ME LATER fdat = result.copy() #fdat.update(lfn=lfn, pfn=pfn, guid=guid, surl=surl) transferred_files.append(fdat) else: failed_transfers.append(result) dumpFileStates(self.workDir, self.job.jobId, ftype="input") #self.log('transferred_files= %s' % transferred_files) self.log('Summary of transferred files:') for e in transferred_files: self.log(" -- %s" % e)
def do_put_files(self, ddmendpoint, protocols, files): # old function : TO BE DEPRECATED ... """ Copy files to dest SE :ddmendpoint: DDMEndpoint name used to store files :return: (list of transferred_files details, list of failed_transfers details) :raise: PilotException in case of error """ self.log('[deprecated do_put_files()]Prepare to copy files=%s to ddmendpoint=%s using protocols data=%s' % (files, ddmendpoint, protocols)) self.log("[deprecated do_put_files()]Number of stage-out tries: %s" % self.stageoutretry) # get SURL for Panda calback registration # resolve from special protocol activity=SE # fix me later to proper name of activitiy=SURL (panda SURL, at the moment only 2-letter name is allowed on AGIS side) surl_prot = [dict(se=e[0], path=e[2]) for e in sorted(self.ddmconf.get(ddmendpoint, {}).get('aprotocols', {}).get('SE', []), key=lambda x: x[1])] if not surl_prot: self.log('FAILED to resolve default SURL path for ddmendpoint=%s' % ddmendpoint) return [], [] surl_prot = surl_prot[0] # take first self.log("[do_put_files] SURL protocol to be used: %s" % surl_prot) self.trace_report.update(localSite=ddmendpoint, remoteSite=ddmendpoint) transferred_files, failed_transfers = [], [] for dat in protocols: copytool, copysetup = dat.get('copytool'), dat.get('copysetup') try: sitemover = getSiteMover(copytool)(copysetup, workDir=self.job.workdir) sitemover.trace_report = self.trace_report sitemover.protocol = dat # ## sitemover.ddmconf = self.ddmconf # quick workaround ### sitemover.setup() except Exception, e: self.log('[do_put_files] WARNING: Failed to get SiteMover: %s .. skipped .. try to check next available protocol, current protocol details=%s' % (e, dat)) continue self.log("[do_put_files] Copy command: %s, sitemover=%s" % (copytool, sitemover)) self.log("[do_put_files] Copy setup: %s" % copysetup) self.trace_report.update(protocol=copytool) se, se_path = dat.get('se', ''), dat.get('path', '') self.log("[do_put_files] Found N=%s files to be transferred: %s" % (len(files), [e.get('pfn') for e in files])) for fdata in files: scope, lfn, pfn = fdata.get('scope', ''), fdata.get('lfn'), fdata.get('pfn') guid = fdata.get('guid', '') surl = sitemover.getSURL(surl_prot.get('se'), surl_prot.get('path'), scope, lfn, self.job) # job is passing here for possible JOB specific processing turl = sitemover.getSURL(se, se_path, scope, lfn, self.job) # job is passing here for possible JOB specific processing self.trace_report.update(scope=scope, dataset=fdata.get('dsname_report'), url=surl) self.trace_report.update(catStart=time.time(), filename=lfn, guid=guid.replace('-', '')) self.log("[do_put_files] Preparing copy for pfn=%s to ddmendpoint=%s using copytool=%s: mover=%s" % (pfn, ddmendpoint, copytool, sitemover)) self.log("[do_put_files] lfn=%s: SURL=%s" % (lfn, surl)) self.log("[do_put_files] TURL=%s" % turl) if not os.path.isfile(pfn) or not os.access(pfn, os.R_OK): error = "Erron: input pfn file is not exist: %s" % pfn self.log(error) raise PilotException(error, code=PilotErrors.ERR_MISSINGOUTPUTFILE, state="FILE_INFO_FAIL") filename = os.path.basename(pfn) # update the current file state updateFileState(filename, self.workDir, self.job.jobId, mode="file_state", state="not_transferred") dumpFileStates(self.workDir, self.job.jobId) # loop over multple stage-out attempts for _attempt in xrange(1, self.stageoutretry + 1): if _attempt > 1: # if not first stage-out attempt, take a nap before next attempt self.log(" -- Waiting %d seconds before next stage-out attempt for file=%s --" % (self.stageout_sleeptime, filename)) time.sleep(self.stageout_sleeptime) self.log("[do_put_files] Put attempt %d/%d for filename=%s" % (_attempt, self.stageoutretry, filename)) try: # quick work around from Job import FileSpec stub_fspec = FileSpec(ddmendpoint=ddmendpoint) result = sitemover.stageOut(pfn, turl, stub_fspec) break # transferred successfully except PilotException, e: result = e self.log(traceback.format_exc()) except Exception, e: self.log(traceback.format_exc()) result = PilotException("stageOut failed with error=%s" % e, code=PilotErrors.ERR_STAGEOUTFAILED) self.log('WARNING [do_put_files]: Error in copying file (attempt %s): %s' % (_attempt, result))
job.outFiles, job.logFile, job.workdir) if ec: # missing output file (only error code from prepareOutFiles) runJob.failJob(job.result[1], ec, job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # update the current file states updateFileStates(outs, runJob.getParentWorkDir(), job.jobId, mode="file_state", state="created") dumpFileStates(runJob.getParentWorkDir(), job.jobId) # create xml string to pass to dispatcher for atlas jobs outputFileInfo = {} if outs or (job.logFile and job.logFile != ''): # get the datasets for the output files dsname, datasetDict = runJob.getDatasets(job) # re-create the metadata.xml file, putting guids of ALL output files into it. # output files that miss guids from the job itself will get guids in PFCxml function # first rename and copy the trf metadata file for non-build jobs if not pUtil.isBuildJob(outs): runJob.moveTrfMetadata(job.workdir, job.jobId) # create the metadata for the output + log files
result = e self.log(traceback.format_exc()) except Exception, e: result = PilotException("stageIn failed with error=%s" % e, code=PilotErrors.ERR_STAGEINFAILED) self.log(traceback.format_exc()) self.log('WARNING: Error in copying file (attempt %s/%s): %s' % (_attempt, self.stageinretry, result)) if not isinstance(result, Exception): # transferred successfully # finalize and send trace report self.trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time.time()) self.sendTrace(self.trace_report) updateFileState(fdata.lfn, self.workDir, self.job.jobId, mode="file_state", state="transferred", ftype="input") dumpFileStates(self.workDir, self.job.jobId, ftype="input") ## self.updateSURLDictionary(guid, surl, self.workDir, self.job.jobId) # FIX ME LATER fdat = result.copy() #fdat.update(lfn=lfn, pfn=pfn, guid=guid, surl=surl) transferred_files.append(fdat) else: failed_transfers.append(result) dumpFileStates(self.workDir, self.job.jobId, ftype="input") #self.log('transferred_files= %s' % transferred_files) self.log('Summary of transferred files:') for e in transferred_files: self.log(" -- %s" % e)
def stageout(self, activity, files): """ Copy files to dest SE: main control function, it should care about alternative stageout and retry-policy for diffrent ddmenndpoints :return: list of entries (is_success, success_transfers, failed_transfers, exception) for each ddmendpoint :return: (transferred_files, failed_transfers) :raise: PilotException in case of error """ if not files: raise PilotException("Failed to put files: empty file list to be transferred") pandaqueue = self.si.getQueueName() # FIX ME LATER protocols = self.protocols.setdefault(activity, self.si.resolvePandaProtocols(pandaqueue, activity)[pandaqueue]) self.log("Mover.stageout() [new implementation] started for activity=%s, files=%s, protocols=%s" % (activity, files, protocols)) # check if file exists before actual processing # populate filesize if need for fspec in files: pfn = os.path.join(self.job.workdir, fspec.lfn) if not os.path.isfile(pfn) or not os.access(pfn, os.R_OK): error = "Erron: input pfn file is not exist: %s" % pfn self.log(error) raise PilotException(error, code=PilotErrors.ERR_MISSINGOUTPUTFILE, state="FILE_INFO_FAIL") fspec.filesize = os.path.getsize(pfn) totalsize = reduce(lambda x, y: x + y.filesize, files, 0) transferred_files, failed_transfers = [],[] self.log("Found N=%s files to be transferred, total_size=%.3f MB: %s" % (len(files), totalsize/1024./1024., [e.lfn for e in files])) # group protocols, files by ddm ddmprotocols, ddmfiles = {}, {} for e in files: ddmfiles.setdefault(e.ddmendpoint, []).append(e) for e in protocols: if e['ddm'] not in ddmfiles: continue ddmprotocols.setdefault(e['ddm'], []).append(e) unknown_ddms = set(ddmfiles) - set(ddmprotocols) if unknown_ddms: raise PilotException("Failed to put files: no protocols defined for output ddmendpoints=%s .. check aprotocols schedconfig settings for activity=%s, " % (unknown_ddms, activity), code=PilotErrors.ERR_NOSTORAGE) self.log("[stage-out] [%s] filtered protocols to be used to transfer files: protocols=%s" % (activity, ddmprotocols)) # get SURL endpoint for Panda callback registration # resolve from special protocol activity=SE # fix me later to proper name of activitiy=SURL (panda SURL, at the moment only 2-letter name is allowed on AGIS side) self.ddmconf.update(self.si.resolveDDMConf(set(ddmfiles))) surl_protocols, no_surl_ddms = {}, set() for fspec in files: if not fspec.surl: # initilize only if not already set surl_prot = [dict(se=e[0], path=e[2]) for e in sorted(self.ddmconf.get(fspec.ddmendpoint, {}).get('aprotocols', {}).get('SE', []), key=lambda x: x[1])] if surl_prot: surl_protocols.setdefault(fspec.ddmendpoint, surl_prot[0]) else: no_surl_ddms.add(fspec.ddmendpoint) if no_surl_ddms: # failed to resolve SURLs self.log('FAILED to resolve default SURL path for ddmendpoints=%s' % list(no_surl_ddms)) raise PilotException("Failed to put files: no SE/SURL protocols defined for output ddmendpoints=%s .. check ddmendpoints aprotocols settings for activity=SE, " % list(no_surl_ddms), code=PilotErrors.ERR_NOSTORAGE) # try to use each protocol of same ddmendpoint until successfull transfer for ddmendpoint, iprotocols in ddmprotocols.iteritems(): for dat in iprotocols: copytool, copysetup = dat.get('copytool'), dat.get('copysetup') try: sitemover = getSiteMover(copytool)(copysetup, workDir=self.job.workdir) sitemover.trace_report = self.trace_report sitemover.protocol = dat # ## sitemover.ddmconf = self.ddmconf # quick workaround ### sitemover.setup() except Exception, e: self.log('WARNING: Failed to get SiteMover: %s .. skipped .. try to check next available protocol, current protocol details=%s' % (e, dat)) continue remain_files = [e for e in ddmfiles.get(ddmendpoint) if e.status not in ['transferred']] if not remain_files: self.log('INFO: all files to be transfered to ddm=%s have been successfully processed for activity=%s ..' % (ddmendpoint, activity)) # stop checking other protocols of ddmendpoint break self.log("Copy command [stage-out]: %s, sitemover=%s" % (copytool, sitemover)) self.log("Copy setup [stage-out]: %s" % copysetup) self.trace_report.update(protocol=copytool, localSite=ddmendpoint, remoteSite=ddmendpoint) # validate se value? se, se_path = dat.get('se', ''), dat.get('path', '') for fdata in remain_files: if not fdata.surl: fdata.surl = sitemover.getSURL(surl_protocols[fdata.ddmendpoint].get('se'), surl_protocols[fdata.ddmendpoint].get('path'), fdata.scope, fdata.lfn, self.job) # job is passing here for possible JOB specific processing updateFileState(fdata.lfn, self.workDir, self.job.jobId, mode="file_state", state="not_transferred", ftype="output") fdata.turl = sitemover.getSURL(se, se_path, fdata.scope, fdata.lfn, self.job) # job is passing here for possible JOB specific processing self.log("[stage-out] resolved SURL=%s to be used for lfn=%s, ddmendpoint=%s" % (fdata.surl, fdata.lfn, fdata.ddmendpoint)) self.log("[stage-out] resolved TURL=%s to be used for lfn=%s, ddmendpoint=%s" % (fdata.turl, fdata.lfn, fdata.ddmendpoint)) self.log("[stage-out] Prepare to put_data: ddmendpoint=%s, protocol=%s, fspec=%s" % (ddmendpoint, dat, fdata)) self.trace_report.update(catStart=time.time(), filename=fdata.lfn, guid=fdata.guid.replace('-', '')) self.trace_report.update(scope=fdata.scope, dataset=fdata.destinationDblock, url=fdata.turl) self.log("[stage-out] Preparing copy for lfn=%s using copytool=%s: mover=%s" % (fdata.lfn, copytool, sitemover)) #dumpFileStates(self.workDir, self.job.jobId, ftype="output") # loop over multple stage-out attempts for _attempt in xrange(1, self.stageoutretry + 1): if _attempt > 1: # if not first stage-out attempt, take a nap before next attempt self.log(" -- Waiting %s seconds before next stage-out attempt for file=%s --" % (self.stageout_sleeptime, fdata.lfn)) time.sleep(self.stageout_sleeptime) self.log("Put attempt %s/%s for filename=%s" % (_attempt, self.stageoutretry, fdata.lfn)) try: result = sitemover.put_data(fdata) fdata.status = 'transferred' # mark as successful if result.get('surl'): fdata.surl = result.get('surl') #if result.get('pfn'): # fdata.turl = result.get('pfn') #self.trace_report.update(url=fdata.surl) ### self.trace_report.update(url=fdata.turl) ### # finalize and send trace report self.trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time.time()) self.sendTrace(self.trace_report) updateFileState(fdata.lfn, self.workDir, self.job.jobId, mode="file_state", state="transferred", ftype="output") dumpFileStates(self.workDir, self.job.jobId, ftype="output") self.updateSURLDictionary(fdata.guid, fdata.surl, self.workDir, self.job.jobId) # FIXME LATER: isolate later fdat = result.copy() #fdat.update(lfn=lfn, pfn=pfn, guid=guid, surl=surl) transferred_files.append(fdat) break # transferred successfully except PilotException, e: result = e self.log(traceback.format_exc()) except Exception, e: result = PilotException("stageOut failed with error=%s" % e, code=PilotErrors.ERR_STAGEOUTFAILED) self.log(traceback.format_exc()) self.log('WARNING: Error in copying file (attempt %s/%s): %s' % (_attempt, self.stageoutretry, result)) if isinstance(result, Exception): # failure transfer failed_transfers.append(result)
# stage-out ........................................................................................ # update the job state file job.jobState = "stageout" _retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # verify and prepare and the output files for transfer ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(job.outFiles, job.logFile, job.workdir) if ec: # missing output file (only error code from prepareOutFiles) failJob(job.result[1], ec, job, pilotserver, pilotport, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # update the current file states updateFileStates(outs, pworkdir, job.jobId, mode="file_state", state="created") dumpFileStates(pworkdir, job.jobId) # create xml string to pass to dispatcher for atlas jobs outputFileInfo = {} if outs or (job.logFile and job.logFile != ''): # get the datasets for the output files dsname, datasetDict = getDatasets(job) # re-create the metadata.xml file, putting guids of ALL output files into it. # output files that miss guids from the job itself will get guids in PFCxml function # first rename and copy the trf metadata file for non-build jobs if not pUtil.isBuildJob(outs): moveTrfMetadata(job.workdir, job.jobId, pworkdir) # create the metadata for the output + log files
def updateRunCommandList(runCommandList, pworkdir, jobId, statusPFCTurl, analysisJob, usedFAXandDirectIO): """ update the run command list if --directIn is no longer needed """ # the method is using the file state dictionary # remove later dumpFileStates(pworkdir, jobId, type="input") # are there only copy_to_scratch transfer modes in the file state dictionary? # if so, remove any lingering --directIn instruction only_copy_to_scratch = hasOnlyCopyToScratch(pworkdir, jobId) if only_copy_to_scratch: # if hasOnlyCopyToScratch(pworkdir, jobId): # python bug? does not work, have to use previous two lines? _runCommandList = [] tolog("There are only copy_to_scratch transfer modes in file state dictionary") for cmd in runCommandList: # remove the --directIn string if present if "--directIn" in cmd: tolog("(Removing --directIn instruction from run command since it is not needed)") cmd = cmd.replace("--directIn", "") # remove the --useFileStager string if present if "--useFileStager" in cmd: tolog("(Removing --useFileStager instruction from run command since it is not needed)") cmd = cmd.replace("--useFileStager", "") # remove additional run options if creation of TURL based PFC failed if statusPFCTurl == False: # (note: can also be None, so do not use 'if not statusPFCTurl') if "--usePFCTurl" in cmd: tolog("(Removing --usePFCTurl instruction from run command since it is not needed)") cmd = cmd.replace(" --usePFCTurl", "") if not "--lfcHost" in cmd and analysisJob: tolog("Adding lfcHost to run command") cmd += ' --lfcHost %s' % (readpar('lfchost')) tolog("Updated run command: %s" % (cmd)) _runCommandList.append(cmd) else: tolog("Nothing to update in run command list related to copy-to-scratch") _runCommandList = runCommandList # was FAX used as primary site mover in combination with direct I/O? if usedFAXandDirectIO: tolog("Since FAX was used as primary site mover in combination with direct I/O, the run command list need to be updated") _runCommandList2 = [] for cmd in _runCommandList: # remove the --lfcHost if "--lfcHost" in cmd: _lfcHost = ' --lfcHost %s' % (readpar('lfchost')) cmd = cmd.replace(_lfcHost, '') tolog("(Removed the LFC host:%s)" % (_lfcHost)) # remove the --oldPrefix if "--oldPrefix" in cmd: pattern = "(\-\-oldPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --oldPrefix pattern)") # remove the --newPrefix if "--newPrefix" in cmd: pattern = "(\-\-newPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --newPrefix pattern)") # add the --usePFCTurl if not there already if not "--usePFCTurl" in cmd: cmd += " --usePFCTurl" tolog("(Added --usePFCTurl)") tolog("Updated run command: %s" % (cmd)) _runCommandList2.append(cmd) _runCommandList = _runCommandList2 tolog("Dumping final input file states") dumpFileStates(pworkdir, jobId, type="input") return _runCommandList
def updateRunCommandList(runCommandList, pworkdir, jobId, statusPFCTurl, analysisJob, usedFAXandDirectIO, hasInput, prodDBlockToken): """ update the run command list if --directIn is no longer needed """ # the method is using the file state dictionary # remove later dumpFileStates(pworkdir, jobId, ftype="input") # remove any instruction regarding tag file creation for event service jobs _runCommandList = [] for cmd in runCommandList: if "--createTAGFileForES" in cmd: cmd = cmd.replace("--createTAGFileForES","") _runCommandList.append(cmd) runCommandList = _runCommandList # no need to continue if no input files if not hasInput: return runCommandList # are there only copy_to_scratch transfer modes in the file state dictionary? # if so, remove any lingering --directIn instruction only_copy_to_scratch = hasOnlyCopyToScratch(pworkdir, jobId) if only_copy_to_scratch or 'local' in prodDBlockToken: # if hasOnlyCopyToScratch(pworkdir, jobId): # python bug? does not work, have to use previous two lines? _runCommandList = [] if only_copy_to_scratch: tolog("There are only copy_to_scratch transfer modes in file state dictionary") for cmd in runCommandList: # remove the --directIn string if present if "--directIn" in cmd: tolog("(Removing --directIn instruction from run command since it is not needed)") cmd = cmd.replace("--directIn", "") # remove the --useFileStager string if present if "--useFileStager" in cmd: tolog("(Removing --useFileStager instruction from run command since it is not needed)") cmd = cmd.replace("--useFileStager", "") # remove additional run options if creation of TURL based PFC failed if statusPFCTurl == False: # (note: can also be None, so do not use 'if not statusPFCTurl') if "--usePFCTurl" in cmd: tolog("(Removing --usePFCTurl instruction from run command since it is not needed)") cmd = cmd.replace(" --usePFCTurl", "") if not "--lfcHost" in cmd and analysisJob: tolog("Adding lfcHost to run command") cmd += ' --lfcHost %s' % (readpar('lfchost')) tolog("Updated run command: %s" % (cmd)) _runCommandList.append(cmd) else: tolog("Nothing to update in run command list related to copy-to-scratch") _runCommandList = runCommandList # was FAX used as primary site mover in combination with direct I/O? if usedFAXandDirectIO == True: tolog("Since FAX was used as primary site mover in combination with direct I/O, the run command list need to be updated") _runCommandList2 = [] for cmd in _runCommandList: # remove the --lfcHost if "--lfcHost" in cmd: _lfcHost = ' --lfcHost %s' % (readpar('lfchost')) cmd = cmd.replace(_lfcHost, '') tolog("(Removed the LFC host:%s)" % (_lfcHost)) # remove the --oldPrefix if "--oldPrefix" in cmd: pattern = "(\-\-oldPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --oldPrefix pattern)") # remove the --newPrefix if "--newPrefix" in cmd: pattern = "(\-\-newPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --newPrefix pattern)") # add the --usePFCTurl if not there already if not "--usePFCTurl" in cmd and analysisJob: cmd += " --usePFCTurl" tolog("(Added --usePFCTurl)") tolog("Updated run command: %s" % (cmd)) _runCommandList2.append(cmd) _runCommandList = _runCommandList2 ### new movers quick integration: reuse usedFAXandDirectIO variable with special meaning ### to avoid any LFC and prefixes lookups in transformation scripts ### since new movers already form proper pfn values ### proper workflow is required: to be reimplemented later if usedFAXandDirectIO == 'newmover' or usedFAXandDirectIO == 'newmover-directaccess': _runCommandList2 = [] for cmd in _runCommandList: # remove the --lfcHost, --oldPrefix, --newPrefix # add --usePFCTurl if "--lfcHost" in cmd: cmd = removePattern(cmd, "(\-\-lfcHost\ \S+)") tolog("(Removed the --lfcHost)") if "--oldPrefix" in cmd: pattern = "(\-\-oldPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --oldPrefix pattern)") if "--newPrefix" in cmd: pattern = "(\-\-newPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --newPrefix pattern)") if "--usePFCTurl" not in cmd and analysisJob: cmd += " --usePFCTurl" tolog("(Added --usePFCTurl)") # add --directIn if need if usedFAXandDirectIO == 'newmover-directaccess': if "--directIn" not in cmd and analysisJob: cmd += " --directIn" tolog("(Added --directIn)") tolog("Updated run command: %s" % cmd) _runCommandList2.append(cmd) _runCommandList = _runCommandList2 tolog("Dumping final input file states") dumpFileStates(pworkdir, jobId, ftype="input") return _runCommandList
break # transferred successfully except PilotException, e: result = e except Exception, e: result = PilotException("stageOut failed with error=%s" % e, code=PilotErrors.ERR_STAGEOUTFAILED) self.log('WARNING: Error in copying file (attempt %s): %s' % (_attempt, result)) if not isinstance(result, Exception): # transferred successfully # finalize and send trace report self.trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time.time()) self.sendTrace(self.trace_report) updateFileState(filename, self.workDir, self.job.jobId, mode="file_state", state="transferred") dumpFileStates(self.workDir, self.job.jobId) self.updateSURLDictionary(guid, surl, self.workDir, self.job.jobId) # FIX ME LATER fdat = result.copy() fdat.update(lfn=lfn, pfn=pfn, guid=guid, surl=surl) transferred_files.append(fdat) else: failed_transfers.append(result) dumpFileStates(self.workDir, self.job.jobId) self.log('transferred_files= %s' % transferred_files) if failed_transfers:
def updateRunCommandListMJ(runCommandList, pworkdir, jobId, statusPFCTurl, analysisJob, usedFAXandDirectIO, hasInput): """!!! Should be merged with updateRunCommandList !!!""" """ update the run command list if --directIn is no longer needed """ # the method is using the file state dictionary # remove later dumpFileStates(pworkdir, jobId, type="input") # remove any instruction regarding tag file creation for event service jobs #_runCommandList = [] # for cmd in runCommandList: cmd = runCommandList['parameters'] if "--createTAGFileForES" in cmd: cmd = cmd.replace("--createTAGFileForES","") runCommandList['parameters'] = cmd # no need to continue if no input files if not hasInput: return runCommandList # are there only copy_to_scratch transfer modes in the file state dictionary? # if so, remove any lingering --directIn instruction only_copy_to_scratch = hasOnlyCopyToScratch(pworkdir, jobId) if only_copy_to_scratch: # if hasOnlyCopyToScratch(pworkdir, jobId): # python bug? does not work, have to use previous two lines? _runCommandList = [] tolog("There are only copy_to_scratch transfer modes in file state dictionary") #for cmd in runCommandList: # remove the --directIn string if present cmd = runCommandList['parameters'] if "--directIn" in cmd: tolog("(Removing --directIn instruction from run command since it is not needed)") cmd = cmd.replace("--directIn", "") # remove the --useFileStager string if present if "--useFileStager" in cmd: tolog("(Removing --useFileStager instruction from run command since it is not needed)") cmd = cmd.replace("--useFileStager", "") # remove additional run options if creation of TURL based PFC failed if statusPFCTurl == False: # (note: can also be None, so do not use 'if not statusPFCTurl') if "--usePFCTurl" in cmd: tolog("(Removing --usePFCTurl instruction from run command since it is not needed)") cmd = cmd.replace(" --usePFCTurl", "") if not "--lfcHost" in cmd and analysisJob: tolog("Adding lfcHost to run command") cmd += ' --lfcHost %s' % (readpar('lfchost')) tolog("Updated run command: %s" % (cmd)) runCommandList['parameters'] = cmd else: tolog("Nothing to update in run command list related to copy-to-scratch") # was FAX used as primary site mover in combination with direct I/O? if usedFAXandDirectIO: tolog("Since FAX was used as primary site mover in combination with direct I/O, the run command list need to be updated") # remove the --lfcHost if "--lfcHost" in cmd: _lfcHost = ' --lfcHost %s' % (readpar('lfchost')) cmd = cmd.replace(_lfcHost, '') tolog("(Removed the LFC host:%s)" % (_lfcHost)) # remove the --oldPrefix if "--oldPrefix" in cmd: pattern = "(\-\-oldPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --oldPrefix pattern)") # remove the --newPrefix if "--newPrefix" in cmd: pattern = "(\-\-newPrefix\ \S+)" cmd = removePattern(cmd, pattern) tolog("(Removed --newPrefix pattern)") # add the --usePFCTurl if not there already if not "--usePFCTurl" in cmd: cmd += " --usePFCTurl" tolog("(Added --usePFCTurl)") tolog("Updated run command: %s" % (cmd)) runCommandList['parameters'] = cmd tolog("Dumping final input file states") dumpFileStates(pworkdir, jobId, type="input") return runCommandList