def putRequest(self, requestID, taskResult=None): """ put back :requestID: to RequestClient :param str requestID: request's id """ if requestID in self.__requestCache: request = self.__requestCache.pop(requestID) if taskResult: if taskResult['OK']: request = taskResult['Value'] # The RequestTask is putting back the Done tasks, no need to redo it if request.Status == 'Done': return S_OK() # In case of timeout, we need to increment ourselves all the attempts elif cmpError(taskResult, errno.ETIME): waitingOp = request.getWaiting() for rmsFile in waitingOp.get('Value', []): rmsFile.Attempt += 1 reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2) if not reset["OK"]: return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"])) else: return S_ERROR('Not in cache') return S_OK()
def getTransportURL(self, urls, protocols): """Get a transport URL for given urls If http/https is requested, the URLs will be valid for 24hours :param dict urls: s3 urls :param list protocols: a list of acceptable transport protocols in priority order. In practice, besides 's3', it can only be: * 'https' if secureConnection is True * 'http' othewise :returns: succ/failed dict url with required protocol """ res = super(S3Storage, self).getTransportURL(urls, protocols) # if the result is OK or the error different than errno.EPROTONOSUPPORT # we just return if not cmpError(res, errno.EPROTONOSUPPORT): return res # We support only http if it is an insecured connection and https if it is a secured connection if self.secureConnection and "https" not in protocols: return S_ERROR(errno.EPROTONOSUPPORT, "Only https protocol is supported") elif not self.secureConnection and "http" not in protocols: return S_ERROR(errno.EPROTONOSUPPORT, "Only http protocol is supported") # Make the presigned URLs valid for 24h if self.directAccess: return self.createPresignedUrl(urls, "get_object", expiration=60 * 60 * 24) return self.S3GatewayClient.createPresignedUrl(self.name, "get_object", urls, expiration=60 * 60 * 24)
def __extractCSData(self, section): """Extract limiting information from the CS in the form: { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } """ stuffDict = self.csDictCache.get(section) if stuffDict: return S_OK(stuffDict) result = self.__opsHelper.getSections(section) if not result["OK"]: if cmpError(result, ESECTION): return S_OK({}) return result attribs = result["Value"] stuffDict = {} for attName in attribs: result = self.__opsHelper.getOptionsDict("%s/%s" % (section, attName)) if not result["OK"]: return result attLimits = result["Value"] try: attLimits = dict([(k, int(attLimits[k])) for k in attLimits]) except Exception as excp: errMsg = "%s/%s has to contain numbers: %s" % (section, attName, str(excp)) self.log.error(errMsg) return S_ERROR(errMsg) stuffDict[attName] = attLimits self.csDictCache.add(section, 300, stuffDict) return S_OK(stuffDict)
def putRequest( self, requestID, taskResult = None ): """ put back :requestID: to RequestClient :param str requestID: request's id """ if requestID in self.__requestCache: request = self.__requestCache.pop( requestID ) if taskResult: if taskResult['OK']: request = taskResult['Value'] # The RequestTask is putting back the Done tasks, no need to redo it if request.Status == 'Done': return S_OK() # In case of timeout, we need to increment ourselves all the attempts elif cmpError( taskResult, errno.ETIME ): waitingOp = request.getWaiting() for rmsFile in waitingOp.get( 'Value', [] ): rmsFile.Attempt += 1 reset = self.requestClient().putRequest( request, useFailoverProxy = False, retryMainService = 2 ) if not reset["OK"]: return S_ERROR( "putRequest: unable to reset request %s: %s" % ( requestID, reset["Message"] ) ) else: return S_ERROR( 'Not in cache' ) return S_OK()
def test_submit_multiHopTransfer_failure_badLFN(): """Do a a multiHop transfer, but the LFN is bad (first loop failure)""" newJob = generateFTS3Job("CNAF-DST", "RAL-DST", ["/badLFN/f1"], multiHopSE="CERN-DST") res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "") assert not res["OK"] assert cmpError(res, errno.ENODATA)
def test_submit_multiHopTransfer_failure_multipleFiles(): """multihop with more than one file (not allowed)""" newJob = generateFTS3Job("CNAF-DST", "RAL-DST", ["/lhcb/f1", "/lhcb/f2"], multiHopSE="CERN-DST") res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "") assert not res["OK"] assert cmpError(res, errno.E2BIG)
def test_submit_multiHopStaging_multipleFiles(): """A multihop transfer cannot have more than one file at the time""" newJob = generateFTS3Job("CERN-RAW", "CNAF-DST", ["/lhcb/f1", "/lhcb/f2"]) res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "") assert not res["OK"] assert cmpError(res, errno.E2BIG)
def test_submit_multiHopStaging_failureBadLFN(): """We do a multi hop stage that fails because of a bad LFN""" newJob = generateFTS3Job("CERN-RAW", "CNAF-DST", ["/badLFN/f1"]) # We should get a complete failure res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "") assert not res["OK"] assert cmpError(res, errno.ENODATA)
def test_submit_directJob_allFailed(): """Simple transfer of two files, with all LFNs problematic""" newJob = generateFTS3Job("CERN-DST", "CNAF-DST", ["/badLFN/f1", "/badLFN/f2"]) # We should get a complete failure res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "") assert not res["OK"] assert cmpError(res, errno.ENODATA)
def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = self._log.getSubLogger("_prepareNewJobs", child=True) filesToSubmit = self._getFilesToSubmit( maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } res = FTS3Utilities.groupFilesByTarget(filesToSubmit) if not res['OK']: return res filesGroupedByTarget = res['Value'] for targetSE, ftsFiles in filesGroupedByTarget.iteritems(): res = self._checkSEAccess(targetSE, 'WriteAccess', vo=self.vo) if not res['OK']: # If the SE is currently banned, we just skip it if cmpError(res, errno.EACCES): log.info( "Write access currently not permitted to %s, skipping." % targetSE) else: log.error(res) for ftsFile in ftsFiles: ftsFile.attempt += 1 continue sourceSEs = self.sourceSEs.split( ',') if self.sourceSEs is not None else [] # { sourceSE : [FTSFiles] } res = FTS3Utilities.selectUniqueRandomSource( ftsFiles, allowedSources=sourceSEs) if not res['OK']: return res uniqueTransfersBySource = res['Value'] # We don't need to check the source, since it is already filtered by the DataManager for sourceSE, ftsFiles in uniqueTransfersBySource.iteritems(): for ftsFilesChunk in breakListIntoChunks( ftsFiles, maxFilesPerJob): newJob = self._createNewJob('Transfer', ftsFilesChunk, targetSE, sourceSE=sourceSE) newJobs.append(newJob) return S_OK(newJobs)
def test_submit_multiHopTransfer_failure_protocolSecondHop(): """Multi hop with second hop impossible (no protocol compatible between CNAF and RAL""" newJob = generateFTS3Job("CERN-DST", "RAL-DST", ["/lhcb/f1"], multiHopSE="CNAF-DST") res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "") assert not res["OK"] # Check that the error is no common protocol assert cmpError(res, errno.ENOPROTOOPT)
def test_submit_direct_noPotocol(): """Direct transfer with no common protocol. It is a failure""" newJob = generateFTS3Job("CNAF-DST", "RAL-DST", ["/lhcb/f1", "/lhcb/f2"]) res = newJob._constructTransferJob(3600, [f.lfn for f in newJob.filesToSubmit], "") assert not res["OK"] # Check that the error is no common protocol assert cmpError(res, errno.ENOPROTOOPT)
def executeRPC(self, functionName, args): """Perform the RPC call, connect before and disconnect after. :param functionName: name of the function :param args: arguments to the function :return: in case of success, the return of the server call. In any case we add the connection stub to it. """ retVal = self._connect() # Generate the stub which contains all the connection and call options # JSON: cast args to list for serialization purposes stub = [self._getBaseStub(), functionName, list(args)] if not retVal["OK"]: retVal["rpcStub"] = stub return retVal # Get the transport connection ID as well as the Transport object trid, transport = retVal["Value"] try: # Handshake to perform the RPC call for functionName retVal = self._proposeAction(transport, ("RPC", functionName)) if not retVal["OK"]: if cmpError(retVal, ENOAUTH): # This query is unauthorized retVal["rpcStub"] = stub return retVal else: # we have network problem or the service is not responding if self.__retry < 3: self.__retry += 1 return self.executeRPC(functionName, args) else: retVal["rpcStub"] = stub return retVal # Send the arguments to the function # Note: we need to convert the arguments to list # We do not need to deseralize it because variadic functions # can work with list too retVal = transport.sendData(S_OK(list(args))) if not retVal["OK"]: return retVal # Get the result of the call and append the stub to it # Note that the RPC timeout basically ticks here, since # the client waits for data for as long as the server side # processes the request. receivedData = transport.receiveData() if isinstance(receivedData, dict): receivedData["rpcStub"] = stub return receivedData finally: self._disconnect(trid)
def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = self._log.getSubLogger("_prepareNewJobs", child=True) filesToSubmit = self._getFilesToSubmit(maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } res = FTS3Utilities.groupFilesByTarget(filesToSubmit) if not res['OK']: return res filesGroupedByTarget = res['Value'] for targetSE, ftsFiles in filesGroupedByTarget.iteritems(): res = self._checkSEAccess(targetSE, 'WriteAccess', vo=self.vo) if not res['OK']: # If the SE is currently banned, we just skip it if cmpError(res, errno.EACCES): log.info("Write access currently not permitted to %s, skipping." % targetSE) else: log.error(res) for ftsFile in ftsFiles: ftsFile.attempt += 1 continue sourceSEs = self.sourceSEs.split(',') if self.sourceSEs is not None else [] # { sourceSE : [FTSFiles] } res = FTS3Utilities.selectUniqueRandomSource(ftsFiles, allowedSources=sourceSEs) if not res['OK']: return res uniqueTransfersBySource = res['Value'] # We don't need to check the source, since it is already filtered by the DataManager for sourceSE, ftsFiles in uniqueTransfersBySource.iteritems(): for ftsFilesChunk in breakListIntoChunks(ftsFiles, maxFilesPerJob): newJob = self._createNewJob('Transfer', ftsFilesChunk, targetSE, sourceSE=sourceSE) newJobs.append(newJob) return S_OK(newJobs)
def cleanTransformationLogFiles(self, directory): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory %s" % directory) res = returnSingleResult(StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res['OK']: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res['Message']) return res self.log.info("Successfully removed transformation log directory") return S_OK()
def executeRPC(self, functionName, args): """ Perform the RPC call, connect before and disconnect after. :param functionName: name of the function :param args: arguments to the function :return: in case of success, the return of the server call. In any case we add the connection stub to it. """ retVal = self._connect() # Generate the stub which contains all the connection and call options stub = (self._getBaseStub(), functionName, args) if not retVal['OK']: retVal['rpcStub'] = stub return retVal # Get the transport connection ID as well as the Transport object trid, transport = retVal['Value'] try: # Handshake to perform the RPC call for functionName retVal = self._proposeAction(transport, ("RPC", functionName)) if not retVal['OK']: if cmpError(retVal, ENOAUTH): # This query is unauthorized retVal['rpcStub'] = stub return retVal else: # we have network problem or the service is not responding if self.__retry < 3: self.__retry += 1 return self.executeRPC(functionName, args) else: retVal['rpcStub'] = stub return retVal # Send the arguments to the function retVal = transport.sendData(S_OK(args)) if not retVal['OK']: return retVal # Get the result of the call and append the stub to it receivedData = transport.receiveData() if isinstance(receivedData, dict): receivedData['rpcStub'] = stub return receivedData finally: self._disconnect(trid)
def executeRPC( self, functionName, args ): """ Perform the RPC call, connect before and disconnect after. :param functionName: name of the function :param args: arguments to the function :return: in case of success, the return of the server call. In any case we add the connection stub to it. """ retVal = self._connect() # Generate the stub which contains all the connection and call options stub = ( self._getBaseStub(), functionName, args ) if not retVal[ 'OK' ]: retVal[ 'rpcStub' ] = stub return retVal # Get the transport connection ID as well as the Transport object trid, transport = retVal[ 'Value' ] try: # Handshake to perform the RPC call for functionName retVal = self._proposeAction( transport, ( "RPC", functionName ) ) if not retVal['OK']: if cmpError( retVal, ENOAUTH ): # This query is unauthorized retVal[ 'rpcStub' ] = stub return retVal else: # we have network problem or the service is not responding if self.__retry < 3: self.__retry += 1 return self.executeRPC( functionName, args ) else: retVal[ 'rpcStub' ] = stub return retVal # Send the arguments to the function retVal = transport.sendData( S_OK( args ) ) if not retVal[ 'OK' ]: return retVal # Get the result of the call and append the stub to it receivedData = transport.receiveData() if isinstance( receivedData, dict ): receivedData[ 'rpcStub' ] = stub return receivedData finally: self._disconnect( trid )
def cleanTransformationLogFiles(self, directory): """ clean up transformation logs from directory :directory: :param self: self reference :param str directory: folder name """ self.log.verbose("Removing log files found in the directory %s" % directory) res = returnSingleResult( StorageElement(self.logSE).removeDirectory(directory, recursive=True)) if not res['OK']: if cmpError(res, errno.ENOENT): # No such file or directory self.log.warn("Transformation log directory does not exist", directory) return S_OK() self.log.error("Failed to remove log files", res['Message']) return res self.log.info("Successfully removed transformation log directory") return S_OK()
def __fetchSpaceToken(seName): """ Fetch the space token of storage element :param seName name of the storageElement :returns space token. If there is no SpaceToken defined, returns None """ seToken = None if seName: seObj = StorageElement(seName) res = seObj.getStorageParameters(protocol='srm') if not res['OK']: # If there is no SRM protocol, we do not specify # the space token if cmpError(res, errno.ENOPROTOOPT): return S_OK(None) return res seToken = res["Value"].get("SpaceToken") return S_OK(seToken)
def getFilesToStage(lfnList, jobState=None, checkOnlyTapeSEs=None, jobLog=None): """ Utility that returns out of a list of LFNs those files that are offline, and those for which at least one copy is online """ if not lfnList: return S_OK({ 'onlineLFNs': [], 'offlineLFNs': {}, 'failedLFNs': [], 'absentLFNs': {} }) dm = DataManager() if isinstance(lfnList, six.string_types): lfnList = [lfnList] lfnListReplicas = dm.getReplicasForJobs(lfnList, getUrl=False) if not lfnListReplicas['OK']: return lfnListReplicas offlineLFNsDict = {} onlineLFNs = {} offlineLFNs = {} absentLFNs = {} failedLFNs = set() if lfnListReplicas['Value']['Failed']: # Check if files are not existing for lfn, reason in lfnListReplicas['Value']['Failed'].items(): # FIXME: awful check until FC returns a proper error if cmpError(reason, errno.ENOENT) or 'No such file' in reason: # The file doesn't exist, job must be Failed # FIXME: it is not possible to return here an S_ERROR(), return the message only absentLFNs[lfn] = S_ERROR(errno.ENOENT, 'File not in FC')['Message'] if absentLFNs: return S_OK({ 'onlineLFNs': list(onlineLFNs), 'offlineLFNs': offlineLFNsDict, 'failedLFNs': list(failedLFNs), 'absentLFNs': absentLFNs }) return S_ERROR("Failures in getting replicas") lfnListReplicas = lfnListReplicas['Value']['Successful'] # If a file is reported here at a tape SE, it is not at a disk SE as we use disk in priority # We shall check all file anyway in order to make sure they exist seToLFNs = dict() for lfn, ses in lfnListReplicas.items(): for se in ses: seToLFNs.setdefault(se, list()).append(lfn) if seToLFNs: if jobState: # Get user name and group from the job state userName = jobState.getAttribute('Owner') if not userName['OK']: return userName userName = userName['Value'] userGroup = jobState.getAttribute('OwnerGroup') if not userGroup['OK']: return userGroup userGroup = userGroup['Value'] else: userName = None userGroup = None # Check whether files are Online or Offline, or missing at SE result = _checkFilesToStage( seToLFNs, onlineLFNs, offlineLFNs, absentLFNs, # pylint: disable=unexpected-keyword-arg checkOnlyTapeSEs=checkOnlyTapeSEs, jobLog=jobLog, proxyUserName=userName, proxyUserGroup=userGroup, executionLock=True) if not result['OK']: return result failedLFNs = set(lfnList) - set(onlineLFNs) - set(offlineLFNs) - set( absentLFNs) # Get the online SEs dmsHelper = DMSHelpers() onlineSEs = set(se for ses in onlineLFNs.values() for se in ses) onlineSites = set( dmsHelper.getLocalSiteForSE(se).get('Value') for se in onlineSEs) - {None} for lfn in offlineLFNs: ses = offlineLFNs[lfn] if len(ses) == 1: # No choice, let's go offlineLFNsDict.setdefault(ses[0], list()).append(lfn) continue # Try and get an SE at a site already with online files found = False if onlineSites: # If there is at least one online site, select one for se in ses: site = dmsHelper.getLocalSiteForSE(se) if site['OK']: if site['Value'] in onlineSites: offlineLFNsDict.setdefault(se, list()).append(lfn) found = True break # No online site found in common, select randomly if not found: offlineLFNsDict.setdefault(random.choice(ses), list()).append(lfn) return S_OK({ 'onlineLFNs': list(onlineLFNs), 'offlineLFNs': offlineLFNsDict, 'failedLFNs': list(failedLFNs), 'absentLFNs': absentLFNs, 'onlineSites': onlineSites })
def _checkFilesToStage(seToLFNs, onlineLFNs, offlineLFNs, absentLFNs, checkOnlyTapeSEs=None, jobLog=None, proxyUserName=None, proxyUserGroup=None, executionLock=None): """ Checks on SEs whether the file is NEARLINE or ONLINE onlineLFNs, offlineLFNs and absentLFNs are modified to contain the files found online If checkOnlyTapeSEs is True, disk replicas are not checked As soon as a replica is found Online for a file, no further check is made """ # Only check on storage if it is a tape SE if jobLog is None: logger = gLogger else: logger = jobLog if checkOnlyTapeSEs is None: # Default value is True checkOnlyTapeSEs = True failed = {} for se, lfnsInSEList in seToLFNs.items(): # If we have found already all files online at another SE, no need to check the others # but still we want to set the SE as Online if not a TapeSE vo = getVOForGroup(proxyUserGroup) seObj = StorageElement(se, vo=vo) status = seObj.getStatus() if not status['OK']: return status tapeSE = status['Value']['TapeSE'] diskSE = status['Value']['DiskSE'] # If requested to check only Tape SEs and the file is at a diskSE, we guess it is Online... filesToCheck = [] for lfn in lfnsInSEList: # If the file had already been found accessible at an SE, only check that this one is on disk diskIsOK = checkOnlyTapeSEs or (lfn in onlineLFNs) if diskIsOK and diskSE: onlineLFNs.setdefault(lfn, []).append(se) elif not diskIsOK or (tapeSE and (lfn not in onlineLFNs)): filesToCheck.append(lfn) if not filesToCheck: continue # We have to use a new SE object because it caches the proxy! with UserProxy(proxyUserName=proxyUserName, proxyUserGroup=proxyUserGroup, executionLock=executionLock) as proxyResult: if proxyResult['OK']: fileMetadata = StorageElement( se, vo=vo).getFileMetadata(filesToCheck) else: fileMetadata = proxyResult if not fileMetadata['OK']: failed[se] = dict.fromkeys(filesToCheck, fileMetadata['Message']) else: if fileMetadata['Value']['Failed']: failed[se] = fileMetadata['Value']['Failed'] # is there at least one replica online? for lfn, mDict in fileMetadata['Value']['Successful'].items(): # SRM returns Cached, but others may only return Accessible if mDict.get('Cached', mDict['Accessible']): onlineLFNs.setdefault(lfn, []).append(se) elif tapeSE: # A file can be staged only at Tape SE offlineLFNs.setdefault(lfn, []).append(se) else: # File not available at a diskSE... we shall retry later pass # Doesn't matter if some files are Offline if they are also online for lfn in set(offlineLFNs) & set(onlineLFNs): offlineLFNs.pop(lfn) # If the file was found staged, ignore possible errors, but print out errors for se, failedLfns in list(failed.items()): logger.error("Errors when getting files metadata", 'at %s' % se) for lfn, reason in list(failedLfns.items()): if lfn in onlineLFNs: logger.warn(reason, 'for %s, but there is an online replica' % lfn) failed[se].pop(lfn) else: logger.error(reason, 'for %s, no online replicas' % lfn) if cmpError(reason, errno.ENOENT): absentLFNs.setdefault(lfn, []).append(se) failed[se].pop(lfn) if not failed[se]: failed.pop(se) # Find the files that do not exist at SE if failed: logger.error( "Error getting metadata", "for %d files" % len(set(lfn for lfnList in failed.values() for lfn in lfnList))) for lfn in absentLFNs: seList = absentLFNs[lfn] # FIXME: it is not possible to return here an S_ERROR(), return the message only absentLFNs[lfn] = S_ERROR(errno.ENOENT, "File not at %s" % ','.join(sorted(seList)))['Message'] # Format the error for absent files return S_OK()
def execute( self ): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark( "Iteration", 1 ) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug( "execute: executing %d request in this cycle" % taskCounter ) requestsToExecute = [] if not self.__bulkRequest: self.log.info( "execute: ask for a single request" ) getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error( "execute: %s" % getRequest["Message"] ) break if not getRequest["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break requestsToExecute = [getRequest["Value"] ] else: numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter ) self.log.info( "execute: ask for %s requests" % numberOfRequest ) getRequests = self.requestClient().getBulkRequests( numberOfRequest ) if not getRequests["OK"]: self.log.error( "execute: %s" % getRequests["Message"] ) break if not getRequests["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break for rId in getRequests["Value"]["Failed"]: self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] ) requestsToExecute = getRequests["Value"]["Successful"].values() self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) ) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses() ) ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep ) time.sleep( self.__poolSleep ) looping += 1 else: if looping: self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep ) looping = 0 # # save current request in cache res = self.cacheRequest( request ) if not res['OK']: if cmpError( res, errno.EALREADY ): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( res['Message'], '(%d requests): put back all requests and exit cycle' % len( self.__requestCache ) ) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result['OK']: continue requestJSON = result['Value'] self.log.info( "spawning task for request '%s/%s'" % ( request.RequestID, request.RequestName ) ) timeOut = self.getTimeout( request ) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs = { "requestJSON" : requestJSON, "handlersDict" : self.handlersDict, "csPath" : self.__configPath, "agentName": self.agentName }, taskID = taskID, blocking = True, usePoolCallbacks = True, timeOut = timeOut ) if not enqueue["OK"]: self.log.error( enqueue["Message"] ) else: self.log.debug( "successfully enqueued task '%s'" % taskID ) # # update monitor gMonitor.addMark( "Processed", 1 ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep( 0.1 ) break self.log.info( 'Flushing callbacks (%d requests still in cache)' % len( self.__requestCache ) ) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK()
def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = self._log.getSubLogger("_prepareNewJobs") filesToSubmit = self._getFilesToSubmit( maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } res = FTS3Utilities.groupFilesByTarget(filesToSubmit) if not res["OK"]: return res filesGroupedByTarget = res["Value"] for targetSE, ftsFiles in filesGroupedByTarget.items(): res = self._checkSEAccess(targetSE, "WriteAccess", vo=self.vo) if not res["OK"]: # If the SE is currently banned, we just skip it if cmpError(res, errno.EACCES): log.info( "Write access currently not permitted to %s, skipping." % targetSE) else: log.error(res) for ftsFile in ftsFiles: ftsFile.attempt += 1 continue sourceSEs = self.sourceSEs.split( ",") if self.sourceSEs is not None else [] # { sourceSE : [FTSFiles] } res = FTS3Utilities.selectUniqueSource(ftsFiles, self.fts3Plugin, allowedSources=sourceSEs) if not res["OK"]: return res uniqueTransfersBySource, failedFiles = res["Value"] # Treat the errors of the failed files for ftsFile, errMsg in failedFiles.items(): log.error("Error when selecting random sources", "%s, %s" % (ftsFile.lfn, errMsg)) # If the error is that the file does not exist in the catalog # fail it ! if cmpError(errMsg, errno.ENOENT): log.error("The file does not exist, setting it Defunct", "%s" % ftsFile.lfn) ftsFile.status = "Defunct" # We don't need to check the source, since it is already filtered by the DataManager for sourceSE, ftsFiles in uniqueTransfersBySource.items(): # Checking whether we will need multiHop transfer multiHopSE = self.fts3Plugin.findMultiHopSEToCoverUpForWLCGFailure( sourceSE, targetSE) if multiHopSE: log.verbose( "WLCG failure manifestation, use %s for multihop, max files per job is 1" % multiHopSE) # Check that we can write and read from it try: for accessType in ("Read", "Write"): res = self._checkSEAccess(multiHopSE, "%sAccess" % accessType, vo=self.vo) if not res["OK"]: # If the SE is currently banned, we just skip it if cmpError(res, errno.EACCES): log.info( "Access currently not permitted", "%s to %s" % (accessType, multiHopSE)) else: log.error("CheckSEAccess error", res) for ftsFile in ftsFiles: ftsFile.attempt += 1 # If we have a problem with the multiHop SE, # we skip the whole loop for the pair # (targetSE, sourceSE) raise RuntimeError("MultiHopSE unavailable") except RuntimeError as e: log.info( "Problem with multiHop SE, skipping transfers from %s to %s." % (sourceSE, targetSE)) continue maxFilesPerJob = 1 # Check if we need a multihop staging elif self.__needsMultiHopStaging(sourceSE, targetSE): log.verbose( "Needs multihop staging, max files per job is 1") maxFilesPerJob = 1 for ftsFilesChunk in breakListIntoChunks( ftsFiles, maxFilesPerJob): newJob = self._createNewJob("Transfer", ftsFilesChunk, targetSE, sourceSE=sourceSE, multiHopSE=multiHopSE) newJobs.append(newJob) return S_OK(newJobs)
def _checkFilesToStage( seToLFNs, onlineLFNs, offlineLFNs, absentLFNs, checkOnlyTapeSEs = None, jobLog = None, proxyUserName = None, proxyUserGroup = None, executionLock = None ): """ Checks on SEs whether the file is NEARLINE or ONLINE onlineLFNs, offlineLFNs and absentLFNs are modified to contain the files found online If checkOnlyTapeSEs is True, disk replicas are not checked As soon as a replica is found Online for a file, no further check is made """ # Only check on storage if it is a tape SE if jobLog is None: logger = gLogger else: logger = jobLog if checkOnlyTapeSEs is None: # Default value is True checkOnlyTapeSEs = True failed = {} for se, lfnsInSEList in seToLFNs.iteritems(): # No need to check files that are already known to be Online lfnsInSEList = list( set( lfnsInSEList ) - onlineLFNs ) if not lfnsInSEList: continue seObj = StorageElement( se ) status = seObj.getStatus() if not status['OK']: logger.error( "Could not get SE status", "%s - %s" % ( se, status['Message'] ) ) return status tapeSE = status['Value']['TapeSE'] # If requested to check only Tape SEs and the file is at a diskSE, we guess it is Online... if checkOnlyTapeSEs and not tapeSE: onlineLFNs.update( lfnsInSEList ) continue # Wrap the SE method with executeWithUserProxy fileMetadata = ( executeWithUserProxy( seObj.getFileMetadata ) ( lfnsInSEList, proxyUserName = proxyUserName, proxyUserGroup = proxyUserGroup, executionLock = executionLock ) ) if not fileMetadata['OK']: failed[se] = dict.fromkeys( lfnsInSEList, fileMetadata['Message'] ) else: if fileMetadata['Value']['Failed']: failed[se] = fileMetadata['Value']['Failed'] # is there at least one replica online? for lfn, mDict in fileMetadata['Value']['Successful'].iteritems(): # SRM returns Cached, but others may only return Accessible if mDict.get( 'Cached', mDict['Accessible'] ): onlineLFNs.add( lfn ) elif tapeSE: # A file can be staged only at Tape SE offlineLFNs.setdefault( lfn, [] ).append( se ) else: # File not available at a diskSE... we shall retry later pass # Doesn't matter if some files are Offline if they are also online for lfn in set( offlineLFNs ) & onlineLFNs: offlineLFNs.pop( lfn ) # If the file was found staged, ignore possible errors, but print out errors for se, failedLfns in failed.items(): logger.error( "Errors when getting files metadata", 'at %s' % se ) for lfn, reason in failedLfns.items(): if lfn in onlineLFNs: logger.warn( reason, 'for %s, but there is an online replica' % lfn ) failed[se].pop( lfn ) else: logger.error( reason, 'for %s, no online replicas' % lfn ) if cmpError( reason, errno.ENOENT ): absentLFNs.setdefault( lfn, [] ).append( se ) failed[se].pop( lfn ) if not failed[se]: failed.pop( se ) # Find the files that do not exist at SE if failed: logger.error( "Error getting metadata", "for %d files" % len( set( lfn for lfnList in failed.itervalues() for lfn in lfnList ) ) ) for lfn in absentLFNs: seList = absentLFNs[lfn] # FIXME: it is not possible to return here an S_ERROR(), return the message only absentLFNs[lfn] = S_ERROR( errno.ENOENT, "File not at %s" % ','.join( seList ) )['Message'] # Format the error for absent files return S_OK()
def _treatOperation(self, operation): """Treat one operation: * does the callback if the operation is finished * generate new jobs and submits them :param operation: the operation to treat :return: operation, S_OK()/S_ERROR() """ try: threadID = current_process().name log = gLogger.getLocalSubLogger("treatOperation/%s" % operation.operationID) # If the operation is totally processed # we perform the callback if operation.isTotallyProcessed(): log.debug("FTS3Operation %s is totally processed" % operation.operationID) res = operation.callback() if not res["OK"]: log.error("Error performing the callback", res) log.info("Putting back the operation") dbRes = self.fts3db.persistOperation(operation) if not dbRes["OK"]: log.error("Could not persist operation", dbRes) return operation, res else: log.debug("FTS3Operation %s is not totally processed yet" % operation.operationID) # This flag is set to False if we want to stop the ongoing processing # of an operation, typically when the matching RMS Request has been # canceled (see below) continueOperationProcessing = True # Check the status of the associated RMS Request. # If it is canceled or does not exist anymore then we will not create new FTS3Jobs, and mark # this as FTS3Operation canceled. if operation.rmsReqID: res = ReqClient().getRequestStatus(operation.rmsReqID) if not res["OK"]: # If the Request does not exist anymore if cmpError(res, errno.ENOENT): log.info( "The RMS Request does not exist anymore, canceling the FTS3Operation", "rmsReqID: %s, FTS3OperationID: %s" % (operation.rmsReqID, operation.operationID), ) operation.status = "Canceled" continueOperationProcessing = False else: log.error("Could not get request status", res) return operation, res else: rmsReqStatus = res["Value"] if rmsReqStatus == "Canceled": log.info( "The RMS Request is canceled, canceling the FTS3Operation", "rmsReqID: %s, FTS3OperationID: %s" % (operation.rmsReqID, operation.operationID), ) operation.status = "Canceled" continueOperationProcessing = False if continueOperationProcessing: res = operation.prepareNewJobs( maxFilesPerJob=self.maxFilesPerJob, maxAttemptsPerFile=self.maxAttemptsPerFile ) if not res["OK"]: log.error("Cannot prepare new Jobs", "FTS3Operation %s : %s" % (operation.operationID, res)) return operation, res newJobs = res["Value"] log.debug("FTS3Operation %s: %s new jobs to be submitted" % (operation.operationID, len(newJobs))) for ftsJob in newJobs: res = self._serverPolicy.chooseFTS3Server() if not res["OK"]: log.error(res) continue ftsServer = res["Value"] log.debug("Use %s server" % ftsServer) ftsJob.ftsServer = ftsServer res = self.getFTS3Context(ftsJob.username, ftsJob.userGroup, ftsServer, threadID=threadID) if not res["OK"]: log.error("Could not get context", res) continue context = res["Value"] try: tpcProtocols = operation.fts3Plugin.selectTPCProtocols(ftsJob=ftsJob) except ValueError as e: log.error("Could not select TPC list", repr(e)) continue res = ftsJob.submit(context=context, protocols=tpcProtocols) if not res["OK"]: log.error( "Could not submit FTS3Job", "FTS3Operation %s : %s" % (operation.operationID, res) ) continue operation.ftsJobs.append(ftsJob) submittedFileIds = res["Value"] log.info( "FTS3Operation %s: Submitted job for %s transfers" % (operation.operationID, len(submittedFileIds)) ) # new jobs are put in the DB at the same time res = self.fts3db.persistOperation(operation) if not res["OK"]: log.error("Could not persist operation", res) return operation, res except Exception as e: log.exception("Exception in the thread", repr(e)) return operation, S_ERROR("Exception %s" % repr(e))
def _monitorJob(self, ftsJob): """ * query the FTS servers * update the FTSFile status * update the FTSJob status :param ftsJob: FTS job :return: ftsJob, S_OK()/S_ERROR() """ # General try catch to avoid that the tread dies try: threadID = current_process().name log = gLogger.getSubLogger("_monitorJob/%s" % ftsJob.jobID, child=True) res = self.getFTS3Context(ftsJob.username, ftsJob.userGroup, ftsJob.ftsServer, threadID=threadID) if not res['OK']: log.error("Error getting context", res) return ftsJob, res context = res['Value'] res = ftsJob.monitor(context=context) if not res['OK']: log.error("Error monitoring job", res) # If the job was not found on the server, update the DB if cmpError(res, errno.ESRCH): res = self.fts3db.cancelNonExistingJob( ftsJob.operationID, ftsJob.ftsGUID) return ftsJob, res # { fileID : { Status, Error } } filesStatus = res['Value'] # Specify the job ftsGUID to make sure we do not overwrite # status of files already taken by newer jobs res = self.fts3db.updateFileStatus(filesStatus, ftsGUID=ftsJob.ftsGUID) if not res['OK']: log.error("Error updating file fts status", "%s, %s" % (ftsJob.ftsGUID, res)) return ftsJob, res upDict = { ftsJob.jobID: { 'status': ftsJob.status, 'error': ftsJob.error, 'completeness': ftsJob.completeness, 'operationID': ftsJob.operationID, 'lastMonitor': True, } } res = self.fts3db.updateJobStatus(upDict) if ftsJob.status in ftsJob.FINAL_STATES: self.__sendAccounting(ftsJob) return ftsJob, res except Exception as e: return ftsJob, S_ERROR(0, "Exception %s" % repr(e))
def execute(self): ''' Main execution method ''' gMonitor.addMark('Iteration', 1) # Get all the transformations result = self.transClient.getTransformations({ 'Status': 'Active', 'Type': self.transformationTypes }) if not result['OK']: self.log.error( "InputDataAgent.execute: Failed to get transformations.", result['Message']) return S_OK() # Process each transformation for transDict in result['Value']: transID = long(transDict['TransformationID']) # res = self.transClient.getTransformationInputDataQuery( transID ) res = self.transClient.getTransformationMetaQuery(transID, 'Input') if not res['OK']: if cmpError(res, ENOENT): self.log.info( "InputDataAgent.execute: No input data query found for transformation", transID) else: self.log.error( "InputDataAgent.execute: Failed to get input data query", "for %d: %s" % (transID, res['Message'])) continue inputDataQuery = res['Value'] if self.refreshonly: # Determine the correct time stamp to use for this transformation if transID in self.timeLog: if transID in self.fullTimeLog: # If it is more than a day since the last reduced query, make a full query just in case if (datetime.datetime.utcnow() - self.fullTimeLog[transID] ) < datetime.timedelta( seconds=self.fullUpdatePeriod): timeStamp = self.timeLog[transID] if self.dateKey: inputDataQuery[self.dateKey] = ( timeStamp - datetime.timedelta(seconds=10) ).strftime('%Y-%m-%d %H:%M:%S') else: self.log.error( "DateKey was not set in the CS, cannot use the RefreshOnly" ) else: self.fullTimeLog[ transID] = datetime.datetime.utcnow() self.timeLog[transID] = datetime.datetime.utcnow() if transID not in self.fullTimeLog: self.fullTimeLog[transID] = datetime.datetime.utcnow() # Perform the query to the metadata catalog self.log.verbose("Using input data query for transformation", "%d: %s" % (transID, str(inputDataQuery))) start = time.time() result = self.metadataClient.findFilesByMetadata(inputDataQuery) rtime = time.time() - start self.log.verbose("Metadata catalog query time", ": %.2f seconds." % (rtime)) if not result['OK']: self.log.error( "InputDataAgent.execute: Failed to get response from the metadata catalog", result['Message']) continue lfnList = result['Value'] # Check if the number of files has changed since the last cycle nlfns = len(lfnList) self.log.info( "files returned for transformation from the metadata catalog: ", "%d -> %d" % (int(transID), nlfns)) if nlfns == self.fileLog.get(transID): self.log.verbose( 'No new files in metadata catalog since last check') self.fileLog[transID] = nlfns # Add any new files to the transformation addedLfns = [] if lfnList: self.log.verbose('Processing lfns for transformation:', "%d -> %d" % (transID, len(lfnList))) # Add the files to the transformation self.log.verbose('Adding lfns for transformation:', "%d -> %d" % (transID, len(lfnList))) result = self.transClient.addFilesToTransformation( transID, sorted(lfnList)) if not result['OK']: self.log.warn( "InputDataAgent.execute: failed to add lfns to transformation", result['Message']) self.fileLog[transID] = 0 else: if result['Value']['Failed']: for lfn, error in res['Value']['Failed'].items(): self.log.warn( "InputDataAgent.execute: Failed to add to transformation:", "%s: %s" % (lfn, error)) if result['Value']['Successful']: for lfn, status in result['Value']['Successful'].items( ): if status == 'Added': addedLfns.append(lfn) self.log.info( "InputDataAgent.execute: Added files to transformation", "(%d)" % len(addedLfns)) return S_OK()
def transferAndRegisterFile(self, fileName, localPath, lfn, destinationSEList, fileMetaDict, fileCatalog=None, masterCatalogOnly=False): """Performs the transfer and register operation with failover. """ errorList = [] fileGUID = fileMetaDict.get("GUID", None) fileChecksum = fileMetaDict.get("Checksum", None) for se in destinationSEList: # We put here some retry in case the problem comes from the FileCatalog # being unavailable. If it is, then the `hasAccess` call would fail, # and we would not make any failover request. So the only way is to wait a bit # This keeps the WN busy for a while, but at least we do not lose all the processing # time we just spent for sleeptime in (10, 60, 300, 600): self.log.info( "Attempting dm.putAndRegister", "('%s','%s','%s',guid='%s',catalog='%s', checksum = '%s')" % (lfn, localPath, se, fileGUID, fileCatalog, fileChecksum)) result = DataManager( catalogs=fileCatalog, masterCatalogOnly=masterCatalogOnly).putAndRegister( lfn, localPath, se, guid=fileGUID, checksum=fileChecksum) self.log.verbose(result) # If the FC is unavailable, we stay in the loop and retry # otherwise we continue without retrying if result['OK'] or not cmpError(result, EFCERR): break self.log.error( "transferAndRegisterFile: FC unavailable, retry") time.sleep(sleeptime) if not result['OK']: self.log.error('dm.putAndRegister failed with message', result['Message']) errorList.append(result['Message']) continue if not result['Value']['Failed']: self.log.info( 'dm.putAndRegister successfully uploaded and registered', '%s to %s' % (fileName, se)) return S_OK({'uploadedSE': se, 'lfn': lfn}) # Now we know something went wrong self.log.warn( "Didn't manage to do everything, now adding requests for the missing operation" ) errorDict = result['Value']['Failed'][lfn] if 'register' not in errorDict: self.log.error('dm.putAndRegister failed with unknown error', str(errorDict)) errorList.append( 'Unknown error while attempting upload to %s' % se) continue # fileDict = errorDict['register'] # Therefore the registration failed but the upload was successful if not fileCatalog: fileCatalog = '' if masterCatalogOnly: fileCatalog = FileCatalog().getMasterCatalogNames()['Value'] result = self._setRegistrationRequest(lfn, se, fileMetaDict, fileCatalog) if not result['OK']: self.log.error('Failed to set registration request', 'SE %s and metadata: \n%s' % (se, fileMetaDict)) errorList.append( 'Failed to set registration request for: SE %s and metadata: \n%s' % (se, fileMetaDict)) continue else: self.log.info( 'Successfully set registration request', 'for: SE %s and metadata: \n%s' % (se, fileMetaDict)) metadata = {} metadata['filedict'] = fileMetaDict metadata['uploadedSE'] = se metadata['lfn'] = lfn metadata['registration'] = 'request' return S_OK(metadata) self.log.error('Failed to upload output data file', 'Encountered %s errors' % len(errorList)) return S_ERROR('Failed to upload output data file')
def _checkFilesToStage(seToLFNs, onlineLFNs, offlineLFNs, absentLFNs): """ Checks on SEs whether the file is NEARLINE or ONLINE onlineLFNs is modified to contain the files found online """ # Only check on storage if it is a tape SE failed = {} for se, lfnsInSEList in seToLFNs.iteritems(): seObj = StorageElement(se) status = seObj.getStatus() if not status['OK']: gLogger.error("Could not get SE status", "%s - %s" % (se, status['Message'])) return status tapeSE = status['Value']['TapeSE'] # File is at a disk SE, no need to stage fileMetadata = seObj.getFileMetadata(lfnsInSEList) if not fileMetadata['OK']: failed[se] = dict.fromkeys(lfnsInSEList, fileMetadata['Message']) else: if fileMetadata['Value']['Failed']: failed[se] = fileMetadata['Value']['Failed'] # is there at least one replica online? for lfn, mDict in fileMetadata['Value']['Successful'].iteritems(): # SRM returns Cached, but others may only return Accessible if mDict.get('Cached', mDict['Accessible']): onlineLFNs.add(lfn) elif tapeSE: # A file can be staged only at Tape SE offlineLFNs.setdefault(lfn, []).append(se) else: # File not available at a diskSE... we shall retry later pass # Doesn't matter if some files are Offline if they are also online for lfn in set(offlineLFNs) & onlineLFNs: offlineLFNs.pop(lfn) # If the file was found staged, ignore possible errors, but print out errors for se, failedLfns in failed.items(): gLogger.error("Errors when getting files metadata", 'at %s' % se) for lfn, reason in failedLfns.items(): if lfn in onlineLFNs: gLogger.info('%s: %s, but there is an online replica' % (lfn, reason)) failed[se].pop(lfn) else: gLogger.error('%s: %s, no online replicas' % (lfn, reason)) if cmpError(reason, errno.ENOENT): absentLFNs.setdefault(lfn, []).append(se) failed[se].pop(lfn) if not failed[se]: failed.pop(se) # Find the files that do not exist at SE if failed: gLogger.error( "Error getting metadata", "for %d files" % len(set(lfn for lfnList in failed.itervalues() for lfn in lfnList))) return S_OK()
def prepareNewJobs(self, maxFilesPerJob=100, maxAttemptsPerFile=10): log = self._log.getSubLogger("_prepareNewJobs", child=True) filesToSubmit = self._getFilesToSubmit( maxAttemptsPerFile=maxAttemptsPerFile) log.debug("%s ftsFiles to submit" % len(filesToSubmit)) newJobs = [] # {targetSE : [FTS3Files] } res = FTS3Utilities.groupFilesByTarget(filesToSubmit) if not res['OK']: return res filesGroupedByTarget = res['Value'] for targetSE, ftsFiles in filesGroupedByTarget.items(): res = self._checkSEAccess(targetSE, 'WriteAccess', vo=self.vo) if not res['OK']: # If the SE is currently banned, we just skip it if cmpError(res, errno.EACCES): log.info( "Write access currently not permitted to %s, skipping." % targetSE) else: log.error(res) for ftsFile in ftsFiles: ftsFile.attempt += 1 continue sourceSEs = self.sourceSEs.split( ',') if self.sourceSEs is not None else [] # { sourceSE : [FTSFiles] } res = FTS3Utilities.selectUniqueRandomSource( ftsFiles, allowedSources=sourceSEs) if not res['OK']: return res uniqueTransfersBySource, failedFiles = res['Value'] # Treat the errors of the failed files for ftsFile, errMsg in failedFiles.items(): log.error("Error when selecting random sources", "%s, %s" % (ftsFile.lfn, errMsg)) # If the error is that the file does not exist in the catalog # fail it ! if cmpError(errMsg, errno.ENOENT): log.error("The file does not exist, setting it Defunct", "%s" % ftsFile.lfn) ftsFile.status = 'Defunct' # We don't need to check the source, since it is already filtered by the DataManager for sourceSE, ftsFiles in uniqueTransfersBySource.items(): if self.__needsMultiHopStaging(sourceSE, targetSE): log.verbose( "Needs multihop staging, max files per job is 1") maxFilesPerJob = 1 for ftsFilesChunk in breakListIntoChunks( ftsFiles, maxFilesPerJob): newJob = self._createNewJob('Transfer', ftsFilesChunk, targetSE, sourceSE=sourceSE) newJobs.append(newJob) return S_OK(newJobs)
def transferAndRegisterFile( self, fileName, localPath, lfn, destinationSEList, fileMetaDict, fileCatalog=None, masterCatalogOnly=False, retryUpload=False, ): """Performs the transfer and register operation with failover. :param filename: of absolute no use except for printing logs. :param localPath: path to the file locally :param lfn: LFN :param destinationSEList: list of possible destination for the file. Loop over it until one succeeds or we reach the end of it. :param fileMetaDict: file metadata for registration :param fileCatalog: list of catalogs to use (see :py:class:`DIRAC.DataManagementSystem.Client.DataManager`) :param masterCatalogOnly: use only master catalog (see :py:class:`DIRAC.DataManagementSystem.Client.DataManager`) :param retryUpload: if set to True, and there is only one output SE in destinationSEList, retry several times. """ errorList = [] fileGUID = fileMetaDict.get("GUID", None) fileChecksum = fileMetaDict.get("Checksum", None) for se in destinationSEList: # We put here some retry in case the problem comes from the FileCatalog # being unavailable. If it is, then the `hasAccess` call would fail, # and we would not make any failover request. So the only way is to wait a bit # This keeps the WN busy for a while, but at least we do not lose all the processing # time we just spent # This same retry path is taken if we only have one possible stage out SE # and retryUpload is True for sleeptime in (10, 60, 300, 600): self.log.info( "Attempting dm.putAndRegister", "('%s','%s','%s',guid='%s',catalog='%s', checksum = '%s')" % (lfn, localPath, se, fileGUID, fileCatalog, fileChecksum), ) result = DataManager( catalogs=fileCatalog, masterCatalogOnly=masterCatalogOnly).putAndRegister( lfn, localPath, se, guid=fileGUID, checksum=fileChecksum) # retry on any failure if result["OK"]: self.log.verbose(result) break elif cmpError(result, EFCERR): self.log.debug( "transferAndRegisterFile: FC unavailable, retry") elif retryUpload and len(destinationSEList) == 1: self.log.debug( "transferAndRegisterFile: Failed uploading to the only SE, retry" ) else: self.log.debug( "dm.putAndRegister failed, but move to the next") break time.sleep(sleeptime) if not result["OK"]: self.log.error("dm.putAndRegister failed with message", result["Message"]) errorList.append(result["Message"]) continue if not result["Value"]["Failed"]: self.log.info( "dm.putAndRegister successfully uploaded and registered", "%s to %s" % (fileName, se)) return S_OK({"uploadedSE": se, "lfn": lfn}) # Now we know something went wrong self.log.warn( "Didn't manage to do everything, now adding requests for the missing operation" ) errorDict = result["Value"]["Failed"][lfn] if "register" not in errorDict: self.log.error("dm.putAndRegister failed with unknown error", str(errorDict)) errorList.append( "Unknown error while attempting upload to %s" % se) continue # fileDict = errorDict['register'] # Therefore the registration failed but the upload was successful if not fileCatalog: fileCatalog = "" if masterCatalogOnly: fileCatalog = FileCatalog().getMasterCatalogNames()["Value"] result = self._setRegistrationRequest(lfn, se, fileMetaDict, fileCatalog) if not result["OK"]: self.log.error("Failed to set registration request", "SE %s and metadata: \n%s" % (se, fileMetaDict)) errorList.append( "Failed to set registration request for: SE %s and metadata: \n%s" % (se, fileMetaDict)) continue else: self.log.info( "Successfully set registration request", "for: SE %s and metadata: \n%s" % (se, fileMetaDict)) metadata = {} metadata["filedict"] = fileMetaDict metadata["uploadedSE"] = se metadata["lfn"] = lfn metadata["registration"] = "request" return S_OK(metadata) self.log.error("Failed to upload output data file", "Encountered %s errors" % len(errorList)) return S_ERROR("Failed to upload output data file")
def getFilesToStage( lfnList, jobState = None, checkOnlyTapeSEs = None, jobLog = None ): """ Utility that returns out of a list of LFNs those files that are offline, and those for which at least one copy is online """ if not lfnList: return S_OK( {'onlineLFNs':[], 'offlineLFNs': {}, 'failedLFNs':[], 'absentLFNs':{}} ) dm = DataManager() if isinstance( lfnList, basestring ): lfnList = [lfnList] lfnListReplicas = dm.getReplicasForJobs( lfnList, getUrl = False ) if not lfnListReplicas['OK']: return lfnListReplicas offlineLFNsDict = {} onlineLFNs = {} offlineLFNs = {} absentLFNs = {} failedLFNs = set() if lfnListReplicas['Value']['Failed']: # Check if files are not existing for lfn, reason in lfnListReplicas['Value']['Failed'].iteritems(): # FIXME: awful check until FC returns a proper error if cmpError( reason, errno.ENOENT ) or 'No such file' in reason: # The file doesn't exist, job must be Failed # FIXME: it is not possible to return here an S_ERROR(), return the message only absentLFNs[lfn] = S_ERROR( errno.ENOENT, 'File not in FC' )['Message'] if absentLFNs: return S_OK({'onlineLFNs': list(onlineLFNs), 'offlineLFNs': offlineLFNsDict, 'failedLFNs': list(failedLFNs), 'absentLFNs': absentLFNs}) return S_ERROR( "Failures in getting replicas" ) lfnListReplicas = lfnListReplicas['Value']['Successful'] # If a file is reported here at a tape SE, it is not at a disk SE as we use disk in priority # We shall check all file anyway in order to make sure they exist seToLFNs = dict() for lfn, ses in lfnListReplicas.iteritems(): for se in ses: seToLFNs.setdefault( se, list() ).append( lfn ) if seToLFNs: if jobState: # Get user name and group from the job state userName = jobState.getAttribute( 'Owner' ) if not userName[ 'OK' ]: return userName userName = userName['Value'] userGroup = jobState.getAttribute( 'OwnerGroup' ) if not userGroup[ 'OK' ]: return userGroup userGroup = userGroup['Value'] else: userName = None userGroup = None # Check whether files are Online or Offline, or missing at SE result = _checkFilesToStage( seToLFNs, onlineLFNs, offlineLFNs, absentLFNs, # pylint: disable=unexpected-keyword-arg checkOnlyTapeSEs = checkOnlyTapeSEs, jobLog = jobLog, proxyUserName = userName, proxyUserGroup = userGroup, executionLock = True ) if not result['OK']: return result failedLFNs = set( lfnList ) - set( onlineLFNs ) - set( offlineLFNs ) - set( absentLFNs ) # Get the online SEs dmsHelper = DMSHelpers() onlineSEs = set( se for ses in onlineLFNs.values() for se in ses ) onlineSites = set( dmsHelper.getLocalSiteForSE( se ).get( 'Value' ) for se in onlineSEs ) - {None} for lfn in offlineLFNs: ses = offlineLFNs[lfn] if len( ses ) == 1: # No choice, let's go offlineLFNsDict.setdefault( ses[0], list() ).append( lfn ) continue # Try and get an SE at a site already with online files found = False if onlineSites: # If there is at least one online site, select one for se in ses: site = dmsHelper.getLocalSiteForSE( se ) if site['OK']: if site['Value'] in onlineSites: offlineLFNsDict.setdefault( se, list() ).append( lfn ) found = True break # No online site found in common, select randomly if not found: offlineLFNsDict.setdefault( random.choice( ses ), list() ).append( lfn ) return S_OK({'onlineLFNs': list(onlineLFNs), 'offlineLFNs': offlineLFNsDict, 'failedLFNs': list(failedLFNs), 'absentLFNs': absentLFNs, 'onlineSites': onlineSites})
def execute(self): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark("Iteration", 1) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug("execute: executing %d request in this cycle" % taskCounter) requestsToExecute = [] if not self.__bulkRequest: self.log.info("execute: ask for a single request") getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error("execute:", "%s" % getRequest["Message"]) break if not getRequest["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break requestsToExecute = [getRequest["Value"]] else: numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter) self.log.info("execute: ask for requests", "%s" % numberOfRequest) getRequests = self.requestClient().getBulkRequests(numberOfRequest) if not getRequests["OK"]: self.log.error("execute:", "%s" % getRequests["Message"]) break if not getRequests["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break for rId in getRequests["Value"]["Failed"]: self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId]) requestsToExecute = getRequests["Value"]["Successful"].values() self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute)) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info("processPool status", "tasks idle = %s working = %s" % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses())) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool", "will wait %d seconds to proceed" % self.__poolSleep) time.sleep(self.__poolSleep) looping += 1 else: if looping: self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep) looping = 0 # # save current request in cache res = self.cacheRequest(request) if not res['OK']: if cmpError(res, errno.EALREADY): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( "Too many requests in cache", '(%d requests): put back all requests and exit cycle. Error %s' % ( len( self.__requestCache), res['Message'])) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result['OK']: continue requestJSON = result['Value'] self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName)) timeOut = self.getTimeout(request) enqueue = self.processPool().createAndQueueTask(RequestTask, kwargs={"requestJSON": requestJSON, "handlersDict": self.handlersDict, "csPath": self.__configPath, "agentName": self.agentName}, taskID=taskID, blocking=True, usePoolCallbacks=True, timeOut=timeOut) if not enqueue["OK"]: self.log.error("Could not enqueue task", enqueue["Message"]) else: self.log.debug("successfully enqueued task", "'%s'" % taskID) # # update monitor gMonitor.addMark("Processed", 1) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep(0.1) break self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache)) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK()
def _checkFilesToStage( seToLFNs, onlineLFNs, offlineLFNs, absentLFNs, checkOnlyTapeSEs = None, jobLog = None, proxyUserName = None, proxyUserGroup = None, executionLock = None ): """ Checks on SEs whether the file is NEARLINE or ONLINE onlineLFNs, offlineLFNs and absentLFNs are modified to contain the files found online If checkOnlyTapeSEs is True, disk replicas are not checked As soon as a replica is found Online for a file, no further check is made """ # Only check on storage if it is a tape SE if jobLog is None: logger = gLogger else: logger = jobLog if checkOnlyTapeSEs is None: # Default value is True checkOnlyTapeSEs = True failed = {} for se, lfnsInSEList in seToLFNs.iteritems(): # If we have found already all files online at another SE, no need to check the others # but still we want to set the SE as Online if not a TapeSE vo = getVOForGroup( proxyUserGroup ) seObj = StorageElement( se, vo = vo ) status = seObj.getStatus() if not status['OK']: return status tapeSE = status['Value']['TapeSE'] diskSE = status['Value']['DiskSE'] # If requested to check only Tape SEs and the file is at a diskSE, we guess it is Online... filesToCheck = [] for lfn in lfnsInSEList: # If the file had already been found accessible at an SE, only check that this one is on disk diskIsOK = checkOnlyTapeSEs or ( lfn in onlineLFNs ) if diskIsOK and diskSE: onlineLFNs.setdefault( lfn, [] ).append( se ) elif not diskIsOK or ( tapeSE and ( lfn not in onlineLFNs ) ): filesToCheck.append( lfn ) if not filesToCheck: continue # We have to use a new SE object because it caches the proxy! with UserProxy(proxyUserName=proxyUserName, proxyUserGroup=proxyUserGroup, executionLock=executionLock) as proxyResult: if proxyResult['OK']: fileMetadata = StorageElement(se, vo=vo).getFileMetadata(filesToCheck) else: fileMetadata = proxyResult if not fileMetadata['OK']: failed[se] = dict.fromkeys( filesToCheck, fileMetadata['Message'] ) else: if fileMetadata['Value']['Failed']: failed[se] = fileMetadata['Value']['Failed'] # is there at least one replica online? for lfn, mDict in fileMetadata['Value']['Successful'].iteritems(): # SRM returns Cached, but others may only return Accessible if mDict.get( 'Cached', mDict['Accessible'] ): onlineLFNs.setdefault( lfn, [] ).append( se ) elif tapeSE: # A file can be staged only at Tape SE offlineLFNs.setdefault( lfn, [] ).append( se ) else: # File not available at a diskSE... we shall retry later pass # Doesn't matter if some files are Offline if they are also online for lfn in set( offlineLFNs ) & set( onlineLFNs ): offlineLFNs.pop( lfn ) # If the file was found staged, ignore possible errors, but print out errors for se, failedLfns in failed.items(): logger.error( "Errors when getting files metadata", 'at %s' % se ) for lfn, reason in failedLfns.items(): if lfn in onlineLFNs: logger.warn( reason, 'for %s, but there is an online replica' % lfn ) failed[se].pop( lfn ) else: logger.error( reason, 'for %s, no online replicas' % lfn ) if cmpError( reason, errno.ENOENT ): absentLFNs.setdefault( lfn, [] ).append( se ) failed[se].pop( lfn ) if not failed[se]: failed.pop( se ) # Find the files that do not exist at SE if failed: logger.error( "Error getting metadata", "for %d files" % len( set( lfn for lfnList in failed.itervalues() for lfn in lfnList ) ) ) for lfn in absentLFNs: seList = absentLFNs[lfn] # FIXME: it is not possible to return here an S_ERROR(), return the message only absentLFNs[lfn] = S_ERROR( errno.ENOENT, "File not at %s" % ','.join( seList ) )['Message'] # Format the error for absent files return S_OK()