def execute( self ): """ one cycle execution """ log = gLogger.getSubLogger( "execute" ) # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info( "resetting expired FTS graph..." ) resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error( "FTSGraph recreation error: %s" % resetFTSGraph["Message"] ) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info( "updating expired R/W access for SEs..." ) try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH ) requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] ) if not requestNames["OK"]: log.error( "unable to read scheduled request names: %s" % requestNames["Message"] ) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [ req[0] for req in requestNames["Value"] ] requestNames = list( set ( requestNames + self.__reqCache.keys() ) ) if not requestNames: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestNames ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestNames ) - len( self.__reqCache ) ) ) for requestName in requestNames: request = self.getRequest( requestName ) if not request["OK"]: log.error( request["Message"] ) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "request '%s' enqueued for execution" % sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK()
def execute( self ): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark( "Iteration", 1 ) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug( "execute: executing %d request in this cycle" % taskCounter ) getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error( "execute: %s" % getRequest["Message"] ) break if not getRequest["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break # # OK, we've got you request = getRequest["Value"] # # set task id taskID = request.RequestName # # save current request in cache self.cacheRequest( request ) # # serialize to JSON requestJSON = request.toJSON() if not requestJSON["OK"]: self.log.error( "JSON serialization error: %s" % requestJSON["Message"] ) break requestJSON = requestJSON["Value"] self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses() ) ) while True: if not self.processPool().getFreeSlots(): self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep ) time.sleep( self.__poolSleep ) else: self.log.info( "spawning task for request '%s'" % ( request.RequestName ) ) timeOut = self.getTimeout( request ) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs = { "requestJSON" : requestJSON, "handlersDict" : self.handlersDict, "csPath" : self.__configPath, "agentName": self.agentName }, taskID = taskID, blocking = True, usePoolCallbacks = True, timeOut = timeOut ) if not enqueue["OK"]: self.log.error( enqueue["Message"] ) else: self.log.debug( "successfully enqueued task '%s'" % taskID ) # # update monitor gMonitor.addMark( "Processed", 1 ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep( 0.1 ) break # # clean return return S_OK()
def __checkReplicas( self ): """ check done replicas and update file states """ waitingFiles = dict( [ ( opFile.LFN, opFile ) for opFile in self.operation if opFile.Status in ( "Waiting", "Scheduled" ) ] ) targetSESet = set( self.operation.targetSEList ) replicas = self.fc.getReplicas( waitingFiles.keys() ) if not replicas["OK"]: self.log.error( replicas["Message"] ) return replicas reMissing = re.compile( "no such file or directory" ) for failedLFN, errStr in replicas["Value"]["Failed"].items(): waitingFiles[failedLFN].Error = errStr if reMissing.search( errStr.lower() ): self.log.error( "file %s does not exists" % failedLFN ) gMonitor.addMark( "ReplicateFail", len( targetSESet ) ) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].items(): if targetSESet.issubset( set( reps ) ): self.log.info( "file %s has been replicated to all targets" % successfulLFN ) waitingFiles[successfulLFN].Status = "Done" return S_OK()
def execute(self): """ Main execution method """ delta = time.time() - self.last_update if delta > self.reportPeriod: result = self.jobDB.getCounters("Jobs", ["Status", "Site"], {}, "") if not result["OK"]: return S_ERROR("Failed to get data from the Job Database") self.resultDB = result["Value"] self.last_update = time.time() totalDict = {} for status in MONITOR_STATUS: totalDict[status] = 0 for row in self.resultDB: site = row[0]["Site"] status = row[0]["Status"] count = row[1] if site in MONITOR_SITES and status in MONITOR_STATUS: gLogger.verbose("Adding mark %s-%s: " % (status, site) + str(count)) gMonitor.addMark("%s-%s" % (status, site), count) if status in totalDict: totalDict[status] += count for status in MONITOR_STATUS: gLogger.verbose("Adding mark %s-All sites: " % status + str(totalDict[status])) gMonitor.addMark("%s-All sites" % status, totalDict[status]) return S_OK()
def _endReportToMonitoring( self, initialWallTime, initialCPUTime ): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100. if percentage > 0: gMonitor.addMark( 'CPU', percentage )
def export_requestJob(self, resourceDescription): """ Serve a job to the request of an agent which is the highest priority one matching the agent's site capacity """ result = self.selectJob(resourceDescription) gMonitor.addMark("matchesDone") return result
def execute( self ): """ execution in one cycle """ now = datetime.datetime.utcnow() kickTime = now - datetime.timedelta( hours = self.KICK_GRACE_HOURS ) rmTime = now - datetime.timedelta( days = self.DEL_GRACE_DAYS ) # # kick statusList = [ "Assigned" ] requestNamesList = self.requestClient().getRequestNamesList( statusList, self.DEL_LIMIT ) if not requestNamesList["OK"]: self.log.error( "execute: %s" % requestNamesList["Message"] ) return requestNamesList requestNamesList = requestNamesList["Value"] kicked = 0 for requestName, status, lastUpdate in requestNamesList: if lastUpdate < kickTime: self.log.info( "execute: kick assigned request '%s'" % requestName ) getRequest = self.requestClient().peekRequest( requestName ) if not getRequest["OK"]: self.log.error( "execute: unable to read request '%s': %s" % ( requestName, getRequest["Message"] ) ) continue getRequest = getRequest["Value"] if getRequest: getRequest.Status = "Waiting" putRequest = self.requestClient().putRequest( getRequest ) if not putRequest["OK"]: self.log.error( "execute: unable to put request '%s': %s" % ( requestName, putRequest["Message"] ) ) continue else: self.log.verbose( "Kicked request %d" % putRequest['Value'] ) kicked += 1 # # delete statusList = [ "Done", "Failed" ] if self.DEL_FAILED else [ "Done" ] requestNamesList = self.requestClient().getRequestNamesList( statusList, self.DEL_LIMIT ) if not requestNamesList["OK"]: self.log.error( "execute: %s" % requestNamesList["Message"] ) return requestNamesList requestNamesList = requestNamesList["Value"] deleted = 0 for requestName, status, lastUpdate in requestNamesList: if lastUpdate < rmTime: self.log.info( "execute: deleting request '%s' with status %s" % ( requestName, status ) ) delRequest = self.requestClient().deleteRequest( requestName ) if not delRequest["OK"]: self.log.error( "execute: unable to delete request '%s': %s" % ( requestName, delRequest["Message"] ) ) continue deleted += 1 gMonitor.addMark( "KickedRequests", kicked ) gMonitor.addMark( "DeletedRequests", deleted ) self.log.info( "execute: kicked assigned requests = %s" % kicked ) self.log.info( "execute: deleted finished requests = %s" % deleted ) return S_OK()
def execute( self ): """ one cycle execution """ # Don't use the server certificate otherwise the DFC wont let us write gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) log = gLogger.getSubLogger( "execute" ) # # reset FTSPlacement if expired now = datetime.datetime.now() if now > self.__ftsPlacementValidStamp: log.info( "resetting expired FTS placement..." ) resetFTSPlacement = self.resetFTSPlacement() if not resetFTSPlacement["OK"]: log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] ) return resetFTSPlacement self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS ) if not requestIDs["OK"]: log.error( "unable to read scheduled request ids" , requestIDs["Message"] ) return requestIDs if not requestIDs["Value"]: requestIDs = [] else: requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ] requestIDs += self.__reqCache.keys() if not requestIDs: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestIDs ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) ) for requestID in requestIDs: request = self.getRequest( requestID ) if not request["OK"]: log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) ) continue request = request["Value"] sTJId = request.RequestID while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "Request enqueued for execution", sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK()
def __monitorJob( self, request, ftsJob ): """ execute FTSJob.monitorFTS for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) ) log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND ) if not monitor["OK"]: gMonitor.addMark( "FTSMonitorFail", 1 ) log.error( monitor["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \ 'was not found' in monitor['Message'] or\ "Not found" in monitor['Message'] or\ 'Unknown transfer state' in monitor['Message']: log.error( "FTSJob not known (expired on server?): delete it" ) for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) return S_OK( ftsFilesDict ) return monitor monitor = monitor["Value"] log.info( "FTSJob Status = %s Completeness = %s" % ( ftsJob.Status, ftsJob.Completeness ) ) # # monitor status change gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 ) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob ) if not finalizeFTSJob["OK"]: if 'Unknown transfer state' in finalizeFTSJob['Message']: for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) else: log.error( finalizeFTSJob["Message"] ) return finalizeFTSJob else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] ) return S_OK( ftsFilesDict )
def execute( self ): self.enableFlag = self.am_getOption( 'EnableFlag', 'True' ) if not self.enableFlag == 'True': self.log.info( 'MigrationMonitoringAgent is disabled by configuration option %s/EnableFlag' % ( self.section ) ) return S_OK( 'Disabled via CS flag' ) gMonitor.addMark( "Iteration", 1 ) self.NewToMigrating() for se in self.storageElements: gMonitor.addMark( "Iteration%s" % se, 1 ) self.MigratingToMigrated( se ) return S_OK()
def execute( self ): """ one cycle execution """ now = datetime.datetime.now() kickTime = now - datetime.timedelta( hours = self.KICK_ASSIGNED_HOURS ) rmTime = now - datetime.timedelta( days = self.DEL_GRACE_DAYS ) kicked = 0 deleted = 0 # # select Assigned FTSJobs assignedFTSJobList = self.ftsClient().getFTSJobList( ["Assigned"], self.KICK_LIMIT ) if not assignedFTSJobList["OK"]: self.log.error( "execute: %s" % assignedFTSJobList["Message"] ) return assignedFTSJobList assignedFTSJobList = assignedFTSJobList["Value"] for ftsJob in assignedFTSJobList: if ftsJob.LastUpdate > kickTime: self.log.debug( "FTSJob %s is Assigned for too long and has to be kicked" % ftsJob.FTSGUID ) kicked += 1 ftsJob.Status = "Submitted" put = self.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: self.log.error( "execute: unable to put back FTSJob %s: %s" % ( ftsJob.FTSGUID, put["Message"] ) ) return put finishedFTSJobList = self.ftsClient().getFTSJobList( list( FTSJob.FINALSTATES ), self.DEL_LIMIT ) if not finishedFTSJobList["OK"]: self.log.error( "execute: %s" % finishedFTSJobList["Message"] ) return finishedFTSJobList finishedFTSJobList = finishedFTSJobList["Value"] for ftsJob in finishedFTSJobList: if ftsJob.LastUpdate > rmTime: self.log.debug( "FTSJob %s is too old and has to be deleted" % ftsJob.FTSGUID ) delJob = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not delJob["OK"]: self.log.error( "execute: %s" % delJob["Message"] ) return delJob else: putJob = self.ftsClient().putFTSJob( ftsJob ) if not putJob["OK"]: self.log.error( "execute: %s" % putJob["Message"] ) return putJob self.log.info( "Assigned FTSJobs kicked %s Finished FTSJobs deleted %s" % ( kicked, deleted ) ) gMonitor.addMark( "KickedFTSJobs", kicked ) gMonitor.addMark( "DeletedFTSJobs", deleted ) return S_OK()
def __call__( self ): """ action for 'removeFile' operation """ # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare waiting file dict toRemoveDict = dict( [ ( opFile.LFN, opFile ) for opFile in waitingFiles ] ) gMonitor.addMark( "RemoveFileAtt", len( toRemoveDict ) ) # # 1st step - bulk removal self.log.debug( "bulk removal of %s files" % len( toRemoveDict ) ) bulkRemoval = self.bulkRemoval( toRemoveDict ) if not bulkRemoval["OK"]: self.log.error( "bulk removal failed: %s" % bulkRemoval["Message"] ) else: gMonitor.addMark( "RemoveFileOK", len( toRemoveDict ) - len( bulkRemoval["Value"] ) ) toRemoveDict = bulkRemoval["Value"] # # 2nd step - single file removal for lfn, opFile in toRemoveDict.items(): self.log.info( "removing single file %s" % lfn ) singleRemoval = self.singleRemoval( opFile ) if not singleRemoval["OK"]: self.log.error( 'Error removing single file', singleRemoval["Message"] ) gMonitor.addMark( "RemoveFileFail", 1 ) else: self.log.info( "file %s has been removed" % lfn ) gMonitor.addMark( "RemoveFileOK", 1 ) # # set failedFiles = [ ( lfn, opFile ) for ( lfn, opFile ) in toRemoveDict.items() if opFile.Status in ( "Failed", "Waiting" ) ] if failedFiles: self.operation.Error = "failed to remove %d files" % len( failedFiles ) return S_OK()
def _startReportToMonitoring( self ): try: now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 10: return ( now, cpuTime ) # Send CPU consumption mark self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB( 'VmRSS:' ) if membytes: mem = membytes / ( 1024. * 1024. ) gMonitor.addMark( 'MEM', mem ) return( now, cpuTime ) except Exception: return False
def export_getReport( self, reportRequest ): """ Plot a accounting Arguments: - viewName : Name of view (easy!) - startTime - endTime - argsDict : Arguments to the view. - grouping - extraArgs """ retVal = self.__checkPlotRequest( reportRequest ) if not retVal[ 'OK' ]: return retVal reporter = MainReporter( gAccountingDB, self.serviceInfoDict[ 'clientSetup' ] ) gMonitor.addMark( "reportsRequested" ) reportRequest[ 'generatePlot' ] = False return reporter.generate( reportRequest, self.getRemoteCredentials() )
def __monitorJob( self, request, ftsJob ): """ execute FTSJob.monitorFTS2 for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "%s/monitor/%s" % ( request.RequestName, ftsJob.FTSGUID ) ) log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) monitor = ftsJob.monitorFTS2() if not monitor["OK"]: gMonitor.addMark( "FTSMonitorFail", 1 ) log.error( monitor["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"]: log.error( "FTSJob not known (expired on server?)" ) for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"] = ftsFile return S_OK( ftsFilesDict ) return monitor monitor = monitor["Value"] log.info( "FTSJob Status = %s Completeness = %s" % ( ftsJob.Status, ftsJob.Completeness ) ) # # monitor status change gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 ) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob ) if not finalizeFTSJob["OK"]: log.error( finalizeFTSJob["Message"] ) return finalizeFTSJob ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] ) return S_OK( ftsFilesDict )
def resultCallback( self, taskID, taskResult ): """ definition of request callback function :param self: self reference """ self.log.info("%s result callback" % taskID ) if not taskResult["OK"]: self.log.error( "%s result callback: %s" % ( taskID, taskResult["Message"] ) ) if taskResult["Message"] == "Timed out": self.resetRequest( taskID ) self.deleteRequest( taskID ) return self.deleteRequest( taskID ) taskResult = taskResult["Value"] ## add monitoring info monitor = taskResult["monitor"] if "monitor" in taskResult else {} for mark, value in monitor.items(): try: gMonitor.addMark( mark, value ) except Exception, error: self.log.exception( str(error) )
def export_addFile( self, lfns ): """ Register supplied files """ gMonitor.addMark( "AddFile", 1 ) res = gFileCatalogDB.addFile( lfns, self.getRemoteCredentials() ) if res['OK']: gMonitor.addMark( "AddFileSuccessful", len( res.get( 'Value', {} ).get( 'Successful', [] ) ) ) gMonitor.addMark( "AddFileFailed", len( res.get( 'Value', {} ).get( 'Failed', [] ) ) ) return res
def export_removeReplica( self, lfns ): """ Remove the supplied replicas """ gMonitor.addMark( "RemoveReplica", 1 ) res = gFileCatalogDB.removeReplica( lfns, self.getRemoteCredentials() ) if res['OK']: gMonitor.addMark( "RemoveReplicaSuccessful", len( res.get( 'Value', {} ).get( 'Successful', [] ) ) ) gMonitor.addMark( "RemoveReplicaFailed", len( res.get( 'Value', {} ).get( 'Failed', [] ) ) ) return res
def export_addReplica(self, lfns): """ Register supplied replicas """ gMonitor.addMark("AddReplica", 1) res = gFileCatalogDB.addReplica(lfns, self.getRemoteCredentials()) if res["OK"]: gMonitor.addMark("AddReplicaSuccessful", len(res.get("Value", {}).get("Successful", []))) gMonitor.addMark("AddReplicaFailed", len(res.get("Value", {}).get("Failed", []))) return res
def export_removeFile(self, lfns): """ Remove the supplied lfns """ gMonitor.addMark("RemoveFile", 1) res = gFileCatalogDB.removeFile(lfns, self.getRemoteCredentials()) if res["OK"]: gMonitor.addMark("RemoveFileSuccessful", len(res.get("Value", {}).get("Successful", []))) gMonitor.addMark("RemoveFileFailed", len(res.get("Value", {}).get("Failed", []))) return res
def execute(self): """ This agent is the smallest and (cutest) of all the DIRAC agents in existence. """ gMonitor.addMark("Iteration",1) central = PathFinder.getServiceURL("RequestManagement/centralURL") if central: self.central = central local = PathFinder.getServiceURL("RequestManagement/localURL") if local: self.local = local res = self.RequestDBClient.serveRequest(url=self.local) if not res['OK']: gLogger.error("ZuziaAgent.execute: Failed to get request from database.",self.local) return S_OK() elif not res['Value']: gLogger.info("ZuziaAgent.execute: No requests to be executed found.") return S_OK() gMonitor.addMark("Attempted",1) requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] gLogger.info("ZuziaAgent.execute: Obtained request %s" % requestName) gLogger.info("ZuziaAgent.execute: Attempting to set request to %s." % self.central) res = self.RequestDBClient.setRequest(requestName,requestString,self.central) if res['OK']: gMonitor.addMark("Successful",1) gLogger.info("ZuziaAgent.execute: Successfully put request.") else: gMonitor.addMark("Failed",1) gLogger.error("ZuziaAgent.execute: Failed to set request to", self.central) gLogger.info("ZuziaAgent.execute: Attempting to set request to %s." % self.local) res = self.RequestDBClient.setRequest(requestName,requestString,self.local) if res['OK']: gLogger.info("ZuziaAgent.execute: Successfully put request.") else: gLogger.error("ZuziaAgent.execute: Failed to set request to", self.local) while not res['OK']: gLogger.info("ZuziaAgent.execute: Attempting to set request to anywhere.") res = self.RequestDBClient.setRequest(requestName,requestString) gLogger.info("ZuziaAgent.execute: Successfully put request to %s." % res["Server"]) return S_OK()
def __call__( self ): """ call me maybe """ # # counter for failed replicas print "YEAHHH" import time time.sleep( 2 ) failedReplicas = 0 # # catalog to use catalog = self.operation.Catalog # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: gMonitor.addMark( "RegisterReplicaAtt", 1 ) # # get LFN lfn = opFile.LFN # # and others replicaTuple = ( lfn , opFile.PFN, self.operation.targetSEList[0] ) # # call ReplicaManager registerReplica = self.replicaManager().registerReplica( replicaTuple, catalog ) # # check results if not registerReplica["OK"] or lfn in registerReplica["Value"]["Failed"]: gMonitor.addMark( "RegisterReplicaFail", 1 ) self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaFail", catalog, "", "RegisterReplica" ) reason = registerReplica["Message"] if not registerReplica["OK"] else registerReplica["Value"]["Failed"][lfn] errorStr = "failed to register LFN %s: %s" % ( lfn, reason ) opFile.Error = errorStr self.log.warn( errorStr ) failedReplicas += 1 else: gMonitor.addMark( "RegisterReplicaOK", 1 ) self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaOK", catalog, "", "RegisterReplica" ) self.log.info( "Replica %s has been registered at %s" % ( lfn, catalog ) ) opFile.Status = "Done" # # final check if failedReplicas: self.log.info( "all replicas processed, %s replicas failed to register" % failedReplicas ) self.operation.Error = "some replicas failed to register" return S_ERROR( self.operation.Error ) return S_OK()
def __call__( self ): """ call me maybe """ # # counter for failed files failedFiles = 0 # # catalog to use catalog = self.operation.Catalog # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: gMonitor.addMark( "RegisterAtt", 1 ) # # get LFN lfn = opFile.LFN # # and others fileTuple = ( lfn , opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum ) # # call ReplicaManager registerFile = self.replicaManager().registerFile( fileTuple, catalog ) # # check results if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]: gMonitor.addMark( "RegisterFail", 1 ) self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", catalog, "", "RegisterFile" ) reason = registerFile["Message"] if not registerFile["OK"] else registerFile["Value"]["Failed"][lfn] errorStr = "failed to register LFN %s: %s" % ( lfn, reason ) opFile.Error = errorStr self.log.warn( errorStr ) failedFiles += 1 else: gMonitor.addMark( "RegisterOK", 1 ) self.dataLoggingClient().addFileRecord( lfn, "Register", catalog, "", "RegisterFile" ) self.log.info( "file %s has been registered at %s" % ( lfn, catalog ) ) opFile.Status = "Done" # # final check if failedFiles: self.log.info( "all files processed, %s files failed to register" % failedFiles ) self.operation.Error = "some files failed to register" return S_ERROR( self.operation.Error ) return S_OK()
def __call__(self): """ call me maybe """ # # list of targetSE targetSEs = self.operation.targetSEList if len(targetSEs) != 1: self.log.error( "wrong TargetSE attribute, expecting one entry, got %s" % len(targetSEs)) self.operation.Error = "Wrongly formatted TargetSE" for opFile in self.operation: gMonitor.addMark("RegisterAtt", 1) gMonitor.addMark("RegisterFail", 1) self.dataLoggingClient().addFileRecord(opFile.LFN, "RegisterFail", self.operation.TargetSE, "", "RegisterFile") opFile.Error = "wrongly formatted targetSE" opFile.Status = "Failed" self.operation.Status = "Failed" return S_ERROR(self.operation.Error) targetSE = targetSEs[0] # # counter for failed files failedFiles = 0 # # catalog to use catalog = self.operation.Catalog # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: gMonitor.addMark("RegisterAtt", 1) # # get LFN lfn = opFile.LFN # # and others fileTuple = (lfn, opFile.PFN, opFile.Size, targetSE, opFile.GUID, opFile.Checksum) # # call ReplicaManager registerFile = self.replicaManager().registerFile( fileTuple, catalog) # # check results if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]: gMonitor.addMark("RegisterFail", 1) self.dataLoggingClient().addFileRecord(lfn, "RegisterFail", targetSE, "", "RegisterFile") reason = registerFile["Message"] if not registerFile[ "OK"] else registerFile["Value"]["Failed"][lfn] errorStr = "failed to register LFN %s: %s" % (lfn, reason) opFile.Error = errorStr self.log.warn(errorStr) failedFiles += 1 else: gMonitor.addMark("RegisterOK", 1) self.dataLoggingClient().addFileRecord(lfn, "Register", targetSE, "", "RegisterFile") self.log.info("file %s has been registered at %s" % (lfn, targetSE)) opFile.Status = "Done" # # final check if failedFiles: self.log.info("all files processed, %s files failed to register" % failedFiles) self.operation.Error = "some files failed to register" return S_ERROR(self.operation.Error) return S_OK()
def execute(self): ''' Main execution method ''' gMonitor.addMark('Iteration', 1) # Get all the transformations result = self.transClient.getTransformations({ 'Status': 'Active', 'Type': self.transformationTypes }) if not result['OK']: gLogger.error( "InputDataAgent.execute: Failed to get transformations.", result['Message']) return S_OK() # Process each transformation for transDict in result['Value']: transID = long(transDict['TransformationID']) res = self.transClient.getTransformationInputDataQuery(transID) if not res['OK']: if res['Message'] == 'No InputDataQuery found for transformation': gLogger.info( "InputDataAgent.execute: No input data query found for transformation %d" % transID) else: gLogger.error( "InputDataAgent.execute: Failed to get input data query for %d" % transID, res['Message']) continue inputDataQuery = res['Value'] if self.refreshonly: # Determine the correct time stamp to use for this transformation if self.timeLog.has_key(transID): if self.fullTimeLog.has_key(transID): # If it is more than a day since the last reduced query, make a full query just in case if (datetime.datetime.utcnow() - self.fullTimeLog[transID] ) < datetime.timedelta( seconds=self.fullUpdatePeriod): timeStamp = self.timeLog[transID] if self.dateKey: inputDataQuery[self.dateKey] = ( timeStamp - datetime.timedelta(seconds=10) ).strftime('%Y-%m-%d %H:%M:%S') else: gLogger.error( "DateKey was not set in the CS, cannot use the RefreshOnly" ) else: self.fullTimeLog[ transID] = datetime.datetime.utcnow() self.timeLog[transID] = datetime.datetime.utcnow() if not self.fullTimeLog.has_key(transID): self.fullTimeLog[transID] = datetime.datetime.utcnow() # Perform the query to the metadata catalog gLogger.verbose( "Using input data query for transformation %d: %s" % (transID, str(inputDataQuery))) start = time.time() result = self.metadataClient.findFilesByMetadata(inputDataQuery) rtime = time.time() - start gLogger.verbose("Metadata catalog query time: %.2f seconds." % (rtime)) if not result['OK']: gLogger.error( "InputDataAgent.execute: Failed to get response from the metadata catalog", result['Message']) continue lfnList = result['Value'] # Check if the number of files has changed since the last cycle nlfns = len(lfnList) gLogger.info( "%d files returned for transformation %d from the metadata catalog" % (nlfns, int(transID))) if self.fileLog.has_key(transID): if nlfns == self.fileLog[transID]: gLogger.verbose( 'No new files in metadata catalog since last check') self.fileLog[transID] = nlfns # Add any new files to the transformation addedLfns = [] if lfnList: gLogger.verbose('Processing %d lfns for transformation %d' % (len(lfnList), transID)) # Add the files to the transformation gLogger.verbose('Adding %d lfns for transformation %d' % (len(lfnList), transID)) result = self.transClient.addFilesToTransformation( transID, sortList(lfnList)) if not result['OK']: gLogger.warn( "InputDataAgent.execute: failed to add lfns to transformation", result['Message']) self.fileLog[transID] = 0 else: if result['Value']['Failed']: for lfn, error in res['Value']['Failed'].items(): gLogger.warn( "InputDataAgent.execute: Failed to add %s to transformation" % lfn, error) if result['Value']['Successful']: for lfn, status in result['Value']['Successful'].items( ): if status == 'Added': addedLfns.append(lfn) gLogger.info( "InputDataAgent.execute: Added %d files to transformation" % len(addedLfns)) return S_OK()
def dmTransfer(self, fromFTS=False): """ replicate and register using dataManager """ # # get waiting files. If none just return # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS(sourceSE, 'ReadAccess') if not bannedSource["OK"]: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info(self.operation.Error) return S_OK(self.operation.Error) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedTargets if bannedTargets['Value']: self.operation.Error = "%s targets are banned for writing" % ",".join( bannedTargets['Value']) return S_OK(self.operation.Error) # Can continue now self.log.verbose("No targets banned for writing") waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files if fromFTS: self.log.info( "Trying transfer using replica manager as FTS failed") else: self.log.info("Transferring files using Data manager...") for opFile in waitingFiles: gMonitor.addMark("ReplicateAndRegisterAtt", 1) opFile.Error = '' lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas(opFile) if not replicas["OK"]: self.log.error(replicas["Message"]) continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas['NoReplicas'] badReplicas = replicas['Bad'] noPFN = replicas['NoPFN'] if not validReplicas: gMonitor.addMark("ReplicateFail") if noMetaReplicas: self.log.warn( "unable to replicate '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas))) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "unable to replicate %s, file doesn't exist at %s" % (opFile.LFN, ','.join(noReplicas))) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif badReplicas: self.log.error( "unable to replicate %s, all replicas have a bad checksum at %s" % (opFile.LFN, ','.join(badReplicas))) opFile.Error = 'All replicas have a bad checksum' opFile.Status = 'Failed' elif noPFN: self.log.warn( "unable to replicate %s, could not get a PFN" % opFile.LFN) continue # # get the first one in the list if sourceSE not in validReplicas: if sourceSE: self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % (lfn, sourceSE, validReplicas[0])) sourceSE = validReplicas[0] # # loop over targetSE catalog = self.operation.Catalog for targetSE in self.operation.targetSEList: # # call DataManager if targetSE in validReplicas: self.log.warn( "Request to replicate %s to an existing location: %s" % (lfn, targetSE)) opFile.Status = 'Done' continue res = self.dm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE, catalog=catalog) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn][ "replicate"] prString = "file %s replicated at %s in %s s." % ( lfn, targetSE, repTime) gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn][ "register"] prString += ' and registered in %s s.' % regTime self.log.info(prString) else: gMonitor.addMark("RegisterFail", 1) prString += " but failed to register" self.log.warn(prString) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE, type='RegisterReplica') self.request.insertAfter( registerOperation, self.operation) else: self.log.error("failed to replicate %s to %s." % (lfn, targetSE)) gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error( "failed to replicate and register file %s at %s:" % (lfn, targetSE), reason) opFile.Error = reason else: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error(opFile.Error) if not opFile.Error: if len(self.operation.targetSEList) > 1: self.log.info( "file %s has been replicated to all targetSEs" % lfn) opFile.Status = "Done" return S_OK()
def __updateMigrationAccounting(self, se, migratingFiles, matchingFiles, mismatchingFiles, assumedEndTime, previousMonitorTime): """ Create accounting messages for the overall throughput observed and the total migration time for the files """ allMigrated = matchingFiles + mismatchingFiles gMonitor.addMark("MigratedFiles%s" % se, len(allMigrated)) gMonitor.addMark("TotalMigratedFiles%s" % se, len(allMigrated)) lfnFileID = {} sizesToObtain = [] for fileID in allMigrated: if not migratingFiles[fileID]['Size']: lfn = migratingFiles[fileID]['LFN'] sizesToObtain.append(lfn) lfnFileID[lfn] = fileID if sizesToObtain: res = self.ReplicaManager.getCatalogFileSize(sizesToObtain) if not res['OK']: gLogger.error( "[%s] __updateMigrationAccounting: Failed to obtain file sizes" % se) return res for lfn, error in res['Value']['Failed'].items(): gLogger.error( "[%s] __updateAccounting: Failed to get file size" % se, "%s %s" % (lfn, error)) migratingFiles[lfnFileID[lfn]]['Size'] = 0 for lfn, size in res['Value']['Successful'].items(): migratingFiles[lfnFileID[lfn]]['Size'] = size totalSize = 0 for fileID in allMigrated: size = migratingFiles[fileID]['Size'] totalSize += size submitTime = migratingFiles[fileID]['SubmitTime'] timeDiff = submitTime - assumedEndTime migrationTime = (timeDiff.days * 86400) + (timeDiff.seconds) + ( timeDiff.microseconds / 1000000.0) gMonitor.addMark("MigrationTime%s" % se, migrationTime) gDataStoreClient.addRegister( self.__initialiseAccountingObject('MigrationTime', se, submitTime, assumedEndTime, size)) gDataStoreClient.addRegister( self.__initialiseAccountingObject('MigrationThroughput', se, previousMonitorTime, assumedEndTime, size)) oDataOperation = self.__initialiseAccountingObject( 'MigrationSuccess', se, submitTime, assumedEndTime, size) if fileID in mismatchingFiles: oDataOperation.setValueByKey('TransferOK', 0) oDataOperation.setValueByKey('FinalStatus', 'Failed') gDataStoreClient.addRegister(oDataOperation) gMonitor.addMark("TotalMigratedSize%s" % se, totalSize) gMonitor.addMark("ChecksumMismatches%s" % se, len(mismatchingFiles)) gMonitor.addMark("TotalChecksumMismatches%s" % se, len(mismatchingFiles)) gMonitor.addMark("ChecksumMatches%s" % se, len(matchingFiles)) gMonitor.addMark("TotalChecksumMatches%s" % se, len(matchingFiles)) if allMigrated: gLogger.info( '[%s] __updateMigrationAccounting: Attempting to send accounting message...' % se) return gDataStoreClient.commit() return S_OK()
def ftsTransfer(self): """ replicate and register using FTS """ self.log.info("scheduling files in FTS...") bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("FTSScheduleAtt") gMonitor.addMark("FTSScheduleFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value'])) # Can continue now self.log.verbose("No targets banned for writing") toSchedule = {} for opFile in self.getWaitingFilesList(): opFile.Error = '' gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] if not replicas["Valid"] and replicas["Banned"]: self.log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN) gMonitor.addMark("FTSScheduleFail") continue validReplicas = replicas["Valid"] bannedReplicas = replicas["Banned"] if not validReplicas and bannedReplicas: self.log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN) gMonitor.addMark("FTSScheduleFail") continue if validReplicas: validTargets = list( set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" continue toSchedule[opFile.LFN] = [opFile, validReplicas, validTargets] res = self._addMetadataToFiles(toSchedule) if not res['OK']: return res else: filesToScheduleList = res['Value'] if filesToScheduleList: ftsSchedule = self.ftsClient.ftsSchedule( self.request.RequestID, self.operation.OperationID, filesToScheduleList) if not ftsSchedule["OK"]: self.log.error(ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] if not ftsSchedule: return S_OK() for fileID in ftsSchedule["Successful"]: gMonitor.addMark("FTSScheduleOK", 1) for opFile in self.operation: if fileID == opFile.FileID: opFile.Status = "Scheduled" self.log.always("%s has been scheduled for FTS" % opFile.LFN) for fileID, reason in ftsSchedule["Failed"]: gMonitor.addMark("FTSScheduleFail", 1) for opFile in self.operation: if fileID == opFile.FileID: opFile.Error = reason self.log.error("unable to schedule %s for FTS: %s" % (opFile.LFN, opFile.Error)) else: self.log.info("No files to schedule after metadata checks") # Just in case some transfers could not be scheduled, try them with RM return self.rmTransfer(fromFTS=True)
def execute( self ): ''' Main execution method ''' gMonitor.addMark( 'Iteration', 1 ) # Get all the transformations result = self.transClient.getTransformations( condDict = {'Status': self.transfStatuses }, timeout = 320 ) if not result['OK']: gLogger.error( "UpdateTransformationCounters.execute: Failed to get transformations.", result['Message'] ) return S_OK() # Process each transformation jobsStates = self.transClient.getTransformationCountersStatuses( 'Tasks' )['Value'] filesStates = self.transClient.getTransformationCountersStatuses( 'Files' )['Value'] for transDict in result['Value']: transID = long( transDict['TransformationID'] ) gLogger.debug( "Looking at transformationID %d" % transID ) counterDict = {} counterDict['TransformationID'] = transID #Take care of the Tasks' states gLogger.verbose( "Getting the tasks stats for Transformation %s" % transID ) res = self.transClient.getTransformationTaskStats( transID ) if not res['OK']: gLogger.warn( "Could not get Transformation Task Stats for transformation %s : %s" % ( transID, res['Message'] ) ) break else: taskDict = {} if res['Value']: taskDict = res['Value'] gLogger.verbose( "Got %s tasks dict for transformation %s" % ( str( taskDict ), transID ) ) for state in jobsStates: counterDict[state] = taskDict.get( state, 0 ) else: gLogger.warn( "No Task Statuses found" ) break #Now look for the files' states gLogger.verbose( "Getting the files stats for Transformation %s" % transID ) res = self.transClient.getTransformationStats( transID ) if not res['OK']: gLogger.warn( "Could not get Transformation Stats for transformation %s : %s" % ( transID, res['Message'] ) ) break else: fileDict = {} if res['Value']: fileDict = res['Value'] gLogger.debug( "Got %s file dict for transformation %s" % ( str( fileDict ), transID ) ) for state in filesStates: counterDict[state] = fileDict.get( state, 0 ) else: gLogger.warn( "No File Statuses found" ) break gLogger.verbose( "Updating the counters for transformation %s" % transID ) res = self.transClient.updateTransformationCounters( counterDict ) if not res['OK']: gLogger.error( "Failed updating counters for transformation %s: %s" % ( transID, res['Message'] ) ) else: gLogger.verbose( "Updated the counters of transformation %s" % transID ) return S_OK()
def rmTransfer(self): """ replicate and register using ReplicaManager """ self.log.info("transferring files using replica manager...") # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read sourceRead = self.rssSEStatus(sourceSE, "ReadAccess") if not sourceRead["OK"]: self.log.error(sourceRead["Message"]) for opFile in self.operation: opFile.Error = sourceRead["Message"] opFile.Status = "Failed" self.operation.Error = sourceRead["Message"] gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return sourceRead if not sourceRead["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.error(self.operation.Error) return S_ERROR(self.operation.Error) # # list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal bannedTargets = [] for targetSE in targetSEs: writeStatus = self.rssSEStatus(targetSE, "WriteAccess") if not writeStatus["OK"]: self.log.error(writeStatus["Message"]) for opFile in self.operation: opFile.Error = "unknown targetSE: %s" % targetSE opFile.Status = "Failed" self.operation.Error = "unknown targetSE: %s" % targetSE return S_ERROR(self.operation.Error) if not writeStatus["Value"]: self.log.error("TargetSE %s in banned for writing right now" % targetSE) bannedTargets.append(targetSE) self.operation.Error += "banned targetSE: %s;" % targetSE # # some targets are banned? return if bannedTargets: return S_ERROR("%s targets are banned for writing" % ",".join(bannedTargets)) # # loop over targetSE for targetSE in targetSEs: # # check target SE targetWrite = self.rssSEStatus(targetSE, "WriteAccess") if not targetWrite["OK"]: self.log.error(targetWrite["Message"]) for opFile in self.operation: opFile.Error = targetWrite["Message"] opFile.Status = "Failed" self.operation.Error = targetWrite["Message"] return targetWrite if not targetWrite["Value"]: reason = "TargetSE %s is banned for writing" % targetSE self.log.error(reason) self.operation.Error = reason continue # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: gMonitor.addMark("ReplicateAndRegisterAtt", 1) lfn = opFile.LFN if not sourceSE: replicas = self._filterReplicas(opFile) if not replicas["OK"]: self.log.error(replicas["Message"]) continue replicas = replicas["Value"] if not replicas["Valid"]: self.log.warn("unable to find valid replicas for %s" % lfn) continue # # get the first one in the list sourceSE = replicas["Valid"][0] # # call ReplicaManager res = self.replicaManager().replicateAndRegister( lfn, targetSE, sourceSE=sourceSE) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn][ "replicate"] self.log.info("file %s replicated at %s in %s s." % (lfn, targetSE, repTime)) gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn][ "register"] self.log.info( "file %s registered at %s in %s s." % (lfn, targetSE, regTime)) else: gMonitor.addMark("RegisterFail", 1) self.log.info("failed to register %s at %s." % (lfn, targetSE)) opFile.Error = "Failed to register" opFile.Status = "Failed" # # add register replica operation self.addRegisterReplica(opFile, targetSE) else: self.log.info("failed to replicate %s to %s." % (lfn, targetSE)) gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error( "failed to replicate and register file %s at %s: %s" % (lfn, targetSE, reason)) opFile.Error = reason else: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "ReplicaManager error: %s" % res["Message"] self.log.error(opFile.Error) if not opFile.Error: self.log.info( "file %s has been replicated to all targetSEs" % lfn) opFile.Status = "Done" return S_OK()
def __call__(self): """ reTransfer operation execution """ # # list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal targetSE = targetSEs[0] bannedTargets = self.checkSEsRSS(targetSE) if not bannedTargets['OK']: gMonitor.addMark("FileReTransferAtt") gMonitor.addMark("FileReTransferFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value'])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare waiting files toRetransfer = dict([(opFile.PFN, opFile) for opFile in waitingFiles]) gMonitor.addMark("FileReTransferAtt", len(toRetransfer)) if len(targetSEs) != 1: error = "only one TargetSE allowed, got %d" % len(targetSEs) for opFile in toRetransfer.values(): opFile.Error = error opFile.Status = "Failed" self.operation.Error = error gMonitor.addMark("FileReTransferFail", len(toRetransfer)) return S_ERROR(error) se = StorageElement(targetSE) for opFile in toRetransfer.values(): reTransfer = se.retransferOnlineFile(opFile.LFN) if not reTransfer["OK"]: opFile.Error = reTransfer["Message"] self.log.error("Retransfer failed", opFile.Error) gMonitor.addMark("FileReTransferFail", 1) continue reTransfer = reTransfer["Value"] if opFile.LFN in reTransfer["Failed"]: opFile.Error = reTransfer["Failed"][opFile.LFN] self.log.error("Retransfer failed", opFile.Error) gMonitor.addMark("FileReTransferFail", 1) continue opFile.Status = "Done" self.log.info("%s retransfer done" % opFile.LFN) gMonitor.addMark("FileReTransferOK", 1) return S_OK()
def export_listDirectory(self, lfns, verbose): """ List the contents of supplied directories """ gMonitor.addMark('ListDirectory', 1) return gFileCatalogDB.listDirectory(lfns, self.getRemoteCredentials(), verbose=verbose)
def __call__( self ): """ LogUpload operation processing """ # # list of targetSEs targetSEs = self.operation.targetSEList if len( targetSEs ) != 1: self.log.error( "wrong value for TargetSE list = %s, should contain only one target!" % targetSEs ) self.operation.Error = "Wrong parameters: TargetSE should contain only one targetSE" for opFile in self.operation: opFile.Status = "Failed" opFile.Error = "Wrong parameters: TargetSE should contain only one targetSE" gMonitor.addMark( "LogUploadAtt", 1 ) gMonitor.addMark( "LogUploadFail", 1 ) return S_ERROR( "TargetSE should contain only one target, got %s" % targetSEs ) targetSE = targetSEs[0] targetWrite = self.rssSEStatus( targetSE, "WriteAccess" ) if not targetWrite["OK"]: self.log.error( targetWrite["Message"] ) for opFile in self.operation: opFile.Status = "Failed" opFile.Error = "Wrong parameters: %s" % targetWrite["Message"] gMonitor.addMark( "LogUploadAtt", 1 ) gMonitor.addMark( "LogUploadFail", 1 ) self.operation.Error = targetWrite["Message"] return S_OK() if not targetWrite["Value"]: self.operation.Error = "TargetSE %s is banned for writing" return S_ERROR( self.operation.Error ) # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: # # get LFN lfn = opFile.LFN self.log.info( "processing file %s" % lfn ) gMonitor.addMark( "LogUploadAtt", 1 ) destinationFolder = '/'.join( lfn.split( '/' )[0:-1] ) destinationSubFolder = "%03d" % ( int(( os.path.basename( lfn ) ).split( '_' )[1].split( '.' )[0]) / 1000) destination = destinationFolder + '/' + destinationSubFolder logUpload = self.dm.replicate( lfn, targetSE, destPath = destination, localCache = self.workDirectory ) if not logUpload["OK"]: gMonitor.addMark( "LogUploadFail", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "LogUploadFail", targetSE, "", "LogUpload" ) self.log.error( "completely failed to upload log file: %s" % logUpload["Message"] ) opFile.Error = str( logUpload["Message"] ) opFile.Attempt += 1 self.operation.Error = opFile.Error if 'No such file or directory' in opFile.Error: opFile.Status = 'Failed' continue if lfn in logUpload['Value']: gMonitor.addMark( "LogUploadOK", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "LogUpload", targetSE, "", "LogUpload" ) opFile.Status = 'Done' self.log.info( "Uploaded %s to %s" % ( lfn, targetSE ) ) return S_OK()
def __call__(self): """ request processing """ self.log.debug("about to execute request") gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: self.log.error(setupProxy["Message"]) self.request.Error = setupProxy["Message"] return self.updateRequest() shifter = setupProxy["Value"]["Shifter"] proxyFile = setupProxy["Value"]["ProxyFile"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error(operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation #%s '%s'" % (operation.Order, operation.Type)) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("unable to process operation %s: %s" % (operation.Type, handler["Message"])) gMonitor.addMark("%s%s" % (operation.Type, "Fail"), 1) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # # and execute try: gMonitor.addMark("%s%s" % (operation.Type, "Att"), 1) exe = handler() if not exe["OK"]: self.log.error("unable to process operation %s: %s" % (operation.Type, exe["Message"])) gMonitor.addMark("%s%s" % (operation.Type, "Fail"), 1) gMonitor.addMark("RequestFail", 1) except Exception, error: self.log.exception("hit by exception: %s" % str(error)) gMonitor.addMark("%s%s" % (operation.Type, "Fail"), 1) gMonitor.addMark("RequestFail", 1) break # # operation status check if operation.Status == "Done": gMonitor.addMark("%s%s" % (operation.Type, "OK"), 1) elif operation.Status == "Failed": gMonitor.addMark("%s%s" % (operation.Type, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break
def execute(self): """ execution in one cycle """ now = datetime.datetime.utcnow() kickTime = now - datetime.timedelta(hours=self.KICK_GRACE_HOURS) rmTime = now - datetime.timedelta(days=self.DEL_GRACE_DAYS) # # kick statusList = ["Assigned"] requestIDsList = self.requestClient().getRequestIDsList( statusList, self.KICK_LIMIT) if not requestIDsList["OK"]: self.log.error("execute: %s" % requestIDsList["Message"]) return requestIDsList requestIDsList = requestIDsList["Value"] kicked = 0 for requestID, status, lastUpdate in requestIDsList: reqStatus = self.requestClient().getRequestStatus(requestID) if not reqStatus['OK']: self.log.error(("execute: unable to get request status", reqStatus['Message'])) continue status = reqStatus['Value'] if lastUpdate < kickTime and status == 'Assigned': getRequest = self.requestClient().peekRequest(requestID) if not getRequest["OK"]: self.log.error("execute: unable to read request '%s': %s" % (requestID, getRequest["Message"])) continue getRequest = getRequest["Value"] if getRequest and getRequest.LastUpdate < kickTime: self.log.info( "execute: kick assigned request (%s/'%s') in status %s" % (requestID, getRequest.RequestName, getRequest.Status)) putRequest = self.requestClient().putRequest(getRequest) if not putRequest["OK"]: self.log.error( "execute: unable to put request (%s/'%s'): %s" % (requestID, getRequest.RequestName, putRequest["Message"])) continue else: self.log.verbose("Kicked request %d" % putRequest['Value']) kicked += 1 # # delete statusList = ["Done", "Failed", "Canceled" ] if self.DEL_FAILED else ["Done"] requestIDsList = self.requestClient().getRequestIDsList( statusList, self.DEL_LIMIT) if not requestIDsList["OK"]: self.log.error("execute: %s" % requestIDsList["Message"]) return requestIDsList requestIDsList = requestIDsList["Value"] deleted = 0 for requestID, status, lastUpdate in requestIDsList: if lastUpdate < rmTime: self.log.info("execute: deleting request '%s' with status %s" % (requestID, status)) delRequest = self.requestClient().deleteRequest(requestID) if not delRequest["OK"]: self.log.error( "execute: unable to delete request '%s': %s" % (requestID, delRequest["Message"])) continue deleted += 1 gMonitor.addMark("KickedRequests", kicked) gMonitor.addMark("DeletedRequests", deleted) self.log.info("execute: kicked assigned requests = %s" % kicked) self.log.info("execute: deleted finished requests = %s" % deleted) return S_OK()
def __call__(self): """ remove replicas """ # # prepare list of targetSEs targetSEs = self.operation.targetSEList # # get waiting files waitingFiles = self.getWaitingFilesList() # # and prepare dict toRemoveDict = dict([(opFile.LFN, opFile) for opFile in waitingFiles]) self.log.info("found %s replicas to delete from %s sites" % (len(toRemoveDict), len(targetSEs))) gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs)) # # check targetSEs for removal bannedTargets = [] for targetSE in targetSEs: # removeStatus = self.rssSEStatus( targetSE, "RemoveAccess" ) removeStatus = self.rssSEStatus(targetSE, "WriteAccess") if not removeStatus["OK"]: self.log.error(removeStatus["Message"]) for opFile in self.operation: opFile.Error = "unknown targetSE: %s" % targetSE opFile.Status = "Failed" self.operation.Error = "unknown targetSE: %s" % targetSE return S_ERROR(self.operation.Error) if not removeStatus["Value"]: self.log.error("%s in banned for remove right now" % targetSE) bannedTargets.append(targetSE) self.operation.Error += "banned targetSE: %s;" % targetSE # # some targets are banned? return if bannedTargets: return S_ERROR("targets %s are banned for removal" % ",".join(bannedTargets)) # # keep status for each targetSE removalStatus = dict.fromkeys(toRemoveDict.keys(), None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, None) # # loop over targetSEs for targetSE in targetSEs: self.log.info("removing replicas at %s" % targetSE) # # 1st step - bulk removal bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error(bulkRemoval["Message"]) return bulkRemoval bulkRemoval = bulkRemoval["Value"] # # update removal status for successful files removalOK = [ opFile for opFile in bulkRemoval.values() if not opFile.Error ] for opFile in removalOK: removalStatus[opFile.LFN][targetSE] = "" gMonitor.addMark("RemoveReplicaOK", len(removalOK)) # # 2nd step - process the rest again toRetry = dict([(lfn, opFile) for lfn, opFile in bulkRemoval.items() if opFile.Error]) for lfn, opFile in toRetry.items(): self.singleRemoval(opFile, targetSE) if not opFile.Error: gMonitor.addMark("RemoveReplicaOK", 1) removalStatus[lfn][targetSE] = "" else: gMonitor.addMark("RemoveReplicaFail", 1) removalStatus[lfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = [ error for error in removalStatus[lfn].values() if error ] if errors: failed += 1 opFile.Error = ",".join(errors) if "Write access not permitted for this credential" in opFile.Error: opFile.Status = "Failed" continue opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s replicas" % failed return S_OK()
def execute(self): """ execution in one cycle """ now = datetime.datetime.utcnow() kickTime = now - datetime.timedelta(hours=self.KICK_GRACE_HOURS) rmTime = now - datetime.timedelta(days=self.DEL_GRACE_DAYS) # # kick statusList = ["Assigned"] requestNamesList = self.requestClient().getRequestNamesList( statusList, self.DEL_LIMIT) if not requestNamesList["OK"]: self.log.error("execute: %s" % requestNamesList["Message"]) return requestNamesList requestNamesList = requestNamesList["Value"] kicked = 0 for requestName, status, lastUpdate in requestNamesList: if lastUpdate < kickTime: self.log.info("execute: kick assigned request '%s'" % requestName) getRequest = self.requestClient().peekRequest(requestName) if not getRequest["OK"]: self.log.error("execute: unable to read request '%s': %s" % (requestName, getRequest["Message"])) continue getRequest = getRequest["Value"] if getRequest: getRequest.Status = "Waiting" putRequest = self.requestClient().putRequest(getRequest) if not putRequest["OK"]: self.log.error( "execute: unable to put request '%s': %s" % (requestName, putRequest["Message"])) continue kicked += 1 # # delete statusList = ["Done", "Failed"] if self.DEL_FAILED else ["Done"] requestNamesList = self.requestClient().getRequestNamesList( statusList, self.DEL_LIMIT) if not requestNamesList["OK"]: self.log.error("execute: %s" % requestNamesList["Message"]) return requestNamesList requestNamesList = requestNamesList["Value"] deleted = 0 for requestName, status, lastUpdate in requestNamesList: if lastUpdate < rmTime: self.log.info("execute: deleting request '%s' with status %s" % (requestName, status)) delRequest = self.requestClient().deleteRequest(requestName) if not delRequest["OK"]: self.log.error( "execute: unable to delete request '%s': %s" % (requestName, delRequest["Message"])) continue deleted += 1 gMonitor.addMark("KickedRequests", kicked) gMonitor.addMark("DeletedRequests", deleted) self.log.info("execute: kicked assigned requests = %s" % kicked) self.log.info("execute: deleted finished requests = %s" % deleted) return S_OK()
def __call__(self): """ PutAndRegister operation processing """ # # list of targetSEs targetSEs = self.operation.targetSEList if len(targetSEs) != 1: self.log.error( "wrong value for TargetSE list = %s, should contain only one target!" % targetSEs) self.operation.Error = "Wrong parameters: TargetSE should contain only one targetSE" for opFile in self.operation: opFile.Status = "Failed" opFile.Error = "Wrong parameters: TargetSE should contain only one targetSE" gMonitor.addMark("PutAtt", 1) gMonitor.addMark("PutFail", 1) return S_ERROR("TargetSE should contain only one target, got %s" % targetSEs) targetSE = targetSEs[0] bannedTargets = self.checkSEsRSS(targetSE) if not bannedTargets['OK']: gMonitor.addMark("PutAtt") gMonitor.addMark("PutFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value'])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: # # get LFN lfn = opFile.LFN self.log.info("processing file %s" % lfn) gMonitor.addMark("PutAtt", 1) pfn = opFile.PFN guid = opFile.GUID checksum = opFile.Checksum # # call RM at least putAndRegister = DataManager( catalogs=self.operation.Catalog).putAndRegister( lfn, pfn, targetSE, guid=guid, checksum=checksum) if not putAndRegister["OK"]: gMonitor.addMark("PutFail", 1) self.dataLoggingClient().addFileRecord(lfn, "PutFail", targetSE, "", "PutAndRegister") self.log.error( "completely failed to put and register file: %s" % putAndRegister["Message"]) opFile.Error = str(putAndRegister["Message"]) self.operation.Error = str(putAndRegister["Message"]) continue putAndRegister = putAndRegister["Value"] if lfn in putAndRegister["Failed"]: gMonitor.addMark("PutFail", 1) self.dataLoggingClient().addFileRecord(lfn, "PutFail", targetSE, "", "PutAndRegister") reason = putAndRegister["Failed"][lfn] self.log.error("failed to put and register file %s at %s: %s" % (lfn, targetSE, reason)) opFile.Error = str(reason) self.operation.Error = str(reason) continue putAndRegister = putAndRegister["Successful"] if lfn in putAndRegister: if "put" not in putAndRegister[lfn]: gMonitor.addMark("PutFail", 1) self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister") self.log.info("failed to put %s to %s" % (lfn, targetSE)) opFile.Error = "put failed" self.operation.Error = "put failed" continue if "register" not in putAndRegister[lfn]: gMonitor.addMark("PutOK", 1) gMonitor.addMark("RegisterFail", 1) self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister") self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", targetSE, "", "PutAndRegister") self.log.info("put of %s to %s took %s seconds" % (lfn, targetSE, putAndRegister[lfn]["put"])) self.log.error("register of %s to %s failed" % (lfn, targetSE)) opFile.Error = "failed to register %s at %s" % (lfn, targetSE) opFile.Status = "Failed" self.log.info(opFile.Error) registerOperation = self.getRegisterOperation( opFile, targetSE) self.request.insertAfter(registerOperation, self.operation) continue gMonitor.addMark("PutOK", 1) gMonitor.addMark("RegisterOK", 1) self.dataLoggingClient().addFileRecord(lfn, "Put", targetSE, "", "PutAndRegister") self.dataLoggingClient().addFileRecord(lfn, "Register", targetSE, "", "PutAndRegister") opFile.Status = "Done" for op in ("put", "register"): self.log.info("%s of %s to %s took %s seconds" % (op, lfn, targetSE, putAndRegister[lfn][op])) return S_OK()
def selectJob(self, resourceDescription): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self.__processResourceDescription(resourceDescription) credDict = self.getRemoteCredentials() #Check credentials if not generic pilot if Properties.GENERIC_PILOT in credDict['properties']: #You can only match groups in the same VO vo = Registry.getVOForGroup(credDict['group']) result = Registry.getGroupsForVO(vo) if result['OK']: resourceDict['OwnerGroup'] = result['Value'] else: #If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict['properties']: gLogger.notice("Setting the resource DN to the credentials DN") resourceDict['OwnerDN'] = credDict['DN'] #If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict['properties']: resourceDict['OwnerGroup'] = credDict['group'] gLogger.notice( "Setting the resource group to the credentials group") if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] result = Registry.getGroupsForDN(resourceDict['OwnerDN']) if not result['OK'] or credDict['group'] not in result[ 'Value']: #DN is not in the same group! bad boy. gLogger.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] #Nothing special, group and DN have to be the same else: resourceDict['OwnerDN'] = credDict['DN'] resourceDict['OwnerGroup'] = credDict['group'] # Check the pilot DIRAC version if self.__opsHelper.getValue("Pilot/CheckVersion", True): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: return S_ERROR( 'Version check requested and not provided by Pilot') else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.__opsHelper.getValue("Pilot/Version", []) if validVersions and pilotVersion not in validVersions: return S_ERROR( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) #Check project if requested validProject = self.__opsHelper.getValue("Pilot/Project", "") if validProject: if 'ReleaseProject' not in resourceDict: return S_ERROR( "Version check requested but expected project %s not received" % validProject) if resourceDict['ReleaseProject'] != validProject: return S_ERROR( "Version check requested but expected project %s != received %s" % (validProject, resourceDict['ReleaseProject'])) # Update pilot information pilotInfoReported = False pilotReference = resourceDict.get('PilotReference', '') if pilotReference: if "PilotInfoReportedFlag" in resourceDict and not resourceDict[ 'PilotInfoReportedFlag']: gridCE = resourceDict.get('GridCE', 'Unknown') site = destination = resourceDict.get('Site', 'Unknown') benchmark = benchmark = resourceDict.get('PilotBenchmark', 0.0) gLogger.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference, gridCE, site, benchmark)) result = gPilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=gridCE, destination=site, benchmark=benchmark) if result['OK']: pilotInfoReported = True #Check the site mask if not 'Site' in resourceDict: return S_ERROR('Missing Site Name in Resource JDL') # Get common site mask and check the agent site result = gJobDB.getSiteMask(siteState='Active') if not result['OK']: return S_ERROR('Internal error: can not get site mask') maskList = result['Value'] siteName = resourceDict['Site'] if siteName not in maskList: if 'GridCE' not in resourceDict: return S_ERROR('Site not in mask and GridCE not specified') #Even if the site is banned, if it defines a CE, it must be able to check it del resourceDict['Site'] resourceDict['Setup'] = self.serviceInfoDict['clientSetup'] gLogger.verbose("Resource description:") for key in resourceDict: gLogger.verbose("%s : %s" % (key.rjust(20), resourceDict[key])) negativeCond = self.__limiter.getNegativeCondForSite(siteName) result = gTaskQueueDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if DEBUG: print result if not result['OK']: return result result = result['Value'] if not result['matchFound']: return S_ERROR('No match found') jobID = result['jobId'] resAtt = gJobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: return S_ERROR('Could not retrieve job attributes') if not resAtt['Value']: return S_ERROR('No attributes returned for job') if not resAtt['Value']['Status'] == 'Waiting': gLogger.error('Job %s matched by the TQ is not in Waiting state' % str(jobID)) result = gTaskQueueDB.deleteJob(jobID) if not result['OK']: return result return S_ERROR("Job %s is not in Waiting state" % str(jobID)) attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', siteName] result = gJobDB.setJobAttributes(jobID, attNames, attValues) # result = gJobDB.setJobStatus( jobID, status = 'Matched', minor = 'Assigned' ) result = gJobLoggingDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') result = gJobDB.getJobJDL(jobID) if not result['OK']: return S_ERROR('Failed to get the job JDL') resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime gLogger.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = gJobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = gJobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: return S_ERROR('Could not retrieve job attributes') if not resAtt['Value']: return S_ERROR('No attributes returned for job') if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.__limiter.updateDelayCounters(siteName, jobID) # Report pilot-job association if pilotReference: result = gPilotAgentsDB.setCurrentJobID(pilotReference, jobID) result = gPilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = pilotInfoReported return S_OK(resultDict)
def ftsTransfer(self): """ replicate and register using FTS """ self.log.info("scheduling files in FTS...") bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("FTSScheduleAtt") gMonitor.addMark("FTSScheduleFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value'])) # Can continue now self.log.verbose("No targets banned for writing") toSchedule = {} for opFile in self.getWaitingFilesList(): opFile.Error = '' gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas['NoReplicas'] badReplicas = replicas['Bad'] noPFN = replicas['NoPFN'] if validReplicas: validTargets = list( set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [ opFile, validReplicas, validTargets ] else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: self.log.warn( "unable to schedule '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas))) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "unable to schedule %s, file doesn't exist at %s" % (opFile.LFN, ','.join(noReplicas))) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif badReplicas: self.log.error( "unable to schedule %s, all replicas have a bad checksum at %s" % (opFile.LFN, ','.join(badReplicas))) opFile.Error = 'All replicas have a bad checksum' opFile.Status = 'Failed' elif noPFN: self.log.warn( "unable to schedule %s, could not get a PFN at %s" % (opFile.LFN, ','.join(noPFN))) res = self._addMetadataToFiles(toSchedule) if not res['OK']: return res else: filesToScheduleList = res['Value'] if filesToScheduleList: ftsSchedule = self.ftsClient.ftsSchedule( self.request.RequestID, self.operation.OperationID, filesToScheduleList) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] if not ftsSchedule: return S_OK() self.log.info("%d files have been scheduled to FTS" % len(ftsSchedule['Successful'])) for opFile in self.operation: fileID = opFile.FileID if fileID in ftsSchedule["Successful"]: gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) elif fileID in ftsSchedule["Failed"]: gMonitor.addMark("FTSScheduleFail", 1) opFile.Error = ftsSchedule["Failed"][fileID] if 'sourceSURL equals to targetSURL' in opFile.Error: # In this case there is no need to continue opFile.Status = 'Failed' self.log.warn("unable to schedule %s for FTS: %s" % (opFile.LFN, opFile.Error)) else: self.log.info("No files to schedule after metadata checks") # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True)
def rmTransfer(self, fromFTS=False): """ replicate and register using ReplicaManager """ # # get waiting files. If none just return waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() if fromFTS: self.log.info( "Trying transfer using replica manager as FTS failed") else: self.log.info("Transferring files using replica manager...") # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read sourceRead = self.rssSEStatus(sourceSE, "ReadAccess") if not sourceRead["OK"]: self.log.info(sourceRead["Message"]) for opFile in self.operation: opFile.Error = sourceRead["Message"] opFile.Status = "Failed" self.operation.Error = sourceRead["Message"] gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return sourceRead if not sourceRead["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info(self.operation.Error) return S_OK(self.operation.Error) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value'])) # Can continue now self.log.verbose("No targets banned for writing") # # loop over files for opFile in waitingFiles: gMonitor.addMark("ReplicateAndRegisterAtt", 1) opFile.Error = '' lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas(opFile) if not replicas["OK"]: self.log.error(replicas["Message"]) continue replicas = replicas["Value"] if not replicas["Valid"]: self.log.warn("unable to find valid replicas for %s" % lfn) continue # # get the first one in the list if sourceSE not in replicas['Valid']: if sourceSE: self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % (lfn, sourceSE, replicas["Valid"][0])) sourceSE = replicas["Valid"][0] # # loop over targetSE for targetSE in self.operation.targetSEList: # # call ReplicaManager if targetSE == sourceSE: self.log.warn( "Request to replicate %s to the source SE: %s" % (lfn, sourceSE)) continue res = self.rm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn][ "replicate"] prString = "file %s replicated at %s in %s s." % ( lfn, targetSE, repTime) gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn][ "register"] prString += ' and registered in %s s.' % regTime self.log.info(prString) else: gMonitor.addMark("RegisterFail", 1) prString += " but failed to register" self.log.warn(prString) opFile.Error = "Failed to register" opFile.Status = "Failed" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE) self.request.insertAfter( registerOperation, self.operation) else: self.log.error("failed to replicate %s to %s." % (lfn, targetSE)) gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error( "failed to replicate and register file %s at %s: %s" % (lfn, targetSE, reason)) opFile.Error = reason else: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "ReplicaManager error: %s" % res["Message"] self.log.error(opFile.Error) if not opFile.Error: if len(self.operation.targetSEList) > 1: self.log.info( "file %s has been replicated to all targetSEs" % lfn) opFile.Status = "Done" return S_OK()
def execute( self ): subRequestAttributes = self.argumentsDict.get( 'SubRequestAttributes', {} ) if not subRequestAttributes: return S_ERROR( "SubRequestAttributes not supplied" ) subRequestFiles = self.argumentsDict.get( 'Files', {} ) if not subRequestFiles: return S_ERROR( "Files not supplied" ) targetSE = subRequestAttributes.get( 'TargetSE', '' ) if not targetSE: return S_ERROR( "TargetSE not supplied" ) catalog = subRequestAttributes.get( 'Catalogue', '' ) status = subRequestAttributes.get( 'Status', 'Waiting' ) if status != 'Waiting': return S_OK() for subRequestFile in subRequestFiles: lfn = str( subRequestFile.get( 'LFN', '' ) ) if not lfn: self.log.error( "LFN not supplied" ) continue file = subRequestFile.get( 'PFN', '' ) if not file: self.log.error( "PFN not supplied" ) continue addler = subRequestFile.get( 'Addler', '' ) guid = subRequestFile.get( 'GUID', '' ) status = subRequestFile.get( 'Status', 'Waiting' ) if status != 'Waiting': self.log.info( "%s in %s status and not Waiting" % ( lfn, status ) ) continue gMonitor.addMark( "Put and register", 1 ) self.__getReplicaManager() res = self.rm.putAndRegister( lfn, file, targetSE, guid = guid, checksum = addler, catalog = catalog ) if not res['OK']: gMonitor.addMark( "Put failed", 1 ) self.log.error( "Failed to put file", res['Message'] ) continue successful = res['Value']['Successful'] failed = res['Value']['Failed'] if ( not successful.has_key( lfn ) ) or ( not successful[lfn].has_key( 'put' ) ): gMonitor.addMark( "Put failed", 1 ) self.log.error( "Failed to put file" ) continue if successful[lfn].has_key( 'put' ): gMonitor.addMark( "Put successful", 1 ) gLogger.info( "Successfully put %s to %s in %s seconds." % ( lfn, targetSE, successful[lfn]['put'] ) ) subRequestFile['Status'] = 'Done' if successful[lfn].has_key( 'register' ): gMonitor.addMark( "File registration successful", 1 ) gLogger.info( "Successfully registered %s to %s in %s seconds." % ( lfn, targetSE, successful[lfn]['register'] ) ) else: gMonitor.addMark( "File registration failed", 1 ) gLogger.error( "Failed to register %s to %s." % ( lfn, targetSE ) ) fileDict = failed[lfn]['register'] registerRequestDict = {'Attributes':{'TargetSE': fileDict['TargetSE'], 'Operation':'registerFile'}, 'Files':[{'LFN': fileDict['LFN'], 'PFN':fileDict['PFN'], 'Size':fileDict['Size'], 'Addler':fileDict['Addler'], 'GUID':fileDict['GUID']}]} print registerRequestDict return S_OK()
def __call__(self): """ reTransfer operation execution """ # # list of targetSEs targetSEs = self.operation.targetSEList # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare waiting files toRetransfer = dict([(opFile.PFN, opFile) for opFile in waitingFiles]) gMonitor.addMark("FileReTransferAtt", len(toRetransfer)) if len(targetSEs) != 1: error = "only one TargetSE allowed, got %s" % len(targetSEs) for opFile in toRetransfer.values(): opFile.Error = error opFile.Status = "Failed" self.operation.Error = error gMonitor.addMark("FileReTransferFail", len(toRetransfer)) return S_ERROR(error) # # check targetSEs for removal targetSE = targetSEs[0] writeStatus = self.rssSEStatus(targetSE, "Write") if not writeStatus["OK"]: self.log.error(writeStatus["Message"]) for opFile in self.operation: opFile.Error = "unknown targetSE: %s" % targetSE opFile.Status = "Failed" self.operation.Error = "unknown targetSE: %s" % targetSE gMonitor.addMark("FileReTransferFail", len(toRetransfer)) return S_ERROR(self.operation.Error) if not writeStatus["Value"]: self.log.error("TargetSE %s in banned for writing right now" % targetSE) self.operation.Error += "banned targetSE: %s;" % targetSE gMonitor.addMark("FileReTransferFail", len(toRetransfer)) return S_ERROR(self.operation.Error) for opFile in toRetransfer.values(): reTransfer = self.replicaManager().onlineRetransfer( targetSE, opFile.PFN) if not reTransfer["OK"]: opFile.Error = reTransfer["Message"] self.log.error("%s retransfer failed: %s" % opFile.Error) gMonitor.addMark("FileReTransferFail", 1) continue reTransfer = reTransfer["Value"] if opFile.PFN in reTransfer["Failed"]: opFile.Error = reTransfer["Failed"][opFile.PFN] self.log.error("%s retransfer failed: %s" % opFile.Error) gMonitor.addMark("FileReTransferFail", 1) continue opFile.Status = "Done" self.log.info("%s retransfer done" % opFile.LFN) gMonitor.addMark("FileReTransferOK", 1) return S_OK()
def execute( self ): """ read requests from RequestClient and enqueue them into ProcessPool """ gMonitor.addMark( "Iteration", 1 ) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug( "execute: executing %d request in this cycle" % taskCounter ) requestsToExecute = [] if not self.__bulkRequest: self.log.info( "execute: ask for a single request" ) getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error( "execute: %s" % getRequest["Message"] ) break if not getRequest["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break requestsToExecute = [getRequest["Value"] ] else: numberOfRequest = min( self.__bulkRequest, self.__requestsPerCycle - taskCounter ) self.log.info( "execute: ask for %s requests" % numberOfRequest ) getRequests = self.requestClient().getBulkRequests( numberOfRequest ) if not getRequests["OK"]: self.log.error( "execute: %s" % getRequests["Message"] ) break if not getRequests["Value"]: self.log.info( "execute: no more 'Waiting' requests to process" ) break for rId in getRequests["Value"]["Failed"]: self.log.error( "execute: %s" % getRequests["Value"]["Failed"][rId] ) requestsToExecute = getRequests["Value"]["Successful"].values() self.log.info( "execute: will execute %s requests " % len( requestsToExecute ) ) for request in requestsToExecute: # # set task id taskID = request.RequestID # # save current request in cache self.cacheRequest( request ) # # serialize to JSON result = request.toJSON() if not result['OK']: continue requestJSON = result['Value'] self.log.info( "processPool tasks idle = %s working = %s" % ( self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses() ) ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool, will wait %d seconds to proceed" % self.__poolSleep ) time.sleep( self.__poolSleep ) looping += 1 else: if looping: self.log.info( "Free slot found after %d seconds" % looping * self.__poolSleep ) looping = 0 self.log.info( "spawning task for request '%s/%s'" % ( request.RequestID, request.RequestName ) ) timeOut = self.getTimeout( request ) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs = { "requestJSON" : requestJSON, "handlersDict" : self.handlersDict, "csPath" : self.__configPath, "agentName": self.agentName }, taskID = taskID, blocking = True, usePoolCallbacks = True, timeOut = timeOut ) if not enqueue["OK"]: self.log.error( enqueue["Message"] ) else: self.log.debug( "successfully enqueued task '%s'" % taskID ) # # update monitor gMonitor.addMark( "Processed", 1 ) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep( 0.1 ) break # # clean return return S_OK()
def selectJob(self, resourceDescription): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self.__processResourceDescription(resourceDescription) credDict = self.getRemoteCredentials() #Check credentials if Properties.GENERIC_PILOT not in credDict['properties']: #Not a generic pilot and requires a DN??? This smells fishy if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] if Properties.JOB_SHARING in credDict['properties']: #Job sharing, is the DN in the same group? result = Registry.getGroupsForDN(ownerDN) if not result['OK']: return S_ERROR( "Requested owner DN %s does not have any group!" % ownerDN) groups = result['Value'] if credDict['group'] not in groups: #DN is not in the same group! bad body. gLogger.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] else: #No generic pilot and not JobSharing? DN has to be the same! gLogger.notice("You can only match jobs for your DN (%s)" % credDict['DN']) resourceDict['OwnerDN'] = credDict['DN'] if Properties.PILOT not in credDict['properties']: #No pilot? Group has to be the same! if 'OwnerGroup' in resourceDict and resourceDict[ 'OwnerGroup'] != credDict['group']: gLogger.notice( "You can only match jobs for your group (%s)" % credDict['group']) resourceDict['OwnerGroup'] = credDict['group'] # Check the pilot DIRAC version if self.checkPilotVersion: if not 'DIRACVersion' in resourceDict: return S_ERROR( 'Version check requested and not provided by Pilot') # Check if the matching Request provides a VirtualOrganization if 'VirtualOrganization' in resourceDict: voName = resourceDict['VirtualOrganization'] # Check if the matching Request provides an OwnerGroup elif 'OwnerGroup' in resourceDict: voName = Registry.getVOForGroup(resourceDict['OwnerGroup']) # else take the default VirtualOrganization for the installation else: voName = Registry.getVOForGroup('') self.pilotVersion = gConfig.getValue( '/Operations/%s/%s/Versions/PilotVersion' % (voName, self.setup), '') if self.pilotVersion and resourceDict[ 'DIRACVersion'] != self.pilotVersion: return S_ERROR( 'Pilot version does not match the production version %s:%s' % \ ( resourceDict['DIRACVersion'], self.pilotVersion ) ) # Get common site mask and check the agent site result = gJobDB.getSiteMask(siteState='Active') if result['OK']: maskList = result['Value'] else: return S_ERROR('Internal error: can not get site mask') if not 'Site' in resourceDict: return S_ERROR('Missing Site Name in Resource JDL') siteName = resourceDict['Site'] if resourceDict['Site'] not in maskList: if 'GridCE' in resourceDict: del resourceDict['Site'] else: return S_ERROR('Site not in mask and GridCE not specified') resourceDict['Setup'] = self.serviceInfoDict['clientSetup'] if DEBUG: print "Resource description:" for key, value in resourceDict.items(): print key.rjust(20), value # Check if Job Limits are imposed onto the site extraConditions = {} if self.siteJobLimits: result = self.getExtraConditions(siteName) if result['OK']: extraConditions = result['Value'] if extraConditions: gLogger.info('Job Limits for site %s are: %s' % (siteName, str(extraConditions))) result = gTaskQueueDB.matchAndGetJob(resourceDict, extraConditions=extraConditions) if DEBUG: print result if not result['OK']: return result result = result['Value'] if not result['matchFound']: return S_ERROR('No match found') jobID = result['jobId'] resAtt = gJobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: return S_ERROR('Could not retrieve job attributes') if not resAtt['Value']: return S_ERROR('No attributes returned for job') if not resAtt['Value']['Status'] == 'Waiting': gLogger.error('Job %s matched by the TQ is not in Waiting state' % str(jobID)) result = gTaskQueueDB.deleteJob(jobID) result = gJobDB.setJobStatus(jobID, status='Matched', minor='Assigned') result = gJobLoggingDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') result = gJobDB.getJobJDL(jobID) if not result['OK']: return S_ERROR('Failed to get the job JDL') resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime gLogger.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = gJobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = gJobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: return S_ERROR('Could not retrieve job attributes') if not resAtt['Value']: return S_ERROR('No attributes returned for job') resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] return S_OK(resultDict)
def MigratingToMigrated(self, se): """ Obtain the active files from the migration monitoring db and check their status """ # First get the migrating files from the database gLogger.info( "[%s] MigratingToMigrated: Attempting to obtain 'Migrating' files." % se) res = self.__getFiles(se, 'Migrating') if not res['OK']: gLogger.error( "[%s] MigratingToMigrated: Failed to get 'Migrating' files." % se, res['Message']) return res pfnIDs = res['Value']['PFNIDs'] if not pfnIDs: gLogger.info( "[%s] MigratingToMigrated: Found no 'Migrating' files." % se) return S_OK() migratingFiles = res['Value']['Files'] gLogger.info("[%s] MigratingToMigrated: Found %d 'Migrating' files." % (se, len(pfnIDs))) gMonitor.addMark("MigratingFiles%s" % se, len(pfnIDs)) gLogger.info( "[%s] MigratingToMigrated: Obtaining physical file metadata for 'Migrating' files." % se) startTime = datetime.datetime.utcnow() res = self.__getMigratedFiles(se, pfnIDs.keys()) if not res['OK']: gLogger.error( "[%s] MigratingToMigrated: Failed to get 'Migrating' file metadata." % se, res['Message']) return res assumedEndTime = datetime.datetime.utcnow() - ( (datetime.datetime.utcnow() - startTime) / 2 ) # Assumed that the files are found migrated midway through obtaining the metadata previousMonitorTime = self.lastMonitors[se] self.lastMonitors[se] = datetime.datetime.utcnow() terminal = res['Value']['Terminal'] migrated = res['Value']['Migrated'] # Update the problematic files in the integrity DB and update the MigrationMonitoringDB gLogger.info( "[%s] MigratingToMigrated: Found %d terminally failed files." % (se, len(terminal))) if terminal: replicaTuples = [] terminalFileIDs = [] for pfn, prognosis in terminal.items(): fileID = pfnIDs[pfn] terminalFileIDs.append(fileID) lfn = migratingFiles[fileID]['LFN'] se = migratingFiles[fileID]['SE'] replicaTuples.append((lfn, pfn, se, prognosis)) self.__reportProblematicReplicas(replicaTuples) res = self.MigrationMonitoringDB.setMigratingReplicaStatus( terminalFileIDs, 'Failed') if not res['OK']: gLogger.error( "[%s] MigratingToMigrated: Failed to update terminal files." % se, res['Message']) # Update the migrated files and send accounting gLogger.info("[%s] MigratingToMigrated: Found %d migrated files." % (se, len(migrated))) if migrated: migratedFileIDs = {} for pfn, checksum in migrated.items(): migratedFileIDs[pfnIDs[pfn]] = checksum #res = self.MigrationMonitoringDB.setMigratingReplicaStatus(migratedFileIDs.keys(),'Migrated') #if not res['OK']: # gLogger.error("[%s] MigratingToMigrated: Failed to update migrated files." % se, res['Message']) # Check the checksums of the migrated files res = self.__validateChecksums(se, migratedFileIDs, migratingFiles) if not res['OK']: gLogger.error( "[%s] MigratingToMigrated: Failed to perform checksum matching." % se, res['Message']) matchingFiles = [] mismatchingFiles = [] else: matchingFiles = res['Value']['MatchingFiles'] mismatchingFiles = res['Value']['MismatchFiles'] # Create and send the accounting messages res = self.__updateMigrationAccounting(se, migratingFiles, matchingFiles, mismatchingFiles, assumedEndTime, previousMonitorTime) if not res['OK']: gLogger.error( "[%s] MigratingToMigrated: Failed to send accounting for migrated files." % se, res['Message']) return S_OK()
def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict
def __call__( self ): """ request processing """ self.log.debug( "about to execute request" ) gMonitor.addMark( "RequestAtt", 1 ) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: self.request.Error = setupProxy["Message"] if 'has no proxy registered' in setupProxy["Message"]: self.log.error( 'Request set to Failed:', setupProxy["Message"] ) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = 'Failed' operation.Status = 'Failed' else: self.log.error( setupProxy["Message"] ) return S_OK( self.request ) shifter = setupProxy["Value"]["Shifter"] proxyFile = setupProxy["Value"]["ProxyFile"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error( operation["Message"] ) return operation operation = operation["Value"] self.log.info( "executing operation #%s '%s'" % ( operation.Order, operation.Type ) ) # # and handler for it handler = self.getHandler( operation ) if not handler["OK"]: self.log.error( "unable to process operation %s: %s" % ( operation.Type, handler["Message"] ) ) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # # and execute pluginName = self.getPluginName( self.handlersDict.get( operation.Type ) ) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: gMonitor.addMark( "%s%s" % ( pluginName, "Att" ), 1 ) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' ) if not exe["OK"]: self.log.error( "unable to process operation %s: %s" % ( operation.Type, exe["Message"] ) ) if pluginName: gMonitor.addMark( "%s%s" % ( pluginName, "Fail" ), 1 ) gMonitor.addMark( "RequestFail", 1 ) if self.request.JobID: # Check if the job exists monitorServer = RPCClient( "WorkloadManagement/JobMonitoring", useCertificates = True ) res = monitorServer.getJobPrimarySummary( int( self.request.JobID ) ) if not res["OK"]: self.log.error( "RequestTask: Failed to get job %d status" % self.request.JobID ) elif not res['Value']: self.log.warn( "RequestTask: job %d does not exist (anymore): failed request" % self.request.JobID ) for opFile in operation: opFile.Status = 'Failed' if operation.Status != 'Failed': operation.Status = 'Failed' self.request.Error = 'Job no longer exists' except Exception, error: self.log.exception( "hit by exception: %s" % str( error ) ) if pluginName: gMonitor.addMark( "%s%s" % ( pluginName, "Fail" ), 1 ) gMonitor.addMark( "RequestFail", 1 ) if useServerCertificate: gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' ) break # # operation status check if operation.Status == "Done" and pluginName: gMonitor.addMark( "%s%s" % ( pluginName, "OK" ), 1 ) elif operation.Status == "Failed" and pluginName: gMonitor.addMark( "%s%s" % ( pluginName, "Fail" ), 1 ) elif operation.Status in ( "Waiting", "Scheduled" ): # # no update for waiting or all files scheduled break
def __call__( self ): """ perform physical removal operation """ # # prepare targetSE list targetSEs = self.operation.targetSEList # # check targetSEs for removal bannedTargets = [] for targetSE in targetSEs: removeStatus = self.rssSEStatus( targetSE, "Remove" ) if not removeStatus["OK"]: self.log.error( removeStatus["Message"] ) for opFile in self.operation: opFile.Error = "unknown targetSE: %s" % targetSE opFile.Status = "Failed" self.operation.Error = "unknown targetSE: %s" % targetSE return S_ERROR( self.operation.Error ) if not removeStatus["Value"]: self.log.error( "%s in banned for remove right now" % targetSE ) bannedTargets.append( targetSE ) self.operation.Error += "banned targetSE: %s;" % targetSE # # some targets are banned? return if bannedTargets: return S_ERROR( "targets %s are banned for removal" % ",".join( bannedTargets ) ) # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare pfn dict toRemoveDict = dict( [ ( opFile.PFN, opFile ) for opFile in waitingFiles ] ) gMonitor.addMark( "PhysicalRemovalAtt", len( toRemoveDict ) * len( targetSEs ) ) # # keep errors dict removalStatus = dict.fromkeys( toRemoveDict.keys(), None ) for pfn in removalStatus: removalStatus[pfn] = dict.fromkeys( targetSEs, "" ) for targetSE in targetSEs: self.log.info( "removing files from %s" % targetSE ) # # 1st - bulk removal bulkRemoval = self.bulkRemoval( toRemoveDict, targetSE ) if not bulkRemoval["OK"]: self.log.error( bulkRemoval["Message"] ) self.operation.Error = bulkRemoval["Message"] return bulkRemoval bulkRemoval = bulkRemoval["Value"] for pfn, opFile in toRemoveDict.items(): removalStatus[pfn][targetSE] = bulkRemoval["Failed"].get( pfn, "" ) opFile.Error = removalStatus[pfn][targetSE] # # 2nd - single file removal toRetry = dict( [ ( pfn, opFile ) for pfn, opFile in toRemoveDict.items() if pfn in bulkRemoval["Failed"] ] ) for pfn, opFile in toRetry.items(): self.singleRemoval( opFile, targetSE ) if not opFile.Error: removalStatus[pfn][targetSE] = "" else: gMonitor.addMark( "PhysicalRemovalFail", 1 ) removalStatus[pfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = [ error for error in removalStatus[opFile.PFN].values() if error.strip() ] if errors: failed += 1 opFile.Error = ",".join( errors ) if "Write access not permitted for this credential" in opFile.Error: opFile.Status = "Failed" gMonitor.addMark( "PhysicalRemovalFail", len( errors ) ) continue gMonitor.addMark( "PhysicalRemovalOK", len( targetSEs ) ) gMonitor.addMark( "PhysicalRemovalSize", opFile.Size * len( targetSEs ) ) opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s files" % failed return S_OK()
def sendNumTaskQueues(): result = taskQueueDB.getNumTaskQueues() if result[ 'OK' ]: gMonitor.addMark( 'numTQs', result[ 'Value' ] ) else: gLogger.error( "Cannot get the number of task queues", result[ 'Message' ] )
def __call__( self ): """ reTransfer operation execution """ # # list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal targetSE = targetSEs[0] bannedTargets = self.checkSEsRSS( targetSE ) if not bannedTargets['OK']: gMonitor.addMark( "FileReTransferAtt" ) gMonitor.addMark( "FileReTransferFail" ) return bannedTargets if bannedTargets['Value']: return S_OK( "%s targets are banned for writing" % ",".join( bannedTargets['Value'] ) ) # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare waiting files toRetransfer = dict( [ ( opFile.PFN, opFile ) for opFile in waitingFiles ] ) gMonitor.addMark( "FileReTransferAtt", len( toRetransfer ) ) if len( targetSEs ) != 1: error = "only one TargetSE allowed, got %d" % len( targetSEs ) for opFile in toRetransfer.values(): opFile.Error = error opFile.Status = "Failed" self.operation.Error = error gMonitor.addMark( "FileReTransferFail", len( toRetransfer ) ) return S_ERROR( error ) se = StorageElement( targetSE ) for opFile in toRetransfer.values(): reTransfer = se.retransferOnlineFile( opFile.PFN ) if not reTransfer["OK"]: opFile.Error = reTransfer["Message"] self.log.error( "%s retransfer failed: %s" % opFile.Error ) gMonitor.addMark( "FileReTransferFail", 1 ) continue reTransfer = reTransfer["Value"] if opFile.PFN in reTransfer["Failed"]: opFile.Error = reTransfer["Failed"][opFile.PFN] self.log.error( "%s retransfer failed: %s" % opFile.Error ) gMonitor.addMark( "FileReTransferFail", 1 ) continue opFile.Status = "Done" self.log.info( "%s retransfer done" % opFile.LFN ) gMonitor.addMark( "FileReTransferOK", 1 ) return S_OK()
def selectJob( self, resourceDescription ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() # Check and form the resource description dictionary resourceDict = {} if type( resourceDescription ) in StringTypes: classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): return S_ERROR( 'Illegal Resource JDL' ) gLogger.verbose( classAdAgent.asJDL() ) for name in taskQueueDB.getSingleValueTQDefFields(): if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in taskQueueDB.getMultiValueMatchFields(): if classAdAgent.lookupAttribute( name ): resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) if classAdAgent.lookupAttribute( 'DIRACVersion' ): resourceDict['DIRACVersion'] = classAdAgent.getAttributeString( 'DIRACVersion' ) else: for name in taskQueueDB.getSingleValueTQDefFields(): if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in taskQueueDB.getMultiValueMatchFields(): if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] if resourceDescription.has_key( 'DIRACVersion' ): resourceDict['DIRACVersion'] = resourceDescription['DIRACVersion'] # Check the pilot DIRAC version if self.checkPilotVersion: if 'DIRACVersion' in resourceDict: if self.pilotVersion and resourceDict['DIRACVersion'] != self.pilotVersion: return S_ERROR( 'Pilot version does not match the production version %s:%s' % \ ( resourceDict['DIRACVersion'], self.pilotVersion ) ) # Get common site mask and check the agent site result = jobDB.getSiteMask( siteState = 'Active' ) if result['OK']: maskList = result['Value'] else: return S_ERROR( 'Internal error: can not get site mask' ) if not 'Site' in resourceDict: return S_ERROR( 'Missing Site Name in Resource JDL' ) siteName = resourceDict['Site'] if resourceDict['Site'] not in maskList: if 'GridCE' in resourceDict: del resourceDict['Site'] else: return S_ERROR( 'Site not in mask and GridCE not specified' ) resourceDict['Setup'] = self.serviceInfoDict['clientSetup'] if DEBUG: print "Resource description:" for k, v in resourceDict.items(): print k.rjust( 20 ), v # Check if Job Limits are imposed onto the site extraConditions = {} if self.siteJobLimits: result = self.getExtraConditions( siteName ) if result['OK']: extraConditions = result['Value'] if extraConditions: gLogger.info( 'Job Limits for site %s are: %s' % ( siteName, str( extraConditions ) ) ) result = taskQueueDB.matchAndGetJob( resourceDict, extraConditions = extraConditions ) if DEBUG: print result if not result['OK']: return result result = result['Value'] if not result['matchFound']: return S_ERROR( 'No match found' ) jobID = result['jobId'] resAtt = jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: return S_ERROR( 'Could not retrieve job attributes' ) if not resAtt['Value']: return S_ERROR( 'No attributes returned for job' ) if not resAtt['Value']['Status'] == 'Waiting': gLogger.error( 'Job %s matched by the TQ is not in Waiting state' % str( jobID ) ) result = taskQueueDB.deleteJob( jobID ) result = jobDB.setJobStatus( jobID, status = 'Matched', minor = 'Assigned' ) result = jobLoggingDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) result = jobDB.getJobJDL( jobID ) if not result['OK']: return S_ERROR( 'Failed to get the job JDL' ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime gLogger.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: return S_ERROR( 'Could not retrieve job attributes' ) if not resAtt['Value']: return S_ERROR( 'No attributes returned for job' ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] return S_OK( resultDict )
def execute_request(self): """ Takes one DISET request and forward it to the destination service """ gMonitor.addMark("Iteration", 1) if self.RequestDB: res = self.RequestDB.getRequest('diset') else: res = self.RequestDBClient.getRequest('diset', url=self.local) if not res['OK']: gLogger.error( "DISETForwardingAgent.execute: Failed to get request from database.", self.local) return S_OK() elif not res['Value']: gLogger.info( "DISETForwardingAgent.execute: No requests to be executed found." ) return S_OK() gMonitor.addMark("Attempted", 1) requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] try: jobID = int(res['Value']['JobID']) except: jobID = 0 gLogger.info("DISETForwardingAgent.execute: Obtained request %s" % requestName) if self.RequestDB: result = self.RequestDB._getRequestAttribute( 'RequestID', requestName=requestName) if not result['OK']: return S_OK('Can not get the request execution order') requestID = result['Value'] result = self.RequestDB.getCurrentExecutionOrder(requestID) else: result = self.RequestDBClient.getCurrentExecutionOrder( requestName, self.local) if result['OK']: currentOrder = result['Value'] else: return S_OK('Can not get the request execution order') oRequest = RequestContainer(request=requestString) requestAttributes = oRequest.getRequestAttributes()['Value'] ################################################ # Find the number of sub-requests from the request res = oRequest.getNumSubRequests('diset') if not res['OK']: errStr = "DISETForwardingAgent.execute: Failed to obtain number of diset subrequests." gLogger.error(errStr, res['Message']) return S_OK() gLogger.info( "DISETForwardingAgent.execute: Found %s sub requests for job %s" % (res['Value'], jobID)) ################################################ # For all the sub-requests in the request modified = False for ind in range(res['Value']): subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'diset')['Value'] subExecutionOrder = int(subRequestAttributes['ExecutionOrder']) subStatus = subRequestAttributes['Status'] gLogger.info( "DISETForwardingAgent.execute: Processing sub-request %s with execution order %d" % (ind, subExecutionOrder)) if subStatus == 'Waiting' and subExecutionOrder <= currentOrder: operation = subRequestAttributes['Operation'] gLogger.info( "DISETForwardingAgent.execute: Attempting to forward %s type." % operation) rpcStubString = subRequestAttributes['Arguments'] rpcStub, length = DEncode.decode(rpcStubString) res = executeRPCStub(rpcStub) if res['OK']: gLogger.info( "DISETForwardingAgent.execute: Successfully forwarded." ) oRequest.setSubRequestStatus(ind, 'diset', 'Done') gMonitor.addMark("Successful", 1) modified = True elif res['Message'] == 'No Matching Job': gLogger.warn( "DISETForwardingAgent.execute: No corresponding job found. Setting to done." ) oRequest.setSubRequestStatus(ind, 'diset', 'Done') else: gLogger.error( "DISETForwardingAgent.execute: Failed to forward request.", res['Message']) else: gLogger.info( "DISETForwardingAgent.execute: Sub-request %s is status '%s' and not to be executed." % (ind, subRequestAttributes['Status'])) ################################################ # Generate the new request string after operation requestString = oRequest.toXML()['Value'] if self.RequestDB: res = self.RequestDB.updateRequest(requestName, requestString) else: res = self.RequestDBClient.updateRequest(requestName, requestString, self.local) if res['OK']: gLogger.info( "DISETForwardingAgent.execute: Successfully updated request.") else: gLogger.error( "DISETForwardingAgent.execute: Failed to update request to", self.local) if modified and jobID: result = self.RequestDBClient.finalizeRequest( requestName, jobID, self.local) return S_OK()
def execute(self): subRequestAttributes = self.argumentsDict.get('SubRequestAttributes', {}) if not subRequestAttributes: return S_ERROR("SubRequestAttributes not supplied") subRequestFiles = self.argumentsDict.get('Files', {}) if not subRequestFiles: return S_ERROR("Files not supplied") targetSE = subRequestAttributes.get('TargetSE', '') if not targetSE: return S_ERROR("TargetSE not supplied") catalog = subRequestAttributes.get('Catalogue', '') status = subRequestAttributes.get('Status', 'Waiting') if status != 'Waiting': return S_OK() for subRequestFile in subRequestFiles: lfn = str(subRequestFile.get('LFN', '')) if not lfn: self.log.error("LFN not supplied") continue file = subRequestFile.get('PFN', '') if not file: self.log.error("PFN not supplied") continue addler = subRequestFile.get('Addler', '') guid = subRequestFile.get('GUID', '') status = subRequestFile.get('Status', 'Waiting') if status != 'Waiting': self.log.info("%s in %s status and not Waiting" % (lfn, status)) continue gMonitor.addMark("Put and register", 1) self.__getReplicaManager() res = self.rm.putAndRegister(lfn, file, targetSE, guid=guid, checksum=addler, catalog=catalog) if not res['OK']: gMonitor.addMark("Put failed", 1) self.log.error("Failed to put file", res['Message']) continue successful = res['Value']['Successful'] failed = res['Value']['Failed'] if (not successful.has_key(lfn)) or ( not successful[lfn].has_key('put')): gMonitor.addMark("Put failed", 1) self.log.error("Failed to put file") continue if successful[lfn].has_key('put'): gMonitor.addMark("Put successful", 1) gLogger.info("Successfully put %s to %s in %s seconds." % (lfn, targetSE, successful[lfn]['put'])) subRequestFile['Status'] = 'Done' if successful[lfn].has_key('register'): gMonitor.addMark("File registration successful", 1) gLogger.info( "Successfully registered %s to %s in %s seconds." % (lfn, targetSE, successful[lfn]['register'])) else: gMonitor.addMark("File registration failed", 1) gLogger.error("Failed to register %s to %s." % (lfn, targetSE)) fileDict = failed[lfn]['register'] registerRequestDict = { 'Attributes': { 'TargetSE': fileDict['TargetSE'], 'Operation': 'registerFile' }, 'Files': [{ 'LFN': fileDict['LFN'], 'PFN': fileDict['PFN'], 'Size': fileDict['Size'], 'Addler': fileDict['Addler'], 'GUID': fileDict['GUID'] }] } print registerRequestDict return S_OK()
def htmlPageTitle(): gMonitor.addMark( "pagesServed" ) path = currentPath() return gWebConfig.getPageTitle( path )
def __call__(self): """ remove replicas """ # # prepare list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal bannedTargets = self.checkSEsRSS(targetSEs, access='RemoveAccess') if not bannedTargets['OK']: gMonitor.addMark("RemoveReplicaAtt") gMonitor.addMark("RemoveReplicaFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets['Value'])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # and prepare dict toRemoveDict = dict([(opFile.LFN, opFile) for opFile in waitingFiles]) self.log.info("todo: %s replicas to delete from %s sites" % (len(toRemoveDict), len(targetSEs))) gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs)) # # keep status for each targetSE removalStatus = dict.fromkeys(toRemoveDict.keys(), None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, None) # # loop over targetSEs for targetSE in targetSEs: self.log.info("removing replicas at %s" % targetSE) # # 1st step - bulk removal bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error('Bulk replica removal failed', bulkRemoval["Message"]) return bulkRemoval bulkRemoval = bulkRemoval["Value"] # # update removal status for successful files removalOK = [ opFile for opFile in bulkRemoval.values() if not opFile.Error ] for opFile in removalOK: removalStatus[opFile.LFN][targetSE] = "" gMonitor.addMark("RemoveReplicaOK", len(removalOK)) # # 2nd step - process the rest again toRetry = dict([(lfn, opFile) for lfn, opFile in bulkRemoval.items() if opFile.Error]) for lfn, opFile in toRetry.items(): self.singleRemoval(opFile, targetSE) if not opFile.Error: gMonitor.addMark("RemoveReplicaOK", 1) removalStatus[lfn][targetSE] = "" else: gMonitor.addMark("RemoveReplicaFail", 1) removalStatus[lfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = list( set([ error for error in removalStatus[lfn].values() if error ])) if errors: opFile.Error = ",".join(errors) # This seems to be the only offending error if "Write access not permitted for this credential" in opFile.Error: failed += 1 continue opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s replicas" % failed return S_OK()
gMonitor.flush() if error: return S_ERROR( error ) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info( 'updating request with status %s' % self.request.Status ) update = self.updateRequest() if not update["OK"]: self.log.error( update["Message"] ) return update self.log.info( "request '%s' is done" % self.request.RequestName ) gMonitor.addMark( "RequestOK", 1 ) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestName, self.request.JobID ) if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request %s: %s, will retry" % ( self.request.RequestName, finalizeRequest["Message"] ) ) self.log.verbose( "Waiting 10 seconds" ) attempts += 1 if attempts == 10: self.log.error( "giving up finalize request after %d attempts" % attempts ) return S_ERROR( 'Could not finalize request' )
def __call__(self): """ perform physical removal operation """ bannedTargets = self.checkSEsRSS(access='RemoveAccess') if not bannedTargets['OK']: gMonitor.addMark("PhysicalRemovalAtt") gMonitor.addMark("PhysicalRemovalFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets['Value'])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare pfn dict toRemoveDict = dict([(opFile.PFN, opFile) for opFile in waitingFiles]) targetSEs = self.operation.targetSEList gMonitor.addMark("PhysicalRemovalAtt", len(toRemoveDict) * len(targetSEs)) # # keep errors dict removalStatus = dict.fromkeys(toRemoveDict.keys(), None) for pfn in removalStatus: removalStatus[pfn] = dict.fromkeys(targetSEs, "") for targetSE in targetSEs: self.log.info("removing files from %s" % targetSE) # # 1st - bulk removal bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error(bulkRemoval["Message"]) self.operation.Error = bulkRemoval["Message"] return bulkRemoval bulkRemoval = bulkRemoval["Value"] for pfn, opFile in toRemoveDict.items(): removalStatus[pfn][targetSE] = bulkRemoval["Failed"].get( pfn, "") opFile.Error = removalStatus[pfn][targetSE] # # 2nd - single file removal toRetry = dict([(pfn, opFile) for pfn, opFile in toRemoveDict.items() if pfn in bulkRemoval["Failed"]]) for pfn, opFile in toRetry.items(): self.singleRemoval(opFile, targetSE) if not opFile.Error: removalStatus[pfn][targetSE] = "" else: gMonitor.addMark("PhysicalRemovalFail", 1) removalStatus[pfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = [ error for error in removalStatus[opFile.PFN].values() if error.strip() ] if errors: failed += 1 opFile.Error = ",".join(errors) if "Write access not permitted for this credential" in opFile.Error: opFile.Status = "Failed" gMonitor.addMark("PhysicalRemovalFail", len(errors)) continue gMonitor.addMark("PhysicalRemovalOK", len(targetSEs)) gMonitor.addMark("PhysicalRemovalSize", opFile.Size * len(targetSEs)) opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s files" % failed return S_OK()
def execute( self ): ''' Main execution method ''' gMonitor.addMark( 'Iteration', 1 ) # Get all the transformations result = self.transClient.getTransformations( {'Status' : 'Active', 'Type' : self.transformationTypes } ) if not result['OK']: gLogger.error( "InputDataAgent.execute: Failed to get transformations.", result['Message'] ) return S_OK() # Process each transformation for transDict in result['Value']: transID = long( transDict['TransformationID'] ) res = self.transClient.getTransformationInputDataQuery( transID ) if not res['OK']: if res['Message'] == 'No InputDataQuery found for transformation': gLogger.info( "InputDataAgent.execute: No input data query found for transformation %d" % transID ) else: gLogger.error( "InputDataAgent.execute: Failed to get input data query for %d" % transID, res['Message'] ) continue inputDataQuery = res['Value'] if self.refreshonly: # Determine the correct time stamp to use for this transformation if self.timeLog.has_key( transID ): if self.fullTimeLog.has_key( transID ): # If it is more than a day since the last reduced query, make a full query just in case if ( datetime.datetime.utcnow() - self.fullTimeLog[transID] ) < datetime.timedelta( seconds = self.fullUpdatePeriod ): timeStamp = self.timeLog[transID] if self.dateKey: inputDataQuery[self.dateKey] = ( timeStamp - datetime.timedelta( seconds = 10 ) ).strftime( '%Y-%m-%d %H:%M:%S' ) else: gLogger.error( "DateKey was not set in the CS, cannot use the RefreshOnly" ) else: self.fullTimeLog[transID] = datetime.datetime.utcnow() self.timeLog[transID] = datetime.datetime.utcnow() if not self.fullTimeLog.has_key( transID ): self.fullTimeLog[transID] = datetime.datetime.utcnow() # Perform the query to the metadata catalog gLogger.verbose( "Using input data query for transformation %d: %s" % ( transID, str( inputDataQuery ) ) ) start = time.time() result = self.metadataClient.findFilesByMetadata( inputDataQuery ) rtime = time.time() - start gLogger.verbose( "Metadata catalog query time: %.2f seconds." % ( rtime ) ) if not result['OK']: gLogger.error( "InputDataAgent.execute: Failed to get response from the metadata catalog", result['Message'] ) continue lfnList = result['Value'] # Check if the number of files has changed since the last cycle nlfns = len( lfnList ) gLogger.info( "%d files returned for transformation %d from the metadata catalog" % ( nlfns, int( transID ) ) ) if self.fileLog.has_key( transID ): if nlfns == self.fileLog[transID]: gLogger.verbose( 'No new files in metadata catalog since last check' ) self.fileLog[transID] = nlfns # Add any new files to the transformation addedLfns = [] if lfnList: gLogger.verbose( 'Processing %d lfns for transformation %d' % ( len( lfnList ), transID ) ) # Add the files to the transformation gLogger.verbose( 'Adding %d lfns for transformation %d' % ( len( lfnList ), transID ) ) result = self.transClient.addFilesToTransformation( transID, sorted( lfnList ) ) if not result['OK']: gLogger.warn( "InputDataAgent.execute: failed to add lfns to transformation", result['Message'] ) self.fileLog[transID] = 0 else: if result['Value']['Failed']: for lfn, error in res['Value']['Failed'].items(): gLogger.warn( "InputDataAgent.execute: Failed to add %s to transformation" % lfn, error ) if result['Value']['Successful']: for lfn, status in result['Value']['Successful'].items(): if status == 'Added': addedLfns.append( lfn ) gLogger.info( "InputDataAgent.execute: Added %d files to transformation" % len( addedLfns ) ) return S_OK()
def ftsTransfer(self): """ replicate and register using FTS """ self.log.info("scheduling files...") targetSEs = self.operation.targetSEList for targetSE in targetSEs: writeStatus = self.rssSEStatus(targetSE, "WriteAccess") if not writeStatus["OK"]: self.log.error(writeStatus["Message"]) for opFile in self.operation: opFile.Error = "unknown targetSE: %s" % targetSE opFile.Status = "Failed" self.operation.Error = "unknown targetSE: %s" % targetSE return S_ERROR(self.operation.Error) toSchedule = [] for opFile in self.getWaitingFilesList(): gMonitor.addMark("FTSScheduleAtt", 1) # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] if not replicas["Valid"] and replicas["Banned"]: self.log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN) gMonitor.addMark("FTSScheduleFail", 1) continue validReplicas = replicas["Valid"] bannedReplicas = replicas["Banned"] if not validReplicas and bannedReplicas: self.log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN) gMonitor.addMark("FTSScheduleFail", 1) continue if validReplicas: validTargets = list( set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" continue toSchedule.append( (opFile.toJSON()["Value"], validReplicas, validTargets)) if toSchedule: self.log.info("found %s files to schedule" % len(toSchedule)) ftsSchedule = self.ftsClient().ftsSchedule( self.request.RequestID, self.operation.OperationID, toSchedule) if not ftsSchedule["OK"]: self.log.error(ftsSchedule["Message"]) return ftsSchedule ftsSchedule = ftsSchedule["Value"] for fileID in ftsSchedule["Successful"]: gMonitor.addMark("FTSScheduleOK", 1) for opFile in self.operation: if fileID == opFile.FileID: opFile.Status = "Scheduled" self.log.always("%s has been scheduled for FTS" % opFile.LFN) for fileID, reason in ftsSchedule["Failed"]: gMonitor.addMark("FTSScheduleFail", 1) for opFile in self.operation: if fileID == opFile.FileID: opFile.Error = reason self.log.error("unable to schedule %s for FTS: %s" % (opFile.LFN, opFile.Error)) return S_OK()