class RegisterFile(OperationHandlerBase): """ .. class:: RegisterOperation RegisterFile operation handler :param self: self reference :param ~DIRAC.RequestManagementSystem.Client.Operation.Operation operation: Operation instance :param str csPath: CS path for this handler """ def __init__(self, operation=None, csPath=None): """c'tor """ OperationHandlerBase.__init__(self, operation, csPath) def __call__(self): """ call me maybe """ # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") else: # # RegisterFile specific monitor info gMonitor.registerActivity("RegisterAtt", "Attempted file registrations", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Successful file registrations", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Failed file registrations", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # counter for failed files failedFiles = 0 # # catalog(s) to use catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(',')] dm = DataManager(catalogs=catalogs) # # get waiting files waitingFiles = self.getWaitingFilesList() if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(waitingFiles))) # # loop over files for opFile in waitingFiles: if not self.rmsMonitoring: gMonitor.addMark("RegisterAtt", 1) # # get LFN lfn = opFile.LFN # # and others fileTuple = (lfn, opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum) # # call DataManager registerFile = dm.registerFile(fileTuple) # # check results if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("RegisterFail", 1) # self.dataLoggingClient().addFileRecord( # lfn, "RegisterFail", ','.join(catalogs) if catalogs else "all catalogs", "", "RegisterFile") reason = str( registerFile.get( "Message", registerFile.get("Value", {}).get("Failed", {}).get(lfn, 'Unknown'))) errorStr = "failed to register LFN" opFile.Error = "%s: %s" % (errorStr, reason) if 'GUID already registered' in reason: opFile.Status = 'Failed' self.log.error(errorStr, "%s: %s" % (lfn, reason)) elif 'File already registered with no replicas' in reason: self.log.warn( errorStr, "%s: %s, will remove it and retry" % (lfn, reason)) dm.removeFile(lfn) else: self.log.warn(errorStr, "%s: %s" % (lfn, reason)) failedFiles += 1 else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("RegisterOK", 1) # self.dataLoggingClient().addFileRecord( # lfn, "Register", ','.join(catalogs) if catalogs else "all catalogs", "", "RegisterFile") self.log.verbose( "file %s has been registered at %s" % (lfn, ','.join(catalogs) if catalogs else "all catalogs")) opFile.Status = "Done" if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # # final check if failedFiles: self.log.warn("all files processed, %s files failed to register" % failedFiles) self.operation.Error = "some files failed to register" return S_ERROR(self.operation.Error) return S_OK()
class StatesMonitoringAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ __summaryKeyFieldsMapping = [ 'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType', 'ApplicationStatus', 'MinorStatus'] __summaryDefinedFields = [ ( 'ApplicationStatus', 'unset' ), ( 'MinorStatus', 'unset' ) ] __summaryValueFieldsMapping = [ 'Jobs', 'Reschedules'] __renameFieldsMapping = { 'JobType' : 'JobSplitType' } __jobDBFields = [] jobDB = None monitoringReporter = None reportPeriod = None def initialize( self ): """ Standard constructor """ self.jobDB = JobDB() self.reportPeriod = 120 self.am_setOption( "PollingTime", self.reportPeriod ) self.monitoringReporter = MonitoringReporter( monitoringType = "WMSHistory" ) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append( field ) return S_OK() def execute( self ): """ Main execution method """ result = gConfig.getSections( "/DIRAC/Setups" ) if not result[ 'OK' ]: return result validSetups = result[ 'Value' ] self.log.info( "Valid setups for this cycle are %s" % ", ".join( validSetups ) ) # Get the WMS Snapshot! result = self.jobDB.getSummarySnapshot( self.__jobDBFields ) now = Time.dateTime() if not result[ 'OK' ]: self.log.error( "Can't get the jobdb summary", result[ 'Message' ] ) else: values = result[ 'Value' ][1] self.log.info( "Start sending records!" ) for record in values: recordSetup = record[0] if recordSetup not in validSetups: self.log.error( "Setup %s is not valid" % recordSetup ) continue record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[ fV[0] ] = fV[1] for iP in range( len( self.__summaryKeyFieldsMapping ) ): fieldName = self.__summaryKeyFieldsMapping[iP] rD[ self.__renameFieldsMapping.get( fieldName, fieldName ) ] = record[iP] record = record[ len( self.__summaryKeyFieldsMapping ): ] for iP in range( len( self.__summaryValueFieldsMapping ) ): rD[ self.__summaryValueFieldsMapping[iP] ] = int( record[iP] ) rD['timestamp'] = int( Time.toEpoch( now ) ) self.monitoringReporter.addRecord( rD ) retVal = self.monitoringReporter.commit() if retVal['OK']: self.log.info( "The records are successfully sent to the Store!" ) else: self.log.warn( "Faild to insert the records! It will be retried in the next iteration", retVal['Message'] ) return S_OK()
class TornadoServer(object): """ Tornado webserver Initialize and run an HTTPS Server for DIRAC services. By default it load all https services defined in the CS, but you can also give an explicit list. The listening port is either: * Given as parameter * Loaded from the CS ``/Systems/Tornado/<instance>/Port`` * Default to 8443 Example 1: Easy way to start tornado:: # Initialize server and load services serverToLaunch = TornadoServer() # Start listening when ready serverToLaunch.startTornado() Example 2:We want to debug service1 and service2 only, and use another port for that :: services = ['component/service1:port1', 'component/service2'] endpoints = ['component/endpoint1', 'component/endpoint2'] serverToLaunch = TornadoServer(services=services, endpoints=endpoints, port=1234) serverToLaunch.startTornado() """ def __init__(self, services=True, endpoints=False, port=None): """C'r :param list services: (default True) List of service handlers to load. If ``True``, loads all described in the CS If ``False``, do not load services :param list endpoints: (default False) List of endpoint handlers to load. If ``True``, loads all described in the CS If ``False``, do not load endpoints :param int port: Port to listen to. If ``None``, the port is resolved following the logic described in the class documentation """ self.__startTime = time.time() # Application metadata, routes and settings mapping on the ports self.__appsSettings = {} # Default port, if enother is not discover if port is None: port = gConfig.getValue( "/Systems/Tornado/%s/Port" % PathFinder.getSystemInstance("Tornado"), 8443) self.port = port # Handler manager initialization with default settings self.handlerManager = HandlerManager(services, endpoints) # temp value for computation, used by the monitoring self.__report = None # Last update time stamp self.__monitorLastStatsUpdate = None self.__monitoringLoopDelay = 60 # In secs self.activityMonitoring = False if "Monitoring" in Operations().getMonitoringBackends( monitoringType="ServiceMonitoring"): self.activityMonitoring = True # If services are defined, load only these ones (useful for debug purpose or specific services) retVal = self.handlerManager.loadServicesHandlers() if not retVal["OK"]: sLog.error(retVal["Message"]) raise ImportError( "Some services can't be loaded, check the service names and configuration." ) # Response time to load services self.__elapsedTime = time.time() - self.__startTime retVal = self.handlerManager.loadEndpointsHandlers() if not retVal["OK"]: sLog.error(retVal["Message"]) raise ImportError( "Some endpoints can't be loaded, check the endpoint names and configuration." ) def __calculateAppSettings(self): """Calculate application information mapping on the ports""" # if no service list is given, load services from configuration handlerDict = self.handlerManager.getHandlersDict() for data in handlerDict.values(): port = data.get("Port") or self.port for hURL in data["URLs"]: if port not in self.__appsSettings: self.__appsSettings[port] = {"routes": [], "settings": {}} if hURL not in self.__appsSettings[port]["routes"]: self.__appsSettings[port]["routes"].append(hURL) return bool(self.__appsSettings) def loadServices(self, services): """Load a services :param services: List of service handlers to load. Default value set at initialization If ``True``, loads all services from CS :type services: bool or list :return: S_OK()/S_ERROR() """ return self.handlerManager.loadServicesHandlers(services) def loadEndpoints(self, endpoints): """Load a endpoints :param endpoints: List of service handlers to load. Default value set at initialization If ``True``, loads all endpoints from CS :type endpoints: bool or list :return: S_OK()/S_ERROR() """ return self.handlerManager.loadEndpointsHandlers(endpoints) def addHandlers(self, routes, settings=None, port=None): """Add new routes :param list routes: routes :param dict settings: application settings :param int port: port """ port = port or self.port if port not in self.__appsSettings: self.__appsSettings[port] = {"routes": [], "settings": {}} if settings: self.__appsSettings[port]["settings"].update(settings) for route in routes: if route not in self.__appsSettings[port]["routes"]: self.__appsSettings[port]["routes"].append(route) return S_OK() def startTornado(self): """ Starts the tornado server when ready. This method never returns. """ # If there is no services loaded: if not self.__calculateAppSettings(): raise Exception( "There is no services loaded, please check your configuration") sLog.debug("Starting Tornado") # Prepare SSL settings certs = Locations.getHostCertificateAndKeyLocation() if certs is False: sLog.fatal("Host certificates not found ! Can't start the Server") raise ImportError("Unable to load certificates") ca = Locations.getCAsLocation() ssl_options = { "certfile": certs[0], "keyfile": certs[1], "cert_reqs": M2Crypto.SSL.verify_peer, "ca_certs": ca, "sslDebug": DEBUG_M2CRYPTO, # Set to true if you want to see the TLS debug messages } # Init monitoring if self.activityMonitoring: from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="ServiceMonitoring") self.__monitorLastStatsUpdate = time.time() self.__report = self.__startReportToMonitoringLoop() # Response time # Starting monitoring, IOLoop waiting time in ms, __monitoringLoopDelay is defined in seconds tornado.ioloop.PeriodicCallback( self.__reportToMonitoring(self.__elapsedTime), self.__monitoringLoopDelay * 1000).start() # If we are running with python3, Tornado will use asyncio, # and we have to convince it to let us run in a different thread # Doing this ensures a consistent behavior between py2 and py3 asyncio.set_event_loop_policy( tornado.platform.asyncio.AnyThreadEventLoopPolicy()) for port, app in self.__appsSettings.items(): sLog.debug(" - %s" % "\n - ".join( ["%s = %s" % (k, ssl_options[k]) for k in ssl_options])) # Default server configuration settings = dict(compress_response=True, cookie_secret="secret") # Merge appllication settings settings.update(app["settings"]) # Start server router = Application(app["routes"], default_handler_class=NotFoundHandler, **settings) server = HTTPServer(router, ssl_options=ssl_options, decompress_request=True) try: server.listen(int(port)) except Exception as e: # pylint: disable=broad-except sLog.exception("Exception starting HTTPServer", e) raise sLog.always("Listening on port %s" % port) tornado.ioloop.IOLoop.current().start() def __reportToMonitoring(self, responseTime): """ Periodically reports to Monitoring """ # Calculate CPU usage by comparing realtime and cpu time since last report percentage = self.__endReportToMonitoringLoop(self.__report[0], self.__report[1]) # Send record to Monitoring self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "Tornado", "MemoryUsage": self.__report[2], "CpuPercentage": percentage, "ResponseTime": responseTime, }) self.activityMonitoringReporter.commit() # Save memory usage and save realtime/CPU time for next call self.__report = self.__startReportToMonitoringLoop() def __startReportToMonitoringLoop(self): """ Snapshot of resources to be taken at the beginning of a monitoring cycle. Also sends memory snapshot to the monitoring. This is basically copy/paste of Service.py :returns: tuple (<time.time(), cpuTime ) """ now = time.time() # Used to calulate a delta stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 0: return (now, cpuTime) # Send CPU consumption mark self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB("VmRSS:") if membytes: mem = membytes / (1024.0 * 1024.0) return (now, cpuTime, mem) def __endReportToMonitoringLoop(self, initialWallTime, initialCPUTime): """ Snapshot of resources to be taken at the end of a monitoring cycle. This is basically copy/paste of Service.py Determines CPU usage by comparing walltime and cputime and send it to monitor """ wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100.0 return percentage
class RequestExecutingAgent(AgentModule): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ # # process pool __processPool = None # # request cache __requestCache = {} # # requests/cycle __requestsPerCycle = 100 # # minimal nb of subprocess running __minProcess = 20 # # maximal nb of subprocess executed same time __maxProcess = 20 # # ProcessPool queue size __queueSize = 20 # # file timeout __fileTimeout = 300 # # operation timeout __operationTimeout = 300 # # ProcessPool finalization timeout __poolTimeout = 900 # # ProcessPool sleep time __poolSleep = 5 # # placeholder for RequestClient instance __requestClient = None # # Size of the bulk if use of getRequests. If 0, use getRequest __bulkRequest = 0 # # Send the monitoring data to ES rather than the Framework/Monitoring __rmsMonitoring = False def __init__(self, *args, **kwargs): """c'tor""" # # call base class ctor AgentModule.__init__(self, *args, **kwargs) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle) self.log.info("Requests/cycle = %d" % self.__requestsPerCycle) self.__minProcess = self.am_getOption("MinProcess", self.__minProcess) self.log.info("ProcessPool min process = %d" % self.__minProcess) self.__maxProcess = self.am_getOption("MaxProcess", self.__maxProcess) self.log.info("ProcessPool max process = %d" % self.__maxProcess) self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize) self.log.info("ProcessPool queue size = %d" % self.__queueSize) self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout)) self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout) self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep)) self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep) self.__bulkRequest = self.am_getOption("BulkRequest", self.__bulkRequest) self.log.info("Bulk request size = %d" % self.__bulkRequest) self.__rmsMonitoring = self.am_getOption("EnableRMSMonitoring", self.__rmsMonitoring) self.log.info("Enable ES RMS Monitoring = %s" % self.__rmsMonitoring) # # keep config path and agent name self.agentName = self.am_getModuleParam("fullName") self.__configPath = PathFinder.getAgentSection(self.agentName) # # operation handlers over here opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers") opHandlers = gConfig.getSections(opHandlersPath) if not opHandlers["OK"]: self.log.error(opHandlers["Message"]) raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath) opHandlers = opHandlers["Value"] self.timeOuts = dict() # # handlers dict self.handlersDict = dict() for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler) opLocation = gConfig.getValue(opHandlerPath, "") if not opLocation: self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler)) continue self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout} opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.handlersDict[opHandler] = opLocation self.log.info("Operation handlers:") for item in enumerate(self.handlersDict.items()): opHandler = item[1][0] self.log.info( "[%s] %s: %s (timeout: %d s + %d s per file)" % ( item[0], item[1][0], item[1][1], self.timeOuts[opHandler]["PerOperation"], self.timeOuts[opHandler]["PerFile"], ) ) if self.__rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") gThreadScheduler.addPeriodicTask(100, self.__rmsMonitoringReporting) else: # # common monitor activity gMonitor.registerActivity("Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM) gMonitor.registerActivity( "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) # # create request dict self.__requestCache = dict() # ?? Probably should be removed self.FTSMode = self.am_getOption("FTSMode", False) def processPool(self): """facade for ProcessPool""" if not self.__processPool: minProcess = max(1, self.__minProcess) maxProcess = max(self.__minProcess, self.__maxProcess) queueSize = abs(self.__queueSize) self.log.info( "REA ProcessPool configuration", "minProcess = %d maxProcess = %d queueSize = %d" % (minProcess, maxProcess, queueSize), ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback=self.resultCallback, poolExceptionCallback=self.exceptionCallback, ) self.__processPool.daemonize() return self.__processPool def requestClient(self): """RequestClient getter""" if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cacheRequest(self, request): """put request into requestCache :param ~Request.Request request: Request instance """ maxProcess = max(self.__minProcess, self.__maxProcess) if len(self.__requestCache) > maxProcess + 50: # For the time being we just print a warning... If the ProcessPool is working well, this is not needed # We don't know how much is acceptable as it depends on many factors self.log.warn("Too many requests in cache", ": %d" % len(self.__requestCache)) # return S_ERROR( "Too many requests in cache" ) if request.RequestID in self.__requestCache: # We don't call putRequest as we have got back the request that is still being executed. Better keep it # The main reason for this is that it lasted longer than the kick time of CleanReqAgent self.log.warn( "Duplicate request, keep it but don't execute", ": %d/%s" % (request.RequestID, request.RequestName) ) return S_ERROR(errno.EALREADY, "Request already in cache") self.__requestCache[request.RequestID] = request return S_OK() def putRequest(self, requestID, taskResult=None): """put back :requestID: to RequestClient :param str requestID: request's id """ if requestID in self.__requestCache: request = self.__requestCache.pop(requestID) if taskResult: if taskResult["OK"]: request = taskResult["Value"] # The RequestTask is putting back the Done tasks, no need to redo it if request.Status == "Done": return S_OK() # In case of timeout, we need to increment ourselves all the attempts elif cmpError(taskResult, errno.ETIME): waitingOp = request.getWaiting() for rmsFile in waitingOp.get("Value", []): rmsFile.Attempt += 1 reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2) if not reset["OK"]: return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"])) else: return S_ERROR("Not in cache") return S_OK() def putAllRequests(self): """put back all requests without callback called into requestClient :param self: self reference """ self.log.info("putAllRequests: will put back requests", "%s" % len(self.__requestCache)) for requestID in self.__requestCache.keys(): reset = self.putRequest(requestID) if not reset["OK"]: self.log.error("Failed to put request", reset["Message"]) else: self.log.debug("putAllRequests: request %s has been put back with its initial state" % requestID) return S_OK() def initialize(self): """initialize agent""" return S_OK() def execute(self): """read requests from RequestClient and enqueue them into ProcessPool""" if not self.__rmsMonitoring: gMonitor.addMark("Iteration", 1) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug("execute: executing %d request in this cycle" % taskCounter) requestsToExecute = [] if not self.__bulkRequest: self.log.info("execute: ask for a single request") getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error("execute:", "%s" % getRequest["Message"]) break if not getRequest["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break requestsToExecute = [getRequest["Value"]] else: numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter) self.log.info("execute: ask for requests", "%s" % numberOfRequest) getRequests = self.requestClient().getBulkRequests(numberOfRequest) if not getRequests["OK"]: self.log.error("execute:", "%s" % getRequests["Message"]) break if not getRequests["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break for rId in getRequests["Value"]["Failed"]: self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId]) requestsToExecute = list(getRequests["Value"]["Successful"].values()) self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute)) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info( "processPool status", "tasks idle = %s working = %s" % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()), ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool", "will wait %d seconds to proceed" % self.__poolSleep, ) time.sleep(self.__poolSleep) looping += 1 else: if looping: self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep) looping = 0 # # save current request in cache res = self.cacheRequest(request) if not res["OK"]: if cmpError(res, errno.EALREADY): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( "Too many requests in cache", "(%d requests): put back all requests and exit cycle. Error %s" % (len(self.__requestCache), res["Message"]), ) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result["OK"]: continue requestJSON = result["Value"] self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName)) timeOut = self.getTimeout(request) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs={ "requestJSON": requestJSON, "handlersDict": self.handlersDict, "csPath": self.__configPath, "agentName": self.agentName, "rmsMonitoring": self.__rmsMonitoring, }, taskID=taskID, blocking=True, usePoolCallbacks=True, timeOut=timeOut, ) if not enqueue["OK"]: self.log.error("Could not enqueue task", enqueue["Message"]) else: self.log.debug("successfully enqueued task", "'%s'" % taskID) # # update monitor if self.__rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "status": "Attempted", "objectID": request.RequestID, "nbObject": 1, } ) else: gMonitor.addMark("Processed", 1) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep(0.1) break self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache)) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK() def getTimeout(self, request): """get timeout for request""" timeout = 0 for op in request: if op.Status not in ("Waiting", "Scheduled", "Queued"): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get("PerOperation", self.__operationTimeout) perFiles = self.timeOuts[op.Type].get("PerFile", self.__fileTimeout) * len(op) timeout += perOp + perFiles self.log.info( "estimated timeOut for request", "(%s/%s) is %s" % (request.RequestID, request.RequestName, timeout) ) return timeout def finalize(self): """agent finalization""" if self.__processPool: self.processPool().finalize(timeout=self.__poolTimeout) self.putAllRequests() return S_OK() def resultCallback(self, taskID, taskResult): """definition of request callback function :param str taskID: Request.RequestID :param dict taskResult: task result S_OK(Request)/S_ERROR(Message) """ # # clean cache res = self.putRequest(taskID, taskResult) self.log.info( "callback:", "%s result is %s(%s), put %s(%s)" % ( taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"], "S_OK" if res["OK"] else "S_ERROR", "" if res["OK"] else res["Message"], ), ) def exceptionCallback(self, taskID, taskException): """definition of exception callback function :param str taskID: Request.RequestID :param Exception taskException: Exception instance """ self.log.error("exceptionCallback:", "%s was hit by exception %s" % (taskID, taskException)) self.putRequest(taskID) def __rmsMonitoringReporting(self): """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which is done by the MonitoringReporter and is send to the 'RMSMonitoring' type. :return: True / False """ result = self.rmsMonitoringReporter.commit() return result["OK"]
class Service(object): SVC_VALID_ACTIONS = { "RPC": "export", "FileTransfer": "transfer", "Message": "msg", "Connection": "Message" } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__(self, serviceData): """ Init the variables for the service :param serviceData: dict with modName, standalone, loadName, moduleObj, classObj. e.g.: {'modName': 'Framework/serviceName', 'standalone': True, 'loadName': 'Framework/serviceName', 'moduleObj': <module 'serviceNameHandler' from '/home/DIRAC/FrameworkSystem/Service/serviceNameHandler.pyo'>, 'classObj': <class 'serviceNameHandler.serviceHandler'>} """ self._svcData = serviceData self._name = serviceData["modName"] self._startTime = datetime.datetime.utcnow() self._validNames = [serviceData["modName"]] if serviceData["loadName"] not in self._validNames: self._validNames.append(serviceData["loadName"]) self._cfg = ServiceConfiguration(list(self._validNames)) self._standalone = serviceData["standalone"] self.__monitorLastStatsUpdate = time.time() self._stats = {"queries": 0, "connections": 0} self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection(serviceData["loadName"])) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 self.__maxFD = 0 self.activityMonitoring = False # Check if monitoring is enabled if "Monitoring" in Operations().getMonitoringBackends( monitoringType="ServiceMonitoring"): self.activityMonitoring = True def setCloneProcessId(self, cloneId): self.__cloneId = cloneId def _isMetaAction(self, action): referedAction = Service.SVC_VALID_ACTIONS[action] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize(self): # Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR("Could not build service URL for %s" % self._name) gLogger.verbose("Service URL is %s" % self._url) # Load handler result = self._loadHandlerInit() if not result["OK"]: return result self._handler = result["Value"] # Initialize lock manager self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions()) self._threadPool = ThreadPoolExecutor(max(0, self._cfg.getMaxThreads())) self._msgBroker = MessageBroker("%sMSB" % self._name, threadPool=self._threadPool) # Create static dict self._serviceInfoDict = { "serviceName": self._name, "serviceSectionPath": PathFinder.getServiceSection(self._name), "URL": self._cfg.getURL(), "messageSender": MessageSender(self._name, self._msgBroker), "validNames": self._validNames, "csPaths": [ PathFinder.getServiceSection(svcName) for svcName in self._validNames ], } self.securityLogging = Operations().getValue( "EnableSecurityLogging", True) and getServiceOption( self._serviceInfoDict, "EnableSecurityLogging", True) # Initialize Monitoring # The import needs to be here because of the CS must be initialized before importing # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793) from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="ServiceMonitoring") self._initMonitoring() # Call static initialization function try: self._handler["class"]._rh__initializeClass( dict(self._serviceInfoDict), self._lockManager, self._msgBroker, self.activityMonitoringReporter) if self._handler["init"]: for initFunc in self._handler["init"]: gLogger.verbose("Executing initialization function") try: result = initFunc(dict(self._serviceInfoDict)) except Exception as excp: gLogger.exception( "Exception while calling initialization function", lException=excp) return S_ERROR( "Exception while calling initialization function: %s" % str(excp)) if not isReturnStructure(result): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc) if not result["OK"]: return S_ERROR("Error while initializing %s: %s" % (self._name, result["Message"])) except Exception as e: errMsg = "Exception while initializing %s" % self._name gLogger.exception(e) gLogger.exception(errMsg) return S_ERROR(errMsg) if self.activityMonitoring: gThreadScheduler.addPeriodicTask(30, self.__reportActivity) gThreadScheduler.addPeriodicTask( 100, self.__activityMonitoringReporting) # Load actions after the handler has initialized itself result = self._loadActions() if not result["OK"]: return result self._actions = result["Value"] return S_OK() def __searchInitFunctions(self, handlerClass, currentClass=None): if not currentClass: currentClass = handlerClass initFuncs = [] ancestorHasInit = False for ancestor in currentClass.__bases__: initFuncs += self.__searchInitFunctions(handlerClass, ancestor) if "initializeHandler" in dir(ancestor): ancestorHasInit = True if ancestorHasInit: initFuncs.append( super(currentClass, handlerClass).initializeHandler) if currentClass == handlerClass and "initializeHandler" in dir( handlerClass): initFuncs.append(handlerClass.initializeHandler) return initFuncs def _loadHandlerInit(self): handlerClass = self._svcData["classObj"] handlerName = handlerClass.__name__ handlerInitMethods = self.__searchInitFunctions(handlerClass) try: handlerInitMethods.append( getattr(self._svcData["moduleObj"], "initialize%s" % handlerName)) except AttributeError: gLogger.verbose( "Not found global initialization function for service") if handlerInitMethods: gLogger.info("Found %s initialization methods" % len(handlerInitMethods)) handlerInfo = {} handlerInfo["name"] = handlerName handlerInfo["module"] = self._svcData["moduleObj"] handlerInfo["class"] = handlerClass handlerInfo["init"] = handlerInitMethods return S_OK(handlerInfo) def _loadActions(self): handlerClass = self._handler["class"] authRules = {} typeCheck = {} methodsList = {} for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction(actionType): continue authRules[actionType] = {} typeCheck[actionType] = {} methodsList[actionType] = [] handlerAttributeList = dir(handlerClass) for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction(actionType): continue methodPrefix = "%s_" % Service.SVC_VALID_ACTIONS[actionType] for attribute in handlerAttributeList: if attribute.find(methodPrefix) != 0: continue exportedName = attribute[len(methodPrefix):] methodsList[actionType].append(exportedName) gLogger.verbose("+ Found %s method %s" % (actionType, exportedName)) # Create lock for method self._lockManager.createLock( "%s/%s" % (actionType, exportedName), self._cfg.getMaxThreadsForMethod(actionType, exportedName)) # Look for type and auth rules if actionType == "RPC": typeAttr = "types_%s" % exportedName authAttr = "auth_%s" % exportedName else: typeAttr = "types_%s_%s" % ( Service.SVC_VALID_ACTIONS[actionType], exportedName) authAttr = "auth_%s_%s" % ( Service.SVC_VALID_ACTIONS[actionType], exportedName) if typeAttr in handlerAttributeList: obj = getattr(handlerClass, typeAttr) gLogger.verbose("|- Found type definition %s: %s" % (typeAttr, str(obj))) typeCheck[actionType][exportedName] = obj if authAttr in handlerAttributeList: obj = getattr(handlerClass, authAttr) gLogger.verbose("|- Found auth rules %s: %s" % (authAttr, str(obj))) authRules[actionType][exportedName] = obj for actionType in Service.SVC_VALID_ACTIONS: referedAction = self._isMetaAction(actionType) if not referedAction: continue gLogger.verbose("Action %s is a meta action for %s" % (actionType, referedAction)) authRules[actionType] = [] for method in authRules[referedAction]: for prop in authRules[referedAction][method]: if prop not in authRules[actionType]: authRules[actionType].append(prop) gLogger.verbose("Meta action %s props are %s" % (actionType, authRules[actionType])) return S_OK({ "methods": methodsList, "auth": authRules, "types": typeCheck }) def _initMonitoring(self): props = [("__doc__", "description")] for prop in props: try: value = getattr(self._handler["module"], prop[0]) except Exception as e: gLogger.exception(e) gLogger.error("Missing property", prop[0]) value = "unset" for secondaryName in self._cfg.registerAlsoAs(): gLogger.info("Registering %s also as %s" % (self._name, secondaryName)) self._validNames.append(secondaryName) return S_OK() def __reportActivity(self): initialWallTime, initialCPUTime, mem = self.__startReportToMonitoring() pendingQueries = self._threadPool._work_queue.qsize() activeQuereies = len(self._threadPool._threads) percentage = self.__endReportToMonitoring(initialWallTime, initialCPUTime) self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "MemoryUsage": mem, "CpuPercentage": percentage, "PendingQueries": pendingQueries, "ActiveQueries": activeQuereies, "RunningThreads": threading.activeCount(), "MaxFD": self.__maxFD, }) self.__maxFD = 0 def getConfig(self): return self._cfg # End of initialization functions def handleConnection(self, clientTransport): """ This method may be called by ServiceReactor. The method stacks openened connection in a queue, another thread read this queue and handle connection. :param clientTransport: Object which describes opened connection (PlainTransport or SSLTransport) """ if not self.activityMonitoring: self._stats["connections"] += 1 self._threadPool.submit(self._processInThread, clientTransport) @property def wantsThrottle(self): """Boolean property for if the service wants requests to stop being accepted""" nQueued = self._threadPool._work_queue.qsize() return nQueued > self._cfg.getMaxWaitingPetitions() # Threaded process function def _processInThread(self, clientTransport): """ This method handles a RPC, FileTransfer or Connection. Connection may be opened via ServiceReactor.__acceptIncomingConnection - Do the SSL/TLS Handshake (if dips is used) and extract credentials - Get the action called by the client - Check if the client is authorized to perform ation - If not, connection is closed - Instanciate the RequestHandler (RequestHandler contain all methods callable) (Following is not directly in this method but it describe what happen at #Execute the action) - Notify the client we're ready to execute the action (via _processProposal) and call RequestHandler._rh_executeAction() - Receive arguments/file/something else (depending on action) in the RequestHandler - Executing the action asked by the client :param clientTransport: Object which describe the opened connection (SSLTransport or PlainTransport) :return: S_OK with "closeTransport" a boolean to indicate if th connection have to be closed e.g. after RPC, closeTransport=True """ self.__maxFD = max(self.__maxFD, clientTransport.oSocket.fileno()) self._lockManager.lockGlobal() try: monReport = self.__startReportToMonitoring() except Exception: monReport = False try: # Handshake try: result = clientTransport.handshake() if not result["OK"]: clientTransport.close() return except Exception: return # Add to the transport pool trid = self._transportPool.add(clientTransport) if not trid: return # Receive and check proposal result = self._receiveAndCheckProposal(trid) if not result["OK"]: self._transportPool.sendAndClose(trid, result) return proposalTuple = result["Value"] # Instantiate handler result = self._instantiateHandler(trid, proposalTuple) if not result["OK"]: self._transportPool.sendAndClose(trid, result) return handlerObj = result["Value"] # Execute the action result = self._processProposal(trid, proposalTuple, handlerObj) # Close the connection if required if result["closeTransport"] or not result["OK"]: if not result["OK"]: gLogger.error("Error processing proposal", result["Message"]) self._transportPool.close(trid) return result finally: self._lockManager.unlockGlobal() if monReport: self.__endReportToMonitoring(monReport[0], monReport[1]) @staticmethod def _createIdentityString(credDict, clientTransport=None): if "username" in credDict: if "group" in credDict: identity = "[%s:%s]" % (credDict["username"], credDict["group"]) else: identity = "[%s:unknown]" % credDict["username"] else: identity = "unknown" if clientTransport: addr = clientTransport.getRemoteAddress() if addr: addr = "{%s:%s}" % (addr[0], addr[1]) if "DN" in credDict: identity += "(%s)" % credDict["DN"] return identity @staticmethod def _deserializeProposalTuple(serializedProposal): """We receive the proposalTuple as a list. Turn it into a tuple again """ proposalTuple = tuple( tuple(x) if isinstance(x, list) else x for x in serializedProposal) return proposalTuple def _receiveAndCheckProposal(self, trid): clientTransport = self._transportPool.get(trid) # Get the peer credentials credDict = clientTransport.getConnectingCredentials() # Receive the action proposal retVal = clientTransport.receiveData(1024) if not retVal["OK"]: gLogger.error( "Invalid action proposal", "%s %s" % (self._createIdentityString( credDict, clientTransport), retVal["Message"]), ) return S_ERROR("Invalid action proposal") proposalTuple = Service._deserializeProposalTuple(retVal["Value"]) gLogger.debug("Received action from client", "/".join(list(proposalTuple[1]))) # Check if there are extra credentials if proposalTuple[2]: clientTransport.setExtraCredentials(proposalTuple[2]) # Check if this is the requested service requestedService = proposalTuple[0][0] if requestedService not in self._validNames: return S_ERROR("%s is not up in this server" % requestedService) # Check if the action is valid requestedActionType = proposalTuple[1][0] if requestedActionType not in Service.SVC_VALID_ACTIONS: return S_ERROR("%s is not a known action type" % requestedActionType) # Check if it's authorized result = self._authorizeProposal(proposalTuple[1], trid, credDict) if not result["OK"]: return result # Proposal is OK return S_OK(proposalTuple) def _authorizeProposal(self, actionTuple, trid, credDict): # Find CS path for the Auth rules referedAction = self._isMetaAction(actionTuple[0]) if referedAction: csAuthPath = "%s/Default" % actionTuple[0] hardcodedMethodAuth = self._actions["auth"][actionTuple[0]] else: if actionTuple[0] == "RPC": csAuthPath = actionTuple[1] else: csAuthPath = "/".join(actionTuple) # Find if there are hardcoded auth rules in the code hardcodedMethodAuth = False if actionTuple[0] in self._actions["auth"]: hardcodedRulesByType = self._actions["auth"][actionTuple[0]] if actionTuple[0] == "FileTransfer": methodName = actionTuple[1][0].lower() + actionTuple[1][1:] else: methodName = actionTuple[1] if methodName in hardcodedRulesByType: hardcodedMethodAuth = hardcodedRulesByType[methodName] # Auth time! if not self._authMgr.authQuery(csAuthPath, credDict, hardcodedMethodAuth): # Get the identity string identity = self._createIdentityString(credDict) fromHost = "unknown host" tr = self._transportPool.get(trid) if tr: fromHost = "/".join( [str(item) for item in tr.getRemoteAddress()]) gLogger.warn( "Unauthorized query", "to %s:%s by %s from %s" % (self._name, "/".join(actionTuple), identity, fromHost)) result = S_ERROR(ENOAUTH, "Unauthorized query") else: result = S_OK() # Security log tr = self._transportPool.get(trid) if not tr: return S_ERROR("Client disconnected") sourceAddress = tr.getRemoteAddress() identity = self._createIdentityString(credDict) if self.securityLogging: Service.SVC_SECLOG_CLIENT.addMessage( result["OK"], sourceAddress[0], sourceAddress[1], identity, self._cfg.getHostname(), self._cfg.getPort(), self._name, "/".join(actionTuple), ) return result def _instantiateHandler(self, trid, proposalTuple=None): """ Generate an instance of the handler for a given service :param int trid: transport ID :param tuple proposalTuple: tuple describing the proposed action :return: S_OK/S_ERROR, Value is the handler object """ # Generate the client params clientParams = {"serviceStartTime": self._startTime} if proposalTuple: # The 4th element is the client version clientParams["clientVersion"] = proposalTuple[3] if len( proposalTuple) > 3 else None clientParams["clientSetup"] = proposalTuple[0][1] if len(proposalTuple[0]) < 3: clientParams["clientVO"] = gConfig.getValue( "/DIRAC/VirtualOrganization", "unknown") else: clientParams["clientVO"] = proposalTuple[0][2] clientTransport = self._transportPool.get(trid) if clientTransport: clientParams["clientAddress"] = clientTransport.getRemoteAddress() # Generate handler dict with per client info handlerInitDict = dict(self._serviceInfoDict) for key in clientParams: handlerInitDict[key] = clientParams[key] # Instantiate and initialize try: handlerInstance = self._handler["class"](handlerInitDict, trid) handlerInstance.initialize() except Exception as e: gLogger.exception("Server error while loading handler: %s" % str(e)) return S_ERROR("Server error while loading handler") return S_OK(handlerInstance) def _processProposal(self, trid, proposalTuple, handlerObj): # Notify the client we're ready to execute the action retVal = self._transportPool.send(trid, S_OK()) if not retVal["OK"]: return retVal messageConnection = False if proposalTuple[1] == ("Connection", "new"): messageConnection = True if messageConnection: if self._msgBroker.getNumConnections( ) > self._cfg.getMaxMessagingConnections(): result = S_ERROR( "Maximum number of connections reached. Try later") result["closeTransport"] = True return result # This is a stable connection self._msgBroker.addTransportId( trid, self._name, receiveMessageCallback=self._mbReceivedMsg, disconnectCallback=self._mbDisconnect, listenToConnection=False, ) result = self._executeAction(trid, proposalTuple, handlerObj) if result["OK"] and messageConnection: self._msgBroker.listenToTransport(trid) result = self._mbConnect(trid, handlerObj) if not result["OK"]: self._msgBroker.removeTransport(trid) result["closeTransport"] = not messageConnection or not result["OK"] return result def _mbConnect(self, trid, handlerObj=None): if not handlerObj: result = self._instantiateHandler(trid) if not result["OK"]: return result handlerObj = result["Value"] return handlerObj._rh_executeConnectionCallback("connected") def _executeAction(self, trid, proposalTuple, handlerObj): try: response = handlerObj._rh_executeAction(proposalTuple) if not response["OK"]: return response if self.activityMonitoring: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) return response["Value"][0] except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e)) def _mbReceivedMsg(self, trid, msgObj): result = self._authorizeProposal( ("Message", msgObj.getName()), trid, self._transportPool.get(trid).getConnectingCredentials()) if not result["OK"]: return result result = self._instantiateHandler(trid) if not result["OK"]: return result handlerObj = result["Value"] response = handlerObj._rh_executeMessageCallback(msgObj) if self.activityMonitoring and response["OK"]: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) if response["OK"]: return response["Value"][0] else: return response def _mbDisconnect(self, trid): result = self._instantiateHandler(trid) if not result["OK"]: return result handlerObj = result["Value"] return handlerObj._rh_executeConnectionCallback("drop") def __activityMonitoringReporting(self): """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which is done by the MonitoringReporter and is send to the 'ComponentMonitoring' type. :return: True / False """ return self.activityMonitoringReporter.commit() def __startReportToMonitoring(self): now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] mem = None if now - self.__monitorLastStatsUpdate < 0: return (now, cpuTime, mem) self.__monitorLastStatsUpdate = now membytes = MemStat.VmB("VmRSS:") if membytes: mem = membytes / (1024.0 * 1024.0) return (now, cpuTime, mem) def __endReportToMonitoring(self, initialWallTime, initialCPUTime): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100.0 return percentage
class RemoveReplica(DMSRequestOperationsBase): """ .. class:: RemoveReplica """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param Operation operation: operation to execute :param str csPath: CS path for this handler """ # # base class ctor DMSRequestOperationsBase.__init__(self, operation, csPath) def __call__(self): """remove replicas""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") else: # # gMonitor stuff gMonitor.registerActivity( "RemoveReplicaAtt", "Replica removals attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveReplicaOK", "Successful replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveReplicaFail", "Failed replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) # # prepare list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal bannedTargets = self.checkSEsRSS(targetSEs, access="RemoveAccess") if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("RemoveReplicaAtt") gMonitor.addMark("RemoveReplicaFail") return bannedTargets if bannedTargets["Value"]: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # and prepare dict toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles) self.log.info("Todo: %s replicas to delete from %s SEs" % (len(toRemoveDict), len(targetSEs))) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict))) else: gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs)) # # keep status for each targetSE removalStatus = dict.fromkeys(toRemoveDict, None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, None) # # loop over targetSEs for targetSE in targetSEs: self.log.info("Removing replicas at %s" % targetSE) # # 1st step - bulk removal bulkRemoval = self._bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error("Bulk replica removal failed", bulkRemoval["Message"]) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return bulkRemoval # # report removal status for successful files if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord( "Successful", len(([opFile for opFile in toRemoveDict.values() if not opFile.Error])) ) ) else: gMonitor.addMark( "RemoveReplicaOK", len([opFile for opFile in toRemoveDict.values() if not opFile.Error]) ) # # 2nd step - process the rest again toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Error) for lfn, opFile in toRetry.items(): self._removeWithOwnerProxy(opFile, targetSE) if opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("RemoveReplicaFail", 1) removalStatus[lfn][targetSE] = opFile.Error else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("RemoveReplicaOK", 1) # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = list(set(error for error in removalStatus[opFile.LFN].values() if error)) if errors: opFile.Error = "\n".join(errors) # This seems to be the only unrecoverable error if "Write access not permitted for this credential" in opFile.Error: failed += 1 opFile.Status = "Failed" else: opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s replicas" % failed if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK() def _bulkRemoval(self, toRemoveDict, targetSE): """remove replicas :toRemoveDict: at :targetSE: :param dict toRemoveDict: { lfn: opFile, ... } :param str targetSE: target SE name """ # Clear the error for opFile in toRemoveDict.values(): opFile.Error = "" removeReplicas = self.dm.removeReplica(targetSE, list(toRemoveDict)) if not removeReplicas["OK"]: for opFile in toRemoveDict.values(): opFile.Error = removeReplicas["Message"] return removeReplicas removeReplicas = removeReplicas["Value"] # # filter out failed for lfn, opFile in toRemoveDict.items(): if lfn in removeReplicas["Failed"]: errorReason = str(removeReplicas["Failed"][lfn]) # If the reason is that the file does not exist, # we consider the removal successful # TODO: use cmpError once the FC returns the proper error msg corresponding to ENOENT if "No such file" not in errorReason: opFile.Error = errorReason self.log.error("Failed removing lfn", "%s:%s" % (lfn, opFile.Error)) return S_OK() def _removeWithOwnerProxy(self, opFile, targetSE): """remove opFile replica from targetSE using owner proxy :param File opFile: File instance :param str targetSE: target SE name """ if "Write access not permitted for this credential" in opFile.Error: proxyFile = None if "DataManager" in self.shifter: # # you're a data manager - save current proxy and get a new one for LFN and retry saveProxy = os.environ["X509_USER_PROXY"] try: fileProxy = self.getProxyForLFN(opFile.LFN) if not fileProxy["OK"]: opFile.Error = fileProxy["Message"] else: proxyFile = fileProxy["Value"] removeReplica = self.dm.removeReplica(targetSE, opFile.LFN) if not removeReplica["OK"]: opFile.Error = removeReplica["Message"] else: # Set or reset the error if all OK opFile.Error = removeReplica["Value"]["Failed"].get(opFile.LFN, "") finally: if proxyFile: os.unlink(proxyFile) # # put back request owner proxy to env os.environ["X509_USER_PROXY"] = saveProxy
class StatesMonitoringAgent(AgentModule): """ """ __summaryKeyFieldsMapping = [ 'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType', 'ApplicationStatus', 'MinorStatus' ] __summaryDefinedFields = [('ApplicationStatus', 'unset'), ('MinorStatus', 'unset')] __summaryValueFieldsMapping = ['Jobs', 'Reschedules'] __renameFieldsMapping = {'JobType': 'JobSplitType'} __jobDBFields = [] jobDB = None monitoringReporter = None def initialize(self): """ Standard initialization """ self.jobDB = JobDB() self.am_setOption("PollingTime", 900) self.messageQueue = self.am_getOption('MessageQueue', 'dirac.wmshistory') self.monitoringReporter = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=self.messageQueue) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK() def execute(self): """ Main execution method """ # Get the WMS Snapshot! result = self.jobDB.getSummarySnapshot(self.__jobDBFields) now = Time.dateTime() if not result['OK']: self.log.error( "Can't get the JobDB summary", "%s: won't commit at this cycle" % result['Message']) else: values = result['Value'][1] self.log.info("Start sending records") for record in values: record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[fV[0]] = fV[1] for iP in range(len(self.__summaryKeyFieldsMapping)): fieldName = self.__summaryKeyFieldsMapping[iP] rD[self.__renameFieldsMapping.get(fieldName, fieldName)] = record[iP] record = record[len(self.__summaryKeyFieldsMapping):] for iP in range(len(self.__summaryValueFieldsMapping)): rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP]) rD['timestamp'] = int(Time.toEpoch(now)) self.monitoringReporter.addRecord(rD) retVal = self.monitoringReporter.commit() if retVal['OK']: self.log.info("Records sent", "(%s)" % retVal['Value']) else: self.log.error( "Failed to insert the records, it will be retried in the next iteration", retVal['Message']) return S_OK()
class RemoveFile(DMSRequestOperationsBase): """ .. class:: RemoveFile remove file operation handler """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param Operation operation: Operation to execute :param str csPath: CS path for this handler """ # # call base class ctor DMSRequestOperationsBase.__init__(self, operation, csPath) # # re pattern for not existing files self.reNotExisting = re.compile(r"(no|not) such file.*", re.IGNORECASE) def __call__(self): """action for 'removeFile' operation""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") # # get waiting files waitingFiles = self.getWaitingFilesList() fc = FileCatalog(self.operation.catalogList) res = fc.getReplicas([wf.LFN for wf in waitingFiles]) if not res["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(waitingFiles))) self.rmsMonitoringReporter.commit() return res # We check the status of the SE from the LFN that are successful # No idea what to do with the others... replicas = res["Value"]["Successful"] targetSEs = set([se for lfn in replicas for se in replicas[lfn]]) if targetSEs: # Check if SEs are allowed for remove but don't fail yet the operation if SEs are always banned bannedTargets = self.checkSEsRSS(targetSEs, access="RemoveAccess", failIfBanned=False) if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(replicas))) self.rmsMonitoringReporter.commit() return bannedTargets bannedTargets = set(bannedTargets["Value"]) else: bannedTargets = set() # # prepare waiting file dict # # We take only files that have no replica at the banned SEs... If no replica, don't toRemoveDict = dict( ((opFile.LFN, opFile) for opFile in waitingFiles if not bannedTargets or not bannedTargets.intersection(replicas.get(opFile.LFN, [])))) # If some SEs are always banned, set Failed the files that cannot be removed if bannedTargets and "always banned" in self.operation.Error: for opFile in waitingFiles: if opFile.LFN not in toRemoveDict: # Set the files that cannot be removed Failed opFile.Error = self.operation.Error opFile.Status = "Failed" if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", len(waitingFiles) - len(toRemoveDict))) self.rmsMonitoringReporter.commit() if not toRemoveDict: # If there are no files that can be removed, exit, else try once to remove them anyway return S_OK("%s targets are always banned for removal" % ",".join(sorted(bannedTargets))) if toRemoveDict: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(toRemoveDict))) # # 1st step - bulk removal self.log.debug("bulk removal of %s files" % len(toRemoveDict)) bulkRemoval = self.bulkRemoval(toRemoveDict) if not bulkRemoval["OK"]: self.log.error("Bulk file removal failed", bulkRemoval["Message"]) else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord( "Successful", len(toRemoveDict) - len(bulkRemoval["Value"]))) # # 2nd step - single file removal for lfn, opFile in toRemoveDict.items(): self.log.info("removing single file %s" % lfn) singleRemoval = self.singleRemoval(opFile) if not singleRemoval["OK"]: self.log.error("Error removing single file", singleRemoval["Message"]) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: self.log.info("file %s has been removed" % lfn) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) # # set failedFiles = [(lfn, opFile) for (lfn, opFile) in toRemoveDict.items() if opFile.Status in ("Failed", "Waiting")] if failedFiles: self.operation.Error = "failed to remove %d files" % len( failedFiles) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() if bannedTargets: return S_OK("%s targets are banned for removal" % ",".join(sorted(bannedTargets))) return S_OK() def bulkRemoval(self, toRemoveDict): """bulk removal using request owner DN :param dict toRemoveDict: { lfn: opFile, ... } :return: S_ERROR or S_OK( { lfn: opFile, ... } ) -- dict with files still waiting to be removed """ bulkRemoval = self.dm.removeFile(list(toRemoveDict), force=True) if not bulkRemoval["OK"]: error = bulkRemoval["Message"] self.log.error("Bulk file removal failed", error) self.operation.Error = error for opFile in self.operation: opFile.Error = error return bulkRemoval bulkRemoval = bulkRemoval["Value"] # # filter results for lfn, opFile in toRemoveDict.items(): if lfn in bulkRemoval["Successful"]: opFile.Status = "Done" elif lfn in bulkRemoval["Failed"]: error = bulkRemoval["Failed"][lfn] if isinstance(error, dict): error = ";".join( ["%s-%s" % (k, v) for k, v in error.items()]) opFile.Error = error if self.reNotExisting.search(opFile.Error): opFile.Status = "Done" # # return files still waiting toRemoveDict = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Status == "Waiting") return S_OK(toRemoveDict) def singleRemoval(self, opFile): """remove single file :param opFile: File instance """ # # try to remove with owner proxy proxyFile = None if "Write access not permitted for this credential" in opFile.Error: if "DataManager" in self.shifter: # # you're a data manager - get proxy for LFN and retry saveProxy = os.environ["X509_USER_PROXY"] try: fileProxy = self.getProxyForLFN(opFile.LFN) if not fileProxy["OK"]: opFile.Error = "Error getting owner's proxy : %s" % fileProxy[ "Message"] else: proxyFile = fileProxy["Value"] self.log.info( "Trying to remove file with owner's proxy (file %s)" % proxyFile) removeFile = self.dm.removeFile(opFile.LFN, force=True) self.log.always(str(removeFile)) if not removeFile["OK"]: opFile.Error = str(removeFile["Message"]) if self.reNotExisting.search( str(removeFile["Message"]).lower()): opFile.Status = "Done" else: removeFile = removeFile["Value"] if opFile.LFN in removeFile["Failed"]: error = removeFile["Failed"][opFile.LFN] if isinstance(error, dict): error = ";".join([ "%s-%s" % (k, v) for k, v in error.items() ]) if self.reNotExisting.search(error): # This should never happen due to the "force" flag opFile.Status = "Done" else: opFile.Error = error else: opFile.Status = "Done" finally: if proxyFile: os.unlink(proxyFile) # # put back request owner proxy to env os.environ["X509_USER_PROXY"] = saveProxy # # file removed? update its status to 'Done' if opFile.Status == "Done": return S_OK() return S_ERROR(opFile.Error)
class MoveReplica(DMSRequestOperationsBase): """ .. class:: MoveReplica MoveReplica operation handler """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param ~Operation.Operation operation: Operation instance :param str csPath: CS path for this handler """ super(MoveReplica, self).__init__(operation, csPath) # Init ConsistencyInspector: used to check replicas self.ci = ConsistencyInspector() def __call__(self): """ call me maybe """ # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") else: # # own gMonitor stuff for files gMonitor.registerActivity("ReplicateAndRegisterAtt", "Replicate and register attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateOK", "Replications successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateFail", "Replications failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Registrations successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Registrations failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveReplicaAtt", "Replica removals attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveReplicaOK", "Successful replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveReplicaFail", "Failed replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # check replicas first res = self.__checkReplicas() if not res["OK"]: self.log.error('Failed to check replicas', res["Message"]) sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS(sourceSE, 'ReadAccess') if not bannedSource["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info(self.operation.Error) return S_OK(self.operation.Error) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedTargets if bannedTargets['Value']: self.operation.Error = "%s targets are banned for writing" % ",".join( bannedTargets['Value']) return S_OK(self.operation.Error) # Can continue now self.log.verbose("No targets banned for writing") # # check sourceSEs for removal # # for removal the targetSEs are the sourceSEs of the replication targetSEs = self.operation.sourceSEList bannedTargets = self.checkSEsRSS(targetSEs, access='RemoveAccess') if not bannedTargets['OK']: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("RemoveReplicaAtt") gMonitor.addMark("RemoveReplicaFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets['Value'])) # Can continue now self.log.verbose("No targets banned for removal") # Do the transfer # # get waiting files. If none just return waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files self.log.info("Transferring files using Data manager...") for opFile in waitingFiles: res = self.dmTransfer(opFile) if not res["OK"]: continue else: # Do the replica removal self.log.info("Removing files using Data manager...") toRemoveDict = dict([(opFile.LFN, opFile) for opFile in waitingFiles]) self.log.info("todo: %s replicas to delete from %s sites" % (len(toRemoveDict), len(targetSEs))) self.dmRemoval(toRemoveDict, targetSEs) return S_OK() def __checkReplicas(self): """ check done replicas and update file states """ waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation if opFile.Status in ("Waiting", "Scheduled")]) targetSESet = set(self.operation.targetSEList) # Check replicas res = self.ci._getCatalogReplicas(list(waitingFiles)) if not res["OK"]: self.log.error('Failed to get catalog replicas', res["Message"]) return S_ERROR() allReplicas = res['Value'][0] replicas = self.ci.compareChecksum(list(waitingFiles)) if not replicas["OK"]: self.log.error('Failed to check replicas', replicas["Message"]) return S_ERROR() replicas = replicas["Value"] noReplicas = replicas['NoReplicas'] if noReplicas: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", len(noReplicas))) self.rmsMonitoringReporter.commit() for lfn in noReplicas.keys(): self.log.error("File %s doesn't exist" % lfn) if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", len(targetSESet)) waitingFiles[lfn].Status = "Failed" for lfn, reps in allReplicas.items(): if targetSESet.issubset(set(reps)): self.log.info("file %s has been replicated to all targets" % lfn) waitingFiles[lfn].Status = "Done" return S_OK() def dmRemoval(self, toRemoveDict, targetSEs): if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(toRemoveDict))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs)) # # keep status for each targetSE removalStatus = dict.fromkeys(toRemoveDict.keys(), None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, None) # # loop over targetSEs for targetSE in targetSEs: self.log.info("removing replicas at %s" % targetSE) # # 1st step - bulk removal bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error('Bulk replica removal failed', bulkRemoval["Message"]) return bulkRemoval bulkRemoval = bulkRemoval["Value"] # # update removal status for successful files removalOK = [ opFile for opFile in bulkRemoval.values() if not opFile.Error ] for opFile in removalOK: removalStatus[opFile.LFN][targetSE] = "" if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", len(removalOK))) else: gMonitor.addMark("RemoveReplicaOK", len(removalOK)) # # 2nd step - process the rest again toRetry = dict([(lfn, opFile) for lfn, opFile in bulkRemoval.items() if opFile.Error]) for lfn, opFile in toRetry.items(): self.singleRemoval(opFile, targetSE) if not opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("RemoveReplicaOK", 1) removalStatus[lfn][targetSE] = "" else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("RemoveReplicaFail", 1) removalStatus[lfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = list( set([ error for error in removalStatus[lfn].values() if error ])) if errors: opFile.Error = ",".join(errors) # This seems to be the only offending error if "Write access not permitted for this credential" in opFile.Error: failed += 1 continue opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s replicas" % failed if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK(removalStatus) def dmTransfer(self, opFile): """ replicate and register using dataManager """ # # get waiting files. If none just return # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", 1)) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("ReplicateAndRegisterAtt", 1) opFile.Error = '' lfn = opFile.LFN # Check replicas res = self.ci._getCatalogReplicas([lfn]) if not res["OK"]: self.log.error('Failed to get catalog replicas', res["Message"]) return S_ERROR() allReplicas = res['Value'][0] replicas = self.ci.compareChecksum([lfn]) if not replicas["OK"]: self.log.error('Failed to check replicas', replicas["Message"]) return S_ERROR() replicas = replicas["Value"] validReplicas = [] noReplicas = replicas['NoReplicas'] missingAllReplicas = replicas["MissingAllReplicas"] missingReplica = replicas["MissingReplica"] someReplicasCorrupted = replicas["SomeReplicasCorrupted"] allReplicasCorrupted = replicas["AllReplicasCorrupted"] if noReplicas: self.log.error("Unable to replicate", "File %s doesn't exist" % (lfn)) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif missingAllReplicas: self.log.error("Unable to replicate", "%s, all replicas are missing" % (lfn)) opFile.Error = 'Missing all replicas' opFile.Status = 'Failed' elif allReplicasCorrupted: self.log.error("Unable to replicate", "%s, all replicas are corrupted" % (lfn)) opFile.Error = 'All replicas corrupted' opFile.Status = 'Failed' elif someReplicasCorrupted: self.log.error( "Unable to replicate", "%s, replicas corrupted at %s" % (lfn, someReplicasCorrupted[lfn])) opFile.Error = 'At least one replica corrupted' opFile.Status = 'Failed' elif missingReplica: self.log.error( "Unable to replicate", "%s, missing replicas at %s" % (lfn, missingReplica[lfn])) opFile.Error = 'At least one missing replica' opFile.Status = 'Failed' if opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("ReplicateFail") return S_ERROR() # Check if replica is at the specified source for repSEName in allReplicas[lfn]: validReplicas.append(repSEName) # # get the first one in the list if sourceSE not in validReplicas: if sourceSE: self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % (lfn, sourceSE, validReplicas[0])) sourceSE = validReplicas[0] # # loop over targetSE catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(',')] for targetSE in self.operation.targetSEList: # # call DataManager if targetSE in validReplicas: self.log.warn( "Request to replicate %s to an existing location: %s" % (lfn, targetSE)) continue res = self.dm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE, catalog=catalogs) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn]["replicate"] prString = "file %s replicated at %s in %s s." % ( lfn, targetSE, repTime) if not self.rmsMonitoring: gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: if not self.rmsMonitoring: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn][ "register"] prString += ' and registered in %s s.' % regTime self.log.info(prString) else: if not self.rmsMonitoring: gMonitor.addMark("RegisterFail", 1) prString += " but failed to register" self.log.warn(prString) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE, type='RegisterReplica') self.request.insertAfter(registerOperation, self.operation) else: self.log.error("Failed to replicate", "%s to %s" % (lfn, targetSE)) if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error( "Failed to replicate and register", "File %s at %s: %s" % (lfn, targetSE, reason)) opFile.Error = reason else: if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error("DataManager error", res["Message"]) if not opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) if len(self.operation.targetSEList) > 1: self.log.info("file %s has been replicated to all targetSEs" % lfn) else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) self.rmsMonitoringReporter.commit() return S_ERROR("dmTransfer failed") if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK() def bulkRemoval(self, toRemoveDict, targetSE): """ remove replicas :toRemoveDict: at :targetSE: :param dict toRemoveDict: { lfn: opFile, ... } :param str targetSE: target SE name :return: toRemoveDict with updated errors """ removeReplicas = self.dm.removeReplica(targetSE, list(toRemoveDict)) if not removeReplicas["OK"]: for opFile in toRemoveDict.values(): opFile.Error = removeReplicas["Message"] return S_ERROR(removeReplicas["Message"]) removeReplicas = removeReplicas["Value"] # # filter out failed for lfn, opFile in toRemoveDict.items(): if lfn in removeReplicas["Failed"]: opFile.Error = str(removeReplicas["Failed"][lfn]) return S_OK(toRemoveDict) def singleRemoval(self, opFile, targetSE): """ remove opFile replica from targetSE :param ~DIRAC.RequestManagementSystem.Client.File.File opFile: File instance :param str targetSE: target SE name """ proxyFile = None if "Write access not permitted for this credential" in opFile.Error: # # not a DataManger? set status to failed and return if "DataManager" in self.shifter: # # you're a data manager - save current proxy and get a new one for LFN and retry saveProxy = os.environ["X509_USER_PROXY"] try: fileProxy = self.getProxyForLFN(opFile.LFN) if not fileProxy["OK"]: opFile.Error = fileProxy["Message"] else: proxyFile = fileProxy["Value"] removeReplica = self.dm.removeReplica( targetSE, opFile.LFN) if not removeReplica["OK"]: opFile.Error = removeReplica["Message"] else: removeReplica = removeReplica["Value"] if opFile.LFN in removeReplica["Failed"]: opFile.Error = removeReplica["Failed"][ opFile.LFN] else: # # reset error - replica has been removed this time opFile.Error = "" finally: if proxyFile: os.unlink(proxyFile) # # put back request owner proxy to env os.environ["X509_USER_PROXY"] = saveProxy return S_OK(opFile)
class PutAndRegister(DMSRequestOperationsBase): """ .. class:: PutAndRegister PutAndRegister operation handler. This takes a local file and put it on a StorageElement before registering it in a catalog """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param Operation operation: Operation instance :param str csPath: CS path for this handler """ # # base classes ctor super(PutAndRegister, self).__init__(operation, csPath) self.dm = DataManager() def __call__(self): """PutAndRegister operation processing""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") else: # # gMonitor stuff gMonitor.registerActivity("PutAtt", "File put attempts", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("PutFail", "Failed file puts", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("PutOK", "Successful file puts", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Successful file registrations", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Failed file registrations", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # list of targetSEs targetSEs = self.operation.targetSEList if len(targetSEs) != 1: self.log.error( "Wrong value for TargetSE list, should contain only one target!", "%s" % targetSEs) self.operation.Error = "Wrong parameters: TargetSE should contain only one targetSE" for opFile in self.operation: opFile.Status = "Failed" opFile.Error = "Wrong parameters: TargetSE should contain only one targetSE" if not self.rmsMonitoring: gMonitor.addMark("PutAtt", 1) gMonitor.addMark("PutFail", 1) if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() return S_ERROR("TargetSE should contain only one target, got %s" % targetSEs) targetSE = targetSEs[0] bannedTargets = self.checkSEsRSS(targetSE) if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("PutAtt") gMonitor.addMark("PutFail") return bannedTargets if bannedTargets["Value"]: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets["Value"])) # # get waiting files waitingFiles = self.getWaitingFilesList() if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(waitingFiles))) # # loop over files for opFile in waitingFiles: # # get LFN lfn = opFile.LFN self.log.info("processing file %s" % lfn) if not self.rmsMonitoring: gMonitor.addMark("PutAtt", 1) pfn = opFile.PFN guid = opFile.GUID checksum = opFile.Checksum # # call DataManager passing a list of requested catalogs catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(",")] putAndRegister = DataManager(catalogs=catalogs).putAndRegister( lfn, pfn, targetSE, guid=guid, checksum=checksum) if not putAndRegister["OK"]: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("PutFail", 1) # self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" ) self.log.error("Completely failed to put and register file", putAndRegister["Message"]) opFile.Error = str(putAndRegister["Message"]) self.operation.Error = str(putAndRegister["Message"]) continue putAndRegister = putAndRegister["Value"] if lfn in putAndRegister["Failed"]: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("PutFail", 1) # self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" ) reason = putAndRegister["Failed"][lfn] self.log.error("Failed to put and register file", " %s at %s: %s" % (lfn, targetSE, reason)) opFile.Error = str(reason) self.operation.Error = str(reason) continue putAndRegister = putAndRegister["Successful"] if lfn in putAndRegister: if "put" not in putAndRegister[lfn]: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("PutFail", 1) # self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" ) self.log.info("failed to put %s to %s" % (lfn, targetSE)) opFile.Error = "put failed" self.operation.Error = "put failed" continue if "register" not in putAndRegister[lfn]: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("PutOK", 1) gMonitor.addMark("RegisterFail", 1) # self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister" ) # self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", targetSE, "", "PutAndRegister" ) self.log.info("put of %s to %s took %s seconds" % (lfn, targetSE, putAndRegister[lfn]["put"])) self.log.error("Register of lfn to SE failed", "%s to %s" % (lfn, targetSE)) opFile.Error = "failed to register %s at %s" % (lfn, targetSE) opFile.Status = "Failed" self.log.info(opFile.Error) registerOperation = self.getRegisterOperation( opFile, targetSE) self.request.insertAfter(registerOperation, self.operation) continue if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("PutOK", 1) gMonitor.addMark("RegisterOK", 1) # self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister" ) # self.dataLoggingClient().addFileRecord( lfn, "Register", targetSE, "", "PutAndRegister" ) opFile.Status = "Done" for op in ("put", "register"): self.log.info("%s of %s to %s took %s seconds" % (op, lfn, targetSE, putAndRegister[lfn][op])) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK()
class AgentModule(object): """ Base class for all agent modules This class is used by the AgentReactor Class to steer the execution of DIRAC Agents. For this purpose the following methods are used: - am_initialize() just after instantiated - am_getPollingTime() to set the execution frequency - am_getMaxCycles() to determine the number of cycles - am_go() for the actual execution of one cycle Before each iteration, the following methods are used to determine if the new cycle is to be started. - am_getModuleParam( 'alive' ) - am_checkStopAgentFile() - am_removeStopAgentFile() To start new execution cycle the following methods are used - am_getCyclesDone() - am_setOption( 'MaxCycles', maxCycles ) At the same time it provides all Agents with common interface. All Agent class must inherit from this base class and must implement at least the following method: - execute() main method called in the agent cycle Additionally they may provide: - initialize() for initial settings - finalize() the graceful exit - beginExecution() before each execution cycle - endExecution() at the end of each execution cycle The agent can be stopped either by a signal or by creating a 'stop_agent' file in the controlDirectory defined in the agent configuration """ def __init__(self, agentName, loadName, baseAgentName=False, properties={}): """ Common __init__ method for all Agents. All Agent modules must define: __doc__ __RCSID__ They are used to populate __codeProperties The following Options are used from the Configuration: - /LocalSite/InstancePath - /DIRAC/Setup - Status - Enabled - PollingTime default = 120 - MaxCycles default = 500 - WatchdogTime default = 0 (disabled) - ControlDirectory control/SystemName/AgentName - WorkDirectory work/SystemName/AgentName - shifterProxy '' - shifterProxyLocation WorkDirectory/SystemName/AgentName/.shifterCred It defines the following default Options that can be set via Configuration (above): - MonitoringEnabled True - Enabled True if Status == Active - PollingTime 120 - MaxCycles 500 - ControlDirectory control/SystemName/AgentName - WorkDirectory work/SystemName/AgentName - shifterProxy False - shifterProxyLocation work/SystemName/AgentName/.shifterCred different defaults can be set in the initialize() method of the Agent using am_setOption() In order to get a shifter proxy in the environment during the execute() the configuration Option 'shifterProxy' must be set, a default may be given in the initialize() method. """ if baseAgentName and agentName == baseAgentName: self.log = gLogger standaloneModule = True else: self.log = gLogger.getSubLogger(agentName, child=False) standaloneModule = False self.__basePath = gConfig.getValue('/LocalSite/InstancePath', rootPath) self.__agentModule = None self.__codeProperties = {} self.__getCodeInfo() self.__moduleProperties = { 'fullName': agentName, 'loadName': loadName, 'section': PathFinder.getAgentSection(agentName), 'loadSection': PathFinder.getAgentSection(loadName), 'standalone': standaloneModule, 'cyclesDone': 0, 'totalElapsedTime': 0, 'setup': gConfig.getValue("/DIRAC/Setup", "Unknown"), 'alive': True } self.__moduleProperties['system'], self.__moduleProperties[ 'agentName'] = agentName.split("/") self.__configDefaults = {} self.__configDefaults['MonitoringEnabled'] = True self.__configDefaults['Enabled'] = self.am_getOption( "Status", "Active").lower() in ('active') self.__configDefaults['PollingTime'] = self.am_getOption( "PollingTime", 120) self.__configDefaults['MaxCycles'] = self.am_getOption( "MaxCycles", 500) self.__configDefaults['WatchdogTime'] = self.am_getOption( "WatchdogTime", 0) self.__configDefaults['ControlDirectory'] = os.path.join( self.__basePath, 'control', *agentName.split("/")) self.__configDefaults['WorkDirectory'] = os.path.join( self.__basePath, 'work', *agentName.split("/")) self.__configDefaults['shifterProxy'] = '' self.__configDefaults['shifterProxyLocation'] = os.path.join( self.__configDefaults['WorkDirectory'], '.shifterCred') if isinstance(properties, dict): for key in properties: self.__moduleProperties[key] = properties[key] self.__moduleProperties['executors'] = [(self.execute, ())] self.__moduleProperties['shifterProxy'] = False self.__monitorLastStatsUpdate = -1 self.monitor = None self.__initializeMonitor() self.__initialized = False def __getCodeInfo(self): versionVar = "__RCSID__" docVar = "__doc__" try: self.__agentModule = __import__(self.__class__.__module__, globals(), locals(), versionVar) except Exception as excp: self.log.exception("Cannot load agent module", lException=excp) for prop in ((versionVar, "version"), (docVar, "description")): try: self.__codeProperties[prop[1]] = getattr( self.__agentModule, prop[0]) except Exception: self.log.error("Missing property", prop[0]) self.__codeProperties[prop[1]] = 'unset' self.__codeProperties['DIRACVersion'] = DIRAC.version self.__codeProperties['platform'] = DIRAC.getPlatform() def am_initialize(self, *initArgs): """ Common initialization for all the agents. This is executed every time an agent (re)starts. This is called by the AgentReactor, should not be overridden. """ agentName = self.am_getModuleParam('fullName') result = self.initialize(*initArgs) if not isReturnStructure(result): return S_ERROR("initialize must return S_OK/S_ERROR") if not result['OK']: return S_ERROR("Error while initializing %s: %s" % (agentName, result['Message'])) mkDir(self.am_getControlDirectory()) workDirectory = self.am_getWorkDirectory() mkDir(workDirectory) # Set the work directory in an environment variable available to subprocesses if needed os.environ['AGENT_WORKDIRECTORY'] = workDirectory self.__moduleProperties['shifterProxy'] = self.am_getOption( 'shifterProxy') if self.am_monitoringEnabled() and not self.activityMonitoring: self.monitor.enable() if len(self.__moduleProperties['executors']) < 1: return S_ERROR("At least one executor method has to be defined") if not self.am_Enabled(): return S_ERROR("Agent is disabled via the configuration") self.log.notice("=" * 40) self.log.notice("Loaded agent module %s" % self.__moduleProperties['fullName']) self.log.notice(" Site: %s" % DIRAC.siteName()) self.log.notice(" Setup: %s" % gConfig.getValue("/DIRAC/Setup")) self.log.notice(" Base Module version: %s " % __RCSID__) self.log.notice(" Agent version: %s" % self.__codeProperties['version']) self.log.notice(" DIRAC version: %s" % DIRAC.version) self.log.notice(" DIRAC platform: %s" % DIRAC.getPlatform()) pollingTime = int(self.am_getOption('PollingTime')) if pollingTime > 3600: self.log.notice(" Polling time: %s hours" % (pollingTime / 3600.)) else: self.log.notice(" Polling time: %s seconds" % self.am_getOption('PollingTime')) self.log.notice(" Control dir: %s" % self.am_getControlDirectory()) self.log.notice(" Work dir: %s" % self.am_getWorkDirectory()) if self.am_getOption('MaxCycles') > 0: self.log.notice(" Cycles: %s" % self.am_getMaxCycles()) else: self.log.notice(" Cycles: unlimited") if self.am_getWatchdogTime() > 0: self.log.notice(" Watchdog interval: %s" % self.am_getWatchdogTime()) else: self.log.notice(" Watchdog interval: disabled ") self.log.notice("=" * 40) self.__initialized = True return S_OK() def am_getControlDirectory(self): return os.path.join(self.__basePath, str(self.am_getOption('ControlDirectory'))) def am_getStopAgentFile(self): return os.path.join(self.am_getControlDirectory(), 'stop_agent') def am_checkStopAgentFile(self): return os.path.isfile(self.am_getStopAgentFile()) def am_createStopAgentFile(self): try: with open(self.am_getStopAgentFile(), 'w') as fd: fd.write('Dirac site agent Stopped at %s' % Time.toString()) except Exception: pass def am_removeStopAgentFile(self): try: os.unlink(self.am_getStopAgentFile()) except Exception: pass def am_getBasePath(self): return self.__basePath def am_getWorkDirectory(self): return os.path.join(self.__basePath, str(self.am_getOption('WorkDirectory'))) def am_getShifterProxyLocation(self): return os.path.join(self.__basePath, str(self.am_getOption('shifterProxyLocation'))) def am_getOption(self, optionName, defaultValue=None): """ Gets an option from the agent's configuration section. The section will be a subsection of the /Systems section in the CS. """ if defaultValue is None: if optionName in self.__configDefaults: defaultValue = self.__configDefaults[optionName] if optionName and optionName[0] == "/": return gConfig.getValue(optionName, defaultValue) for section in (self.__moduleProperties['section'], self.__moduleProperties['loadSection']): result = gConfig.getOption("%s/%s" % (section, optionName), defaultValue) if result['OK']: return result['Value'] return defaultValue def am_setOption(self, optionName, value): self.__configDefaults[optionName] = value def am_getModuleParam(self, optionName): return self.__moduleProperties[optionName] def am_setModuleParam(self, optionName, value): self.__moduleProperties[optionName] = value def am_getPollingTime(self): return self.am_getOption("PollingTime") def am_getMaxCycles(self): return self.am_getOption("MaxCycles") def am_getWatchdogTime(self): return int(self.am_getOption("WatchdogTime")) def am_getCyclesDone(self): return self.am_getModuleParam('cyclesDone') def am_Enabled(self): return self.am_getOption("Enabled") def am_disableMonitoring(self): self.am_setOption('MonitoringEnabled', False) def am_monitoringEnabled(self): return self.am_getOption("MonitoringEnabled") def am_stopExecution(self): self.am_setModuleParam('alive', False) def __initializeMonitor(self): """ Initialize the system monitoring. """ # This flag is used to activate ES based monitoring # if the "EnableActivityMonitoring" flag in "yes" or "true" in the cfg file. self.activityMonitoring = ( Operations().getValue("EnableActivityMonitoring", False) or self.am_getOption("EnableActivityMonitoring", False)) if self.activityMonitoring: # The import needs to be here because of the CS must be initialized before importing # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793) from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="ComponentMonitoring") # With the help of this periodic task we commit the data to ES at an interval of 100 seconds. gThreadScheduler.addPeriodicTask( 100, self.__activityMonitoringReporting) else: if self.__moduleProperties['standalone']: self.monitor = gMonitor else: self.monitor = MonitoringClient() self.monitor.setComponentType(self.monitor.COMPONENT_AGENT) self.monitor.setComponentName(self.__moduleProperties['fullName']) self.monitor.initialize() self.monitor.registerActivity('CPU', "CPU Usage", 'Framework', "CPU,%", self.monitor.OP_MEAN, 600) self.monitor.registerActivity('MEM', "Memory Usage", 'Framework', 'Memory,MB', self.monitor.OP_MEAN, 600) # Component monitor for field in ('version', 'DIRACVersion', 'description', 'platform'): self.monitor.setComponentExtraParam( field, self.__codeProperties[field]) self.monitor.setComponentExtraParam('startTime', Time.dateTime()) self.monitor.setComponentExtraParam('cycles', 0) self.monitor.disable() self.__monitorLastStatsUpdate = time.time() def am_secureCall(self, functor, args=(), name=False): if not name: name = str(functor) try: result = functor(*args) if not isReturnStructure(result): raise Exception( "%s method for %s module has to return S_OK/S_ERROR" % (name, self.__moduleProperties['fullName'])) return result except Exception as e: self.log.exception("Agent exception while calling method %s" % name, lException=e) return S_ERROR("Exception while calling %s method: %s" % (name, str(e))) def _setShifterProxy(self): if self.__moduleProperties["shifterProxy"]: result = setupShifterProxyInEnv( self.__moduleProperties["shifterProxy"], self.am_getShifterProxyLocation()) if not result['OK']: self.log.error("Failed to set shifter proxy", result['Message']) return result return S_OK() def am_go(self): # Set the shifter proxy if required result = self._setShifterProxy() if not result['OK']: return result self.log.notice("-" * 40) self.log.notice("Starting cycle for module %s" % self.__moduleProperties['fullName']) mD = self.am_getMaxCycles() if mD > 0: cD = self.__moduleProperties['cyclesDone'] self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD)) self.log.notice("-" * 40) # use SIGALARM as a watchdog interrupt if enabled watchdogInt = self.am_getWatchdogTime() if watchdogInt > 0: signal.signal(signal.SIGALRM, signal.SIG_DFL) signal.alarm(watchdogInt) elapsedTime = time.time() cpuStats = self._startReportToMonitoring() cycleResult = self.__executeModuleCycle() if cpuStats: self._endReportToMonitoring(*cpuStats) # Increment counters self.__moduleProperties['cyclesDone'] += 1 # Show status elapsedTime = time.time() - elapsedTime self.__moduleProperties['totalElapsedTime'] += elapsedTime self.log.notice("-" * 40) self.log.notice("Agent module %s run summary" % self.__moduleProperties['fullName']) self.log.notice(" Executed %s times previously" % self.__moduleProperties['cyclesDone']) self.log.notice(" Cycle took %.2f seconds" % elapsedTime) averageElapsedTime = self.__moduleProperties[ 'totalElapsedTime'] / self.__moduleProperties['cyclesDone'] self.log.notice(" Average execution time: %.2f seconds" % (averageElapsedTime)) elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption( 'PollingTime') self.log.notice(" Polling time: %s seconds" % self.am_getOption('PollingTime')) self.log.notice(" Average execution/polling time: %.2f%%" % elapsedPollingRate) if cycleResult['OK']: self.log.notice(" Cycle was successful") if self.activityMonitoring: # Here we record the data about the cycle duration along with some basic details about the # component and right now it isn't committed to the ES backend. self.activityMonitoringReporter.addRecord({ 'timestamp': int(Time.toEpoch()), 'host': Network.getFQDN(), 'componentType': "agent", 'component': "_".join(self.__moduleProperties['fullName'].split("/")), 'cycleDuration': elapsedTime, 'cycles': 1 }) else: self.log.warn(" Cycle had an error:", cycleResult['Message']) self.log.notice("-" * 40) # Update number of cycles if not self.activityMonitoring: self.monitor.setComponentExtraParam( 'cycles', self.__moduleProperties['cyclesDone']) # cycle finished successfully, cancel watchdog if watchdogInt > 0: signal.alarm(0) return cycleResult def _startReportToMonitoring(self): try: if not self.activityMonitoring: now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 10: return (now, cpuTime) # Send CPU consumption mark self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB('VmRSS:') if membytes: mem = membytes / (1024. * 1024.) gMonitor.addMark('MEM', mem) return (now, cpuTime) else: return False except Exception: return False def _endReportToMonitoring(self, initialWallTime, initialCPUTime): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = 0 if wallTime: percentage = cpuTime / wallTime * 100. if percentage > 0: gMonitor.addMark('CPU', percentage) def __executeModuleCycle(self): # Execute the beginExecution function result = self.am_secureCall(self.beginExecution, name="beginExecution") if not result['OK']: return result # Launch executor functions executors = self.__moduleProperties['executors'] if len(executors) == 1: result = self.am_secureCall(executors[0][0], executors[0][1]) if not result['OK']: return result else: exeThreads = [ threading.Thread(target=executor[0], args=executor[1]) for executor in executors ] for thread in exeThreads: thread.setDaemon(1) thread.start() for thread in exeThreads: thread.join() # Execute the endExecution function return self.am_secureCall(self.endExecution, name="endExecution") def initialize(self, *args, **kwargs): """ Agents should override this method for specific initialization. Executed at every agent (re)start. """ return S_OK() def beginExecution(self): return S_OK() def endExecution(self): return S_OK() def finalize(self): return S_OK() def execute(self): return S_ERROR("Execute method has to be overwritten by agent module") def __activityMonitoringReporting(self): """ This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which is done by the MonitoringReporter and is send to the 'ComponentMonitoring' type. :return: True / False """ result = self.activityMonitoringReporter.commit() return result['OK']
class RequestTask(object): """ .. class:: RequestTask request's processing task """ def __init__(self, requestJSON, handlersDict, csPath, agentName, standalone=False, requestClient=None, rmsMonitoring=False): """c'tor :param self: self reference :param str requestJSON: request serialized to JSON :param dict opHandlers: operation handlers """ self.request = Request(requestJSON) # # csPath self.csPath = csPath # # agent name self.agentName = agentName # # standalone flag self.standalone = standalone # # handlers dict self.handlersDict = handlersDict # # handlers class def self.handlers = {} # # own sublogger self.log = gLogger.getSubLogger( "pid_%s/%s" % (os.getpid(), self.request.RequestName)) # # get shifters info self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error("Cannot setup shifter proxies", shifterProxies["Message"]) # This flag which is set and sent from the RequestExecutingAgent and is False by default. self.rmsMonitoring = rmsMonitoring if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") else: # # initialize gMonitor gMonitor.setComponentType(gMonitor.COMPONENT_AGENT) gMonitor.setComponentName(self.agentName) gMonitor.initialize() # # own gMonitor activities gMonitor.registerActivity("RequestAtt", "Requests processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestFail", "Requests failed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestOK", "Requests done", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) if requestClient is None: self.requestClient = ReqClient() else: self.requestClient = requestClient def __setupManagerProxies(self): """setup grid proxy for all defined managers""" oHelper = Operations() shifters = oHelper.getSections("Shifter") if not shifters["OK"]: return shifters shifters = shifters["Value"] for shifter in shifters: shifterDict = oHelper.getOptionsDict("Shifter/%s" % shifter) if not shifterDict["OK"]: self.log.error("Cannot get options dict for shifter", "%s: %s" % (shifter, shifterDict["Message"])) continue userName = shifterDict["Value"].get("User", "") userGroup = shifterDict["Value"].get("Group", "") userDN = Registry.getDNForUsername(userName) if not userDN["OK"]: self.log.error("Cannot get DN For Username", "%s: %s" % (userName, userDN["Message"])) continue userDN = userDN["Value"][0] vomsAttr = Registry.getVOMSAttributeForGroup(userGroup) if vomsAttr: self.log.debug( "getting VOMS [%s] proxy for shifter %s@%s (%s)" % (vomsAttr, userName, userGroup, userDN)) getProxy = gProxyManager.downloadVOMSProxyToFile( userDN, userGroup, requiredTimeLeft=1200, cacheTime=4 * 43200) else: self.log.debug("getting proxy for shifter %s@%s (%s)" % (userName, userGroup, userDN)) getProxy = gProxyManager.downloadProxyToFile( userDN, userGroup, requiredTimeLeft=1200, cacheTime=4 * 43200) if not getProxy["OK"]: return S_ERROR("unable to setup shifter proxy for %s: %s" % (shifter, getProxy["Message"])) chain = getProxy["chain"] fileName = getProxy["Value"] self.log.debug("got %s: %s %s" % (shifter, userName, userGroup)) self.__managersDict[shifter] = { "ShifterDN": userDN, "ShifterName": userName, "ShifterGroup": userGroup, "Chain": chain, "ProxyFile": fileName, } return S_OK() def setupProxy(self): """download and dump request owner proxy to file and env :return: S_OK with name of newly created owner proxy file and shifter name if any """ self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error(shifterProxies["Message"]) ownerDN = self.request.OwnerDN ownerGroup = self.request.OwnerGroup isShifter = [] for shifter, creds in self.__managersDict.items(): if creds["ShifterDN"] == ownerDN and creds[ "ShifterGroup"] == ownerGroup: isShifter.append(shifter) if isShifter: proxyFile = self.__managersDict[isShifter[0]]["ProxyFile"] os.environ["X509_USER_PROXY"] = proxyFile return S_OK({"Shifter": isShifter, "ProxyFile": proxyFile}) # # if we're here owner is not a shifter at all ownerProxyFile = gProxyManager.downloadVOMSProxyToFile( ownerDN, ownerGroup) if not ownerProxyFile["OK"] or not ownerProxyFile["Value"]: reason = ownerProxyFile.get( "Message", "No valid proxy found in ProxyManager.") return S_ERROR("Change proxy error for '%s'@'%s': %s" % (ownerDN, ownerGroup, reason)) ownerProxyFile = ownerProxyFile["Value"] os.environ["X509_USER_PROXY"] = ownerProxyFile return S_OK({"Shifter": isShifter, "ProxyFile": ownerProxyFile}) @staticmethod def getPluginName(pluginPath): """return plugin name""" if not pluginPath: return "" if "/" in pluginPath: pluginPath = ".".join( [chunk for chunk in pluginPath.split("/") if chunk]) return pluginPath.split(".")[-1] def loadHandler(self, pluginPath): """Create an instance of requested plugin class, loading and importing it when needed. This function could raise ImportError when plugin cannot be find or TypeError when loaded class object isn't inherited from BaseOperation class. :param str pluginName: dotted path to plugin, specified as in import statement, i.e. "DIRAC.CheesShopSystem.private.Cheddar" or alternatively in 'normal' path format "DIRAC/CheesShopSystem/private/Cheddar" :return: object instance This function try to load and instantiate an object from given path. It is assumed that: * `pluginPath` is pointing to module directory "importable" by python interpreter, i.e.: it's package's top level directory is in $PYTHONPATH env variable, * the module should consist a class definition following module name, * the class itself is inherited from DIRAC.RequestManagementSystem.private.BaseOperation.BaseOperation If above conditions aren't meet, function is throwing exceptions: :raises ImportError: when class cannot be imported :raises TypeError: when class isn't inherited from OperationHandlerBase """ if "/" in pluginPath: pluginPath = ".".join( [chunk for chunk in pluginPath.split("/") if chunk]) pluginName = pluginPath.split(".")[-1] if pluginName not in globals(): mod = __import__(pluginPath, globals(), fromlist=[pluginName]) pluginClassObj = getattr(mod, pluginName) else: pluginClassObj = globals()[pluginName] if not issubclass(pluginClassObj, OperationHandlerBase): raise TypeError( "operation handler '%s' isn't inherited from OperationHandlerBase class" % pluginName) if not self.rmsMonitoring: for key, status in (("Att", "Attempted"), ("OK", "Successful"), ("Fail", "Failed")): gMonitor.registerActivity( "%s%s" % (pluginName, key), "%s operations %s" % (pluginName, status), "RequestExecutingAgent", "Operations/min", gMonitor.OP_SUM, ) # # return an instance return pluginClassObj def getHandler(self, operation): """return instance of a handler for a given operation type on demand all created handlers are kept in self.handlers dict for further use :param ~Operation.Operation operation: Operation instance """ if operation.Type not in self.handlersDict: return S_ERROR("handler for operation '%s' not set" % operation.Type) handler = self.handlers.get(operation.Type, None) if not handler: try: handlerCls = self.loadHandler( self.handlersDict[operation.Type]) self.handlers[operation.Type] = handlerCls( csPath="%s/OperationHandlers/%s" % (self.csPath, operation.Type)) handler = self.handlers[operation.Type] except (ImportError, TypeError) as error: self.log.exception("Error getting Handler", "%s" % error, lException=error) return S_ERROR(str(error)) # # set operation for this handler handler.setOperation(operation) # # and return return S_OK(handler) def updateRequest(self): """put back request to the RequestDB""" updateRequest = self.requestClient.putRequest(self.request, useFailoverProxy=False, retryMainService=2) if not updateRequest["OK"]: self.log.error("Cannot updateRequest", updateRequest["Message"]) return updateRequest def __call__(self): """request processing""" self.log.debug("about to execute request") if not self.rmsMonitoring: gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: userSuspended = "User is currently suspended" self.request.Error = setupProxy["Message"] # In case the user does not have proxy if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND): self.log.error("Error setting proxy. Request set to Failed:", setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = "Failed" operation.Status = "Failed" elif userSuspended in setupProxy["Message"]: # If user is suspended, wait for a long time self.request.delayNextExecution(6 * 60) self.request.Error = userSuspended self.log.error("Error setting proxy: " + userSuspended, self.request.OwnerDN) else: self.log.error("Error setting proxy", setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error("Cannot get waiting operation", operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation", "%s" % operation.Type) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("Unable to process operation", "%s: %s" % (operation.Type, handler["Message"])) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # set rmsMonitoring flag for the RequestOperation handler.rmsMonitoring = self.rmsMonitoring # # and execute pluginName = self.getPluginName( self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Attempted", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Att"), 1) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not exe["OK"]: self.log.error("unable to process operation", "%s: %s" % (operation.Type, exe["Message"])) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if self.request.JobID: # Check if the job exists monitorServer = JobMonitoringClient( useCertificates=True) res = monitorServer.getJobSummary( int(self.request.JobID)) if not res["OK"]: self.log.error( "RequestTask: Failed to get job status", "%d" % self.request.JobID) elif not res["Value"]: self.log.warn( "RequestTask: job does not exist (anymore): failed request", "JobID: %d" % self.request.JobID, ) for opFile in operation: opFile.Status = "Failed" if operation.Status != "Failed": operation.Status = "Failed" self.request.Error = "Job no longer exists" except Exception as e: error = str(e) self.log.exception("hit by exception:", "%s" % error) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") break # # operation status check if operation.Status == "Done" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "OK"), 1) elif operation.Status == "Failed" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break if not self.rmsMonitoring: gMonitor.flush() if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info("Updating request status:", "%s" % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error("Cannot update request status", update["Message"]) return update self.log.info("request is done", "%s" % self.request.RequestName) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": getattr(self.request, "RequestID", 0), "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("RequestOK", 1) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID # pylint: disable=no-member ) if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request, will retry", "ReqName %s:%s" % (self.request.RequestName, finalizeRequest["Message"]), ) self.log.debug("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error("Giving up finalize request") return S_ERROR("Could not finalize request") time.sleep(10) else: self.log.info( "request is finalized", "ReqName %s %s" % (self.request.RequestName, (" after %d attempts" % attempts) if attempts else ""), ) break # Commit all the data to the ES Backend if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting", "request %s" % self.request.Status) return S_OK(self.request)
class ReTransfer(DMSRequestOperationsBase): """ .. class:: ReTransfer online ReTransfer operation handler :param self: self reference :param ~DIRAC.RequestManagementSystem.Client.Operation.Operation operation: Operation instance :param str csPath: CS path for this handler """ def __init__(self, operation=None, csPath=None): """c'tor""" # # base class ctor DMSRequestOperationsBase.__init__(self, operation, csPath) def __call__(self): """reTransfer operation execution""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") # # list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal targetSE = targetSEs[0] bannedTargets = self.checkSEsRSS(targetSE) if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() return bannedTargets if bannedTargets["Value"]: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets["Value"])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare waiting files toRetransfer = dict([(opFile.PFN, opFile) for opFile in waitingFiles]) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(toRetransfer))) if len(targetSEs) != 1: error = "only one TargetSE allowed, got %d" % len(targetSEs) for opFile in toRetransfer.values(): opFile.Error = error opFile.Status = "Failed" self.operation.Error = error if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", len(toRetransfer))) self.rmsMonitoringReporter.commit() return S_ERROR(error) se = StorageElement(targetSE) for opFile in toRetransfer.values(): reTransfer = se.retransferOnlineFile(opFile.LFN) if not reTransfer["OK"]: opFile.Error = reTransfer["Message"] self.log.error("Retransfer failed", opFile.Error) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) continue reTransfer = reTransfer["Value"] if opFile.LFN in reTransfer["Failed"]: opFile.Error = reTransfer["Failed"][opFile.LFN] self.log.error("Retransfer failed", opFile.Error) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) continue opFile.Status = "Done" self.log.info("%s retransfer done" % opFile.LFN) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK()
class DataOperationSender: """ class:: DataOperationSender It reads the MonitoringBackends option to decide whether send and commit data operation to either Accounting or Monitoring. """ # Initialize the object so that the Reporters are created only once def __init__(self): monitoringType = "DataOperation" # Will use the `MonitoringBackends/Default` value as monitoring backend unless a flag for `MonitoringBackends/DataOperation` is set. self.monitoringOptions = Operations().getMonitoringBackends( monitoringType) if "Monitoring" in self.monitoringOptions: self.dataOperationReporter = MonitoringReporter(monitoringType) if "Accounting" in self.monitoringOptions: self.dataOp = DataOperation() def sendData(self, baseDict, commitFlag=False, delayedCommit=False, startTime=False, endTime=False): """ Sends the input to Monitoring or Acconting based on the monitoringOptions :param dict baseDict: contains a key/value pair :param bool commitFlag: decides whether to commit the record or not. :param bool delayedCommit: decides whether to commit the record with delay (only for sending to Accounting) :param int startTime: epoch time, start time of the plot :param int endTime: epoch time, end time of the plot """ def sendMonitoring(self): baseDict["ExecutionSite"] = DIRAC.siteName() baseDict["Channel"] = baseDict["Source"] + "->" + baseDict[ "Destination"] self.dataOperationReporter.addRecord(baseDict) if commitFlag: result = self.dataOperationReporter.commit() sLog.debug("Committing data operation to monitoring") if not result["OK"]: sLog.error("Could not commit data operation to monitoring", result["Message"]) else: sLog.debug("Done committing to monitoring") def sendAccounting(self): self.dataOp.setValuesFromDict(baseDict) if startTime: self.dataOp.setStartTime(startTime) self.dataOp.setEndTime(endTime) else: self.dataOp.setStartTime() self.dataOp.setEndTime() # Adding only to register if not commitFlag and not delayedCommit: return gDataStoreClient.addRegister(self.dataOp) # Adding to register and committing if commitFlag and not delayedCommit: gDataStoreClient.addRegister(self.dataOp) result = gDataStoreClient.commit() sLog.debug("Committing data operation to accounting") if not result["OK"]: sLog.error("Could not commit data operation to accounting", result["Message"]) return result sLog.debug("Done committing to accounting") # Only late committing else: result = self.dataOp.delayedCommit() if not result["OK"]: sLog.error( "Could not delay-commit data operation to accounting") return result # Send data and commit prioritizing the first monitoring option in the list for backend in self.monitoringOptions: func = locals()[f"send{backend}"] res = func() if not res["OK"]: return res return S_OK() # Call this method in order to commit all records added but not yet committed to Accounting and Monitoring def concludeSending(self): def commitAccounting(): result = gDataStoreClient.commit() sLog.debug( "Concluding the sending and committing data operation to accounting" ) if not result["OK"]: sLog.error("Could not commit data operation to accounting", result["Message"]) sLog.debug("Committing to accounting concluded") return result def commitMonitoring(): result = self.dataOperationReporter.commit() sLog.debug("Committing data operation to monitoring") if not result["OK"]: sLog.error("Could not commit data operation to monitoring", result["Message"]) sLog.debug("Committing to monitoring concluded") return result # Commit data prioritizing first monitoring option in the list for backend in self.monitoringOptions: func = locals()[f"commit{backend}"] res = func() if not res["OK"]: return res return S_OK()
class ReplicateAndRegister(DMSRequestOperationsBase): """ .. class:: ReplicateAndRegister ReplicateAndRegister operation handler """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param Operation operation: Operation instance :param str csPath: CS path for this handler """ super(ReplicateAndRegister, self).__init__(operation, csPath) # # SE cache # Clients self.fc = FileCatalog() def __call__(self): """call me maybe""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") else: # # own gMonitor stuff for files gMonitor.registerActivity( "ReplicateAndRegisterAtt", "Replicate and register attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM, ) gMonitor.registerActivity("ReplicateOK", "Replications successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateFail", "Replications failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Registrations successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Registrations failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # for FTS gMonitor.registerActivity("FTSScheduleAtt", "Files schedule attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleOK", "File schedule successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleFail", "File schedule failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # check replicas first checkReplicas = self.__checkReplicas() if not checkReplicas["OK"]: self.log.error("Failed to check replicas", checkReplicas["Message"]) if hasattr(self, "FTSMode") and getattr(self, "FTSMode"): bannedGroups = getattr(self, "FTSBannedGroups") if hasattr( self, "FTSBannedGroups") else () if self.request.OwnerGroup in bannedGroups: self.log.verbose( "usage of FTS system is banned for request's owner") return self.dmTransfer() return self.fts3Transfer() return self.dmTransfer() def __checkReplicas(self): """check done replicas and update file states""" waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation if opFile.Status in ("Waiting", "Scheduled")]) targetSESet = set(self.operation.targetSEList) replicas = self.fc.getReplicas(list(waitingFiles)) if not replicas["OK"]: self.log.error("Failed to get replicas", replicas["Message"]) return replicas reMissing = re.compile(r".*such file.*") for failedLFN, errStr in replicas["Value"]["Failed"].items(): waitingFiles[failedLFN].Error = errStr if reMissing.search(errStr.lower()): self.log.error("File does not exists", failedLFN) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("ReplicateFail", len(targetSESet)) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].items(): if targetSESet.issubset(set(reps)): self.log.info("file replicated to all targets", successfulLFN) waitingFiles[successfulLFN].Status = "Done" return S_OK() def _addMetadataToFiles(self, toSchedule): """Add metadata to those files that need to be scheduled through FTS toSchedule is a dictionary: {'lfn1': opFile, 'lfn2': opFile} """ if toSchedule: self.log.info( "found %s files to schedule, getting metadata from FC" % len(toSchedule)) else: self.log.verbose("No files to schedule") return S_OK([]) res = self.fc.getFileMetadata(list(toSchedule)) if not res["OK"]: return res else: if res["Value"]["Failed"]: self.log.warn( "Can't schedule %d files: problems getting the metadata: %s" % (len(res["Value"]["Failed"]), ", ".join( res["Value"]["Failed"]))) metadata = res["Value"]["Successful"] filesToSchedule = {} for lfn, lfnMetadata in metadata.items(): opFileToSchedule = toSchedule[lfn][0] opFileToSchedule.GUID = lfnMetadata["GUID"] # In principle this is defined already in filterReplicas() if not opFileToSchedule.Checksum: opFileToSchedule.Checksum = metadata[lfn]["Checksum"] opFileToSchedule.ChecksumType = metadata[lfn]["ChecksumType"] opFileToSchedule.Size = metadata[lfn]["Size"] filesToSchedule[opFileToSchedule.LFN] = opFileToSchedule return S_OK(filesToSchedule) def _filterReplicas(self, opFile): """filter out banned/invalid source SEs""" return filterReplicas(opFile, logger=self.log, dataManager=self.dm, opSources=self.operation.sourceSEList) def _checkExistingFTS3Operations(self): """ Check if there are ongoing FTS3Operation for the current RMS Operation Under some conditions, we can be trying to schedule files while there is still an FTS transfer going on. This typically happens when the REA hangs. To prevent further race condition, we check if there are FTS3Operations in a non Final state matching the current operation ID. If so, we put the corresponding files in scheduled mode. We will then wait till the FTS3 Operation performs the callback :returns: S_OK with True if we can go on, False if we should stop the processing """ res = FTS3Client().getOperationsFromRMSOpID(self.operation.OperationID) if not res["OK"]: self.log.debug("Could not get FTS3Operations matching OperationID", self.operation.OperationID) return res existingFTSOperations = res["Value"] # It is ok to have FTS Operations in a final state, so we # care only about the others unfinishedFTSOperations = [ ops for ops in existingFTSOperations if ops.status not in FTS3TransferOperation.FINAL_STATES ] if not unfinishedFTSOperations: self.log.debug("No ongoing FTS3Operations, all good") return S_OK(True) self.log.warn( "Some FTS3Operations already exist for the RMS Operation:", [op.operationID for op in unfinishedFTSOperations], ) # This would really be a screwed up situation ! if len(unfinishedFTSOperations) > 1: self.log.warn("That's a serious problem !!") # We take the rmsFileID of the files in the Operations, # find the corresponding File object, and set them scheduled rmsFileIDsToSetScheduled = set([ ftsFile.rmsFileID for ftsOp in unfinishedFTSOperations for ftsFile in ftsOp.ftsFiles ]) for opFile in self.operation: # If it is in the DB, it has a FileID opFileID = opFile.FileID if opFileID in rmsFileIDsToSetScheduled: self.log.warn("Setting RMSFile as already scheduled", opFileID) opFile.Status = "Scheduled" # We return here such that the Request is set back to Scheduled in the DB # With no further modification return S_OK(False) def fts3Transfer(self): """replicate and register using FTS3""" self.log.info("scheduling files in FTS3...") # Check first if we do not have ongoing transfers res = self._checkExistingFTS3Operations() if not res["OK"]: return res # if res['Value'] is False # it means that there are ongoing transfers # and we should stop here if res["Value"] is False: # return S_OK such that the request is put back return S_OK() fts3Files = [] toSchedule = {} # Dict which maps the FileID to the object rmsFilesIds = {} if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(self.getWaitingFilesList()))) for opFile in self.getWaitingFilesList(): rmsFilesIds[opFile.FileID] = opFile opFile.Error = "" if not self.rmsMonitoring: gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas["NoReplicas"] badReplicas = replicas["Bad"] noPFN = replicas["NoPFN"] if validReplicas: validTargets = list( set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [opFile, validTargets] else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: self.log.warn( "unable to schedule file", "'%s': couldn't get metadata at %s" % (opFile.LFN, ",".join(noMetaReplicas)), ) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "Unable to schedule transfer", "File %s doesn't exist at %s" % (opFile.LFN, ",".join(noReplicas)), ) opFile.Error = "No replicas found" opFile.Status = "Failed" elif badReplicas: self.log.error( "Unable to schedule transfer", "File %s, all replicas have a bad checksum at %s" % (opFile.LFN, ",".join(badReplicas)), ) opFile.Error = "All replicas have a bad checksum" opFile.Status = "Failed" elif noPFN: self.log.warn( "unable to schedule %s, could not get a PFN at %s" % (opFile.LFN, ",".join(noPFN))) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() res = self._addMetadataToFiles(toSchedule) if not res["OK"]: return res else: filesToSchedule = res["Value"] for lfn in filesToSchedule: opFile = filesToSchedule[lfn] validTargets = toSchedule[lfn][1] for targetSE in validTargets: ftsFile = FTS3File.fromRMSFile(opFile, targetSE) fts3Files.append(ftsFile) if fts3Files: res = Registry.getUsernameForDN(self.request.OwnerDN) if not res["OK"]: self.log.error( "Cannot get username for DN", "%s %s" % (self.request.OwnerDN, res["Message"])) return res username = res["Value"] fts3Operation = FTS3TransferOperation.fromRMSObjects( self.request, self.operation, username) fts3Operation.ftsFiles = fts3Files try: if not fts3Operation.activity: vo = getVOfromProxyGroup().get("Value") fts3Plugin = getFTS3Plugin(vo=vo) fts3Operation.activity = fts3Plugin.inferFTSActivity( fts3Operation, self.request, self.operation) except Exception: pass ftsSchedule = FTS3Client().persistOperation(fts3Operation) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS3:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] self.log.info("Scheduled with FTS3Operation id %s" % ftsSchedule) self.log.info("%d files have been scheduled to FTS3" % len(fts3Files)) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", len(fts3Files))) for ftsFile in fts3Files: opFile = rmsFilesIds[ftsFile.rmsFileID] if not self.rmsMonitoring: gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) else: self.log.info("No files to schedule after metadata checks") if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True) def dmTransfer(self, fromFTS=False): """replicate and register using dataManager""" # # get waiting files. If none just return # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS(sourceSE, "ReadAccess") if not bannedSource["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info(self.operation.Error) return S_OK(self.operation.Error) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedTargets if bannedTargets["Value"]: self.operation.Error = "%s targets are banned for writing" % ",".join( bannedTargets["Value"]) return S_OK(self.operation.Error) # Can continue now self.log.verbose("No targets banned for writing") waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files if fromFTS: self.log.info( "Trying transfer using replica manager as FTS failed") else: self.log.info("Transferring files using Data manager...") errors = defaultdict(int) delayExecution = 0 if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(waitingFiles))) for opFile in waitingFiles: if opFile.Error in ( "Couldn't get metadata", "File doesn't exist", "No active replica found", "All replicas have a bad checksum", ): err = "File already in error status" errors[err] += 1 if not self.rmsMonitoring: gMonitor.addMark("ReplicateAndRegisterAtt", 1) opFile.Error = "" lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas(opFile) if not replicas["OK"]: self.log.error("Failed to check replicas", replicas["Message"]) continue replicas = replicas["Value"] validReplicas = replicas.get("Valid") noMetaReplicas = replicas.get("NoMetadata") noReplicas = replicas.get("NoReplicas") badReplicas = replicas.get("Bad") noActiveReplicas = replicas.get("NoActiveReplicas") if not validReplicas: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("ReplicateFail") if noMetaReplicas: err = "Couldn't get metadata" errors[err] += 1 self.log.verbose( "unable to replicate '%s', couldn't get metadata at %s" % (opFile.LFN, ",".join(noMetaReplicas))) opFile.Error = err elif noReplicas: err = "File doesn't exist" errors[err] += 1 self.log.verbose( "Unable to replicate", "File %s doesn't exist at %s" % (opFile.LFN, ",".join(noReplicas))) opFile.Error = err opFile.Status = "Failed" elif badReplicas: err = "All replicas have a bad checksum" errors[err] += 1 self.log.error( "Unable to replicate", "%s, all replicas have a bad checksum at %s" % (opFile.LFN, ",".join(badReplicas)), ) opFile.Error = err opFile.Status = "Failed" elif noActiveReplicas: err = "No active replica found" errors[err] += 1 self.log.verbose( "Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ",".join(noActiveReplicas))) opFile.Error = err # All source SEs are banned, delay execution by 1 hour delayExecution = 60 continue # # get the first one in the list if sourceSE not in validReplicas: if sourceSE: err = "File not at specified source" errors[err] += 1 self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % (lfn, sourceSE, validReplicas[0])) sourceSE = validReplicas[0] # # loop over targetSE catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(",")] for targetSE in self.operation.targetSEList: # # call DataManager if targetSE in validReplicas: self.log.warn( "Request to replicate %s to an existing location: %s" % (lfn, targetSE)) continue res = self.dm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE, catalog=catalogs) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn][ "replicate"] prString = "file %s replicated at %s in %s s." % ( lfn, targetSE, repTime) if not self.rmsMonitoring: gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: if not self.rmsMonitoring: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn][ "register"] prString += " and registered in %s s." % regTime self.log.info(prString) else: if not self.rmsMonitoring: gMonitor.addMark("RegisterFail", 1) prString += " but failed to register" self.log.warn(prString) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE, type="RegisterReplica") self.request.insertAfter( registerOperation, self.operation) else: self.log.error("Failed to replicate", "%s to %s" % (lfn, targetSE)) if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error("Failed to replicate and register", "File %s at %s:" % (lfn, targetSE), reason) opFile.Error = reason else: if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error("DataManager error", res["Message"]) if not opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) if len(self.operation.targetSEList) > 1: self.log.info( "file %s has been replicated to all targetSEs" % lfn) opFile.Status = "Done" elif self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) # Log error counts if delayExecution: self.log.info("Delay execution of the request by %d minutes" % delayExecution) self.request.delayNextExecution(delayExecution) for error, count in errors.items(): self.log.error(error, "for %d files" % count) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK()
class PhysicalRemoval(DMSRequestOperationsBase): """ .. class:: PhysicalRemoval """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param ~DIRAC.RequestManagementSystem.Client.Operation.Operation operation: Operation instance :param str csPath: cs config path """ DMSRequestOperationsBase.__init__(self, operation, csPath) def __call__(self): """perform physical removal operation""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") bannedTargets = self.checkSEsRSS(access="RemoveAccess") if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() return bannedTargets if bannedTargets["Value"]: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare lfn dict toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles) targetSEs = self.operation.targetSEList if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict))) self.rmsMonitoringReporter.commit() # # keep errors dict removalStatus = dict.fromkeys(toRemoveDict.keys(), None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, "") for targetSE in targetSEs: self.log.info("removing files from %s" % targetSE) # # 1st - bulk removal bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error("Failed bulk removal", bulkRemoval["Message"]) self.operation.Error = bulkRemoval["Message"] return bulkRemoval bulkRemoval = bulkRemoval["Value"] for lfn, opFile in toRemoveDict.items(): removalStatus[lfn][targetSE] = bulkRemoval["Failed"].get(lfn, "") opFile.Error = removalStatus[lfn][targetSE] # # 2nd - single file removal toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if lfn in bulkRemoval["Failed"]) for lfn, opFile in toRetry.items(): self.singleRemoval(opFile, targetSE) if not opFile.Error: removalStatus[lfn][targetSE] = "" else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) removalStatus[lfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = [error for error in removalStatus[opFile.LFN].values() if error.strip()] if errors: failed += 1 opFile.Error = ",".join(errors) if "Write access not permitted for this credential" in opFile.Error: opFile.Status = "Failed" if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) continue if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1)) opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s files" % failed if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK() def bulkRemoval(self, toRemoveDict, targetSE): """bulk removal of lfns from :targetSE: :param dict toRemoveDict: { lfn : opFile, ... } :param str targetSE: target SE name """ bulkRemoval = StorageElement(targetSE).removeFile(toRemoveDict) return bulkRemoval def singleRemoval(self, opFile, targetSE): """remove single file from :targetSE:""" proxyFile = None if "Write access not permitted for this credential" in opFile.Error: # # not a DataManger? set status to failed and return if "DataManager" not in self.shifter: opFile.Status = "Failed" elif not opFile.LFN: opFile.Error = "LFN not set" opFile.Status = "Failed" else: # # you're a data manager - save current proxy and get a new one for LFN and retry saveProxy = os.environ["X509_USER_PROXY"] try: proxyFile = self.getProxyForLFN(opFile.LFN) if not proxyFile["OK"]: opFile.Error = proxyFile["Message"] else: proxyFile = proxyFile["Value"] removeFile = StorageElement(targetSE).removeFile(opFile.LFN) if not removeFile["OK"]: opFile.Error = removeFile["Message"] else: removeFile = removeFile["Value"] if opFile.LFN in removeFile["Failed"]: opFile.Error = removeFile["Failed"][opFile.LFN] else: # # reset error - replica has been removed this time opFile.Error = "" finally: if proxyFile: os.unlink(proxyFile) # # put back request owner proxy to env os.environ["X509_USER_PROXY"] = saveProxy return S_OK(opFile)
class StatesAccountingAgent(AgentModule): """Agent that every 15 minutes will report to the AccountingDB (MySQL) or the Monitoring DB (ElasticSearch), or both, a snapshot of the JobDB. Also sends a snapshot of PilotAgentsDB to Monitoring. """ # WMSHistory fields __summaryKeyFieldsMapping = [ "Status", "Site", "User", "UserGroup", "JobGroup", "JobType", "ApplicationStatus", "MinorStatus", ] __summaryDefinedFields = [("ApplicationStatus", "unset"), ("MinorStatus", "unset")] __summaryValueFieldsMapping = ["Jobs", "Reschedules"] __renameFieldsMapping = {"JobType": "JobSplitType"} # PilotsHistory fields __pilotsMapping = [ "TaskQueueID", "GridSite", "GridType", "Status", "NumOfPilots" ] def initialize(self): """Standard initialization""" # This agent will always loop every 15 minutes self.am_setOption("PollingTime", 900) # Check whether to send to Monitoring or Accounting or both self.jobMonitoringOption = Operations().getMonitoringBackends( monitoringType="WMSHistory") self.pilotMonitoringOption = Operations().getMonitoringBackends( monitoringType="PilotsHistory") messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory") self.datastores = { } # For storing the clients to Accounting and Monitoring if "Accounting" in self.jobMonitoringOption: self.datastores["Accounting"] = DataStoreClient(retryGraceTime=900) if "Monitoring" in self.jobMonitoringOption: self.datastores["Monitoring"] = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=messageQueue) if "Monitoring" in self.pilotMonitoringOption: self.pilotReporter = MonitoringReporter( monitoringType="PilotsHistory", failoverQueueName=messageQueue) self.__jobDBFields = [] for field in self.__summaryKeyFieldsMapping: if field == "User": field = "Owner" elif field == "UserGroup": field = "OwnerGroup" self.__jobDBFields.append(field) return S_OK() def execute(self): """Main execution method""" # PilotsHistory to Monitoring if "Monitoring" in self.pilotMonitoringOption: self.log.info("Committing PilotsHistory to Monitoring") result = PilotAgentsDB().getSummarySnapshot() now = datetime.datetime.utcnow() if not result["OK"]: self.log.error( "Can't get the PilotAgentsDB summary", "%s: won't commit PilotsHistory at this cycle" % result["Message"], ) values = result["Value"][1] for record in values: rD = {} for iP, _ in enumerate(self.__pilotsMapping): rD[self.__pilotsMapping[iP]] = record[iP] rD["timestamp"] = int(TimeUtilities.toEpoch(now)) self.pilotReporter.addRecord(rD) self.log.info("Committing to Monitoring...") result = self.pilotReporter.commit() if not result["OK"]: self.log.error("Could not commit to Monitoring", result["Message"]) self.log.verbose("Done committing PilotsHistory to Monitoring") # WMSHistory to Monitoring or Accounting self.log.info("Committing WMSHistory to %s backend" % "and ".join(self.jobMonitoringOption)) result = JobDB().getSummarySnapshot(self.__jobDBFields) now = datetime.datetime.utcnow() if not result["OK"]: self.log.error( "Can't get the JobDB summary", "%s: won't commit WMSHistory at this cycle" % result["Message"]) return S_ERROR() values = result["Value"][1] self.log.info("Start sending WMSHistory records") for record in values: record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[fV[0]] = fV[1] for iP, _ in enumerate(self.__summaryKeyFieldsMapping): fieldName = self.__summaryKeyFieldsMapping[iP] rD[self.__renameFieldsMapping.get(fieldName, fieldName)] = record[iP] record = record[len(self.__summaryKeyFieldsMapping):] for iP, _ in enumerate(self.__summaryValueFieldsMapping): rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP]) for backend in self.datastores: if backend.lower() == "monitoring": rD["timestamp"] = int(TimeUtilities.toEpoch(now)) self.datastores["Monitoring"].addRecord(rD) elif backend.lower() == "accounting": acWMS = WMSHistory() acWMS.setStartTime(now) acWMS.setEndTime(now) acWMS.setValuesFromDict(rD) retVal = acWMS.checkValues() if not retVal["OK"]: self.log.error("Invalid WMSHistory accounting record ", "%s -> %s" % (retVal["Message"], rD)) else: self.datastores["Accounting"].addRegister(acWMS) for backend, datastore in self.datastores.items(): self.log.info("Committing WMSHistory records to %s backend" % backend) result = datastore.commit() if not result["OK"]: self.log.error("Couldn't commit WMSHistory to %s" % backend, result["Message"]) return S_ERROR() self.log.verbose("Done committing WMSHistory to %s backend" % backend) return S_OK()
class StatesMonitoringAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ __summaryKeyFieldsMapping = [ 'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType', 'ApplicationStatus', 'MinorStatus' ] __summaryDefinedFields = [('ApplicationStatus', 'unset'), ('MinorStatus', 'unset')] __summaryValueFieldsMapping = ['Jobs', 'Reschedules'] __renameFieldsMapping = {'JobType': 'JobSplitType'} __jobDBFields = [] jobDB = None monitoringReporter = None def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.am_setOption("PollingTime", 900) self.messageQueue = self.am_getOption('MessageQueue', 'dirac.wmshistory') self.monitoringReporter = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=self.messageQueue) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK() def execute(self): """ Main execution method """ result = gConfig.getSections("/DIRAC/Setups") if not result['OK']: return result validSetups = result['Value'] self.log.info("Valid setups for this cycle are %s" % ", ".join(validSetups)) # Get the WMS Snapshot! result = self.jobDB.getSummarySnapshot(self.__jobDBFields) now = Time.dateTime() if not result['OK']: self.log.error("Can't get the jobdb summary", result['Message']) else: values = result['Value'][1] self.log.info("Start sending records!") for record in values: recordSetup = record[0] if recordSetup not in validSetups: self.log.error("Setup %s is not valid" % recordSetup) continue record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[fV[0]] = fV[1] for iP in range(len(self.__summaryKeyFieldsMapping)): fieldName = self.__summaryKeyFieldsMapping[iP] rD[self.__renameFieldsMapping.get(fieldName, fieldName)] = record[iP] record = record[len(self.__summaryKeyFieldsMapping):] for iP in range(len(self.__summaryValueFieldsMapping)): rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP]) rD['timestamp'] = int(Time.toEpoch(now)) self.monitoringReporter.addRecord(rD) retVal = self.monitoringReporter.commit() if retVal['OK']: self.log.info( "The records are successfully sent to the Store!") else: self.log.warn( "Faild to insert the records! It will be retried in the next iteration", retVal['Message']) return S_OK()
class RegisterReplica(DMSRequestOperationsBase): """ .. class:: RegisterReplica RegisterReplica operation handler :param self: self reference :param ~DIRAC.RequestManagementSystem.Client.Operation.Operation operation: Operation instance :param str csPath: CS path for this handler """ def __init__(self, operation=None, csPath=None): """c'tor""" DMSRequestOperationsBase.__init__(self, operation, csPath) def __call__(self): """call me maybe""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") else: # # RegisterReplica specific monitor info gMonitor.registerActivity( "RegisterReplicaAtt", "Attempted replicas registrations", "RequestExecutingAgent", "Replicas/min", gMonitor.OP_SUM, ) gMonitor.registerActivity( "RegisterReplicaOK", "Successful replicas registrations", "RequestExecutingAgent", "Replicas/min", gMonitor.OP_SUM, ) gMonitor.registerActivity( "RegisterReplicaFail", "Failed replicas registrations", "RequestExecutingAgent", "Replicas/min", gMonitor.OP_SUM, ) # # counter for failed replicas failedReplicas = 0 # # catalog to use catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(",")] # # get waiting files waitingFiles = self.getWaitingFilesList() if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(waitingFiles))) # # loop over files registerOperations = {} successReplicas = 0 for opFile in waitingFiles: if not self.rmsMonitoring: gMonitor.addMark("RegisterReplicaAtt", 1) # # get LFN lfn = opFile.LFN # # and others targetSE = self.operation.targetSEList[0] replicaTuple = (lfn, opFile.PFN, targetSE) # # call ReplicaManager registerReplica = self.dm.registerReplica(replicaTuple, catalogs) # # check results if not registerReplica["OK"] or lfn in registerReplica["Value"]["Failed"]: # There have been some errors if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("RegisterReplicaFail", 1) # self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaFail", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterReplica" ) reason = registerReplica.get( "Message", registerReplica.get("Value", {}).get("Failed", {}).get(lfn, "Unknown") ) errorStr = "failed to register LFN %s: %s" % (lfn, str(reason)) # FIXME: this is incompatible with the change made in the DM that we # ignore failures if successful in at least one catalog if lfn in registerReplica.get("Value", {}).get("Successful", {}) and isinstance(reason, dict): # As we managed, let's create a new operation for just the remaining registration errorStr += " - adding registerReplica operations to request" for failedCatalog in reason: key = "%s/%s" % (targetSE, failedCatalog) newOperation = self.getRegisterOperation( opFile, targetSE, type="RegisterReplica", catalog=failedCatalog ) if key not in registerOperations: registerOperations[key] = newOperation else: registerOperations[key].addFile(newOperation[0]) opFile.Status = "Done" else: opFile.Error = errorStr catMaster = True if isinstance(reason, dict): from DIRAC.Resources.Catalog.FileCatalog import FileCatalog for failedCatalog in reason: catMaster = catMaster and FileCatalog()._getCatalogConfigDetails(failedCatalog).get( "Value", {} ).get("Master", False) # If one targets explicitly a catalog and it fails or if it fails on the master catalog if (catalogs or catMaster) and ( "file does not exist" in opFile.Error.lower() or "no such file" in opFile.Error.lower() ): # Check if the file really exists in SE, if not, consider this file registration as Done res = self.dm.getReplicaMetadata(lfn, targetSE) notExist = bool("No such file" in res.get("Value", {}).get("Failed", {}).get(lfn, "")) if not notExist: opFile.Status = "Failed" else: opFile.Status = "Done" if opFile.Status != "Done": failedReplicas += 1 self.log.warn(errorStr) else: # All is OK if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("RegisterReplicaOK", 1) successReplicas += 1 self.log.verbose( "Replica %s has been registered at %s" % (lfn, ",".join(catalogs) if catalogs else "all catalogs") ) opFile.Status = "Done" # # if we have new replications to take place, put them at the end if registerOperations: self.log.info("adding %d operations to the request" % len(registerOperations)) for operation in registerOperations.values(): self.operation._parent.addOperation(operation) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # # final check infoStr = "" if successReplicas: infoStr = "%d replicas successfully registered" % successReplicas if failedReplicas: infoStr += ", %d replicas failed to register" % failedReplicas self.log.info("All replicas processed", infoStr) if failedReplicas: self.operation.Error = "some replicas failed to register" return S_ERROR(self.operation.Error) return S_OK()
class AgentModule: """Base class for all agent modules This class is used by the AgentReactor Class to steer the execution of DIRAC Agents. For this purpose the following methods are used: - am_initialize() just after instantiated - am_getPollingTime() to set the execution frequency - am_getMaxCycles() to determine the number of cycles - am_go() for the actual execution of one cycle Before each iteration, the following methods are used to determine if the new cycle is to be started. - am_getModuleParam( 'alive' ) - am_checkStopAgentFile() - am_removeStopAgentFile() To start new execution cycle the following methods are used - am_getCyclesDone() - am_setOption( 'MaxCycles', maxCycles ) At the same time it provides all Agents with common interface. All Agent class must inherit from this base class and must implement at least the following method: - execute() main method called in the agent cycle Additionally they may provide: - initialize() for initial settings - finalize() the graceful exit - beginExecution() before each execution cycle - endExecution() at the end of each execution cycle The agent can be stopped either by a signal or by creating a 'stop_agent' file in the controlDirectory defined in the agent configuration """ def __init__(self, agentName, loadName, baseAgentName=False, properties={}): """ Common __init__ method for all Agents. All Agent modules must define: __doc__ They are used to populate __codeProperties The following Options are used from the Configuration: - /DIRAC/Setup - Status - Enabled - PollingTime default = 120 - MaxCycles default = 500 - WatchdogTime default = 0 (disabled) - ControlDirectory control/SystemName/AgentName - WorkDirectory work/SystemName/AgentName - shifterProxy '' - shifterProxyLocation WorkDirectory/SystemName/AgentName/.shifterCred It defines the following default Options that can be set via Configuration (above): - MonitoringEnabled True - Enabled True if Status == Active - PollingTime 120 - MaxCycles 500 - ControlDirectory control/SystemName/AgentName - WorkDirectory work/SystemName/AgentName - shifterProxy False - shifterProxyLocation work/SystemName/AgentName/.shifterCred different defaults can be set in the initialize() method of the Agent using am_setOption() In order to get a shifter proxy in the environment during the execute() the configuration Option 'shifterProxy' must be set, a default may be given in the initialize() method. """ self.log = gLogger.getSubLogger(agentName) self.__basePath = rootPath self.__agentModule = None self.agentName = agentName self.__codeProperties = {} self.__getCodeInfo() self.__moduleProperties = { "fullName": agentName, "loadName": loadName, "section": PathFinder.getAgentSection(agentName), "loadSection": PathFinder.getAgentSection(loadName), "cyclesDone": 0, "totalElapsedTime": 0, "setup": gConfig.getValue("/DIRAC/Setup", "Unknown"), "alive": True, } self.__moduleProperties["system"], self.__moduleProperties[ "agentName"] = agentName.split("/") self.__configDefaults = {} self.__configDefaults["MonitoringEnabled"] = True self.__configDefaults["Enabled"] = self.am_getOption( "Status", "Active").lower() in ("active") self.__configDefaults["PollingTime"] = self.am_getOption( "PollingTime", 120) self.__configDefaults["MaxCycles"] = self.am_getOption( "MaxCycles", 500) self.__configDefaults["WatchdogTime"] = self.am_getOption( "WatchdogTime", 0) self.__configDefaults["ControlDirectory"] = os.path.join( self.__basePath, "control", *agentName.split("/")) self.__configDefaults["WorkDirectory"] = os.path.join( self.__basePath, "work", *agentName.split("/")) self.__configDefaults["shifterProxy"] = "" self.__configDefaults["shifterProxyLocation"] = os.path.join( self.__configDefaults["WorkDirectory"], ".shifterCred") if isinstance(properties, dict): for key in properties: self.__moduleProperties[key] = properties[key] self.__moduleProperties["executors"] = [(self.execute, ())] self.__moduleProperties["shifterProxy"] = False self.__monitorLastStatsUpdate = -1 self.activityMonitoring = False # Check if monitoring is enabled if "Monitoring" in Operations().getMonitoringBackends( monitoringType="AgentMonitoring"): self.activityMonitoring = True def __getCodeInfo(self): try: self.__codeProperties["version"] = importlib.metadata.version( inspect.getmodule(self).__package__.split(".")[0]) except Exception: self.log.exception(f"Failed to find version for {self!r}") self.__codeProperties["version"] = "unset" try: self.__agentModule = __import__(self.__class__.__module__, globals(), locals(), "__doc__") except Exception as excp: self.log.exception("Cannot load agent module", lException=excp) try: self.__codeProperties["description"] = getattr( self.__agentModule, "__doc__") except Exception: self.log.error("Missing property __doc__") self.__codeProperties["description"] = "unset" self.__codeProperties["DIRACVersion"] = DIRAC.version self.__codeProperties["platform"] = DIRAC.getPlatform() def am_initialize(self, *initArgs): """Common initialization for all the agents. This is executed every time an agent (re)starts. This is called by the AgentReactor, should not be overridden. """ agentName = self.am_getModuleParam("fullName") self.__initializeMonitor() result = self.initialize(*initArgs) if not isReturnStructure(result): return S_ERROR("initialize must return S_OK/S_ERROR") if not result["OK"]: return S_ERROR("Error while initializing %s: %s" % (agentName, result["Message"])) mkDir(self.am_getControlDirectory()) workDirectory = self.am_getWorkDirectory() mkDir(workDirectory) # Set the work directory in an environment variable available to subprocesses if needed os.environ["AGENT_WORKDIRECTORY"] = workDirectory self.__moduleProperties["shifterProxy"] = self.am_getOption( "shifterProxy") if len(self.__moduleProperties["executors"]) < 1: return S_ERROR("At least one executor method has to be defined") if not self.am_Enabled(): return S_ERROR("Agent is disabled via the configuration") self.log.notice("=" * 40) self.log.notice("Loaded agent module %s" % self.__moduleProperties["fullName"]) self.log.notice(" Site: %s" % DIRAC.siteName()) self.log.notice(" Setup: %s" % gConfig.getValue("/DIRAC/Setup")) self.log.notice(" Agent version: %s" % self.__codeProperties["version"]) self.log.notice(" DIRAC version: %s" % DIRAC.version) self.log.notice(" DIRAC platform: %s" % DIRAC.getPlatform()) pollingTime = int(self.am_getOption("PollingTime")) if pollingTime > 3600: self.log.notice(" Polling time: %s hours" % (pollingTime / 3600.0)) else: self.log.notice(" Polling time: %s seconds" % self.am_getOption("PollingTime")) self.log.notice(" Control dir: %s" % self.am_getControlDirectory()) self.log.notice(" Work dir: %s" % self.am_getWorkDirectory()) if self.am_getOption("MaxCycles") > 0: self.log.notice(" Cycles: %s" % self.am_getMaxCycles()) else: self.log.notice(" Cycles: unlimited") if self.am_getWatchdogTime() > 0: self.log.notice(" Watchdog interval: %s" % self.am_getWatchdogTime()) else: self.log.notice(" Watchdog interval: disabled ") self.log.notice("=" * 40) return S_OK() def am_getControlDirectory(self): return os.path.join(self.__basePath, str(self.am_getOption("ControlDirectory"))) def am_getStopAgentFile(self): return os.path.join(self.am_getControlDirectory(), "stop_agent") def am_checkStopAgentFile(self): return os.path.isfile(self.am_getStopAgentFile()) def am_createStopAgentFile(self): try: with open(self.am_getStopAgentFile(), "w") as fd: fd.write("Dirac site agent Stopped at %s" % str(datetime.datetime.utcnow())) except Exception: pass def am_removeStopAgentFile(self): try: os.unlink(self.am_getStopAgentFile()) except Exception: pass def am_getBasePath(self): return self.__basePath def am_getWorkDirectory(self): return os.path.join(self.__basePath, str(self.am_getOption("WorkDirectory"))) def am_getShifterProxyLocation(self): return os.path.join(self.__basePath, str(self.am_getOption("shifterProxyLocation"))) def am_getOption(self, optionName, defaultValue=None): """Gets an option from the agent's configuration section. The section will be a subsection of the /Systems section in the CS. """ if defaultValue is None: if optionName in self.__configDefaults: defaultValue = self.__configDefaults[optionName] if optionName and optionName[0] == "/": return gConfig.getValue(optionName, defaultValue) for section in (self.__moduleProperties["section"], self.__moduleProperties["loadSection"]): result = gConfig.getOption("%s/%s" % (section, optionName), defaultValue) if result["OK"]: return result["Value"] return defaultValue def am_setOption(self, optionName, value): self.__configDefaults[optionName] = value def am_getModuleParam(self, optionName): return self.__moduleProperties[optionName] def am_setModuleParam(self, optionName, value): self.__moduleProperties[optionName] = value def am_getPollingTime(self): return self.am_getOption("PollingTime") def am_getMaxCycles(self): return self.am_getOption("MaxCycles") def am_getWatchdogTime(self): return int(self.am_getOption("WatchdogTime")) def am_getCyclesDone(self): return self.am_getModuleParam("cyclesDone") def am_Enabled(self): return self.am_getOption("Enabled") def am_stopExecution(self): self.am_setModuleParam("alive", False) def __initializeMonitor(self): """ Initialize the system monitoring. """ # This flag is used to activate ES based monitoring if self.activityMonitoring: self.log.debug("Monitoring of the agent is enabled.") # The import needs to be here because of the CS must be initialized before importing # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793) from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="AgentMonitoring") # With the help of this periodic task we commit the data to ES at an interval of 100 seconds. gThreadScheduler.addPeriodicTask( 100, self.__activityMonitoringReporting) self.__monitorLastStatsUpdate = time.time() def am_secureCall(self, functor, args=(), name=False): if not name: name = str(functor) try: result = functor(*args) if not isReturnStructure(result): raise Exception( "%s method for %s module has to return S_OK/S_ERROR" % (name, self.__moduleProperties["fullName"])) return result except Exception as e: self.log.exception("Agent exception while calling method %s" % name, lException=e) return S_ERROR("Exception while calling %s method: %s" % (name, str(e))) def _setShifterProxy(self): if self.__moduleProperties["shifterProxy"]: result = setupShifterProxyInEnv( self.__moduleProperties["shifterProxy"], self.am_getShifterProxyLocation()) if not result["OK"]: self.log.error("Failed to set shifter proxy", result["Message"]) return result return S_OK() def am_go(self): # Set the shifter proxy if required result = self._setShifterProxy() if not result["OK"]: return result self.log.notice("-" * 40) self.log.notice("Starting cycle for module %s" % self.__moduleProperties["fullName"]) mD = self.am_getMaxCycles() if mD > 0: cD = self.__moduleProperties["cyclesDone"] self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD)) self.log.notice("-" * 40) # use SIGALARM as a watchdog interrupt if enabled watchdogInt = self.am_getWatchdogTime() if watchdogInt > 0: signal.signal(signal.SIGALRM, signal.SIG_DFL) signal.alarm(watchdogInt) elapsedTime = time.time() if self.activityMonitoring: initialWallTime, initialCPUTime, mem = self._startReportToMonitoring( ) cycleResult = self.__executeModuleCycle() if self.activityMonitoring and initialWallTime and initialCPUTime: cpuPercentage = self._endReportToMonitoring( initialWallTime, initialCPUTime) # Increment counters self.__moduleProperties["cyclesDone"] += 1 # Show status elapsedTime = time.time() - elapsedTime self.__moduleProperties["totalElapsedTime"] += elapsedTime self.log.notice("-" * 40) self.log.notice("Agent module %s run summary" % self.__moduleProperties["fullName"]) self.log.notice(" Executed %s times previously" % self.__moduleProperties["cyclesDone"]) self.log.notice(" Cycle took %.2f seconds" % elapsedTime) averageElapsedTime = self.__moduleProperties[ "totalElapsedTime"] / self.__moduleProperties["cyclesDone"] self.log.notice(" Average execution time: %.2f seconds" % (averageElapsedTime)) elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption( "PollingTime") self.log.notice(" Polling time: %s seconds" % self.am_getOption("PollingTime")) self.log.notice(" Average execution/polling time: %.2f%%" % elapsedPollingRate) if cycleResult["OK"]: self.log.notice(" Cycle was successful") if self.activityMonitoring: # Here we record the data about the cycle duration along with some basic details about the # agent and right now it isn't committed to the ES backend. self.activityMonitoringReporter.addRecord({ "AgentName": self.agentName, "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "MemoryUsage": mem, "CpuPercentage": cpuPercentage, "CycleDuration": elapsedTime, }) else: self.log.warn(" Cycle had an error:", cycleResult["Message"]) self.log.notice("-" * 40) # Update number of cycles # cycle finished successfully, cancel watchdog if watchdogInt > 0: signal.alarm(0) return cycleResult def _startReportToMonitoring(self): now = time.time() stats = os.times() mem = None cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 10: return (now, cpuTime, mem) self.__monitorLastStatsUpdate = now membytes = MemStat.VmB("VmRSS:") if membytes: mem = membytes / (1024.0 * 1024.0) return (now, cpuTime, mem) def _endReportToMonitoring(self, initialWallTime, initialCPUTime): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = 0 if wallTime: percentage = cpuTime / wallTime * 100.0 return percentage def __executeModuleCycle(self): # Execute the beginExecution function result = self.am_secureCall(self.beginExecution, name="beginExecution") if not result["OK"]: return result # Launch executor functions executors = self.__moduleProperties["executors"] if len(executors) == 1: result = self.am_secureCall(executors[0][0], executors[0][1]) if not result["OK"]: return result else: exeThreads = [ threading.Thread(target=executor[0], args=executor[1]) for executor in executors ] for thread in exeThreads: thread.setDaemon(1) thread.start() for thread in exeThreads: thread.join() # Execute the endExecution function return self.am_secureCall(self.endExecution, name="endExecution") def initialize(self, *args, **kwargs): """Agents should override this method for specific initialization. Executed at every agent (re)start. """ return S_OK() def beginExecution(self): return S_OK() def endExecution(self): return S_OK() def finalize(self): return S_OK() def execute(self): return S_ERROR("Execute method has to be overwritten by agent module") def __activityMonitoringReporting(self): """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which is done by the MonitoringReporter and is send to the 'ComponentMonitoring' type. :return: True / False """ return self.activityMonitoringReporter.commit()["OK"]