def _mbReceivedMsg(self, trid, msgObj): result = self._authorizeProposal( ("Message", msgObj.getName()), trid, self._transportPool.get(trid).getConnectingCredentials()) if not result["OK"]: return result result = self._instantiateHandler(trid) if not result["OK"]: return result handlerObj = result["Value"] response = handlerObj._rh_executeMessageCallback(msgObj) if self.activityMonitoring and response["OK"]: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) if response["OK"]: return response["Value"][0] else: return response
def __reportToMonitoring(self, responseTime): """ Periodically reports to Monitoring """ # Calculate CPU usage by comparing realtime and cpu time since last report percentage = self.__endReportToMonitoringLoop(self.__report[0], self.__report[1]) # Send record to Monitoring self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "Tornado", "MemoryUsage": self.__report[2], "CpuPercentage": percentage, "ResponseTime": responseTime, }) self.activityMonitoringReporter.commit() # Save memory usage and save realtime/CPU time for next call self.__report = self.__startReportToMonitoringLoop()
def _mbReceivedMsg(self, trid, msgObj): result = self._authorizeProposal( ('Message', msgObj.getName()), trid, self._transportPool.get(trid).getConnectingCredentials()) if not result['OK']: return result result = self._instantiateHandler(trid) if not result['OK']: return result handlerObj = result['Value'] response = handlerObj._rh_executeMessageCallback(msgObj) if self.activityMonitoring and response["OK"]: self.activityMonitoringReporter.addRecord({ 'timestamp': int(Time.toEpoch()), 'host': Network.getFQDN(), 'componentType': 'service', 'component': "_".join(self._name.split("/")), 'componentLocation': self._cfg.getURL(), 'ServiceResponseTime': response["Value"][1] }) if response["OK"]: return response["Value"][0] else: return response
def __reportActivity(self): initialWallTime, initialCPUTime, mem = self.__startReportToMonitoring() pendingQueries = self._threadPool._work_queue.qsize() activeQuereies = len(self._threadPool._threads) percentage = self.__endReportToMonitoring(initialWallTime, initialCPUTime) self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "MemoryUsage": mem, "CpuPercentage": percentage, "PendingQueries": pendingQueries, "ActiveQueries": activeQuereies, "RunningThreads": threading.activeCount(), "MaxFD": self.__maxFD, }) self.__maxFD = 0
def initialize( self ): self.logger = gLogger.getSubLogger( "Monitoring" ) self.logger.debug( "Initializing Monitoring Client" ) self.sourceDict[ 'setup' ] = gConfig.getValue( "/DIRAC/Setup" ) self.sourceDict[ 'site' ] = DIRAC.siteName() if self.sourceDict[ 'componentType' ] == self.COMPONENT_SERVICE: self.cfgSection = PathFinder.getSystemSection( self.sourceDict[ 'componentName' ] ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_AGENT: self.cfgSection = PathFinder.getAgentSection( self.sourceDict[ 'componentName' ] ) self.setComponentLocation( Network.getFQDN() ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_WEB: self.cfgSection = "/WebApp" self.setComponentLocation( 'http://%s' % Network.getFQDN() ) self.setComponentName( 'WebApp' ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_SCRIPT: self.cfgSection = "/Script" else: raise Exception( "Component type has not been defined" ) gMonitoringFlusher.registerMonitoringClient( self ) # ExitCallback.registerExitCallback( self.forceFlush ) self.__initialized = True
def initialize( self ): self.logger = gLogger.getSubLogger( "Monitoring" ) self.logger.debug( "Initializing Monitoring Client" ) self.sourceDict[ 'setup' ] = gConfig.getValue( "/DIRAC/Setup" ) self.sourceDict[ 'site' ] = DIRAC.siteName() if self.sourceDict[ 'componentType' ] == self.COMPONENT_SERVICE: self.cfgSection = PathFinder.getSystemSection( self.sourceDict[ 'componentName' ] ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_AGENT: self.cfgSection = PathFinder.getAgentSection( self.sourceDict[ 'componentName' ] ) self.setComponentLocation( Network.getFQDN() ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_WEB: self.cfgSection = "/Website" self.setComponentLocation( 'http://%s' % Network.getFQDN() ) self.setComponentName( 'Web' ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_SCRIPT: self.cfgSection = "/Script" else: raise Exception( "Component type has not been defined" ) gMonitoringFlusher.registerMonitoringClient( self ) #ExitCallback.registerExitCallback( self.forceFlush ) self.__initialized = True
def initialize(self): self.logger = gLogger.getSubLogger("Monitoring") self.logger.debug("Initializing Monitoring Client") self.sourceDict["setup"] = gConfig.getValue("/DIRAC/Setup") self.sourceDict["site"] = DIRAC.siteName() if self.sourceDict["componentType"] == self.COMPONENT_SERVICE: self.cfgSection = PathFinder.getSystemSection( self.sourceDict["componentName"]) elif self.sourceDict["componentType"] == self.COMPONENT_AGENT: self.cfgSection = PathFinder.getAgentSection( self.sourceDict["componentName"]) self.setComponentLocation(Network.getFQDN()) elif self.sourceDict["componentType"] == self.COMPONENT_WEB: self.cfgSection = "/WebApp" self.setComponentLocation("http://%s" % Network.getFQDN()) self.setComponentName("WebApp") elif self.sourceDict["componentType"] == self.COMPONENT_SCRIPT: self.cfgSection = "/Script" elif self.sourceDict["componentType"] == self.COMPONENT_TORNADO: self.cfgSection = "/Tornado" else: raise Exception("Component type has not been defined") gMonitoringFlusher.registerMonitoringClient(self) self.__initialized = True
def __init__( self, optionsDictionary ): threading.Thread.__init__( self ) self.__interactive = optionsDictionary[ 'Interactive' ] self.__sleep = optionsDictionary[ 'SleepTime' ] self._messageQueue = Queue.Queue() self._Transactions = [] self._alive = True self._site = optionsDictionary[ 'Site' ] self._hostname = Network.getFQDN() self._logLevels = LogLevels() self._negativeLevel = self._logLevels.getLevelValue( 'ERROR' ) self._positiveLevel = self._logLevels.getLevelValue( 'ALWAYS' ) self._maxBundledMessages = 20 self.setDaemon(1) self.start()
def _executeAction(self, trid, proposalTuple, handlerObj): try: response = handlerObj._rh_executeAction(proposalTuple) if not response["OK"]: return response if self.activityMonitoring: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) return response["Value"][0] except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e))
def siteName(): """ Determine and return DIRAC name for current site """ global __siteName if not __siteName: #FIXME: does this ever happen that we have to use the defaultValue if getValue ??? from DIRAC.Core.Utilities import Network # Some Defaults if not present in the configuration fqdn = Network.getFQDN() if len( fqdn.split( '.' ) ) > 2 : # Use the last component of the FQDN as country code if there are more than 2 components _siteName = 'DIRAC.Client.%s' % fqdn.split( '.' )[-1] else: # else use local as country code _siteName = 'DIRAC.Client.local' __siteName = gConfig.getValue( '/LocalSite/Site', _siteName ) return __siteName
def _executeAction(self, trid, proposalTuple, handlerObj): try: response = handlerObj._rh_executeAction(proposalTuple) if self.activityMonitoring and response["OK"]: self.activityMonitoringReporter.addRecord({ 'timestamp': int(Time.toEpoch()), 'host': Network.getFQDN(), 'componentType': 'service', 'component': "_".join(self._name.split("/")), 'componentLocation': self._cfg.getURL(), 'ServiceResponseTime': response["Value"][1] }) return response["Value"][0] except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e))
def createRMSRecord(self, status, nbObject): """ This method is used to create a record given some parameters for sending it to the ES backend. It is used inside DMS/Agent/RequestOperations and this method is designed particularly for file type of objects. :param status: This can be one of these i.e. Attempted, Failed, or Successful. :param nbObject: This is number of objects in question. :returns: a dictionary. """ record = { "timestamp": int(TimeUtilities.toEpoch()), "host": Network.getFQDN(), "objectType": "File", "operationType": self.operation.Type, "status": status, "nbObject": nbObject, "parentID": self.operation.OperationID, } return record
def __init__(self, sleepTime, interactive, site): """ Initialization of the ServerHandler. The queue is initialized with the hostname and the start of the thread. :params sleepTime: integer, representing time in seconds where the handler can send messages. :params interactive: not used at the moment. :params site: the site where the log messages come from. """ super(ServerHandler, self).__init__() threading.Thread.__init__(self) self.__logQueue = Queue.Queue() self.__sleepTime = sleepTime self.__interactive = interactive self.__site = site self.__transactions = [] self.__hostname = Network.getFQDN() self.__alive = True self.__maxBundledLogs = 20 self.setDaemon(True) self.start()
def __generateUniqueClientName(self): hashStr = ":".join( (str(datetime.datetime.utcnow()), str(random.random()), Network.getFQDN(), gLogger.getName()) ) hexHash = md5(hashStr.encode()).hexdigest() return hexHash
def getHostname( self ): hostname = self.getOption( "/DIRAC/Hostname" ) if not hostname: return Network.getFQDN() return hostname
def __call__(self): """request processing""" self.log.debug("about to execute request") if not self.rmsMonitoring: gMonitor.addMark("RequestAtt", 1) # # setup proxy for request owner setupProxy = self.setupProxy() if not setupProxy["OK"]: userSuspended = "User is currently suspended" self.request.Error = setupProxy["Message"] # In case the user does not have proxy if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND): self.log.error("Error setting proxy. Request set to Failed:", setupProxy["Message"]) # If user is no longer registered, fail the request for operation in self.request: for opFile in operation: opFile.Status = "Failed" operation.Status = "Failed" elif userSuspended in setupProxy["Message"]: # If user is suspended, wait for a long time self.request.delayNextExecution(6 * 60) self.request.Error = userSuspended self.log.error("Error setting proxy: " + userSuspended, self.request.OwnerDN) else: self.log.error("Error setting proxy", setupProxy["Message"]) return S_OK(self.request) shifter = setupProxy["Value"]["Shifter"] error = None while self.request.Status == "Waiting": # # get waiting operation operation = self.request.getWaiting() if not operation["OK"]: self.log.error("Cannot get waiting operation", operation["Message"]) return operation operation = operation["Value"] self.log.info("executing operation", "%s" % operation.Type) # # and handler for it handler = self.getHandler(operation) if not handler["OK"]: self.log.error("Unable to process operation", "%s: %s" % (operation.Type, handler["Message"])) # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 ) operation.Error = handler["Message"] break handler = handler["Value"] # # set shifters list in the handler handler.shifter = shifter # set rmsMonitoring flag for the RequestOperation handler.rmsMonitoring = self.rmsMonitoring # # and execute pluginName = self.getPluginName( self.handlersDict.get(operation.Type)) if self.standalone: useServerCertificate = gConfig.useServerCertificate() else: # Always use server certificates if executed within an agent useServerCertificate = True try: if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Attempted", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Att"), 1) # Always use request owner proxy if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "false") exe = handler() if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") if not exe["OK"]: self.log.error("unable to process operation", "%s: %s" % (operation.Type, exe["Message"])) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if self.request.JobID: # Check if the job exists monitorServer = JobMonitoringClient( useCertificates=True) res = monitorServer.getJobSummary( int(self.request.JobID)) if not res["OK"]: self.log.error( "RequestTask: Failed to get job status", "%d" % self.request.JobID) elif not res["Value"]: self.log.warn( "RequestTask: job does not exist (anymore): failed request", "JobID: %d" % self.request.JobID, ) for opFile in operation: opFile.Status = "Failed" if operation.Status != "Failed": operation.Status = "Failed" self.request.Error = "Job no longer exists" except Exception as e: error = str(e) self.log.exception("hit by exception:", "%s" % error) if pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("RequestFail", 1) if useServerCertificate: gConfigurationData.setOptionInCFG( "/DIRAC/Security/UseServerCertificate", "true") break # # operation status check if operation.Status == "Done" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "OK"), 1) elif operation.Status == "Failed" and pluginName: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Operation", "operationType": pluginName, "objectID": operation.OperationID, "parentID": operation.RequestID, "status": "Failed", "nbObject": 1, }) else: gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1) elif operation.Status in ("Waiting", "Scheduled"): # # no update for waiting or all files scheduled break if not self.rmsMonitoring: gMonitor.flush() if error: return S_ERROR(error) # # request done? if self.request.Status == "Done": # # update request to the RequestDB self.log.info("Updating request status:", "%s" % self.request.Status) update = self.updateRequest() if not update["OK"]: self.log.error("Cannot update request status", update["Message"]) return update self.log.info("request is done", "%s" % self.request.RequestName) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord({ "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "objectID": getattr(self.request, "RequestID", 0), "status": "Successful", "nbObject": 1, }) else: gMonitor.addMark("RequestOK", 1) # # and there is a job waiting for it? finalize! if self.request.JobID: attempts = 0 while True: finalizeRequest = self.requestClient.finalizeRequest( self.request.RequestID, self.request.JobID # pylint: disable=no-member ) if not finalizeRequest["OK"]: if not attempts: self.log.error( "unable to finalize request, will retry", "ReqName %s:%s" % (self.request.RequestName, finalizeRequest["Message"]), ) self.log.debug("Waiting 10 seconds") attempts += 1 if attempts == 10: self.log.error("Giving up finalize request") return S_ERROR("Could not finalize request") time.sleep(10) else: self.log.info( "request is finalized", "ReqName %s %s" % (self.request.RequestName, (" after %d attempts" % attempts) if attempts else ""), ) break # Commit all the data to the ES Backend if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # Request will be updated by the callBack method self.log.verbose("RequestTasks exiting", "request %s" % self.request.Status) return S_OK(self.request)
def __generateUniqueClientName(self): hashStr = ":".join((Time.toString(), str(random.random()), Network.getFQDN(), gLogger.getName())) hexHash = md5(hashStr).hexdigest() return hexHash
def getHostname(self): hostname = self.getOption("/DIRAC/Hostname") if not hostname: return Network.getFQDN() return hostname
def am_go(self): # Set the shifter proxy if required result = self._setShifterProxy() if not result['OK']: return result self.log.notice("-" * 40) self.log.notice("Starting cycle for module %s" % self.__moduleProperties['fullName']) mD = self.am_getMaxCycles() if mD > 0: cD = self.__moduleProperties['cyclesDone'] self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD)) self.log.notice("-" * 40) # use SIGALARM as a watchdog interrupt if enabled watchdogInt = self.am_getWatchdogTime() if watchdogInt > 0: signal.signal(signal.SIGALRM, signal.SIG_DFL) signal.alarm(watchdogInt) elapsedTime = time.time() cpuStats = self._startReportToMonitoring() cycleResult = self.__executeModuleCycle() if cpuStats: self._endReportToMonitoring(*cpuStats) # Increment counters self.__moduleProperties['cyclesDone'] += 1 # Show status elapsedTime = time.time() - elapsedTime self.__moduleProperties['totalElapsedTime'] += elapsedTime self.log.notice("-" * 40) self.log.notice("Agent module %s run summary" % self.__moduleProperties['fullName']) self.log.notice(" Executed %s times previously" % self.__moduleProperties['cyclesDone']) self.log.notice(" Cycle took %.2f seconds" % elapsedTime) averageElapsedTime = self.__moduleProperties[ 'totalElapsedTime'] / self.__moduleProperties['cyclesDone'] self.log.notice(" Average execution time: %.2f seconds" % (averageElapsedTime)) elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption( 'PollingTime') self.log.notice(" Polling time: %s seconds" % self.am_getOption('PollingTime')) self.log.notice(" Average execution/polling time: %.2f%%" % elapsedPollingRate) if cycleResult['OK']: self.log.notice(" Cycle was successful") if self.activityMonitoring: # Here we record the data about the cycle duration along with some basic details about the # component and right now it isn't committed to the ES backend. self.activityMonitoringReporter.addRecord({ 'timestamp': int(Time.toEpoch()), 'host': Network.getFQDN(), 'componentType': "agent", 'component': "_".join(self.__moduleProperties['fullName'].split("/")), 'cycleDuration': elapsedTime, 'cycles': 1 }) else: self.log.warn(" Cycle had an error:", cycleResult['Message']) self.log.notice("-" * 40) # Update number of cycles if not self.activityMonitoring: self.monitor.setComponentExtraParam( 'cycles', self.__moduleProperties['cyclesDone']) # cycle finished successfully, cancel watchdog if watchdogInt > 0: signal.alarm(0) return cycleResult
def execute(self): """read requests from RequestClient and enqueue them into ProcessPool""" if not self.__rmsMonitoring: gMonitor.addMark("Iteration", 1) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug("execute: executing %d request in this cycle" % taskCounter) requestsToExecute = [] if not self.__bulkRequest: self.log.info("execute: ask for a single request") getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error("execute:", "%s" % getRequest["Message"]) break if not getRequest["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break requestsToExecute = [getRequest["Value"]] else: numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter) self.log.info("execute: ask for requests", "%s" % numberOfRequest) getRequests = self.requestClient().getBulkRequests(numberOfRequest) if not getRequests["OK"]: self.log.error("execute:", "%s" % getRequests["Message"]) break if not getRequests["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break for rId in getRequests["Value"]["Failed"]: self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId]) requestsToExecute = list(getRequests["Value"]["Successful"].values()) self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute)) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info( "processPool status", "tasks idle = %s working = %s" % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()), ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool", "will wait %d seconds to proceed" % self.__poolSleep, ) time.sleep(self.__poolSleep) looping += 1 else: if looping: self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep) looping = 0 # # save current request in cache res = self.cacheRequest(request) if not res["OK"]: if cmpError(res, errno.EALREADY): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( "Too many requests in cache", "(%d requests): put back all requests and exit cycle. Error %s" % (len(self.__requestCache), res["Message"]), ) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result["OK"]: continue requestJSON = result["Value"] self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName)) timeOut = self.getTimeout(request) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs={ "requestJSON": requestJSON, "handlersDict": self.handlersDict, "csPath": self.__configPath, "agentName": self.agentName, "rmsMonitoring": self.__rmsMonitoring, }, taskID=taskID, blocking=True, usePoolCallbacks=True, timeOut=timeOut, ) if not enqueue["OK"]: self.log.error("Could not enqueue task", enqueue["Message"]) else: self.log.debug("successfully enqueued task", "'%s'" % taskID) # # update monitor if self.__rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "status": "Attempted", "objectID": request.RequestID, "nbObject": 1, } ) else: gMonitor.addMark("Processed", 1) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep(0.1) break self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache)) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK()
def am_go(self): # Set the shifter proxy if required result = self._setShifterProxy() if not result["OK"]: return result self.log.notice("-" * 40) self.log.notice("Starting cycle for module %s" % self.__moduleProperties["fullName"]) mD = self.am_getMaxCycles() if mD > 0: cD = self.__moduleProperties["cyclesDone"] self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD)) self.log.notice("-" * 40) # use SIGALARM as a watchdog interrupt if enabled watchdogInt = self.am_getWatchdogTime() if watchdogInt > 0: signal.signal(signal.SIGALRM, signal.SIG_DFL) signal.alarm(watchdogInt) elapsedTime = time.time() if self.activityMonitoring: initialWallTime, initialCPUTime, mem = self._startReportToMonitoring( ) cycleResult = self.__executeModuleCycle() if self.activityMonitoring and initialWallTime and initialCPUTime: cpuPercentage = self._endReportToMonitoring( initialWallTime, initialCPUTime) # Increment counters self.__moduleProperties["cyclesDone"] += 1 # Show status elapsedTime = time.time() - elapsedTime self.__moduleProperties["totalElapsedTime"] += elapsedTime self.log.notice("-" * 40) self.log.notice("Agent module %s run summary" % self.__moduleProperties["fullName"]) self.log.notice(" Executed %s times previously" % self.__moduleProperties["cyclesDone"]) self.log.notice(" Cycle took %.2f seconds" % elapsedTime) averageElapsedTime = self.__moduleProperties[ "totalElapsedTime"] / self.__moduleProperties["cyclesDone"] self.log.notice(" Average execution time: %.2f seconds" % (averageElapsedTime)) elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption( "PollingTime") self.log.notice(" Polling time: %s seconds" % self.am_getOption("PollingTime")) self.log.notice(" Average execution/polling time: %.2f%%" % elapsedPollingRate) if cycleResult["OK"]: self.log.notice(" Cycle was successful") if self.activityMonitoring: # Here we record the data about the cycle duration along with some basic details about the # agent and right now it isn't committed to the ES backend. self.activityMonitoringReporter.addRecord({ "AgentName": self.agentName, "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "MemoryUsage": mem, "CpuPercentage": cpuPercentage, "CycleDuration": elapsedTime, }) else: self.log.warn(" Cycle had an error:", cycleResult["Message"]) self.log.notice("-" * 40) # Update number of cycles # cycle finished successfully, cancel watchdog if watchdogInt > 0: signal.alarm(0) return cycleResult