Beispiel #1
0
 def _mbReceivedMsg(self, trid, msgObj):
     result = self._authorizeProposal(
         ("Message", msgObj.getName()), trid,
         self._transportPool.get(trid).getConnectingCredentials())
     if not result["OK"]:
         return result
     result = self._instantiateHandler(trid)
     if not result["OK"]:
         return result
     handlerObj = result["Value"]
     response = handlerObj._rh_executeMessageCallback(msgObj)
     if self.activityMonitoring and response["OK"]:
         self.activityMonitoringReporter.addRecord({
             "timestamp":
             int(TimeUtilities.toEpoch()),
             "Host":
             Network.getFQDN(),
             "ServiceName":
             "_".join(self._name.split("/")),
             "Location":
             self._cfg.getURL(),
             "ResponseTime":
             response["Value"][1],
         })
     if response["OK"]:
         return response["Value"][0]
     else:
         return response
Beispiel #2
0
    def __reportToMonitoring(self, responseTime):
        """
        Periodically reports to Monitoring
        """

        # Calculate CPU usage by comparing realtime and cpu time since last report
        percentage = self.__endReportToMonitoringLoop(self.__report[0],
                                                      self.__report[1])
        # Send record to Monitoring
        self.activityMonitoringReporter.addRecord({
            "timestamp":
            int(TimeUtilities.toEpoch()),
            "Host":
            Network.getFQDN(),
            "ServiceName":
            "Tornado",
            "MemoryUsage":
            self.__report[2],
            "CpuPercentage":
            percentage,
            "ResponseTime":
            responseTime,
        })
        self.activityMonitoringReporter.commit()
        # Save memory usage and save realtime/CPU time for next call
        self.__report = self.__startReportToMonitoringLoop()
Beispiel #3
0
 def _mbReceivedMsg(self, trid, msgObj):
     result = self._authorizeProposal(
         ('Message', msgObj.getName()), trid,
         self._transportPool.get(trid).getConnectingCredentials())
     if not result['OK']:
         return result
     result = self._instantiateHandler(trid)
     if not result['OK']:
         return result
     handlerObj = result['Value']
     response = handlerObj._rh_executeMessageCallback(msgObj)
     if self.activityMonitoring and response["OK"]:
         self.activityMonitoringReporter.addRecord({
             'timestamp':
             int(Time.toEpoch()),
             'host':
             Network.getFQDN(),
             'componentType':
             'service',
             'component':
             "_".join(self._name.split("/")),
             'componentLocation':
             self._cfg.getURL(),
             'ServiceResponseTime':
             response["Value"][1]
         })
     if response["OK"]:
         return response["Value"][0]
     else:
         return response
Beispiel #4
0
 def __reportActivity(self):
     initialWallTime, initialCPUTime, mem = self.__startReportToMonitoring()
     pendingQueries = self._threadPool._work_queue.qsize()
     activeQuereies = len(self._threadPool._threads)
     percentage = self.__endReportToMonitoring(initialWallTime,
                                               initialCPUTime)
     self.activityMonitoringReporter.addRecord({
         "timestamp":
         int(TimeUtilities.toEpoch()),
         "Host":
         Network.getFQDN(),
         "ServiceName":
         "_".join(self._name.split("/")),
         "Location":
         self._cfg.getURL(),
         "MemoryUsage":
         mem,
         "CpuPercentage":
         percentage,
         "PendingQueries":
         pendingQueries,
         "ActiveQueries":
         activeQuereies,
         "RunningThreads":
         threading.activeCount(),
         "MaxFD":
         self.__maxFD,
     })
     self.__maxFD = 0
Beispiel #5
0
 def initialize( self ):
   self.logger = gLogger.getSubLogger( "Monitoring" )
   self.logger.debug( "Initializing Monitoring Client" )
   self.sourceDict[ 'setup' ] = gConfig.getValue( "/DIRAC/Setup" )
   self.sourceDict[ 'site' ] = DIRAC.siteName()
   if self.sourceDict[ 'componentType' ] == self.COMPONENT_SERVICE:
     self.cfgSection = PathFinder.getSystemSection( self.sourceDict[ 'componentName' ] )
   elif self.sourceDict[ 'componentType' ] == self.COMPONENT_AGENT:
     self.cfgSection = PathFinder.getAgentSection( self.sourceDict[ 'componentName' ] )
     self.setComponentLocation( Network.getFQDN() )
   elif self.sourceDict[ 'componentType' ] == self.COMPONENT_WEB:
     self.cfgSection = "/WebApp"
     self.setComponentLocation( 'http://%s' % Network.getFQDN() )
     self.setComponentName( 'WebApp' )
   elif self.sourceDict[ 'componentType' ] == self.COMPONENT_SCRIPT:
     self.cfgSection = "/Script"
   else:
     raise Exception( "Component type has not been defined" )
   gMonitoringFlusher.registerMonitoringClient( self )
   # ExitCallback.registerExitCallback( self.forceFlush )
   self.__initialized = True
Beispiel #6
0
 def initialize( self ):
   self.logger = gLogger.getSubLogger( "Monitoring" )
   self.logger.debug( "Initializing Monitoring Client" )
   self.sourceDict[ 'setup' ] = gConfig.getValue( "/DIRAC/Setup" )
   self.sourceDict[ 'site' ] = DIRAC.siteName()
   if self.sourceDict[ 'componentType' ] == self.COMPONENT_SERVICE:
     self.cfgSection = PathFinder.getSystemSection( self.sourceDict[ 'componentName' ] )
   elif self.sourceDict[ 'componentType' ] == self.COMPONENT_AGENT:
     self.cfgSection = PathFinder.getAgentSection( self.sourceDict[ 'componentName' ] )
     self.setComponentLocation( Network.getFQDN() )
   elif self.sourceDict[ 'componentType' ] == self.COMPONENT_WEB:
     self.cfgSection = "/Website"
     self.setComponentLocation( 'http://%s' % Network.getFQDN() )
     self.setComponentName( 'Web' )
   elif self.sourceDict[ 'componentType' ] == self.COMPONENT_SCRIPT:
     self.cfgSection = "/Script"
   else:
     raise Exception( "Component type has not been defined" )
   gMonitoringFlusher.registerMonitoringClient( self )
   #ExitCallback.registerExitCallback( self.forceFlush )
   self.__initialized = True
Beispiel #7
0
 def initialize(self):
     self.logger = gLogger.getSubLogger("Monitoring")
     self.logger.debug("Initializing Monitoring Client")
     self.sourceDict["setup"] = gConfig.getValue("/DIRAC/Setup")
     self.sourceDict["site"] = DIRAC.siteName()
     if self.sourceDict["componentType"] == self.COMPONENT_SERVICE:
         self.cfgSection = PathFinder.getSystemSection(
             self.sourceDict["componentName"])
     elif self.sourceDict["componentType"] == self.COMPONENT_AGENT:
         self.cfgSection = PathFinder.getAgentSection(
             self.sourceDict["componentName"])
         self.setComponentLocation(Network.getFQDN())
     elif self.sourceDict["componentType"] == self.COMPONENT_WEB:
         self.cfgSection = "/WebApp"
         self.setComponentLocation("http://%s" % Network.getFQDN())
         self.setComponentName("WebApp")
     elif self.sourceDict["componentType"] == self.COMPONENT_SCRIPT:
         self.cfgSection = "/Script"
     elif self.sourceDict["componentType"] == self.COMPONENT_TORNADO:
         self.cfgSection = "/Tornado"
     else:
         raise Exception("Component type has not been defined")
     gMonitoringFlusher.registerMonitoringClient(self)
     self.__initialized = True
Beispiel #8
0
 def __init__( self, optionsDictionary ):
   threading.Thread.__init__( self )
   self.__interactive = optionsDictionary[ 'Interactive' ]
   self.__sleep = optionsDictionary[ 'SleepTime' ]
   self._messageQueue = Queue.Queue()
   self._Transactions = []
   self._alive = True
   self._site = optionsDictionary[ 'Site' ]
   self._hostname = Network.getFQDN()
   self._logLevels = LogLevels()
   self._negativeLevel = self._logLevels.getLevelValue( 'ERROR' )
   self._positiveLevel = self._logLevels.getLevelValue( 'ALWAYS' )
   self._maxBundledMessages = 20
   self.setDaemon(1)
   self.start()
Beispiel #9
0
 def __init__( self, optionsDictionary ):
   threading.Thread.__init__( self )
   self.__interactive = optionsDictionary[ 'Interactive' ]
   self.__sleep = optionsDictionary[ 'SleepTime' ]
   self._messageQueue = Queue.Queue()
   self._Transactions = []
   self._alive = True
   self._site = optionsDictionary[ 'Site' ]
   self._hostname = Network.getFQDN()
   self._logLevels = LogLevels()
   self._negativeLevel = self._logLevels.getLevelValue( 'ERROR' )
   self._positiveLevel = self._logLevels.getLevelValue( 'ALWAYS' )
   self._maxBundledMessages = 20
   self.setDaemon(1)
   self.start()
Beispiel #10
0
 def _executeAction(self, trid, proposalTuple, handlerObj):
     try:
         response = handlerObj._rh_executeAction(proposalTuple)
         if not response["OK"]:
             return response
         if self.activityMonitoring:
             self.activityMonitoringReporter.addRecord({
                 "timestamp":
                 int(TimeUtilities.toEpoch()),
                 "Host":
                 Network.getFQDN(),
                 "ServiceName":
                 "_".join(self._name.split("/")),
                 "Location":
                 self._cfg.getURL(),
                 "ResponseTime":
                 response["Value"][1],
             })
         return response["Value"][0]
     except Exception as e:
         gLogger.exception("Exception while executing handler action")
         return S_ERROR("Server error while executing action: %s" % str(e))
Beispiel #11
0
def siteName():
  """
  Determine and return DIRAC name for current site
  """
  global __siteName

  if not __siteName:
  
    #FIXME: does this ever happen that we have to use the defaultValue if getValue ???
    from DIRAC.Core.Utilities import Network
    # Some Defaults if not present in the configuration
    fqdn = Network.getFQDN()
    if len( fqdn.split( '.' ) ) > 2 :
    # Use the last component of the FQDN as country code if there are more than 2 components
      _siteName = 'DIRAC.Client.%s' % fqdn.split( '.' )[-1]
    else:
      # else use local as country code
      _siteName = 'DIRAC.Client.local'
    
    __siteName = gConfig.getValue( '/LocalSite/Site', _siteName )

  return __siteName
Beispiel #12
0
 def _executeAction(self, trid, proposalTuple, handlerObj):
     try:
         response = handlerObj._rh_executeAction(proposalTuple)
         if self.activityMonitoring and response["OK"]:
             self.activityMonitoringReporter.addRecord({
                 'timestamp':
                 int(Time.toEpoch()),
                 'host':
                 Network.getFQDN(),
                 'componentType':
                 'service',
                 'component':
                 "_".join(self._name.split("/")),
                 'componentLocation':
                 self._cfg.getURL(),
                 'ServiceResponseTime':
                 response["Value"][1]
             })
         return response["Value"][0]
     except Exception as e:
         gLogger.exception("Exception while executing handler action")
         return S_ERROR("Server error while executing action: %s" % str(e))
Beispiel #13
0
    def createRMSRecord(self, status, nbObject):
        """
        This method is used to create a record given some parameters for sending it to the ES backend.
        It is used inside DMS/Agent/RequestOperations and this method is designed particularly for file
        type of objects.

        :param status: This can be one of these i.e. Attempted, Failed, or Successful.
        :param nbObject: This is number of objects in question.

        :returns: a dictionary.
        """
        record = {
            "timestamp": int(TimeUtilities.toEpoch()),
            "host": Network.getFQDN(),
            "objectType": "File",
            "operationType": self.operation.Type,
            "status": status,
            "nbObject": nbObject,
            "parentID": self.operation.OperationID,
        }

        return record
Beispiel #14
0
    def __init__(self, sleepTime, interactive, site):
        """
    Initialization of the ServerHandler.
    The queue is initialized with the hostname and the start of the thread.

    :params sleepTime: integer, representing time in seconds where the handler can send messages.
    :params interactive: not used at the moment.
    :params site: the site where the log messages come from.
    """
        super(ServerHandler, self).__init__()
        threading.Thread.__init__(self)
        self.__logQueue = Queue.Queue()

        self.__sleepTime = sleepTime
        self.__interactive = interactive
        self.__site = site
        self.__transactions = []
        self.__hostname = Network.getFQDN()
        self.__alive = True
        self.__maxBundledLogs = 20

        self.setDaemon(True)
        self.start()
Beispiel #15
0
 def __generateUniqueClientName(self):
     hashStr = ":".join(
         (str(datetime.datetime.utcnow()), str(random.random()), Network.getFQDN(), gLogger.getName())
     )
     hexHash = md5(hashStr.encode()).hexdigest()
     return hexHash
 def getHostname( self ):
   hostname = self.getOption( "/DIRAC/Hostname" )
   if not hostname:
     return Network.getFQDN()
   return hostname
Beispiel #17
0
    def __call__(self):
        """request processing"""

        self.log.debug("about to execute request")
        if not self.rmsMonitoring:
            gMonitor.addMark("RequestAtt", 1)

        # # setup proxy for request owner
        setupProxy = self.setupProxy()
        if not setupProxy["OK"]:
            userSuspended = "User is currently suspended"
            self.request.Error = setupProxy["Message"]
            # In case the user does not have proxy
            if DErrno.cmpError(setupProxy, DErrno.EPROXYFIND):
                self.log.error("Error setting proxy. Request set to Failed:",
                               setupProxy["Message"])
                # If user is no longer registered, fail the request
                for operation in self.request:
                    for opFile in operation:
                        opFile.Status = "Failed"
                    operation.Status = "Failed"
            elif userSuspended in setupProxy["Message"]:
                # If user is suspended, wait for a long time
                self.request.delayNextExecution(6 * 60)
                self.request.Error = userSuspended
                self.log.error("Error setting proxy: " + userSuspended,
                               self.request.OwnerDN)
            else:
                self.log.error("Error setting proxy", setupProxy["Message"])
            return S_OK(self.request)
        shifter = setupProxy["Value"]["Shifter"]

        error = None

        while self.request.Status == "Waiting":

            # # get waiting operation
            operation = self.request.getWaiting()
            if not operation["OK"]:
                self.log.error("Cannot get waiting operation",
                               operation["Message"])
                return operation
            operation = operation["Value"]
            self.log.info("executing operation", "%s" % operation.Type)

            # # and handler for it
            handler = self.getHandler(operation)
            if not handler["OK"]:
                self.log.error("Unable to process operation",
                               "%s: %s" % (operation.Type, handler["Message"]))
                # gMonitor.addMark( "%s%s" % ( operation.Type, "Fail" ), 1 )
                operation.Error = handler["Message"]
                break

            handler = handler["Value"]
            # # set shifters list in the handler
            handler.shifter = shifter
            # set rmsMonitoring flag for the RequestOperation
            handler.rmsMonitoring = self.rmsMonitoring
            # # and execute
            pluginName = self.getPluginName(
                self.handlersDict.get(operation.Type))
            if self.standalone:
                useServerCertificate = gConfig.useServerCertificate()
            else:
                # Always use server certificates if executed within an agent
                useServerCertificate = True
            try:
                if pluginName:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Operation",
                            "operationType":
                            pluginName,
                            "objectID":
                            operation.OperationID,
                            "parentID":
                            operation.RequestID,
                            "status":
                            "Attempted",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("%s%s" % (pluginName, "Att"), 1)
                # Always use request owner proxy
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "false")
                exe = handler()
                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "true")
                if not exe["OK"]:
                    self.log.error("unable to process operation",
                                   "%s: %s" % (operation.Type, exe["Message"]))
                    if pluginName:
                        if self.rmsMonitoring:
                            self.rmsMonitoringReporter.addRecord({
                                "timestamp":
                                int(Time.toEpoch()),
                                "host":
                                Network.getFQDN(),
                                "objectType":
                                "Operation",
                                "operationType":
                                pluginName,
                                "objectID":
                                operation.OperationID,
                                "parentID":
                                operation.RequestID,
                                "status":
                                "Failed",
                                "nbObject":
                                1,
                            })
                        else:
                            gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Request",
                            "objectID":
                            operation.RequestID,
                            "status":
                            "Failed",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("RequestFail", 1)

                    if self.request.JobID:
                        # Check if the job exists
                        monitorServer = JobMonitoringClient(
                            useCertificates=True)
                        res = monitorServer.getJobSummary(
                            int(self.request.JobID))
                        if not res["OK"]:
                            self.log.error(
                                "RequestTask: Failed to get job status",
                                "%d" % self.request.JobID)
                        elif not res["Value"]:
                            self.log.warn(
                                "RequestTask: job does not exist (anymore): failed request",
                                "JobID: %d" % self.request.JobID,
                            )
                            for opFile in operation:
                                opFile.Status = "Failed"
                            if operation.Status != "Failed":
                                operation.Status = "Failed"
                            self.request.Error = "Job no longer exists"
            except Exception as e:
                error = str(e)
                self.log.exception("hit by exception:", "%s" % error)
                if pluginName:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord({
                            "timestamp":
                            int(Time.toEpoch()),
                            "host":
                            Network.getFQDN(),
                            "objectType":
                            "Operation",
                            "operationType":
                            pluginName,
                            "objectID":
                            operation.OperationID,
                            "parentID":
                            operation.RequestID,
                            "status":
                            "Failed",
                            "nbObject":
                            1,
                        })
                    else:
                        gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Request",
                        "objectID":
                        operation.RequestID,
                        "status":
                        "Failed",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("RequestFail", 1)

                if useServerCertificate:
                    gConfigurationData.setOptionInCFG(
                        "/DIRAC/Security/UseServerCertificate", "true")
                break

            # # operation status check
            if operation.Status == "Done" and pluginName:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Operation",
                        "operationType":
                        pluginName,
                        "objectID":
                        operation.OperationID,
                        "parentID":
                        operation.RequestID,
                        "status":
                        "Successful",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("%s%s" % (pluginName, "OK"), 1)
            elif operation.Status == "Failed" and pluginName:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord({
                        "timestamp":
                        int(Time.toEpoch()),
                        "host":
                        Network.getFQDN(),
                        "objectType":
                        "Operation",
                        "operationType":
                        pluginName,
                        "objectID":
                        operation.OperationID,
                        "parentID":
                        operation.RequestID,
                        "status":
                        "Failed",
                        "nbObject":
                        1,
                    })
                else:
                    gMonitor.addMark("%s%s" % (pluginName, "Fail"), 1)
            elif operation.Status in ("Waiting", "Scheduled"):
                # # no update for waiting or all files scheduled
                break

        if not self.rmsMonitoring:
            gMonitor.flush()

        if error:
            return S_ERROR(error)

        # # request done?
        if self.request.Status == "Done":
            # # update request to the RequestDB
            self.log.info("Updating request status:",
                          "%s" % self.request.Status)
            update = self.updateRequest()
            if not update["OK"]:
                self.log.error("Cannot update request status",
                               update["Message"])
                return update
            self.log.info("request is done", "%s" % self.request.RequestName)
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord({
                    "timestamp":
                    int(Time.toEpoch()),
                    "host":
                    Network.getFQDN(),
                    "objectType":
                    "Request",
                    "objectID":
                    getattr(self.request, "RequestID", 0),
                    "status":
                    "Successful",
                    "nbObject":
                    1,
                })
            else:
                gMonitor.addMark("RequestOK", 1)
            # # and there is a job waiting for it? finalize!
            if self.request.JobID:
                attempts = 0
                while True:
                    finalizeRequest = self.requestClient.finalizeRequest(
                        self.request.RequestID,
                        self.request.JobID  # pylint: disable=no-member
                    )
                    if not finalizeRequest["OK"]:
                        if not attempts:
                            self.log.error(
                                "unable to finalize request, will retry",
                                "ReqName %s:%s" % (self.request.RequestName,
                                                   finalizeRequest["Message"]),
                            )
                        self.log.debug("Waiting 10 seconds")
                        attempts += 1
                        if attempts == 10:
                            self.log.error("Giving up finalize request")
                            return S_ERROR("Could not finalize request")

                        time.sleep(10)

                    else:
                        self.log.info(
                            "request is finalized",
                            "ReqName %s %s" % (self.request.RequestName,
                                               (" after %d attempts" %
                                                attempts) if attempts else ""),
                        )
                        break

        # Commit all the data to the ES Backend
        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()
        # Request will be updated by the callBack method
        self.log.verbose("RequestTasks exiting",
                         "request %s" % self.request.Status)
        return S_OK(self.request)
Beispiel #18
0
 def __generateUniqueClientName(self):
   hashStr = ":".join((Time.toString(), str(random.random()), Network.getFQDN(), gLogger.getName()))
   hexHash = md5(hashStr).hexdigest()
   return hexHash
Beispiel #19
0
 def getHostname(self):
     hostname = self.getOption("/DIRAC/Hostname")
     if not hostname:
         return Network.getFQDN()
     return hostname
Beispiel #20
0
 def am_go(self):
     # Set the shifter proxy if required
     result = self._setShifterProxy()
     if not result['OK']:
         return result
     self.log.notice("-" * 40)
     self.log.notice("Starting cycle for module %s" %
                     self.__moduleProperties['fullName'])
     mD = self.am_getMaxCycles()
     if mD > 0:
         cD = self.__moduleProperties['cyclesDone']
         self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD))
     self.log.notice("-" * 40)
     # use SIGALARM as a watchdog interrupt if enabled
     watchdogInt = self.am_getWatchdogTime()
     if watchdogInt > 0:
         signal.signal(signal.SIGALRM, signal.SIG_DFL)
         signal.alarm(watchdogInt)
     elapsedTime = time.time()
     cpuStats = self._startReportToMonitoring()
     cycleResult = self.__executeModuleCycle()
     if cpuStats:
         self._endReportToMonitoring(*cpuStats)
     # Increment counters
     self.__moduleProperties['cyclesDone'] += 1
     # Show status
     elapsedTime = time.time() - elapsedTime
     self.__moduleProperties['totalElapsedTime'] += elapsedTime
     self.log.notice("-" * 40)
     self.log.notice("Agent module %s run summary" %
                     self.__moduleProperties['fullName'])
     self.log.notice(" Executed %s times previously" %
                     self.__moduleProperties['cyclesDone'])
     self.log.notice(" Cycle took %.2f seconds" % elapsedTime)
     averageElapsedTime = self.__moduleProperties[
         'totalElapsedTime'] / self.__moduleProperties['cyclesDone']
     self.log.notice(" Average execution time: %.2f seconds" %
                     (averageElapsedTime))
     elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption(
         'PollingTime')
     self.log.notice(" Polling time: %s seconds" %
                     self.am_getOption('PollingTime'))
     self.log.notice(" Average execution/polling time: %.2f%%" %
                     elapsedPollingRate)
     if cycleResult['OK']:
         self.log.notice(" Cycle was successful")
         if self.activityMonitoring:
             # Here we record the data about the cycle duration along with some basic details about the
             # component and right now it isn't committed to the ES backend.
             self.activityMonitoringReporter.addRecord({
                 'timestamp':
                 int(Time.toEpoch()),
                 'host':
                 Network.getFQDN(),
                 'componentType':
                 "agent",
                 'component':
                 "_".join(self.__moduleProperties['fullName'].split("/")),
                 'cycleDuration':
                 elapsedTime,
                 'cycles':
                 1
             })
     else:
         self.log.warn(" Cycle had an error:", cycleResult['Message'])
     self.log.notice("-" * 40)
     # Update number of cycles
     if not self.activityMonitoring:
         self.monitor.setComponentExtraParam(
             'cycles', self.__moduleProperties['cyclesDone'])
     # cycle finished successfully, cancel watchdog
     if watchdogInt > 0:
         signal.alarm(0)
     return cycleResult
Beispiel #21
0
    def execute(self):
        """read requests from RequestClient and enqueue them into ProcessPool"""
        if not self.__rmsMonitoring:
            gMonitor.addMark("Iteration", 1)
        # # requests (and so tasks) counter
        taskCounter = 0
        while taskCounter < self.__requestsPerCycle:
            self.log.debug("execute: executing %d request in this cycle" % taskCounter)

            requestsToExecute = []

            if not self.__bulkRequest:
                self.log.info("execute: ask for a single request")
                getRequest = self.requestClient().getRequest()
                if not getRequest["OK"]:
                    self.log.error("execute:", "%s" % getRequest["Message"])
                    break
                if not getRequest["Value"]:
                    self.log.info("execute: no more 'Waiting' requests to process")
                    break
                requestsToExecute = [getRequest["Value"]]
            else:
                numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter)
                self.log.info("execute: ask for requests", "%s" % numberOfRequest)
                getRequests = self.requestClient().getBulkRequests(numberOfRequest)
                if not getRequests["OK"]:
                    self.log.error("execute:", "%s" % getRequests["Message"])
                    break
                if not getRequests["Value"]:
                    self.log.info("execute: no more 'Waiting' requests to process")
                    break
                for rId in getRequests["Value"]["Failed"]:
                    self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId])

                requestsToExecute = list(getRequests["Value"]["Successful"].values())

            self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute))

            for request in requestsToExecute:
                # # set task id
                taskID = request.RequestID

                self.log.info(
                    "processPool status",
                    "tasks idle = %s working = %s"
                    % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()),
                )

                looping = 0
                while True:
                    if not self.processPool().getFreeSlots():
                        if not looping:
                            self.log.info(
                                "No free slots available in processPool",
                                "will wait %d seconds to proceed" % self.__poolSleep,
                            )
                        time.sleep(self.__poolSleep)
                        looping += 1
                    else:
                        if looping:
                            self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep)
                        looping = 0
                        # # save current request in cache
                        res = self.cacheRequest(request)
                        if not res["OK"]:
                            if cmpError(res, errno.EALREADY):
                                # The request is already in the cache, skip it. break out of the while loop to get next request
                                break
                            # There are too many requests in the cache, commit suicide
                            self.log.error(
                                "Too many requests in cache",
                                "(%d requests): put back all requests and exit cycle. Error %s"
                                % (len(self.__requestCache), res["Message"]),
                            )
                            self.putAllRequests()
                            return res
                        # # serialize to JSON
                        result = request.toJSON()
                        if not result["OK"]:
                            continue
                        requestJSON = result["Value"]
                        self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName))
                        timeOut = self.getTimeout(request)
                        enqueue = self.processPool().createAndQueueTask(
                            RequestTask,
                            kwargs={
                                "requestJSON": requestJSON,
                                "handlersDict": self.handlersDict,
                                "csPath": self.__configPath,
                                "agentName": self.agentName,
                                "rmsMonitoring": self.__rmsMonitoring,
                            },
                            taskID=taskID,
                            blocking=True,
                            usePoolCallbacks=True,
                            timeOut=timeOut,
                        )
                        if not enqueue["OK"]:
                            self.log.error("Could not enqueue task", enqueue["Message"])
                        else:
                            self.log.debug("successfully enqueued task", "'%s'" % taskID)
                            # # update monitor
                            if self.__rmsMonitoring:
                                self.rmsMonitoringReporter.addRecord(
                                    {
                                        "timestamp": int(Time.toEpoch()),
                                        "host": Network.getFQDN(),
                                        "objectType": "Request",
                                        "status": "Attempted",
                                        "objectID": request.RequestID,
                                        "nbObject": 1,
                                    }
                                )
                            else:
                                gMonitor.addMark("Processed", 1)

                            # # update request counter
                            taskCounter += 1
                            # # task created, a little time kick to proceed
                            time.sleep(0.1)
                            break

        self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache))
        processed = self.processPool().processResults()
        # This happens when the result queue is screwed up.
        # Returning S_ERROR proved not to be sufficient,
        # and when in this situation, there is nothing we can do.
        # So we just exit. runit will restart from scratch.
        if processed < 0:
            self.log.fatal("Results queue is screwed up")
            sys.exit(1)
        # # clean return
        return S_OK()
Beispiel #22
0
 def am_go(self):
     # Set the shifter proxy if required
     result = self._setShifterProxy()
     if not result["OK"]:
         return result
     self.log.notice("-" * 40)
     self.log.notice("Starting cycle for module %s" %
                     self.__moduleProperties["fullName"])
     mD = self.am_getMaxCycles()
     if mD > 0:
         cD = self.__moduleProperties["cyclesDone"]
         self.log.notice("Remaining %s of %s cycles" % (mD - cD, mD))
     self.log.notice("-" * 40)
     # use SIGALARM as a watchdog interrupt if enabled
     watchdogInt = self.am_getWatchdogTime()
     if watchdogInt > 0:
         signal.signal(signal.SIGALRM, signal.SIG_DFL)
         signal.alarm(watchdogInt)
     elapsedTime = time.time()
     if self.activityMonitoring:
         initialWallTime, initialCPUTime, mem = self._startReportToMonitoring(
         )
     cycleResult = self.__executeModuleCycle()
     if self.activityMonitoring and initialWallTime and initialCPUTime:
         cpuPercentage = self._endReportToMonitoring(
             initialWallTime, initialCPUTime)
     # Increment counters
     self.__moduleProperties["cyclesDone"] += 1
     # Show status
     elapsedTime = time.time() - elapsedTime
     self.__moduleProperties["totalElapsedTime"] += elapsedTime
     self.log.notice("-" * 40)
     self.log.notice("Agent module %s run summary" %
                     self.__moduleProperties["fullName"])
     self.log.notice(" Executed %s times previously" %
                     self.__moduleProperties["cyclesDone"])
     self.log.notice(" Cycle took %.2f seconds" % elapsedTime)
     averageElapsedTime = self.__moduleProperties[
         "totalElapsedTime"] / self.__moduleProperties["cyclesDone"]
     self.log.notice(" Average execution time: %.2f seconds" %
                     (averageElapsedTime))
     elapsedPollingRate = averageElapsedTime * 100 / self.am_getOption(
         "PollingTime")
     self.log.notice(" Polling time: %s seconds" %
                     self.am_getOption("PollingTime"))
     self.log.notice(" Average execution/polling time: %.2f%%" %
                     elapsedPollingRate)
     if cycleResult["OK"]:
         self.log.notice(" Cycle was successful")
         if self.activityMonitoring:
             # Here we record the data about the cycle duration along with some basic details about the
             # agent and right now it isn't committed to the ES backend.
             self.activityMonitoringReporter.addRecord({
                 "AgentName":
                 self.agentName,
                 "timestamp":
                 int(TimeUtilities.toEpoch()),
                 "Host":
                 Network.getFQDN(),
                 "MemoryUsage":
                 mem,
                 "CpuPercentage":
                 cpuPercentage,
                 "CycleDuration":
                 elapsedTime,
             })
     else:
         self.log.warn(" Cycle had an error:", cycleResult["Message"])
     self.log.notice("-" * 40)
     # Update number of cycles
     # cycle finished successfully, cancel watchdog
     if watchdogInt > 0:
         signal.alarm(0)
     return cycleResult