Exemple #1
0
    def __call__(self):
        """call me maybe"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")

        # # check replicas first
        checkReplicas = self.__checkReplicas()
        if not checkReplicas["OK"]:
            self.log.error("Failed to check replicas",
                           checkReplicas["Message"])
        if hasattr(self, "FTSMode") and getattr(self, "FTSMode"):
            bannedGroups = getattr(self, "FTSBannedGroups") if hasattr(
                self, "FTSBannedGroups") else ()
            if self.request.OwnerGroup in bannedGroups:
                self.log.verbose(
                    "usage of FTS system is banned for request's owner")
                return self.dmTransfer()

            return self.fts3Transfer()

        return self.dmTransfer()
    def initialize(self):
        """Standard initialization"""
        # This agent will always loop every 15 minutes
        self.am_setOption("PollingTime", 900)

        # Check whether to send to Monitoring or Accounting or both
        self.jobMonitoringOption = Operations().getMonitoringBackends(
            monitoringType="WMSHistory")
        self.pilotMonitoringOption = Operations().getMonitoringBackends(
            monitoringType="PilotsHistory")
        messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory")
        self.datastores = {
        }  # For storing the clients to Accounting and Monitoring

        if "Accounting" in self.jobMonitoringOption:
            self.datastores["Accounting"] = DataStoreClient(retryGraceTime=900)
        if "Monitoring" in self.jobMonitoringOption:
            self.datastores["Monitoring"] = MonitoringReporter(
                monitoringType="WMSHistory", failoverQueueName=messageQueue)
        if "Monitoring" in self.pilotMonitoringOption:
            self.pilotReporter = MonitoringReporter(
                monitoringType="PilotsHistory", failoverQueueName=messageQueue)

        self.__jobDBFields = []
        for field in self.__summaryKeyFieldsMapping:
            if field == "User":
                field = "Owner"
            elif field == "UserGroup":
                field = "OwnerGroup"
            self.__jobDBFields.append(field)
        return S_OK()
Exemple #3
0
    def __call__(self):
        """call me maybe"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")
        else:
            # # own gMonitor stuff for files
            gMonitor.registerActivity(
                "ReplicateAndRegisterAtt",
                "Replicate and register attempted",
                "RequestExecutingAgent",
                "Files/min",
                gMonitor.OP_SUM,
            )
            gMonitor.registerActivity("ReplicateOK", "Replications successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("ReplicateFail", "Replications failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterOK", "Registrations successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterFail", "Registrations failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            # # for FTS
            gMonitor.registerActivity("FTSScheduleAtt",
                                      "Files schedule attempted",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("FTSScheduleOK",
                                      "File schedule successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("FTSScheduleFail",
                                      "File schedule failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)

        # # check replicas first
        checkReplicas = self.__checkReplicas()
        if not checkReplicas["OK"]:
            self.log.error("Failed to check replicas",
                           checkReplicas["Message"])
        if hasattr(self, "FTSMode") and getattr(self, "FTSMode"):
            bannedGroups = getattr(self, "FTSBannedGroups") if hasattr(
                self, "FTSBannedGroups") else ()
            if self.request.OwnerGroup in bannedGroups:
                self.log.verbose(
                    "usage of FTS system is banned for request's owner")
                return self.dmTransfer()

            return self.fts3Transfer()

        return self.dmTransfer()
Exemple #4
0
  def __init__(
          self,
          requestJSON,
          handlersDict,
          csPath,
          agentName,
          standalone=False,
          requestClient=None,
          rmsMonitoring=False):
    """c'tor

    :param self: self reference
    :param str requestJSON: request serialized to JSON
    :param dict opHandlers: operation handlers
    """
    self.request = Request(requestJSON)
    # # csPath
    self.csPath = csPath
    # # agent name
    self.agentName = agentName
    # # standalone flag
    self.standalone = standalone
    # # handlers dict
    self.handlersDict = handlersDict
    # # handlers class def
    self.handlers = {}
    # # own sublogger
    self.log = gLogger.getSubLogger("pid_%s/%s" % (os.getpid(), self.request.RequestName))
    # # get shifters info
    self.__managersDict = {}
    shifterProxies = self.__setupManagerProxies()
    if not shifterProxies["OK"]:
      self.log.error("Cannot setup shifter proxies", shifterProxies["Message"])

    #  This flag which is set and sent from the RequestExecutingAgent and is False by default.
    self.rmsMonitoring = rmsMonitoring

    if self.rmsMonitoring:
      self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
    else:
      # # initialize gMonitor
      gMonitor.setComponentType(gMonitor.COMPONENT_AGENT)
      gMonitor.setComponentName(self.agentName)
      gMonitor.initialize()

      # # own gMonitor activities
      gMonitor.registerActivity("RequestAtt", "Requests processed",
                                "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM)
      gMonitor.registerActivity("RequestFail", "Requests failed",
                                "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM)
      gMonitor.registerActivity("RequestOK", "Requests done",
                                "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM)

    if requestClient is None:
      self.requestClient = ReqClient()
    else:
      self.requestClient = requestClient
 def __init__(self):
     monitoringType = "DataOperation"
     # Will use the `MonitoringBackends/Default` value as monitoring backend unless a flag for `MonitoringBackends/DataOperation` is set.
     self.monitoringOptions = Operations().getMonitoringBackends(
         monitoringType)
     if "Monitoring" in self.monitoringOptions:
         self.dataOperationReporter = MonitoringReporter(monitoringType)
     if "Accounting" in self.monitoringOptions:
         self.dataOp = DataOperation()
    def initialize(self):
        """Standard initialization"""
        # This agent will always loop every 15 minutes
        self.am_setOption("PollingTime", 900)

        self.backends = self.am_getOption("Backends",
                                          "Accounting").replace(" ",
                                                                "").split(",")
        messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory")

        self.log.info("Committing to %s backend" % "and ".join(self.backends))

        self.datastores = {
        }  # For storing the clients to Accounting and Monitoring

        if "Accounting" in self.backends:
            self.datastores["Accounting"] = DataStoreClient(retryGraceTime=900)
        if "Monitoring" in self.backends:
            self.datastores["Monitoring"] = MonitoringReporter(
                monitoringType="WMSHistory", failoverQueueName=messageQueue)

        self.__jobDBFields = []
        for field in self.__summaryKeyFieldsMapping:
            if field == "User":
                field = "Owner"
            elif field == "UserGroup":
                field = "OwnerGroup"
            self.__jobDBFields.append(field)
        return S_OK()
    def initialize(self):
        """ Standard initialization
    """
        # This agent will always loop every 15 minutes
        self.am_setOption("PollingTime", 900)

        self.backends = self.am_getOption("Backends",
                                          "Accounting").replace(' ',
                                                                '').split(',')
        messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory")

        self.datastores = {
        }  # For storing the clients to Accounting and Monitoring

        if 'Accounting' in self.backends:
            self.datastores['Accounting'] = DataStoreClient(retryGraceTime=900)
        if 'Monitoring' in self.backends:
            self.datastores['Monitoring'] = MonitoringReporter(
                monitoringType="WMSHistory", failoverQueueName=messageQueue)

        self.__jobDBFields = []
        for field in self.__summaryKeyFieldsMapping:
            if field == 'User':
                field = 'Owner'
            elif field == 'UserGroup':
                field = 'OwnerGroup'
            self.__jobDBFields.append(field)
        return S_OK()
Exemple #8
0
    def initializeHandler(cls, serviceInfo):
        """
    Handler class initialization
    """

        # Check the flag for monitoring of the state of the host
        hostMonitoring = cls.srv_getCSOption('HostMonitoring', True)

        if hostMonitoring:
            gThreadScheduler.addPeriodicTask(60, cls.__storeHostInfo)
            # the SystemAdministrator service does not has to use the client to report data about the host.

        # Check the flag for dynamic monitoring
        dynamicMonitoring = cls.srv_getCSOption('DynamicMonitoring', False)
        messageQueue = cls.srv_getCSOption('MessageQueue',
                                           'dirac.componentmonitoring')

        if dynamicMonitoring:
            global gMonitoringReporter
            gMonitoringReporter = MonitoringReporter(
                monitoringType="ComponentMonitoring",
                failoverQueueName=messageQueue)
            gThreadScheduler.addPeriodicTask(120, cls.__storeProfiling)

        keepSoftwareVersions = cls.srv_getCSOption('KeepSoftwareVersions', 0)
        if keepSoftwareVersions > 0:
            gLogger.info(
                "The last %s software version will be kept and the rest will be deleted!"
                % keepSoftwareVersions)
            gThreadScheduler.addPeriodicTask(
                600,
                cls.__deleteOldSoftware, (keepSoftwareVersions, ),
                executions=2)  # it is enough to try 2 times

        return S_OK('Initialization went well')
Exemple #9
0
    def __initializeMonitor(self):
        """
        Initialize the system monitoring.
        """
        # This flag is used to activate ES based monitoring
        if self.activityMonitoring:
            self.log.debug("Monitoring of the agent is enabled.")
            # The import needs to be here because of the CS must be initialized before importing
            # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793)
            from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

            self.activityMonitoringReporter = MonitoringReporter(
                monitoringType="AgentMonitoring")
            # With the help of this periodic task we commit the data to ES at an interval of 100 seconds.
            gThreadScheduler.addPeriodicTask(
                100, self.__activityMonitoringReporting)
            self.__monitorLastStatsUpdate = time.time()
Exemple #10
0
    def initialize(self):
        """ Standard constructor
    """

        self.jobDB = JobDB()

        self.reportPeriod = 120
        self.am_setOption("PollingTime", self.reportPeriod)

        self.monitoringReporter = MonitoringReporter(
            monitoringType="WMSHistory")

        for field in self.__summaryKeyFieldsMapping:
            if field == 'User':
                field = 'Owner'
            elif field == 'UserGroup':
                field = 'OwnerGroup'
            self.__jobDBFields.append(field)

        return S_OK()
Exemple #11
0
    def initialize(self):
        """ Standard constructor
    """

        self.jobDB = JobDB()

        self.am_setOption("PollingTime", 900)
        self.messageQueue = self.am_getOption('MessageQueue',
                                              'dirac.wmshistory')

        self.monitoringReporter = MonitoringReporter(
            monitoringType="WMSHistory", failoverQueueName=self.messageQueue)

        for field in self.__summaryKeyFieldsMapping:
            if field == 'User':
                field = 'Owner'
            elif field == 'UserGroup':
                field = 'OwnerGroup'
            self.__jobDBFields.append(field)

        return S_OK()
Exemple #12
0
 def __initializeMonitor(self):
     """
 Initialize the system monitoring.
 """
     # This flag is used to activate ES based monitoring
     # if the "EnableActivityMonitoring" flag in "yes" or "true" in the cfg file.
     self.activityMonitoring = (
         Operations().getValue("EnableActivityMonitoring", False)
         or self.am_getOption("EnableActivityMonitoring", False))
     if self.activityMonitoring:
         # The import needs to be here because of the CS must be initialized before importing
         # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793)
         from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter
         self.activityMonitoringReporter = MonitoringReporter(
             monitoringType="ComponentMonitoring")
         # With the help of this periodic task we commit the data to ES at an interval of 100 seconds.
         gThreadScheduler.addPeriodicTask(
             100, self.__activityMonitoringReporting)
     else:
         if self.__moduleProperties['standalone']:
             self.monitor = gMonitor
         else:
             self.monitor = MonitoringClient()
         self.monitor.setComponentType(self.monitor.COMPONENT_AGENT)
         self.monitor.setComponentName(self.__moduleProperties['fullName'])
         self.monitor.initialize()
         self.monitor.registerActivity('CPU', "CPU Usage", 'Framework',
                                       "CPU,%", self.monitor.OP_MEAN, 600)
         self.monitor.registerActivity('MEM', "Memory Usage", 'Framework',
                                       'Memory,MB', self.monitor.OP_MEAN,
                                       600)
         # Component monitor
         for field in ('version', 'DIRACVersion', 'description',
                       'platform'):
             self.monitor.setComponentExtraParam(
                 field, self.__codeProperties[field])
         self.monitor.setComponentExtraParam('startTime', Time.dateTime())
         self.monitor.setComponentExtraParam('cycles', 0)
         self.monitor.disable()
         self.__monitorLastStatsUpdate = time.time()
Exemple #13
0
    def __init__(
        self, requestJSON, handlersDict, csPath, agentName, standalone=False, requestClient=None, rmsMonitoring=False
    ):
        """c'tor

        :param self: self reference
        :param str requestJSON: request serialized to JSON
        :param dict opHandlers: operation handlers
        """
        self.request = Request(requestJSON)
        # # csPath
        self.csPath = csPath
        # # agent name
        self.agentName = agentName
        # # standalone flag
        self.standalone = standalone
        # # handlers dict
        self.handlersDict = handlersDict
        # # handlers class def
        self.handlers = {}
        # # own sublogger
        self.log = gLogger.getSubLogger("pid_%s/%s" % (os.getpid(), self.request.RequestName))
        # # get shifters info
        self.__managersDict = {}
        shifterProxies = self.__setupManagerProxies()
        if not shifterProxies["OK"]:
            self.log.error("Cannot setup shifter proxies", shifterProxies["Message"])

        #  This flag which is set and sent from the RequestExecutingAgent and is False by default.
        self.rmsMonitoring = rmsMonitoring

        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")

        if requestClient is None:
            self.requestClient = ReqClient()
        else:
            self.requestClient = requestClient
Exemple #14
0
 def initialize( self ):
   """ Standard constructor
   """
   
   self.jobDB = JobDB()
   
   self.reportPeriod = 120
   self.am_setOption( "PollingTime", self.reportPeriod )
   
   self.monitoringReporter = MonitoringReporter( monitoringType = "WMSHistory" )
   
   for field in self.__summaryKeyFieldsMapping:
     if field == 'User':
       field = 'Owner'
     elif field == 'UserGroup':
       field = 'OwnerGroup'
     self.__jobDBFields.append( field )
   
   return S_OK()
  def initialize(self):
    """ Standard constructor
    """

    self.jobDB = JobDB()

    self.am_setOption("PollingTime", 900)
    self.messageQueue = self.am_getOption('MessageQueue', 'dirac.wmshistory')

    self.monitoringReporter = MonitoringReporter(monitoringType="WMSHistory", failoverQueueName=self.messageQueue)

    for field in self.__summaryKeyFieldsMapping:
      if field == 'User':
        field = 'Owner'
      elif field == 'UserGroup':
        field = 'OwnerGroup'
      self.__jobDBFields.append(field)

    return S_OK()
Exemple #16
0
  def initializeHandler( cls, serviceInfo ):
    """
    Handler class initialization
    """
    
    # Check the flag for monitoring of the state of the host
    hostMonitoring = cls.srv_getCSOption( 'HostMonitoring', True )

    if hostMonitoring:
      gThreadScheduler.addPeriodicTask( 60, cls.__storeHostInfo )
      #the SystemAdministrator service does not has to use the client to report data about the host.

    # Check the flag for dynamic monitoring
    dynamicMonitoring = cls.srv_getCSOption( 'DynamicMonitoring', False )
    
    if dynamicMonitoring:
      global gMonitoringReporter
      gMonitoringReporter = MonitoringReporter( monitoringType = "ComponentMonitoring" )
      gThreadScheduler.addPeriodicTask( 120, cls.__storeProfiling )
      
    return S_OK( 'Initialization went well' )
Exemple #17
0
class ReplicateAndRegister(DMSRequestOperationsBase):
    """
    .. class:: ReplicateAndRegister

    ReplicateAndRegister operation handler
    """
    def __init__(self, operation=None, csPath=None):
        """c'tor

        :param self: self reference
        :param Operation operation: Operation instance
        :param str csPath: CS path for this handler
        """
        super(ReplicateAndRegister, self).__init__(operation, csPath)

        # # SE cache

        # Clients
        self.fc = FileCatalog()

    def __call__(self):
        """call me maybe"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")
        else:
            # # own gMonitor stuff for files
            gMonitor.registerActivity(
                "ReplicateAndRegisterAtt",
                "Replicate and register attempted",
                "RequestExecutingAgent",
                "Files/min",
                gMonitor.OP_SUM,
            )
            gMonitor.registerActivity("ReplicateOK", "Replications successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("ReplicateFail", "Replications failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterOK", "Registrations successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterFail", "Registrations failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            # # for FTS
            gMonitor.registerActivity("FTSScheduleAtt",
                                      "Files schedule attempted",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("FTSScheduleOK",
                                      "File schedule successful",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("FTSScheduleFail",
                                      "File schedule failed",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)

        # # check replicas first
        checkReplicas = self.__checkReplicas()
        if not checkReplicas["OK"]:
            self.log.error("Failed to check replicas",
                           checkReplicas["Message"])
        if hasattr(self, "FTSMode") and getattr(self, "FTSMode"):
            bannedGroups = getattr(self, "FTSBannedGroups") if hasattr(
                self, "FTSBannedGroups") else ()
            if self.request.OwnerGroup in bannedGroups:
                self.log.verbose(
                    "usage of FTS system is banned for request's owner")
                return self.dmTransfer()

            return self.fts3Transfer()

        return self.dmTransfer()

    def __checkReplicas(self):
        """check done replicas and update file states"""
        waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation
                             if opFile.Status in ("Waiting", "Scheduled")])
        targetSESet = set(self.operation.targetSEList)

        replicas = self.fc.getReplicas(list(waitingFiles))
        if not replicas["OK"]:
            self.log.error("Failed to get replicas", replicas["Message"])
            return replicas

        reMissing = re.compile(r".*such file.*")
        for failedLFN, errStr in replicas["Value"]["Failed"].items():
            waitingFiles[failedLFN].Error = errStr
            if reMissing.search(errStr.lower()):
                self.log.error("File does not exists", failedLFN)
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("ReplicateFail", len(targetSESet))
                waitingFiles[failedLFN].Status = "Failed"

        for successfulLFN, reps in replicas["Value"]["Successful"].items():
            if targetSESet.issubset(set(reps)):
                self.log.info("file replicated to all targets", successfulLFN)
                waitingFiles[successfulLFN].Status = "Done"

        return S_OK()

    def _addMetadataToFiles(self, toSchedule):
        """Add metadata to those files that need to be scheduled through FTS

        toSchedule is a dictionary:
        {'lfn1': opFile, 'lfn2': opFile}
        """
        if toSchedule:
            self.log.info(
                "found %s files to schedule, getting metadata from FC" %
                len(toSchedule))
        else:
            self.log.verbose("No files to schedule")
            return S_OK([])

        res = self.fc.getFileMetadata(list(toSchedule))
        if not res["OK"]:
            return res
        else:
            if res["Value"]["Failed"]:
                self.log.warn(
                    "Can't schedule %d files: problems getting the metadata: %s"
                    % (len(res["Value"]["Failed"]), ", ".join(
                        res["Value"]["Failed"])))
            metadata = res["Value"]["Successful"]

        filesToSchedule = {}

        for lfn, lfnMetadata in metadata.items():
            opFileToSchedule = toSchedule[lfn][0]
            opFileToSchedule.GUID = lfnMetadata["GUID"]
            # In principle this is defined already in filterReplicas()
            if not opFileToSchedule.Checksum:
                opFileToSchedule.Checksum = metadata[lfn]["Checksum"]
                opFileToSchedule.ChecksumType = metadata[lfn]["ChecksumType"]
            opFileToSchedule.Size = metadata[lfn]["Size"]

            filesToSchedule[opFileToSchedule.LFN] = opFileToSchedule

        return S_OK(filesToSchedule)

    def _filterReplicas(self, opFile):
        """filter out banned/invalid source SEs"""
        return filterReplicas(opFile,
                              logger=self.log,
                              dataManager=self.dm,
                              opSources=self.operation.sourceSEList)

    def _checkExistingFTS3Operations(self):
        """
        Check if there are ongoing FTS3Operation for the current RMS Operation

        Under some conditions, we can be trying to schedule files while
        there is still an FTS transfer going on. This typically happens
        when the REA hangs. To prevent further race condition, we check
        if there are FTS3Operations in a non Final state matching the
        current operation ID. If so, we put the corresponding files in
        scheduled mode. We will then wait till the FTS3 Operation performs
        the callback

        :returns: S_OK with True if we can go on, False if we should stop the processing
        """

        res = FTS3Client().getOperationsFromRMSOpID(self.operation.OperationID)

        if not res["OK"]:
            self.log.debug("Could not get FTS3Operations matching OperationID",
                           self.operation.OperationID)
            return res

        existingFTSOperations = res["Value"]
        # It is ok to have FTS Operations in a final state, so we
        # care only about the others
        unfinishedFTSOperations = [
            ops for ops in existingFTSOperations
            if ops.status not in FTS3TransferOperation.FINAL_STATES
        ]

        if not unfinishedFTSOperations:
            self.log.debug("No ongoing FTS3Operations, all good")
            return S_OK(True)

        self.log.warn(
            "Some FTS3Operations already exist for the RMS Operation:",
            [op.operationID for op in unfinishedFTSOperations],
        )

        # This would really be a screwed up situation !
        if len(unfinishedFTSOperations) > 1:
            self.log.warn("That's a serious problem !!")

        # We take the rmsFileID of the files in the Operations,
        # find the corresponding File object, and set them scheduled
        rmsFileIDsToSetScheduled = set([
            ftsFile.rmsFileID for ftsOp in unfinishedFTSOperations
            for ftsFile in ftsOp.ftsFiles
        ])

        for opFile in self.operation:
            # If it is in the DB, it has a FileID
            opFileID = opFile.FileID
            if opFileID in rmsFileIDsToSetScheduled:
                self.log.warn("Setting RMSFile as already scheduled", opFileID)
                opFile.Status = "Scheduled"

        # We return here such that the Request is set back to Scheduled in the DB
        # With no further modification
        return S_OK(False)

    def fts3Transfer(self):
        """replicate and register using FTS3"""

        self.log.info("scheduling files in FTS3...")

        # Check first if we do not have ongoing transfers

        res = self._checkExistingFTS3Operations()
        if not res["OK"]:
            return res

        # if res['Value'] is False
        # it means that there are ongoing transfers
        # and we should stop here
        if res["Value"] is False:
            # return S_OK such that the request is put back
            return S_OK()

        fts3Files = []
        toSchedule = {}

        # Dict which maps the FileID to the object
        rmsFilesIds = {}

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted",
                                     len(self.getWaitingFilesList())))

        for opFile in self.getWaitingFilesList():
            rmsFilesIds[opFile.FileID] = opFile

            opFile.Error = ""
            if not self.rmsMonitoring:
                gMonitor.addMark("FTSScheduleAtt")
            # # check replicas
            replicas = self._filterReplicas(opFile)
            if not replicas["OK"]:
                continue
            replicas = replicas["Value"]

            validReplicas = replicas["Valid"]
            noMetaReplicas = replicas["NoMetadata"]
            noReplicas = replicas["NoReplicas"]
            badReplicas = replicas["Bad"]
            noPFN = replicas["NoPFN"]

            if validReplicas:
                validTargets = list(
                    set(self.operation.targetSEList) - set(validReplicas))
                if not validTargets:
                    self.log.info("file %s is already present at all targets" %
                                  opFile.LFN)
                    opFile.Status = "Done"
                else:
                    toSchedule[opFile.LFN] = [opFile, validTargets]

            else:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("FTSScheduleFail")
                if noMetaReplicas:
                    self.log.warn(
                        "unable to schedule file",
                        "'%s': couldn't get metadata at %s" %
                        (opFile.LFN, ",".join(noMetaReplicas)),
                    )
                    opFile.Error = "Couldn't get metadata"
                elif noReplicas:
                    self.log.error(
                        "Unable to schedule transfer",
                        "File %s doesn't exist at %s" %
                        (opFile.LFN, ",".join(noReplicas)),
                    )
                    opFile.Error = "No replicas found"
                    opFile.Status = "Failed"
                elif badReplicas:
                    self.log.error(
                        "Unable to schedule transfer",
                        "File %s, all replicas have a bad checksum at %s" %
                        (opFile.LFN, ",".join(badReplicas)),
                    )
                    opFile.Error = "All replicas have a bad checksum"
                    opFile.Status = "Failed"
                elif noPFN:
                    self.log.warn(
                        "unable to schedule %s, could not get a PFN at %s" %
                        (opFile.LFN, ",".join(noPFN)))

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        res = self._addMetadataToFiles(toSchedule)
        if not res["OK"]:
            return res
        else:
            filesToSchedule = res["Value"]

            for lfn in filesToSchedule:
                opFile = filesToSchedule[lfn]
                validTargets = toSchedule[lfn][1]
                for targetSE in validTargets:
                    ftsFile = FTS3File.fromRMSFile(opFile, targetSE)
                    fts3Files.append(ftsFile)

        if fts3Files:
            res = Registry.getUsernameForDN(self.request.OwnerDN)
            if not res["OK"]:
                self.log.error(
                    "Cannot get username for DN",
                    "%s %s" % (self.request.OwnerDN, res["Message"]))
                return res

            username = res["Value"]
            fts3Operation = FTS3TransferOperation.fromRMSObjects(
                self.request, self.operation, username)
            fts3Operation.ftsFiles = fts3Files

            try:
                if not fts3Operation.activity:
                    vo = getVOfromProxyGroup().get("Value")
                    fts3Plugin = getFTS3Plugin(vo=vo)
                    fts3Operation.activity = fts3Plugin.inferFTSActivity(
                        fts3Operation, self.request, self.operation)
            except Exception:
                pass

            ftsSchedule = FTS3Client().persistOperation(fts3Operation)
            if not ftsSchedule["OK"]:
                self.log.error("Completely failed to schedule to FTS3:",
                               ftsSchedule["Message"])
                return ftsSchedule

            # might have nothing to schedule
            ftsSchedule = ftsSchedule["Value"]
            self.log.info("Scheduled with FTS3Operation id %s" % ftsSchedule)

            self.log.info("%d files have been scheduled to FTS3" %
                          len(fts3Files))

            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Successful", len(fts3Files)))

            for ftsFile in fts3Files:
                opFile = rmsFilesIds[ftsFile.rmsFileID]
                if not self.rmsMonitoring:
                    gMonitor.addMark("FTSScheduleOK", 1)
                opFile.Status = "Scheduled"
                self.log.debug("%s has been scheduled for FTS" % opFile.LFN)
        else:
            self.log.info("No files to schedule after metadata checks")

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        # Just in case some transfers could not be scheduled, try them with RM
        return self.dmTransfer(fromFTS=True)

    def dmTransfer(self, fromFTS=False):
        """replicate and register using dataManager"""
        # # get waiting files. If none just return
        # # source SE
        sourceSE = self.operation.SourceSE if self.operation.SourceSE else None
        if sourceSE:
            # # check source se for read
            bannedSource = self.checkSEsRSS(sourceSE, "ReadAccess")
            if not bannedSource["OK"]:
                if self.rmsMonitoring:
                    for status in ["Attempted", "Failed"]:
                        self.rmsMonitoringReporter.addRecord(
                            self.createRMSRecord(status, len(self.operation)))
                    self.rmsMonitoringReporter.commit()
                else:
                    gMonitor.addMark("ReplicateAndRegisterAtt",
                                     len(self.operation))
                    gMonitor.addMark("ReplicateFail", len(self.operation))
                return bannedSource

            if bannedSource["Value"]:
                self.operation.Error = "SourceSE %s is banned for reading" % sourceSE
                self.log.info(self.operation.Error)
                return S_OK(self.operation.Error)

        # # check targetSEs for write
        bannedTargets = self.checkSEsRSS()
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("ReplicateAndRegisterAtt",
                                 len(self.operation))
                gMonitor.addMark("ReplicateFail", len(self.operation))
            return bannedTargets

        if bannedTargets["Value"]:
            self.operation.Error = "%s targets are banned for writing" % ",".join(
                bannedTargets["Value"])
            return S_OK(self.operation.Error)

        # Can continue now
        self.log.verbose("No targets banned for writing")

        waitingFiles = self.getWaitingFilesList()
        if not waitingFiles:
            return S_OK()
        # # loop over files
        if fromFTS:
            self.log.info(
                "Trying transfer using replica manager as FTS failed")
        else:
            self.log.info("Transferring files using Data manager...")
        errors = defaultdict(int)
        delayExecution = 0

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted", len(waitingFiles)))

        for opFile in waitingFiles:
            if opFile.Error in (
                    "Couldn't get metadata",
                    "File doesn't exist",
                    "No active replica found",
                    "All replicas have a bad checksum",
            ):
                err = "File already in error status"
                errors[err] += 1

            if not self.rmsMonitoring:
                gMonitor.addMark("ReplicateAndRegisterAtt", 1)

            opFile.Error = ""
            lfn = opFile.LFN

            # Check if replica is at the specified source
            replicas = self._filterReplicas(opFile)
            if not replicas["OK"]:
                self.log.error("Failed to check replicas", replicas["Message"])
                continue
            replicas = replicas["Value"]
            validReplicas = replicas.get("Valid")
            noMetaReplicas = replicas.get("NoMetadata")
            noReplicas = replicas.get("NoReplicas")
            badReplicas = replicas.get("Bad")
            noActiveReplicas = replicas.get("NoActiveReplicas")

            if not validReplicas:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("ReplicateFail")
                if noMetaReplicas:
                    err = "Couldn't get metadata"
                    errors[err] += 1
                    self.log.verbose(
                        "unable to replicate '%s', couldn't get metadata at %s"
                        % (opFile.LFN, ",".join(noMetaReplicas)))
                    opFile.Error = err
                elif noReplicas:
                    err = "File doesn't exist"
                    errors[err] += 1
                    self.log.verbose(
                        "Unable to replicate", "File %s doesn't exist at %s" %
                        (opFile.LFN, ",".join(noReplicas)))
                    opFile.Error = err
                    opFile.Status = "Failed"
                elif badReplicas:
                    err = "All replicas have a bad checksum"
                    errors[err] += 1
                    self.log.error(
                        "Unable to replicate",
                        "%s, all replicas have a bad checksum at %s" %
                        (opFile.LFN, ",".join(badReplicas)),
                    )
                    opFile.Error = err
                    opFile.Status = "Failed"
                elif noActiveReplicas:
                    err = "No active replica found"
                    errors[err] += 1
                    self.log.verbose(
                        "Unable to schedule transfer", "%s, %s at %s" %
                        (opFile.LFN, err, ",".join(noActiveReplicas)))
                    opFile.Error = err
                    # All source SEs are banned, delay execution by 1 hour
                    delayExecution = 60
                continue
            # # get the first one in the list
            if sourceSE not in validReplicas:
                if sourceSE:
                    err = "File not at specified source"
                    errors[err] += 1
                    self.log.warn(
                        "%s is not at specified sourceSE %s, changed to %s" %
                        (lfn, sourceSE, validReplicas[0]))
                sourceSE = validReplicas[0]

            # # loop over targetSE
            catalogs = self.operation.Catalog
            if catalogs:
                catalogs = [cat.strip() for cat in catalogs.split(",")]

            for targetSE in self.operation.targetSEList:

                # # call DataManager
                if targetSE in validReplicas:
                    self.log.warn(
                        "Request to replicate %s to an existing location: %s" %
                        (lfn, targetSE))
                    continue
                res = self.dm.replicateAndRegister(lfn,
                                                   targetSE,
                                                   sourceSE=sourceSE,
                                                   catalog=catalogs)
                if res["OK"]:

                    if lfn in res["Value"]["Successful"]:

                        if "replicate" in res["Value"]["Successful"][lfn]:

                            repTime = res["Value"]["Successful"][lfn][
                                "replicate"]
                            prString = "file %s replicated at %s in %s s." % (
                                lfn, targetSE, repTime)

                            if not self.rmsMonitoring:
                                gMonitor.addMark("ReplicateOK", 1)

                            if "register" in res["Value"]["Successful"][lfn]:

                                if not self.rmsMonitoring:
                                    gMonitor.addMark("RegisterOK", 1)

                                regTime = res["Value"]["Successful"][lfn][
                                    "register"]
                                prString += " and registered in %s s." % regTime
                                self.log.info(prString)
                            else:

                                if not self.rmsMonitoring:
                                    gMonitor.addMark("RegisterFail", 1)
                                prString += " but failed to register"
                                self.log.warn(prString)

                                opFile.Error = "Failed to register"
                                # # add register replica operation
                                registerOperation = self.getRegisterOperation(
                                    opFile, targetSE, type="RegisterReplica")
                                self.request.insertAfter(
                                    registerOperation, self.operation)

                        else:

                            self.log.error("Failed to replicate",
                                           "%s to %s" % (lfn, targetSE))
                            if not self.rmsMonitoring:
                                gMonitor.addMark("ReplicateFail", 1)
                            opFile.Error = "Failed to replicate"

                    else:

                        if not self.rmsMonitoring:
                            gMonitor.addMark("ReplicateFail", 1)
                        reason = res["Value"]["Failed"][lfn]
                        self.log.error("Failed to replicate and register",
                                       "File %s at %s:" % (lfn, targetSE),
                                       reason)
                        opFile.Error = reason

                else:

                    if not self.rmsMonitoring:
                        gMonitor.addMark("ReplicateFail", 1)
                    opFile.Error = "DataManager error: %s" % res["Message"]
                    self.log.error("DataManager error", res["Message"])

            if not opFile.Error:
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Successful", 1))

                if len(self.operation.targetSEList) > 1:
                    self.log.info(
                        "file %s has been replicated to all targetSEs" % lfn)
                opFile.Status = "Done"
            elif self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord("Failed", 1))
        # Log error counts
        if delayExecution:
            self.log.info("Delay execution of the request by %d minutes" %
                          delayExecution)
            self.request.delayNextExecution(delayExecution)
        for error, count in errors.items():
            self.log.error(error, "for %d files" % count)

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()
}

"""
# TODO: use WMSHistory_testData.json as in Test_MonitoringDB.py

# pylint: disable=invalid-name,wrong-import-position
import DIRAC

DIRAC.initialize()  # Initialize configuration

from DIRAC import gLogger
from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

gLogger.setLevel("INFO")

wmsMonitoringReporter = MonitoringReporter(monitoringType="WMSHistory")
agentMonitoringReporter = MonitoringReporter(monitoringType="AgentMonitoring")
serviceMonitoringReporter = MonitoringReporter(
    monitoringType="ServiceMonitoring")
pilotMonitoringReporter = MonitoringReporter(
    monitoringType="PilotSubmissionMonitoring")
pilotsHistoryReporter = MonitoringReporter(monitoringType="PilotsHistory")
data = [
    {
        "Status": "Waiting",
        "Jobs": 2,
        "timestamp": 1458130176,
        "JobSplitType": "MCStripping",
        "MinorStatus": "unset",
        "Site": "LCG.GRIDKA.de",
        "Reschedules": 0,
Exemple #19
0
    def __call__(self):
        """ call me maybe """

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(
                monitoringType="RMSMonitoring")
        else:
            # # RegisterFile specific monitor info
            gMonitor.registerActivity("RegisterAtt",
                                      "Attempted file registrations",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterOK",
                                      "Successful file registrations",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)
            gMonitor.registerActivity("RegisterFail",
                                      "Failed file registrations",
                                      "RequestExecutingAgent", "Files/min",
                                      gMonitor.OP_SUM)

        # # counter for failed files
        failedFiles = 0
        # # catalog(s) to use
        catalogs = self.operation.Catalog
        if catalogs:
            catalogs = [cat.strip() for cat in catalogs.split(',')]
        dm = DataManager(catalogs=catalogs)
        # # get waiting files
        waitingFiles = self.getWaitingFilesList()

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(
                self.createRMSRecord("Attempted", len(waitingFiles)))

        # # loop over files
        for opFile in waitingFiles:

            if not self.rmsMonitoring:
                gMonitor.addMark("RegisterAtt", 1)

            # # get LFN
            lfn = opFile.LFN
            # # and others
            fileTuple = (lfn, opFile.PFN, opFile.Size,
                         self.operation.targetSEList[0], opFile.GUID,
                         opFile.Checksum)
            # # call DataManager
            registerFile = dm.registerFile(fileTuple)
            # # check results
            if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]:

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("RegisterFail", 1)
                # self.dataLoggingClient().addFileRecord(
                #     lfn, "RegisterFail", ','.join(catalogs) if catalogs else "all catalogs", "", "RegisterFile")

                reason = str(
                    registerFile.get(
                        "Message",
                        registerFile.get("Value",
                                         {}).get("Failed",
                                                 {}).get(lfn, 'Unknown')))
                errorStr = "failed to register LFN"
                opFile.Error = "%s: %s" % (errorStr, reason)
                if 'GUID already registered' in reason:
                    opFile.Status = 'Failed'
                    self.log.error(errorStr, "%s: %s" % (lfn, reason))
                elif 'File already registered with no replicas' in reason:
                    self.log.warn(
                        errorStr,
                        "%s: %s, will remove it and retry" % (lfn, reason))
                    dm.removeFile(lfn)
                else:
                    self.log.warn(errorStr, "%s: %s" % (lfn, reason))
                failedFiles += 1

            else:

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(
                        self.createRMSRecord("Successful", 1))
                else:
                    gMonitor.addMark("RegisterOK", 1)
                # self.dataLoggingClient().addFileRecord(
                #     lfn, "Register", ','.join(catalogs) if catalogs else "all catalogs", "", "RegisterFile")

                self.log.verbose(
                    "file %s has been registered at %s" %
                    (lfn, ','.join(catalogs) if catalogs else "all catalogs"))
                opFile.Status = "Done"

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        # # final check
        if failedFiles:
            self.log.warn("all files processed, %s files failed to register" %
                          failedFiles)
            self.operation.Error = "some files failed to register"
            return S_ERROR(self.operation.Error)

        return S_OK()
Exemple #20
0
class TornadoServer(object):
    """
    Tornado webserver

    Initialize and run an HTTPS Server for DIRAC services.
    By default it load all https services defined in the CS,
    but you can also give an explicit list.

    The listening port is either:

    * Given as parameter
    * Loaded from the CS ``/Systems/Tornado/<instance>/Port``
    * Default to 8443


    Example 1: Easy way to start tornado::

      # Initialize server and load services
      serverToLaunch = TornadoServer()

      # Start listening when ready
      serverToLaunch.startTornado()

    Example 2:We want to debug service1 and service2 only, and use another port for that ::

      services = ['component/service1:port1', 'component/service2']
      endpoints = ['component/endpoint1', 'component/endpoint2']
      serverToLaunch = TornadoServer(services=services, endpoints=endpoints, port=1234)
      serverToLaunch.startTornado()

    """
    def __init__(self, services=True, endpoints=False, port=None):
        """C'r

        :param list services: (default True) List of service handlers to load.
            If ``True``, loads all described in the CS
            If ``False``, do not load services
        :param list endpoints: (default False) List of endpoint handlers to load.
            If ``True``, loads all described in the CS
            If ``False``, do not load endpoints
        :param int port: Port to listen to.
            If ``None``, the port is resolved following the logic described in the class documentation
        """
        self.__startTime = time.time()
        # Application metadata, routes and settings mapping on the ports
        self.__appsSettings = {}
        # Default port, if enother is not discover
        if port is None:
            port = gConfig.getValue(
                "/Systems/Tornado/%s/Port" %
                PathFinder.getSystemInstance("Tornado"), 8443)
        self.port = port

        # Handler manager initialization with default settings
        self.handlerManager = HandlerManager(services, endpoints)

        # temp value for computation, used by the monitoring
        self.__report = None
        # Last update time stamp
        self.__monitorLastStatsUpdate = None
        self.__monitoringLoopDelay = 60  # In secs

        self.activityMonitoring = False
        if "Monitoring" in Operations().getMonitoringBackends(
                monitoringType="ServiceMonitoring"):
            self.activityMonitoring = True
        # If services are defined, load only these ones (useful for debug purpose or specific services)
        retVal = self.handlerManager.loadServicesHandlers()
        if not retVal["OK"]:
            sLog.error(retVal["Message"])
            raise ImportError(
                "Some services can't be loaded, check the service names and configuration."
            )
        # Response time to load services
        self.__elapsedTime = time.time() - self.__startTime
        retVal = self.handlerManager.loadEndpointsHandlers()
        if not retVal["OK"]:
            sLog.error(retVal["Message"])
            raise ImportError(
                "Some endpoints can't be loaded, check the endpoint names and configuration."
            )

    def __calculateAppSettings(self):
        """Calculate application information mapping on the ports"""
        # if no service list is given, load services from configuration
        handlerDict = self.handlerManager.getHandlersDict()
        for data in handlerDict.values():
            port = data.get("Port") or self.port
            for hURL in data["URLs"]:
                if port not in self.__appsSettings:
                    self.__appsSettings[port] = {"routes": [], "settings": {}}
                if hURL not in self.__appsSettings[port]["routes"]:
                    self.__appsSettings[port]["routes"].append(hURL)
        return bool(self.__appsSettings)

    def loadServices(self, services):
        """Load a services

        :param services: List of service handlers to load. Default value set at initialization
            If ``True``, loads all services from CS
        :type services: bool or list

        :return: S_OK()/S_ERROR()
        """
        return self.handlerManager.loadServicesHandlers(services)

    def loadEndpoints(self, endpoints):
        """Load a endpoints

        :param endpoints: List of service handlers to load. Default value set at initialization
            If ``True``, loads all endpoints from CS
        :type endpoints: bool or list

        :return: S_OK()/S_ERROR()
        """
        return self.handlerManager.loadEndpointsHandlers(endpoints)

    def addHandlers(self, routes, settings=None, port=None):
        """Add new routes

        :param list routes: routes
        :param dict settings: application settings
        :param int port: port
        """
        port = port or self.port
        if port not in self.__appsSettings:
            self.__appsSettings[port] = {"routes": [], "settings": {}}
        if settings:
            self.__appsSettings[port]["settings"].update(settings)
        for route in routes:
            if route not in self.__appsSettings[port]["routes"]:
                self.__appsSettings[port]["routes"].append(route)

        return S_OK()

    def startTornado(self):
        """
        Starts the tornado server when ready.
        This method never returns.
        """
        # If there is no services loaded:
        if not self.__calculateAppSettings():
            raise Exception(
                "There is no services loaded, please check your configuration")

        sLog.debug("Starting Tornado")

        # Prepare SSL settings
        certs = Locations.getHostCertificateAndKeyLocation()
        if certs is False:
            sLog.fatal("Host certificates not found ! Can't start the Server")
            raise ImportError("Unable to load certificates")
        ca = Locations.getCAsLocation()
        ssl_options = {
            "certfile": certs[0],
            "keyfile": certs[1],
            "cert_reqs": M2Crypto.SSL.verify_peer,
            "ca_certs": ca,
            "sslDebug":
            DEBUG_M2CRYPTO,  # Set to true if you want to see the TLS debug messages
        }

        # Init monitoring
        if self.activityMonitoring:
            from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

            self.activityMonitoringReporter = MonitoringReporter(
                monitoringType="ServiceMonitoring")
            self.__monitorLastStatsUpdate = time.time()
            self.__report = self.__startReportToMonitoringLoop()
            # Response time
            # Starting monitoring, IOLoop waiting time in ms, __monitoringLoopDelay is defined in seconds
            tornado.ioloop.PeriodicCallback(
                self.__reportToMonitoring(self.__elapsedTime),
                self.__monitoringLoopDelay * 1000).start()

            # If we are running with python3, Tornado will use asyncio,
            # and we have to convince it to let us run in a different thread
            # Doing this ensures a consistent behavior between py2 and py3
            asyncio.set_event_loop_policy(
                tornado.platform.asyncio.AnyThreadEventLoopPolicy())

        for port, app in self.__appsSettings.items():
            sLog.debug(" - %s" % "\n - ".join(
                ["%s = %s" % (k, ssl_options[k]) for k in ssl_options]))

            # Default server configuration
            settings = dict(compress_response=True, cookie_secret="secret")

            # Merge appllication settings
            settings.update(app["settings"])
            # Start server
            router = Application(app["routes"],
                                 default_handler_class=NotFoundHandler,
                                 **settings)
            server = HTTPServer(router,
                                ssl_options=ssl_options,
                                decompress_request=True)
            try:
                server.listen(int(port))
            except Exception as e:  # pylint: disable=broad-except
                sLog.exception("Exception starting HTTPServer", e)
                raise
            sLog.always("Listening on port %s" % port)

        tornado.ioloop.IOLoop.current().start()

    def __reportToMonitoring(self, responseTime):
        """
        Periodically reports to Monitoring
        """

        # Calculate CPU usage by comparing realtime and cpu time since last report
        percentage = self.__endReportToMonitoringLoop(self.__report[0],
                                                      self.__report[1])
        # Send record to Monitoring
        self.activityMonitoringReporter.addRecord({
            "timestamp":
            int(TimeUtilities.toEpoch()),
            "Host":
            Network.getFQDN(),
            "ServiceName":
            "Tornado",
            "MemoryUsage":
            self.__report[2],
            "CpuPercentage":
            percentage,
            "ResponseTime":
            responseTime,
        })
        self.activityMonitoringReporter.commit()
        # Save memory usage and save realtime/CPU time for next call
        self.__report = self.__startReportToMonitoringLoop()

    def __startReportToMonitoringLoop(self):
        """
        Snapshot of resources to be taken at the beginning
        of a monitoring cycle.
        Also sends memory snapshot to the monitoring.

        This is basically copy/paste of Service.py

        :returns: tuple (<time.time(), cpuTime )

        """
        now = time.time()  # Used to calulate a delta
        stats = os.times()
        cpuTime = stats[0] + stats[2]
        if now - self.__monitorLastStatsUpdate < 0:
            return (now, cpuTime)
        # Send CPU consumption mark
        self.__monitorLastStatsUpdate = now
        # Send Memory consumption mark
        membytes = MemStat.VmB("VmRSS:")
        if membytes:
            mem = membytes / (1024.0 * 1024.0)
        return (now, cpuTime, mem)

    def __endReportToMonitoringLoop(self, initialWallTime, initialCPUTime):
        """
        Snapshot of resources to be taken at the end
        of a monitoring cycle.

        This is basically copy/paste of Service.py

        Determines CPU usage by comparing walltime and cputime and send it to monitor
        """
        wallTime = time.time() - initialWallTime
        stats = os.times()
        cpuTime = stats[0] + stats[2] - initialCPUTime
        percentage = cpuTime / wallTime * 100.0
        return percentage
Exemple #21
0
    def startTornado(self):
        """
        Starts the tornado server when ready.
        This method never returns.
        """
        # If there is no services loaded:
        if not self.__calculateAppSettings():
            raise Exception(
                "There is no services loaded, please check your configuration")

        sLog.debug("Starting Tornado")

        # Prepare SSL settings
        certs = Locations.getHostCertificateAndKeyLocation()
        if certs is False:
            sLog.fatal("Host certificates not found ! Can't start the Server")
            raise ImportError("Unable to load certificates")
        ca = Locations.getCAsLocation()
        ssl_options = {
            "certfile": certs[0],
            "keyfile": certs[1],
            "cert_reqs": M2Crypto.SSL.verify_peer,
            "ca_certs": ca,
            "sslDebug":
            DEBUG_M2CRYPTO,  # Set to true if you want to see the TLS debug messages
        }

        # Init monitoring
        if self.activityMonitoring:
            from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

            self.activityMonitoringReporter = MonitoringReporter(
                monitoringType="ServiceMonitoring")
            self.__monitorLastStatsUpdate = time.time()
            self.__report = self.__startReportToMonitoringLoop()
            # Response time
            # Starting monitoring, IOLoop waiting time in ms, __monitoringLoopDelay is defined in seconds
            tornado.ioloop.PeriodicCallback(
                self.__reportToMonitoring(self.__elapsedTime),
                self.__monitoringLoopDelay * 1000).start()

            # If we are running with python3, Tornado will use asyncio,
            # and we have to convince it to let us run in a different thread
            # Doing this ensures a consistent behavior between py2 and py3
            asyncio.set_event_loop_policy(
                tornado.platform.asyncio.AnyThreadEventLoopPolicy())

        for port, app in self.__appsSettings.items():
            sLog.debug(" - %s" % "\n - ".join(
                ["%s = %s" % (k, ssl_options[k]) for k in ssl_options]))

            # Default server configuration
            settings = dict(compress_response=True, cookie_secret="secret")

            # Merge appllication settings
            settings.update(app["settings"])
            # Start server
            router = Application(app["routes"],
                                 default_handler_class=NotFoundHandler,
                                 **settings)
            server = HTTPServer(router,
                                ssl_options=ssl_options,
                                decompress_request=True)
            try:
                server.listen(int(port))
            except Exception as e:  # pylint: disable=broad-except
                sLog.exception("Exception starting HTTPServer", e)
                raise
            sLog.always("Listening on port %s" % port)

        tornado.ioloop.IOLoop.current().start()
Exemple #22
0
class StatesMonitoringAgent(AgentModule):
    """
      The specific agents must provide the following methods:
        - initialize() for initial settings
        - beginExecution()
        - execute() - the main method called in the agent cycle
        - endExecution()
        - finalize() - the graceful exit of the method, this one is usually used
                   for the agent restart
  """

    __summaryKeyFieldsMapping = [
        'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType',
        'ApplicationStatus', 'MinorStatus'
    ]
    __summaryDefinedFields = [('ApplicationStatus', 'unset'),
                              ('MinorStatus', 'unset')]
    __summaryValueFieldsMapping = ['Jobs', 'Reschedules']
    __renameFieldsMapping = {'JobType': 'JobSplitType'}

    __jobDBFields = []

    jobDB = None
    monitoringReporter = None

    def initialize(self):
        """ Standard constructor
    """

        self.jobDB = JobDB()

        self.am_setOption("PollingTime", 900)
        self.messageQueue = self.am_getOption('MessageQueue',
                                              'dirac.wmshistory')

        self.monitoringReporter = MonitoringReporter(
            monitoringType="WMSHistory", failoverQueueName=self.messageQueue)

        for field in self.__summaryKeyFieldsMapping:
            if field == 'User':
                field = 'Owner'
            elif field == 'UserGroup':
                field = 'OwnerGroup'
            self.__jobDBFields.append(field)

        return S_OK()

    def execute(self):
        """ Main execution method
    """
        result = gConfig.getSections("/DIRAC/Setups")
        if not result['OK']:
            return result
        validSetups = result['Value']
        self.log.info("Valid setups for this cycle are %s" %
                      ", ".join(validSetups))
        # Get the WMS Snapshot!
        result = self.jobDB.getSummarySnapshot(self.__jobDBFields)
        now = Time.dateTime()
        if not result['OK']:
            self.log.error("Can't get the jobdb summary", result['Message'])
        else:
            values = result['Value'][1]
            self.log.info("Start sending records!")
            for record in values:
                recordSetup = record[0]
                if recordSetup not in validSetups:
                    self.log.error("Setup %s is not valid" % recordSetup)
                    continue
                record = record[1:]
                rD = {}
                for fV in self.__summaryDefinedFields:
                    rD[fV[0]] = fV[1]
                for iP in range(len(self.__summaryKeyFieldsMapping)):
                    fieldName = self.__summaryKeyFieldsMapping[iP]
                    rD[self.__renameFieldsMapping.get(fieldName,
                                                      fieldName)] = record[iP]
                record = record[len(self.__summaryKeyFieldsMapping):]
                for iP in range(len(self.__summaryValueFieldsMapping)):
                    rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP])
                rD['timestamp'] = int(Time.toEpoch(now))
                self.monitoringReporter.addRecord(rD)
            retVal = self.monitoringReporter.commit()
            if retVal['OK']:
                self.log.info(
                    "The records are successfully sent to the Store!")
            else:
                self.log.warn(
                    "Faild to insert the records! It will be retried in the next iteration",
                    retVal['Message'])

        return S_OK()
Exemple #23
0
class RemoveReplica(DMSRequestOperationsBase):
    """
    .. class:: RemoveReplica

    """

    def __init__(self, operation=None, csPath=None):
        """c'tor

        :param self: self reference
        :param Operation operation: operation to execute
        :param str csPath: CS path for this handler
        """
        # # base class ctor
        DMSRequestOperationsBase.__init__(self, operation, csPath)

    def __call__(self):
        """remove replicas"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
        else:
            # # gMonitor stuff
            gMonitor.registerActivity(
                "RemoveReplicaAtt", "Replica removals attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "RemoveReplicaOK", "Successful replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "RemoveReplicaFail", "Failed replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )

        # # prepare list of targetSEs
        targetSEs = self.operation.targetSEList
        # # check targetSEs for removal
        bannedTargets = self.checkSEsRSS(targetSEs, access="RemoveAccess")
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("RemoveReplicaAtt")
                gMonitor.addMark("RemoveReplicaFail")
            return bannedTargets

        if bannedTargets["Value"]:
            return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"]))

        # # get waiting files
        waitingFiles = self.getWaitingFilesList()
        # # and prepare dict
        toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles)

        self.log.info("Todo: %s replicas to delete from %s SEs" % (len(toRemoveDict), len(targetSEs)))

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict)))
        else:
            gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs))

        # # keep status for each targetSE
        removalStatus = dict.fromkeys(toRemoveDict, None)
        for lfn in removalStatus:
            removalStatus[lfn] = dict.fromkeys(targetSEs, None)

        # # loop over targetSEs
        for targetSE in targetSEs:

            self.log.info("Removing replicas at %s" % targetSE)

            # # 1st step - bulk removal
            bulkRemoval = self._bulkRemoval(toRemoveDict, targetSE)
            if not bulkRemoval["OK"]:
                self.log.error("Bulk replica removal failed", bulkRemoval["Message"])

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.commit()

                return bulkRemoval

            # # report removal status for successful files
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord(
                        "Successful", len(([opFile for opFile in toRemoveDict.values() if not opFile.Error]))
                    )
                )
            else:
                gMonitor.addMark(
                    "RemoveReplicaOK", len([opFile for opFile in toRemoveDict.values() if not opFile.Error])
                )

            # # 2nd step - process the rest again
            toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Error)
            for lfn, opFile in toRetry.items():
                self._removeWithOwnerProxy(opFile, targetSE)
                if opFile.Error:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))
                    else:
                        gMonitor.addMark("RemoveReplicaFail", 1)
                    removalStatus[lfn][targetSE] = opFile.Error
                else:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1))
                    else:
                        gMonitor.addMark("RemoveReplicaOK", 1)

        # # update file status for waiting files
        failed = 0
        for opFile in self.operation:
            if opFile.Status == "Waiting":
                errors = list(set(error for error in removalStatus[opFile.LFN].values() if error))
                if errors:
                    opFile.Error = "\n".join(errors)
                    # This seems to be the only unrecoverable error
                    if "Write access not permitted for this credential" in opFile.Error:
                        failed += 1
                        opFile.Status = "Failed"
                else:
                    opFile.Status = "Done"

        if failed:
            self.operation.Error = "failed to remove %s replicas" % failed

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()

    def _bulkRemoval(self, toRemoveDict, targetSE):
        """remove replicas :toRemoveDict: at :targetSE:

        :param dict toRemoveDict: { lfn: opFile, ... }
        :param str targetSE: target SE name
        """
        # Clear the error
        for opFile in toRemoveDict.values():
            opFile.Error = ""
        removeReplicas = self.dm.removeReplica(targetSE, list(toRemoveDict))
        if not removeReplicas["OK"]:
            for opFile in toRemoveDict.values():
                opFile.Error = removeReplicas["Message"]
            return removeReplicas
        removeReplicas = removeReplicas["Value"]
        # # filter out failed
        for lfn, opFile in toRemoveDict.items():
            if lfn in removeReplicas["Failed"]:
                errorReason = str(removeReplicas["Failed"][lfn])
                # If the reason is that the file does not exist,
                # we consider the removal successful
                # TODO: use cmpError once the FC returns the proper error msg corresponding to ENOENT
                if "No such file" not in errorReason:
                    opFile.Error = errorReason
                    self.log.error("Failed removing lfn", "%s:%s" % (lfn, opFile.Error))

        return S_OK()

    def _removeWithOwnerProxy(self, opFile, targetSE):
        """remove opFile replica from targetSE using owner proxy

        :param File opFile: File instance
        :param str targetSE: target SE name
        """
        if "Write access not permitted for this credential" in opFile.Error:
            proxyFile = None
            if "DataManager" in self.shifter:
                # #  you're a data manager - save current proxy and get a new one for LFN and retry
                saveProxy = os.environ["X509_USER_PROXY"]
                try:
                    fileProxy = self.getProxyForLFN(opFile.LFN)
                    if not fileProxy["OK"]:
                        opFile.Error = fileProxy["Message"]
                    else:
                        proxyFile = fileProxy["Value"]
                        removeReplica = self.dm.removeReplica(targetSE, opFile.LFN)
                        if not removeReplica["OK"]:
                            opFile.Error = removeReplica["Message"]
                        else:
                            # Set or reset the error if all OK
                            opFile.Error = removeReplica["Value"]["Failed"].get(opFile.LFN, "")
                finally:
                    if proxyFile:
                        os.unlink(proxyFile)
                    # # put back request owner proxy to env
                    os.environ["X509_USER_PROXY"] = saveProxy
Exemple #24
0
    def __init__(self, *args, **kwargs):
        """c'tor"""
        # # call base class ctor
        AgentModule.__init__(self, *args, **kwargs)
        # # ProcessPool related stuff
        self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle)
        self.log.info("Requests/cycle = %d" % self.__requestsPerCycle)
        self.__minProcess = self.am_getOption("MinProcess", self.__minProcess)
        self.log.info("ProcessPool min process = %d" % self.__minProcess)
        self.__maxProcess = self.am_getOption("MaxProcess", self.__maxProcess)
        self.log.info("ProcessPool max process = %d" % self.__maxProcess)
        self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize)
        self.log.info("ProcessPool queue size = %d" % self.__queueSize)
        self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout))
        self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout)
        self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep))
        self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep)
        self.__bulkRequest = self.am_getOption("BulkRequest", self.__bulkRequest)
        self.log.info("Bulk request size = %d" % self.__bulkRequest)
        self.__rmsMonitoring = self.am_getOption("EnableRMSMonitoring", self.__rmsMonitoring)
        self.log.info("Enable ES RMS Monitoring = %s" % self.__rmsMonitoring)

        # # keep config path and agent name
        self.agentName = self.am_getModuleParam("fullName")
        self.__configPath = PathFinder.getAgentSection(self.agentName)

        # # operation handlers over here
        opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers")
        opHandlers = gConfig.getSections(opHandlersPath)
        if not opHandlers["OK"]:
            self.log.error(opHandlers["Message"])
            raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath)
        opHandlers = opHandlers["Value"]

        self.timeOuts = dict()

        # # handlers dict
        self.handlersDict = dict()
        for opHandler in opHandlers:
            opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler)
            opLocation = gConfig.getValue(opHandlerPath, "")
            if not opLocation:
                self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler))
                continue
            self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout}

            opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0)
            if opTimeout:
                self.timeOuts[opHandler]["PerOperation"] = opTimeout
            fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0)
            if fileTimeout:
                self.timeOuts[opHandler]["PerFile"] = fileTimeout

            self.handlersDict[opHandler] = opLocation

        self.log.info("Operation handlers:")
        for item in enumerate(self.handlersDict.items()):
            opHandler = item[1][0]
            self.log.info(
                "[%s] %s: %s (timeout: %d s + %d s per file)"
                % (
                    item[0],
                    item[1][0],
                    item[1][1],
                    self.timeOuts[opHandler]["PerOperation"],
                    self.timeOuts[opHandler]["PerFile"],
                )
            )

        if self.__rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
            gThreadScheduler.addPeriodicTask(100, self.__rmsMonitoringReporting)
        else:
            # # common monitor activity
            gMonitor.registerActivity("Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM)
            gMonitor.registerActivity(
                "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM
            )

        # # create request dict
        self.__requestCache = dict()

        # ?? Probably should be removed
        self.FTSMode = self.am_getOption("FTSMode", False)
class StatesAccountingAgent(AgentModule):
    """Agent that every 15 minutes will report
    to the AccountingDB (MySQL) or the Monitoring DB (ElasticSearch), or both,
    a snapshot of the JobDB.
    Also sends a snapshot of PilotAgentsDB to Monitoring.
    """

    # WMSHistory fields
    __summaryKeyFieldsMapping = [
        "Status",
        "Site",
        "User",
        "UserGroup",
        "JobGroup",
        "JobType",
        "ApplicationStatus",
        "MinorStatus",
    ]
    __summaryDefinedFields = [("ApplicationStatus", "unset"),
                              ("MinorStatus", "unset")]
    __summaryValueFieldsMapping = ["Jobs", "Reschedules"]
    __renameFieldsMapping = {"JobType": "JobSplitType"}

    # PilotsHistory fields
    __pilotsMapping = [
        "TaskQueueID", "GridSite", "GridType", "Status", "NumOfPilots"
    ]

    def initialize(self):
        """Standard initialization"""
        # This agent will always loop every 15 minutes
        self.am_setOption("PollingTime", 900)

        # Check whether to send to Monitoring or Accounting or both
        self.jobMonitoringOption = Operations().getMonitoringBackends(
            monitoringType="WMSHistory")
        self.pilotMonitoringOption = Operations().getMonitoringBackends(
            monitoringType="PilotsHistory")
        messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory")
        self.datastores = {
        }  # For storing the clients to Accounting and Monitoring

        if "Accounting" in self.jobMonitoringOption:
            self.datastores["Accounting"] = DataStoreClient(retryGraceTime=900)
        if "Monitoring" in self.jobMonitoringOption:
            self.datastores["Monitoring"] = MonitoringReporter(
                monitoringType="WMSHistory", failoverQueueName=messageQueue)
        if "Monitoring" in self.pilotMonitoringOption:
            self.pilotReporter = MonitoringReporter(
                monitoringType="PilotsHistory", failoverQueueName=messageQueue)

        self.__jobDBFields = []
        for field in self.__summaryKeyFieldsMapping:
            if field == "User":
                field = "Owner"
            elif field == "UserGroup":
                field = "OwnerGroup"
            self.__jobDBFields.append(field)
        return S_OK()

    def execute(self):
        """Main execution method"""
        # PilotsHistory to Monitoring
        if "Monitoring" in self.pilotMonitoringOption:
            self.log.info("Committing PilotsHistory to Monitoring")
            result = PilotAgentsDB().getSummarySnapshot()
            now = datetime.datetime.utcnow()
            if not result["OK"]:
                self.log.error(
                    "Can't get the PilotAgentsDB summary",
                    "%s: won't commit PilotsHistory at this cycle" %
                    result["Message"],
                )

            values = result["Value"][1]
            for record in values:
                rD = {}
                for iP, _ in enumerate(self.__pilotsMapping):
                    rD[self.__pilotsMapping[iP]] = record[iP]
                rD["timestamp"] = int(TimeUtilities.toEpoch(now))
                self.pilotReporter.addRecord(rD)

            self.log.info("Committing to Monitoring...")
            result = self.pilotReporter.commit()
            if not result["OK"]:
                self.log.error("Could not commit to Monitoring",
                               result["Message"])
            self.log.verbose("Done committing PilotsHistory to Monitoring")

        # WMSHistory to Monitoring or Accounting
        self.log.info("Committing WMSHistory to %s backend" %
                      "and ".join(self.jobMonitoringOption))
        result = JobDB().getSummarySnapshot(self.__jobDBFields)
        now = datetime.datetime.utcnow()
        if not result["OK"]:
            self.log.error(
                "Can't get the JobDB summary",
                "%s: won't commit WMSHistory at this cycle" %
                result["Message"])
            return S_ERROR()

        values = result["Value"][1]

        self.log.info("Start sending WMSHistory records")
        for record in values:
            record = record[1:]
            rD = {}
            for fV in self.__summaryDefinedFields:
                rD[fV[0]] = fV[1]
            for iP, _ in enumerate(self.__summaryKeyFieldsMapping):
                fieldName = self.__summaryKeyFieldsMapping[iP]
                rD[self.__renameFieldsMapping.get(fieldName,
                                                  fieldName)] = record[iP]
            record = record[len(self.__summaryKeyFieldsMapping):]
            for iP, _ in enumerate(self.__summaryValueFieldsMapping):
                rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP])

            for backend in self.datastores:
                if backend.lower() == "monitoring":
                    rD["timestamp"] = int(TimeUtilities.toEpoch(now))
                    self.datastores["Monitoring"].addRecord(rD)

                elif backend.lower() == "accounting":
                    acWMS = WMSHistory()
                    acWMS.setStartTime(now)
                    acWMS.setEndTime(now)
                    acWMS.setValuesFromDict(rD)
                    retVal = acWMS.checkValues()
                    if not retVal["OK"]:
                        self.log.error("Invalid WMSHistory accounting record ",
                                       "%s -> %s" % (retVal["Message"], rD))
                    else:
                        self.datastores["Accounting"].addRegister(acWMS)

        for backend, datastore in self.datastores.items():
            self.log.info("Committing WMSHistory records to %s backend" %
                          backend)
            result = datastore.commit()
            if not result["OK"]:
                self.log.error("Couldn't commit WMSHistory to %s" % backend,
                               result["Message"])
                return S_ERROR()
            self.log.verbose("Done committing WMSHistory to %s backend" %
                             backend)

        return S_OK()
Exemple #26
0
class RequestExecutingAgent(AgentModule):
    """
    .. class:: RequestExecutingAgent

    request processing agent using ProcessPool, Operation handlers and RequestTask
    """

    # # process pool
    __processPool = None
    # # request cache
    __requestCache = {}
    # # requests/cycle
    __requestsPerCycle = 100
    # # minimal nb of subprocess running
    __minProcess = 20
    # # maximal nb of subprocess executed same time
    __maxProcess = 20
    # # ProcessPool queue size
    __queueSize = 20
    # # file timeout
    __fileTimeout = 300
    # # operation timeout
    __operationTimeout = 300
    # # ProcessPool finalization timeout
    __poolTimeout = 900
    # # ProcessPool sleep time
    __poolSleep = 5
    # # placeholder for RequestClient instance
    __requestClient = None
    # # Size of the bulk if use of getRequests. If 0, use getRequest
    __bulkRequest = 0
    # # Send the monitoring data to ES rather than the Framework/Monitoring
    __rmsMonitoring = False

    def __init__(self, *args, **kwargs):
        """c'tor"""
        # # call base class ctor
        AgentModule.__init__(self, *args, **kwargs)
        # # ProcessPool related stuff
        self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle)
        self.log.info("Requests/cycle = %d" % self.__requestsPerCycle)
        self.__minProcess = self.am_getOption("MinProcess", self.__minProcess)
        self.log.info("ProcessPool min process = %d" % self.__minProcess)
        self.__maxProcess = self.am_getOption("MaxProcess", self.__maxProcess)
        self.log.info("ProcessPool max process = %d" % self.__maxProcess)
        self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize)
        self.log.info("ProcessPool queue size = %d" % self.__queueSize)
        self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout))
        self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout)
        self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep))
        self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep)
        self.__bulkRequest = self.am_getOption("BulkRequest", self.__bulkRequest)
        self.log.info("Bulk request size = %d" % self.__bulkRequest)
        self.__rmsMonitoring = self.am_getOption("EnableRMSMonitoring", self.__rmsMonitoring)
        self.log.info("Enable ES RMS Monitoring = %s" % self.__rmsMonitoring)

        # # keep config path and agent name
        self.agentName = self.am_getModuleParam("fullName")
        self.__configPath = PathFinder.getAgentSection(self.agentName)

        # # operation handlers over here
        opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers")
        opHandlers = gConfig.getSections(opHandlersPath)
        if not opHandlers["OK"]:
            self.log.error(opHandlers["Message"])
            raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath)
        opHandlers = opHandlers["Value"]

        self.timeOuts = dict()

        # # handlers dict
        self.handlersDict = dict()
        for opHandler in opHandlers:
            opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler)
            opLocation = gConfig.getValue(opHandlerPath, "")
            if not opLocation:
                self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler))
                continue
            self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout}

            opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0)
            if opTimeout:
                self.timeOuts[opHandler]["PerOperation"] = opTimeout
            fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0)
            if fileTimeout:
                self.timeOuts[opHandler]["PerFile"] = fileTimeout

            self.handlersDict[opHandler] = opLocation

        self.log.info("Operation handlers:")
        for item in enumerate(self.handlersDict.items()):
            opHandler = item[1][0]
            self.log.info(
                "[%s] %s: %s (timeout: %d s + %d s per file)"
                % (
                    item[0],
                    item[1][0],
                    item[1][1],
                    self.timeOuts[opHandler]["PerOperation"],
                    self.timeOuts[opHandler]["PerFile"],
                )
            )

        if self.__rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
            gThreadScheduler.addPeriodicTask(100, self.__rmsMonitoringReporting)
        else:
            # # common monitor activity
            gMonitor.registerActivity("Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM)
            gMonitor.registerActivity(
                "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM
            )

        # # create request dict
        self.__requestCache = dict()

        # ?? Probably should be removed
        self.FTSMode = self.am_getOption("FTSMode", False)

    def processPool(self):
        """facade for ProcessPool"""
        if not self.__processPool:
            minProcess = max(1, self.__minProcess)
            maxProcess = max(self.__minProcess, self.__maxProcess)
            queueSize = abs(self.__queueSize)
            self.log.info(
                "REA ProcessPool configuration",
                "minProcess = %d maxProcess = %d queueSize = %d" % (minProcess, maxProcess, queueSize),
            )
            self.__processPool = ProcessPool(
                minProcess,
                maxProcess,
                queueSize,
                poolCallback=self.resultCallback,
                poolExceptionCallback=self.exceptionCallback,
            )
            self.__processPool.daemonize()
        return self.__processPool

    def requestClient(self):
        """RequestClient getter"""
        if not self.__requestClient:
            self.__requestClient = ReqClient()
        return self.__requestClient

    def cacheRequest(self, request):
        """put request into requestCache

        :param ~Request.Request request: Request instance
        """
        maxProcess = max(self.__minProcess, self.__maxProcess)
        if len(self.__requestCache) > maxProcess + 50:
            # For the time being we just print a warning... If the ProcessPool is working well, this is not needed
            # We don't know how much is acceptable as it depends on many factors
            self.log.warn("Too many requests in cache", ": %d" % len(self.__requestCache))
        #      return S_ERROR( "Too many requests in cache" )
        if request.RequestID in self.__requestCache:
            # We don't call  putRequest as we have got back the request that is still being executed. Better keep it
            # The main reason for this is that it lasted longer than the kick time of CleanReqAgent
            self.log.warn(
                "Duplicate request, keep it but don't execute", ": %d/%s" % (request.RequestID, request.RequestName)
            )
            return S_ERROR(errno.EALREADY, "Request already in cache")
        self.__requestCache[request.RequestID] = request
        return S_OK()

    def putRequest(self, requestID, taskResult=None):
        """put back :requestID: to RequestClient

        :param str requestID: request's id
        """
        if requestID in self.__requestCache:
            request = self.__requestCache.pop(requestID)
            if taskResult:
                if taskResult["OK"]:
                    request = taskResult["Value"]
                    # The RequestTask is putting back the Done tasks, no need to redo it
                    if request.Status == "Done":
                        return S_OK()
                # In case of timeout, we need to increment ourselves all the attempts
                elif cmpError(taskResult, errno.ETIME):
                    waitingOp = request.getWaiting()
                    for rmsFile in waitingOp.get("Value", []):
                        rmsFile.Attempt += 1

            reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2)
            if not reset["OK"]:
                return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"]))
        else:
            return S_ERROR("Not in cache")
        return S_OK()

    def putAllRequests(self):
        """put back all requests without callback called into requestClient

        :param self: self reference
        """
        self.log.info("putAllRequests: will put back requests", "%s" % len(self.__requestCache))
        for requestID in self.__requestCache.keys():
            reset = self.putRequest(requestID)
            if not reset["OK"]:
                self.log.error("Failed to put request", reset["Message"])
            else:
                self.log.debug("putAllRequests: request %s has been put back with its initial state" % requestID)
        return S_OK()

    def initialize(self):
        """initialize agent"""
        return S_OK()

    def execute(self):
        """read requests from RequestClient and enqueue them into ProcessPool"""
        if not self.__rmsMonitoring:
            gMonitor.addMark("Iteration", 1)
        # # requests (and so tasks) counter
        taskCounter = 0
        while taskCounter < self.__requestsPerCycle:
            self.log.debug("execute: executing %d request in this cycle" % taskCounter)

            requestsToExecute = []

            if not self.__bulkRequest:
                self.log.info("execute: ask for a single request")
                getRequest = self.requestClient().getRequest()
                if not getRequest["OK"]:
                    self.log.error("execute:", "%s" % getRequest["Message"])
                    break
                if not getRequest["Value"]:
                    self.log.info("execute: no more 'Waiting' requests to process")
                    break
                requestsToExecute = [getRequest["Value"]]
            else:
                numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter)
                self.log.info("execute: ask for requests", "%s" % numberOfRequest)
                getRequests = self.requestClient().getBulkRequests(numberOfRequest)
                if not getRequests["OK"]:
                    self.log.error("execute:", "%s" % getRequests["Message"])
                    break
                if not getRequests["Value"]:
                    self.log.info("execute: no more 'Waiting' requests to process")
                    break
                for rId in getRequests["Value"]["Failed"]:
                    self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId])

                requestsToExecute = list(getRequests["Value"]["Successful"].values())

            self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute))

            for request in requestsToExecute:
                # # set task id
                taskID = request.RequestID

                self.log.info(
                    "processPool status",
                    "tasks idle = %s working = %s"
                    % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()),
                )

                looping = 0
                while True:
                    if not self.processPool().getFreeSlots():
                        if not looping:
                            self.log.info(
                                "No free slots available in processPool",
                                "will wait %d seconds to proceed" % self.__poolSleep,
                            )
                        time.sleep(self.__poolSleep)
                        looping += 1
                    else:
                        if looping:
                            self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep)
                        looping = 0
                        # # save current request in cache
                        res = self.cacheRequest(request)
                        if not res["OK"]:
                            if cmpError(res, errno.EALREADY):
                                # The request is already in the cache, skip it. break out of the while loop to get next request
                                break
                            # There are too many requests in the cache, commit suicide
                            self.log.error(
                                "Too many requests in cache",
                                "(%d requests): put back all requests and exit cycle. Error %s"
                                % (len(self.__requestCache), res["Message"]),
                            )
                            self.putAllRequests()
                            return res
                        # # serialize to JSON
                        result = request.toJSON()
                        if not result["OK"]:
                            continue
                        requestJSON = result["Value"]
                        self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName))
                        timeOut = self.getTimeout(request)
                        enqueue = self.processPool().createAndQueueTask(
                            RequestTask,
                            kwargs={
                                "requestJSON": requestJSON,
                                "handlersDict": self.handlersDict,
                                "csPath": self.__configPath,
                                "agentName": self.agentName,
                                "rmsMonitoring": self.__rmsMonitoring,
                            },
                            taskID=taskID,
                            blocking=True,
                            usePoolCallbacks=True,
                            timeOut=timeOut,
                        )
                        if not enqueue["OK"]:
                            self.log.error("Could not enqueue task", enqueue["Message"])
                        else:
                            self.log.debug("successfully enqueued task", "'%s'" % taskID)
                            # # update monitor
                            if self.__rmsMonitoring:
                                self.rmsMonitoringReporter.addRecord(
                                    {
                                        "timestamp": int(Time.toEpoch()),
                                        "host": Network.getFQDN(),
                                        "objectType": "Request",
                                        "status": "Attempted",
                                        "objectID": request.RequestID,
                                        "nbObject": 1,
                                    }
                                )
                            else:
                                gMonitor.addMark("Processed", 1)

                            # # update request counter
                            taskCounter += 1
                            # # task created, a little time kick to proceed
                            time.sleep(0.1)
                            break

        self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache))
        processed = self.processPool().processResults()
        # This happens when the result queue is screwed up.
        # Returning S_ERROR proved not to be sufficient,
        # and when in this situation, there is nothing we can do.
        # So we just exit. runit will restart from scratch.
        if processed < 0:
            self.log.fatal("Results queue is screwed up")
            sys.exit(1)
        # # clean return
        return S_OK()

    def getTimeout(self, request):
        """get timeout for request"""
        timeout = 0
        for op in request:
            if op.Status not in ("Waiting", "Scheduled", "Queued"):
                continue
            if op.Type not in self.timeOuts:
                timeout += self.__operationTimeout
            else:
                perOp = self.timeOuts[op.Type].get("PerOperation", self.__operationTimeout)
                perFiles = self.timeOuts[op.Type].get("PerFile", self.__fileTimeout) * len(op)
                timeout += perOp + perFiles
        self.log.info(
            "estimated timeOut for request", "(%s/%s) is %s" % (request.RequestID, request.RequestName, timeout)
        )
        return timeout

    def finalize(self):
        """agent finalization"""
        if self.__processPool:
            self.processPool().finalize(timeout=self.__poolTimeout)
        self.putAllRequests()
        return S_OK()

    def resultCallback(self, taskID, taskResult):
        """definition of request callback function

        :param str taskID: Request.RequestID
        :param dict taskResult: task result S_OK(Request)/S_ERROR(Message)
        """
        # # clean cache
        res = self.putRequest(taskID, taskResult)
        self.log.info(
            "callback:",
            "%s result is %s(%s), put %s(%s)"
            % (
                taskID,
                "S_OK" if taskResult["OK"] else "S_ERROR",
                taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"],
                "S_OK" if res["OK"] else "S_ERROR",
                "" if res["OK"] else res["Message"],
            ),
        )

    def exceptionCallback(self, taskID, taskException):
        """definition of exception callback function

        :param str taskID: Request.RequestID
        :param Exception taskException: Exception instance
        """
        self.log.error("exceptionCallback:", "%s was hit by exception %s" % (taskID, taskException))
        self.putRequest(taskID)

    def __rmsMonitoringReporting(self):
        """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which
        is done by the MonitoringReporter and is send to the 'RMSMonitoring' type.
        :return: True / False
        """
        result = self.rmsMonitoringReporter.commit()
        return result["OK"]
Exemple #27
0
    def __call__(self):
        """perform physical removal operation"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")

        bannedTargets = self.checkSEsRSS(access="RemoveAccess")
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            return bannedTargets

        if bannedTargets["Value"]:
            return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"]))

        # # get waiting files
        waitingFiles = self.getWaitingFilesList()
        # # prepare lfn dict
        toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles)

        targetSEs = self.operation.targetSEList

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict)))
            self.rmsMonitoringReporter.commit()

        # # keep errors dict
        removalStatus = dict.fromkeys(toRemoveDict.keys(), None)
        for lfn in removalStatus:
            removalStatus[lfn] = dict.fromkeys(targetSEs, "")

        for targetSE in targetSEs:

            self.log.info("removing files from %s" % targetSE)

            # # 1st - bulk removal
            bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE)
            if not bulkRemoval["OK"]:
                self.log.error("Failed bulk removal", bulkRemoval["Message"])
                self.operation.Error = bulkRemoval["Message"]
                return bulkRemoval

            bulkRemoval = bulkRemoval["Value"]

            for lfn, opFile in toRemoveDict.items():
                removalStatus[lfn][targetSE] = bulkRemoval["Failed"].get(lfn, "")
                opFile.Error = removalStatus[lfn][targetSE]

            # # 2nd - single file removal
            toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if lfn in bulkRemoval["Failed"])
            for lfn, opFile in toRetry.items():
                self.singleRemoval(opFile, targetSE)
                if not opFile.Error:
                    removalStatus[lfn][targetSE] = ""
                else:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))
                    removalStatus[lfn][targetSE] = opFile.Error

        # # update file status for waiting files
        failed = 0
        for opFile in self.operation:
            if opFile.Status == "Waiting":
                errors = [error for error in removalStatus[opFile.LFN].values() if error.strip()]
                if errors:
                    failed += 1
                    opFile.Error = ",".join(errors)
                    if "Write access not permitted for this credential" in opFile.Error:
                        opFile.Status = "Failed"

                        if self.rmsMonitoring:
                            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))

                    continue

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1))
                opFile.Status = "Done"

        if failed:
            self.operation.Error = "failed to remove %s files" % failed

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()
Exemple #28
0
    def __call__(self):
        """remove replicas"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
        else:
            # # gMonitor stuff
            gMonitor.registerActivity(
                "RemoveReplicaAtt", "Replica removals attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "RemoveReplicaOK", "Successful replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )
            gMonitor.registerActivity(
                "RemoveReplicaFail", "Failed replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM
            )

        # # prepare list of targetSEs
        targetSEs = self.operation.targetSEList
        # # check targetSEs for removal
        bannedTargets = self.checkSEsRSS(targetSEs, access="RemoveAccess")
        if not bannedTargets["OK"]:
            if self.rmsMonitoring:
                for status in ["Attempted", "Failed"]:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation)))
                self.rmsMonitoringReporter.commit()
            else:
                gMonitor.addMark("RemoveReplicaAtt")
                gMonitor.addMark("RemoveReplicaFail")
            return bannedTargets

        if bannedTargets["Value"]:
            return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"]))

        # # get waiting files
        waitingFiles = self.getWaitingFilesList()
        # # and prepare dict
        toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles)

        self.log.info("Todo: %s replicas to delete from %s SEs" % (len(toRemoveDict), len(targetSEs)))

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict)))
        else:
            gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs))

        # # keep status for each targetSE
        removalStatus = dict.fromkeys(toRemoveDict, None)
        for lfn in removalStatus:
            removalStatus[lfn] = dict.fromkeys(targetSEs, None)

        # # loop over targetSEs
        for targetSE in targetSEs:

            self.log.info("Removing replicas at %s" % targetSE)

            # # 1st step - bulk removal
            bulkRemoval = self._bulkRemoval(toRemoveDict, targetSE)
            if not bulkRemoval["OK"]:
                self.log.error("Bulk replica removal failed", bulkRemoval["Message"])

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.commit()

                return bulkRemoval

            # # report removal status for successful files
            if self.rmsMonitoring:
                self.rmsMonitoringReporter.addRecord(
                    self.createRMSRecord(
                        "Successful", len(([opFile for opFile in toRemoveDict.values() if not opFile.Error]))
                    )
                )
            else:
                gMonitor.addMark(
                    "RemoveReplicaOK", len([opFile for opFile in toRemoveDict.values() if not opFile.Error])
                )

            # # 2nd step - process the rest again
            toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Error)
            for lfn, opFile in toRetry.items():
                self._removeWithOwnerProxy(opFile, targetSE)
                if opFile.Error:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))
                    else:
                        gMonitor.addMark("RemoveReplicaFail", 1)
                    removalStatus[lfn][targetSE] = opFile.Error
                else:
                    if self.rmsMonitoring:
                        self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1))
                    else:
                        gMonitor.addMark("RemoveReplicaOK", 1)

        # # update file status for waiting files
        failed = 0
        for opFile in self.operation:
            if opFile.Status == "Waiting":
                errors = list(set(error for error in removalStatus[opFile.LFN].values() if error))
                if errors:
                    opFile.Error = "\n".join(errors)
                    # This seems to be the only unrecoverable error
                    if "Write access not permitted for this credential" in opFile.Error:
                        failed += 1
                        opFile.Status = "Failed"
                else:
                    opFile.Status = "Done"

        if failed:
            self.operation.Error = "failed to remove %s replicas" % failed

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        return S_OK()
Exemple #29
0
class Service(object):

    SVC_VALID_ACTIONS = {
        "RPC": "export",
        "FileTransfer": "transfer",
        "Message": "msg",
        "Connection": "Message"
    }
    SVC_SECLOG_CLIENT = SecurityLogClient()

    def __init__(self, serviceData):
        """
        Init the variables for the service

        :param serviceData: dict with modName, standalone, loadName, moduleObj, classObj. e.g.:
          {'modName': 'Framework/serviceName',
          'standalone': True,
          'loadName': 'Framework/serviceName',
          'moduleObj': <module 'serviceNameHandler' from '/home/DIRAC/FrameworkSystem/Service/serviceNameHandler.pyo'>,
          'classObj': <class 'serviceNameHandler.serviceHandler'>}

        """
        self._svcData = serviceData
        self._name = serviceData["modName"]
        self._startTime = datetime.datetime.utcnow()
        self._validNames = [serviceData["modName"]]
        if serviceData["loadName"] not in self._validNames:
            self._validNames.append(serviceData["loadName"])
        self._cfg = ServiceConfiguration(list(self._validNames))
        self._standalone = serviceData["standalone"]
        self.__monitorLastStatsUpdate = time.time()
        self._stats = {"queries": 0, "connections": 0}
        self._authMgr = AuthManager(
            "%s/Authorization" %
            PathFinder.getServiceSection(serviceData["loadName"]))
        self._transportPool = getGlobalTransportPool()
        self.__cloneId = 0
        self.__maxFD = 0
        self.activityMonitoring = False
        # Check if monitoring is enabled
        if "Monitoring" in Operations().getMonitoringBackends(
                monitoringType="ServiceMonitoring"):
            self.activityMonitoring = True

    def setCloneProcessId(self, cloneId):
        self.__cloneId = cloneId

    def _isMetaAction(self, action):
        referedAction = Service.SVC_VALID_ACTIONS[action]
        if referedAction in Service.SVC_VALID_ACTIONS:
            return referedAction
        return False

    def initialize(self):
        # Build the URLs
        self._url = self._cfg.getURL()
        if not self._url:
            return S_ERROR("Could not build service URL for %s" % self._name)
        gLogger.verbose("Service URL is %s" % self._url)
        # Load handler
        result = self._loadHandlerInit()
        if not result["OK"]:
            return result
        self._handler = result["Value"]
        # Initialize lock manager
        self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions())
        self._threadPool = ThreadPoolExecutor(max(0,
                                                  self._cfg.getMaxThreads()))
        self._msgBroker = MessageBroker("%sMSB" % self._name,
                                        threadPool=self._threadPool)
        # Create static dict
        self._serviceInfoDict = {
            "serviceName":
            self._name,
            "serviceSectionPath":
            PathFinder.getServiceSection(self._name),
            "URL":
            self._cfg.getURL(),
            "messageSender":
            MessageSender(self._name, self._msgBroker),
            "validNames":
            self._validNames,
            "csPaths": [
                PathFinder.getServiceSection(svcName)
                for svcName in self._validNames
            ],
        }
        self.securityLogging = Operations().getValue(
            "EnableSecurityLogging", True) and getServiceOption(
                self._serviceInfoDict, "EnableSecurityLogging", True)
        # Initialize Monitoring
        # The import needs to be here because of the CS must be initialized before importing
        # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793)
        from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

        self.activityMonitoringReporter = MonitoringReporter(
            monitoringType="ServiceMonitoring")

        self._initMonitoring()
        # Call static initialization function
        try:
            self._handler["class"]._rh__initializeClass(
                dict(self._serviceInfoDict), self._lockManager,
                self._msgBroker, self.activityMonitoringReporter)
            if self._handler["init"]:
                for initFunc in self._handler["init"]:
                    gLogger.verbose("Executing initialization function")
                    try:
                        result = initFunc(dict(self._serviceInfoDict))
                    except Exception as excp:
                        gLogger.exception(
                            "Exception while calling initialization function",
                            lException=excp)
                        return S_ERROR(
                            "Exception while calling initialization function: %s"
                            % str(excp))
                    if not isReturnStructure(result):
                        return S_ERROR(
                            "Service initialization function %s must return S_OK/S_ERROR"
                            % initFunc)
                    if not result["OK"]:
                        return S_ERROR("Error while initializing %s: %s" %
                                       (self._name, result["Message"]))
        except Exception as e:
            errMsg = "Exception while initializing %s" % self._name
            gLogger.exception(e)
            gLogger.exception(errMsg)
            return S_ERROR(errMsg)
        if self.activityMonitoring:
            gThreadScheduler.addPeriodicTask(30, self.__reportActivity)
            gThreadScheduler.addPeriodicTask(
                100, self.__activityMonitoringReporting)

        # Load actions after the handler has initialized itself
        result = self._loadActions()
        if not result["OK"]:
            return result
        self._actions = result["Value"]

        return S_OK()

    def __searchInitFunctions(self, handlerClass, currentClass=None):
        if not currentClass:
            currentClass = handlerClass
        initFuncs = []
        ancestorHasInit = False
        for ancestor in currentClass.__bases__:
            initFuncs += self.__searchInitFunctions(handlerClass, ancestor)
            if "initializeHandler" in dir(ancestor):
                ancestorHasInit = True
        if ancestorHasInit:
            initFuncs.append(
                super(currentClass, handlerClass).initializeHandler)
        if currentClass == handlerClass and "initializeHandler" in dir(
                handlerClass):
            initFuncs.append(handlerClass.initializeHandler)
        return initFuncs

    def _loadHandlerInit(self):
        handlerClass = self._svcData["classObj"]
        handlerName = handlerClass.__name__
        handlerInitMethods = self.__searchInitFunctions(handlerClass)
        try:
            handlerInitMethods.append(
                getattr(self._svcData["moduleObj"],
                        "initialize%s" % handlerName))
        except AttributeError:
            gLogger.verbose(
                "Not found global initialization function for service")

        if handlerInitMethods:
            gLogger.info("Found %s initialization methods" %
                         len(handlerInitMethods))

        handlerInfo = {}
        handlerInfo["name"] = handlerName
        handlerInfo["module"] = self._svcData["moduleObj"]
        handlerInfo["class"] = handlerClass
        handlerInfo["init"] = handlerInitMethods

        return S_OK(handlerInfo)

    def _loadActions(self):

        handlerClass = self._handler["class"]

        authRules = {}
        typeCheck = {}
        methodsList = {}
        for actionType in Service.SVC_VALID_ACTIONS:
            if self._isMetaAction(actionType):
                continue
            authRules[actionType] = {}
            typeCheck[actionType] = {}
            methodsList[actionType] = []
        handlerAttributeList = dir(handlerClass)
        for actionType in Service.SVC_VALID_ACTIONS:
            if self._isMetaAction(actionType):
                continue
            methodPrefix = "%s_" % Service.SVC_VALID_ACTIONS[actionType]
            for attribute in handlerAttributeList:
                if attribute.find(methodPrefix) != 0:
                    continue
                exportedName = attribute[len(methodPrefix):]
                methodsList[actionType].append(exportedName)
                gLogger.verbose("+ Found %s method %s" %
                                (actionType, exportedName))
                # Create lock for method
                self._lockManager.createLock(
                    "%s/%s" % (actionType, exportedName),
                    self._cfg.getMaxThreadsForMethod(actionType, exportedName))
                # Look for type and auth rules
                if actionType == "RPC":
                    typeAttr = "types_%s" % exportedName
                    authAttr = "auth_%s" % exportedName
                else:
                    typeAttr = "types_%s_%s" % (
                        Service.SVC_VALID_ACTIONS[actionType], exportedName)
                    authAttr = "auth_%s_%s" % (
                        Service.SVC_VALID_ACTIONS[actionType], exportedName)
                if typeAttr in handlerAttributeList:
                    obj = getattr(handlerClass, typeAttr)
                    gLogger.verbose("|- Found type definition %s: %s" %
                                    (typeAttr, str(obj)))
                    typeCheck[actionType][exportedName] = obj
                if authAttr in handlerAttributeList:
                    obj = getattr(handlerClass, authAttr)
                    gLogger.verbose("|- Found auth rules %s: %s" %
                                    (authAttr, str(obj)))
                    authRules[actionType][exportedName] = obj

        for actionType in Service.SVC_VALID_ACTIONS:
            referedAction = self._isMetaAction(actionType)
            if not referedAction:
                continue
            gLogger.verbose("Action %s is a meta action for %s" %
                            (actionType, referedAction))
            authRules[actionType] = []
            for method in authRules[referedAction]:
                for prop in authRules[referedAction][method]:
                    if prop not in authRules[actionType]:
                        authRules[actionType].append(prop)
            gLogger.verbose("Meta action %s props are %s" %
                            (actionType, authRules[actionType]))

        return S_OK({
            "methods": methodsList,
            "auth": authRules,
            "types": typeCheck
        })

    def _initMonitoring(self):
        props = [("__doc__", "description")]
        for prop in props:
            try:
                value = getattr(self._handler["module"], prop[0])
            except Exception as e:
                gLogger.exception(e)
                gLogger.error("Missing property", prop[0])
                value = "unset"

        for secondaryName in self._cfg.registerAlsoAs():
            gLogger.info("Registering %s also as %s" %
                         (self._name, secondaryName))
            self._validNames.append(secondaryName)
        return S_OK()

    def __reportActivity(self):
        initialWallTime, initialCPUTime, mem = self.__startReportToMonitoring()
        pendingQueries = self._threadPool._work_queue.qsize()
        activeQuereies = len(self._threadPool._threads)
        percentage = self.__endReportToMonitoring(initialWallTime,
                                                  initialCPUTime)
        self.activityMonitoringReporter.addRecord({
            "timestamp":
            int(TimeUtilities.toEpoch()),
            "Host":
            Network.getFQDN(),
            "ServiceName":
            "_".join(self._name.split("/")),
            "Location":
            self._cfg.getURL(),
            "MemoryUsage":
            mem,
            "CpuPercentage":
            percentage,
            "PendingQueries":
            pendingQueries,
            "ActiveQueries":
            activeQuereies,
            "RunningThreads":
            threading.activeCount(),
            "MaxFD":
            self.__maxFD,
        })
        self.__maxFD = 0

    def getConfig(self):
        return self._cfg

    # End of initialization functions

    def handleConnection(self, clientTransport):
        """
        This method may be called by ServiceReactor.
        The method stacks openened connection in a queue, another thread
        read this queue and handle connection.

        :param clientTransport: Object which describes opened connection (PlainTransport or SSLTransport)
        """
        if not self.activityMonitoring:
            self._stats["connections"] += 1
        self._threadPool.submit(self._processInThread, clientTransport)

    @property
    def wantsThrottle(self):
        """Boolean property for if the service wants requests to stop being accepted"""
        nQueued = self._threadPool._work_queue.qsize()
        return nQueued > self._cfg.getMaxWaitingPetitions()

    # Threaded process function
    def _processInThread(self, clientTransport):
        """
        This method handles a RPC, FileTransfer or Connection.
        Connection may be opened via ServiceReactor.__acceptIncomingConnection


        - Do the SSL/TLS Handshake (if dips is used) and extract credentials
        - Get the action called by the client
        - Check if the client is authorized to perform ation
          - If not, connection is closed
        - Instanciate the RequestHandler (RequestHandler contain all methods callable)

        (Following is not directly in this method but it describe what happen at
        #Execute the action)
        - Notify the client we're ready to execute the action (via _processProposal)
          and call RequestHandler._rh_executeAction()
        - Receive arguments/file/something else (depending on action) in the RequestHandler
        - Executing the action asked by the client

        :param clientTransport: Object which describe the opened connection (SSLTransport or PlainTransport)

        :return: S_OK with "closeTransport" a boolean to indicate if th connection have to be closed
                e.g. after RPC, closeTransport=True

        """
        self.__maxFD = max(self.__maxFD, clientTransport.oSocket.fileno())
        self._lockManager.lockGlobal()
        try:
            monReport = self.__startReportToMonitoring()
        except Exception:
            monReport = False
        try:
            # Handshake
            try:
                result = clientTransport.handshake()
                if not result["OK"]:
                    clientTransport.close()
                    return
            except Exception:
                return
            # Add to the transport pool
            trid = self._transportPool.add(clientTransport)
            if not trid:
                return
            # Receive and check proposal
            result = self._receiveAndCheckProposal(trid)
            if not result["OK"]:
                self._transportPool.sendAndClose(trid, result)
                return
            proposalTuple = result["Value"]
            # Instantiate handler
            result = self._instantiateHandler(trid, proposalTuple)
            if not result["OK"]:
                self._transportPool.sendAndClose(trid, result)
                return
            handlerObj = result["Value"]
            # Execute the action
            result = self._processProposal(trid, proposalTuple, handlerObj)
            # Close the connection if required
            if result["closeTransport"] or not result["OK"]:
                if not result["OK"]:
                    gLogger.error("Error processing proposal",
                                  result["Message"])
                self._transportPool.close(trid)
            return result
        finally:
            self._lockManager.unlockGlobal()
            if monReport:
                self.__endReportToMonitoring(monReport[0], monReport[1])

    @staticmethod
    def _createIdentityString(credDict, clientTransport=None):
        if "username" in credDict:
            if "group" in credDict:
                identity = "[%s:%s]" % (credDict["username"],
                                        credDict["group"])
            else:
                identity = "[%s:unknown]" % credDict["username"]
        else:
            identity = "unknown"
        if clientTransport:
            addr = clientTransport.getRemoteAddress()
            if addr:
                addr = "{%s:%s}" % (addr[0], addr[1])
        if "DN" in credDict:
            identity += "(%s)" % credDict["DN"]
        return identity

    @staticmethod
    def _deserializeProposalTuple(serializedProposal):
        """We receive the proposalTuple as a list.
        Turn it into a tuple again
        """
        proposalTuple = tuple(
            tuple(x) if isinstance(x, list) else x for x in serializedProposal)
        return proposalTuple

    def _receiveAndCheckProposal(self, trid):
        clientTransport = self._transportPool.get(trid)
        # Get the peer credentials
        credDict = clientTransport.getConnectingCredentials()
        # Receive the action proposal
        retVal = clientTransport.receiveData(1024)
        if not retVal["OK"]:
            gLogger.error(
                "Invalid action proposal",
                "%s %s" % (self._createIdentityString(
                    credDict, clientTransport), retVal["Message"]),
            )
            return S_ERROR("Invalid action proposal")
        proposalTuple = Service._deserializeProposalTuple(retVal["Value"])
        gLogger.debug("Received action from client",
                      "/".join(list(proposalTuple[1])))
        # Check if there are extra credentials
        if proposalTuple[2]:
            clientTransport.setExtraCredentials(proposalTuple[2])
        # Check if this is the requested service
        requestedService = proposalTuple[0][0]
        if requestedService not in self._validNames:
            return S_ERROR("%s is not up in this server" % requestedService)
        # Check if the action is valid
        requestedActionType = proposalTuple[1][0]
        if requestedActionType not in Service.SVC_VALID_ACTIONS:
            return S_ERROR("%s is not a known action type" %
                           requestedActionType)
        # Check if it's authorized
        result = self._authorizeProposal(proposalTuple[1], trid, credDict)
        if not result["OK"]:
            return result
        # Proposal is OK
        return S_OK(proposalTuple)

    def _authorizeProposal(self, actionTuple, trid, credDict):
        # Find CS path for the Auth rules
        referedAction = self._isMetaAction(actionTuple[0])
        if referedAction:
            csAuthPath = "%s/Default" % actionTuple[0]
            hardcodedMethodAuth = self._actions["auth"][actionTuple[0]]
        else:
            if actionTuple[0] == "RPC":
                csAuthPath = actionTuple[1]
            else:
                csAuthPath = "/".join(actionTuple)
            # Find if there are hardcoded auth rules in the code
            hardcodedMethodAuth = False
            if actionTuple[0] in self._actions["auth"]:
                hardcodedRulesByType = self._actions["auth"][actionTuple[0]]
                if actionTuple[0] == "FileTransfer":
                    methodName = actionTuple[1][0].lower() + actionTuple[1][1:]
                else:
                    methodName = actionTuple[1]

                if methodName in hardcodedRulesByType:
                    hardcodedMethodAuth = hardcodedRulesByType[methodName]
        # Auth time!
        if not self._authMgr.authQuery(csAuthPath, credDict,
                                       hardcodedMethodAuth):
            # Get the identity string
            identity = self._createIdentityString(credDict)
            fromHost = "unknown host"
            tr = self._transportPool.get(trid)
            if tr:
                fromHost = "/".join(
                    [str(item) for item in tr.getRemoteAddress()])
            gLogger.warn(
                "Unauthorized query", "to %s:%s by %s from %s" %
                (self._name, "/".join(actionTuple), identity, fromHost))
            result = S_ERROR(ENOAUTH, "Unauthorized query")
        else:
            result = S_OK()

        # Security log
        tr = self._transportPool.get(trid)
        if not tr:
            return S_ERROR("Client disconnected")
        sourceAddress = tr.getRemoteAddress()
        identity = self._createIdentityString(credDict)
        if self.securityLogging:
            Service.SVC_SECLOG_CLIENT.addMessage(
                result["OK"],
                sourceAddress[0],
                sourceAddress[1],
                identity,
                self._cfg.getHostname(),
                self._cfg.getPort(),
                self._name,
                "/".join(actionTuple),
            )
        return result

    def _instantiateHandler(self, trid, proposalTuple=None):
        """
        Generate an instance of the handler for a given service

        :param int trid: transport ID
        :param tuple proposalTuple: tuple describing the proposed action

        :return: S_OK/S_ERROR, Value is the handler object
        """
        # Generate the client params
        clientParams = {"serviceStartTime": self._startTime}
        if proposalTuple:
            # The 4th element is the client version
            clientParams["clientVersion"] = proposalTuple[3] if len(
                proposalTuple) > 3 else None
            clientParams["clientSetup"] = proposalTuple[0][1]
            if len(proposalTuple[0]) < 3:
                clientParams["clientVO"] = gConfig.getValue(
                    "/DIRAC/VirtualOrganization", "unknown")
            else:
                clientParams["clientVO"] = proposalTuple[0][2]
        clientTransport = self._transportPool.get(trid)
        if clientTransport:
            clientParams["clientAddress"] = clientTransport.getRemoteAddress()
        # Generate handler dict with per client info
        handlerInitDict = dict(self._serviceInfoDict)
        for key in clientParams:
            handlerInitDict[key] = clientParams[key]
        # Instantiate and initialize
        try:
            handlerInstance = self._handler["class"](handlerInitDict, trid)
            handlerInstance.initialize()
        except Exception as e:
            gLogger.exception("Server error while loading handler: %s" %
                              str(e))
            return S_ERROR("Server error while loading handler")
        return S_OK(handlerInstance)

    def _processProposal(self, trid, proposalTuple, handlerObj):
        # Notify the client we're ready to execute the action
        retVal = self._transportPool.send(trid, S_OK())
        if not retVal["OK"]:
            return retVal

        messageConnection = False
        if proposalTuple[1] == ("Connection", "new"):
            messageConnection = True

        if messageConnection:

            if self._msgBroker.getNumConnections(
            ) > self._cfg.getMaxMessagingConnections():
                result = S_ERROR(
                    "Maximum number of connections reached. Try later")
                result["closeTransport"] = True
                return result

            # This is a stable connection
            self._msgBroker.addTransportId(
                trid,
                self._name,
                receiveMessageCallback=self._mbReceivedMsg,
                disconnectCallback=self._mbDisconnect,
                listenToConnection=False,
            )

        result = self._executeAction(trid, proposalTuple, handlerObj)
        if result["OK"] and messageConnection:
            self._msgBroker.listenToTransport(trid)
            result = self._mbConnect(trid, handlerObj)
            if not result["OK"]:
                self._msgBroker.removeTransport(trid)

        result["closeTransport"] = not messageConnection or not result["OK"]
        return result

    def _mbConnect(self, trid, handlerObj=None):
        if not handlerObj:
            result = self._instantiateHandler(trid)
            if not result["OK"]:
                return result
            handlerObj = result["Value"]
        return handlerObj._rh_executeConnectionCallback("connected")

    def _executeAction(self, trid, proposalTuple, handlerObj):
        try:
            response = handlerObj._rh_executeAction(proposalTuple)
            if not response["OK"]:
                return response
            if self.activityMonitoring:
                self.activityMonitoringReporter.addRecord({
                    "timestamp":
                    int(TimeUtilities.toEpoch()),
                    "Host":
                    Network.getFQDN(),
                    "ServiceName":
                    "_".join(self._name.split("/")),
                    "Location":
                    self._cfg.getURL(),
                    "ResponseTime":
                    response["Value"][1],
                })
            return response["Value"][0]
        except Exception as e:
            gLogger.exception("Exception while executing handler action")
            return S_ERROR("Server error while executing action: %s" % str(e))

    def _mbReceivedMsg(self, trid, msgObj):
        result = self._authorizeProposal(
            ("Message", msgObj.getName()), trid,
            self._transportPool.get(trid).getConnectingCredentials())
        if not result["OK"]:
            return result
        result = self._instantiateHandler(trid)
        if not result["OK"]:
            return result
        handlerObj = result["Value"]
        response = handlerObj._rh_executeMessageCallback(msgObj)
        if self.activityMonitoring and response["OK"]:
            self.activityMonitoringReporter.addRecord({
                "timestamp":
                int(TimeUtilities.toEpoch()),
                "Host":
                Network.getFQDN(),
                "ServiceName":
                "_".join(self._name.split("/")),
                "Location":
                self._cfg.getURL(),
                "ResponseTime":
                response["Value"][1],
            })
        if response["OK"]:
            return response["Value"][0]
        else:
            return response

    def _mbDisconnect(self, trid):
        result = self._instantiateHandler(trid)
        if not result["OK"]:
            return result
        handlerObj = result["Value"]
        return handlerObj._rh_executeConnectionCallback("drop")

    def __activityMonitoringReporting(self):
        """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which
        is done by the MonitoringReporter and is send to the 'ComponentMonitoring' type.

        :return: True / False
        """
        return self.activityMonitoringReporter.commit()

    def __startReportToMonitoring(self):
        now = time.time()
        stats = os.times()
        cpuTime = stats[0] + stats[2]
        mem = None
        if now - self.__monitorLastStatsUpdate < 0:
            return (now, cpuTime, mem)
        self.__monitorLastStatsUpdate = now
        membytes = MemStat.VmB("VmRSS:")
        if membytes:
            mem = membytes / (1024.0 * 1024.0)
        return (now, cpuTime, mem)

    def __endReportToMonitoring(self, initialWallTime, initialCPUTime):
        wallTime = time.time() - initialWallTime
        stats = os.times()
        cpuTime = stats[0] + stats[2] - initialCPUTime
        percentage = cpuTime / wallTime * 100.0
        return percentage
Exemple #30
0
    def __call__(self):
        """call me maybe"""

        # The flag  'rmsMonitoring' is set by the RequestTask and is False by default.
        # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase.
        if self.rmsMonitoring:
            self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring")
        else:
            # # RegisterReplica specific monitor info
            gMonitor.registerActivity(
                "RegisterReplicaAtt",
                "Attempted replicas registrations",
                "RequestExecutingAgent",
                "Replicas/min",
                gMonitor.OP_SUM,
            )
            gMonitor.registerActivity(
                "RegisterReplicaOK",
                "Successful replicas registrations",
                "RequestExecutingAgent",
                "Replicas/min",
                gMonitor.OP_SUM,
            )
            gMonitor.registerActivity(
                "RegisterReplicaFail",
                "Failed replicas registrations",
                "RequestExecutingAgent",
                "Replicas/min",
                gMonitor.OP_SUM,
            )

        # # counter for failed replicas

        failedReplicas = 0
        # # catalog to use
        catalogs = self.operation.Catalog
        if catalogs:
            catalogs = [cat.strip() for cat in catalogs.split(",")]
        # # get waiting files
        waitingFiles = self.getWaitingFilesList()

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(waitingFiles)))

        # # loop over files
        registerOperations = {}
        successReplicas = 0
        for opFile in waitingFiles:

            if not self.rmsMonitoring:
                gMonitor.addMark("RegisterReplicaAtt", 1)

            # # get LFN
            lfn = opFile.LFN
            # # and others
            targetSE = self.operation.targetSEList[0]
            replicaTuple = (lfn, opFile.PFN, targetSE)
            # # call ReplicaManager
            registerReplica = self.dm.registerReplica(replicaTuple, catalogs)
            # # check results
            if not registerReplica["OK"] or lfn in registerReplica["Value"]["Failed"]:
                # There have been some errors

                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1))
                else:
                    gMonitor.addMark("RegisterReplicaFail", 1)
                #        self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaFail", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterReplica" )

                reason = registerReplica.get(
                    "Message", registerReplica.get("Value", {}).get("Failed", {}).get(lfn, "Unknown")
                )
                errorStr = "failed to register LFN %s: %s" % (lfn, str(reason))
                # FIXME: this is incompatible with the change made in the DM that we
                # ignore failures if successful in at least one catalog
                if lfn in registerReplica.get("Value", {}).get("Successful", {}) and isinstance(reason, dict):
                    # As we managed, let's create a new operation for just the remaining registration
                    errorStr += " - adding registerReplica operations to request"
                    for failedCatalog in reason:
                        key = "%s/%s" % (targetSE, failedCatalog)
                        newOperation = self.getRegisterOperation(
                            opFile, targetSE, type="RegisterReplica", catalog=failedCatalog
                        )
                        if key not in registerOperations:
                            registerOperations[key] = newOperation
                        else:
                            registerOperations[key].addFile(newOperation[0])
                    opFile.Status = "Done"
                else:
                    opFile.Error = errorStr
                    catMaster = True
                    if isinstance(reason, dict):
                        from DIRAC.Resources.Catalog.FileCatalog import FileCatalog

                        for failedCatalog in reason:
                            catMaster = catMaster and FileCatalog()._getCatalogConfigDetails(failedCatalog).get(
                                "Value", {}
                            ).get("Master", False)
                    # If one targets explicitly a catalog and it fails or if it fails on the master catalog
                    if (catalogs or catMaster) and (
                        "file does not exist" in opFile.Error.lower() or "no such file" in opFile.Error.lower()
                    ):
                        # Check if the file really exists in SE, if not, consider this file registration as Done
                        res = self.dm.getReplicaMetadata(lfn, targetSE)
                        notExist = bool("No such file" in res.get("Value", {}).get("Failed", {}).get(lfn, ""))
                        if not notExist:
                            opFile.Status = "Failed"
                        else:
                            opFile.Status = "Done"
                    if opFile.Status != "Done":
                        failedReplicas += 1
                self.log.warn(errorStr)

            else:
                # All is OK
                if self.rmsMonitoring:
                    self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1))
                else:
                    gMonitor.addMark("RegisterReplicaOK", 1)
                    successReplicas += 1
                    self.log.verbose(
                        "Replica %s has been registered at %s"
                        % (lfn, ",".join(catalogs) if catalogs else "all catalogs")
                    )

                opFile.Status = "Done"

        # # if we have new replications to take place, put them at the end
        if registerOperations:
            self.log.info("adding %d operations to the request" % len(registerOperations))
        for operation in registerOperations.values():
            self.operation._parent.addOperation(operation)

        if self.rmsMonitoring:
            self.rmsMonitoringReporter.commit()

        # # final check
        infoStr = ""
        if successReplicas:
            infoStr = "%d replicas successfully registered" % successReplicas
        if failedReplicas:
            infoStr += ", %d replicas failed to register" % failedReplicas
        self.log.info("All replicas processed", infoStr)
        if failedReplicas:
            self.operation.Error = "some replicas failed to register"
            return S_ERROR(self.operation.Error)

        return S_OK()
Exemple #31
0
    def initialize(self):
        # Build the URLs
        self._url = self._cfg.getURL()
        if not self._url:
            return S_ERROR("Could not build service URL for %s" % self._name)
        gLogger.verbose("Service URL is %s" % self._url)
        # Load handler
        result = self._loadHandlerInit()
        if not result["OK"]:
            return result
        self._handler = result["Value"]
        # Initialize lock manager
        self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions())
        self._threadPool = ThreadPoolExecutor(max(0,
                                                  self._cfg.getMaxThreads()))
        self._msgBroker = MessageBroker("%sMSB" % self._name,
                                        threadPool=self._threadPool)
        # Create static dict
        self._serviceInfoDict = {
            "serviceName":
            self._name,
            "serviceSectionPath":
            PathFinder.getServiceSection(self._name),
            "URL":
            self._cfg.getURL(),
            "messageSender":
            MessageSender(self._name, self._msgBroker),
            "validNames":
            self._validNames,
            "csPaths": [
                PathFinder.getServiceSection(svcName)
                for svcName in self._validNames
            ],
        }
        self.securityLogging = Operations().getValue(
            "EnableSecurityLogging", True) and getServiceOption(
                self._serviceInfoDict, "EnableSecurityLogging", True)
        # Initialize Monitoring
        # The import needs to be here because of the CS must be initialized before importing
        # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793)
        from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter

        self.activityMonitoringReporter = MonitoringReporter(
            monitoringType="ServiceMonitoring")

        self._initMonitoring()
        # Call static initialization function
        try:
            self._handler["class"]._rh__initializeClass(
                dict(self._serviceInfoDict), self._lockManager,
                self._msgBroker, self.activityMonitoringReporter)
            if self._handler["init"]:
                for initFunc in self._handler["init"]:
                    gLogger.verbose("Executing initialization function")
                    try:
                        result = initFunc(dict(self._serviceInfoDict))
                    except Exception as excp:
                        gLogger.exception(
                            "Exception while calling initialization function",
                            lException=excp)
                        return S_ERROR(
                            "Exception while calling initialization function: %s"
                            % str(excp))
                    if not isReturnStructure(result):
                        return S_ERROR(
                            "Service initialization function %s must return S_OK/S_ERROR"
                            % initFunc)
                    if not result["OK"]:
                        return S_ERROR("Error while initializing %s: %s" %
                                       (self._name, result["Message"]))
        except Exception as e:
            errMsg = "Exception while initializing %s" % self._name
            gLogger.exception(e)
            gLogger.exception(errMsg)
            return S_ERROR(errMsg)
        if self.activityMonitoring:
            gThreadScheduler.addPeriodicTask(30, self.__reportActivity)
            gThreadScheduler.addPeriodicTask(
                100, self.__activityMonitoringReporting)

        # Load actions after the handler has initialized itself
        result = self._loadActions()
        if not result["OK"]:
            return result
        self._actions = result["Value"]

        return S_OK()
    def setUp(self):
        gLogger.setLevel('INFO')

        self.wmsMonitoringReporter = MonitoringReporter(
            monitoringType="WMSHistory")
        self.componentMonitoringReporter = MonitoringReporter(
            monitoringType="ComponentMonitoring")

        self.data = [{
            "Status": "Waiting",
            "Jobs": 2,
            "timestamp": 1458130176,
            "JobSplitType": "MCStripping",
            "MinorStatus": "unset",
            "Site": "LCG.GRIDKA.de",
            "Reschedules": 0,
            "ApplicationStatus": "unset",
            "User": "******",
            "JobGroup": "00049848",
            "UserGroup": "lhcb_mc",
            "metric": "WMSHistory"
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458130176,
            u'JobSplitType': u'User',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.PIC.es',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'olupton',
            u'JobGroup': u'lhcb',
            u'UserGroup': u'lhcb_user',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458130176,
            u'JobSplitType': u'User',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RAL.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'olupton',
            u'JobGroup': u'lhcb',
            u'UserGroup': u'lhcb_user',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458130176,
            u'JobSplitType': u'MCStripping',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RAL.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00049845',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 34,
            u'timestamp': 1458141578,
            u'JobSplitType': u'DataStripping',
            u'MinorStatus': u'unset',
            u'Site': u'Group.RAL.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050299',
            u'UserGroup': u'lhcb_data',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 120,
            u'timestamp': 1458141578,
            u'JobSplitType': u'User',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.CERN.ch',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'mvesteri',
            u'JobGroup': u'lhcb',
            u'UserGroup': u'lhcb_user',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458141578,
            u'JobSplitType': u'MCStripping',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.CNAF.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00049845',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 2,
            u'timestamp': 1458141578,
            u'JobSplitType': u'MCStripping',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.CNAF.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00049848',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458141578,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.CNAF.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050286',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 95,
            u'timestamp': 1458199202,
            u'JobSplitType': u'User',
            u'MinorStatus': u'unset',
            u'Site': u'Multiple',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'mamartin',
            u'JobGroup': u'lhcb',
            u'UserGroup': u'lhcb_user',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 3,
            u'timestamp': 1458199202,
            u'JobSplitType': u'User',
            u'MinorStatus': u'unset',
            u'Site': u'Multiple',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'olupton',
            u'JobGroup': u'lhcb',
            u'UserGroup': u'lhcb_user',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 129,
            u'timestamp': 1458199202,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'Multiple',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00049844',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 5,
            u'timestamp': 1458217812,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.IHEP.su',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050232',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 7,
            u'timestamp': 1458217812,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.IHEP.su',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050234',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 1,
            u'timestamp': 1458217812,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.IHEP.su',
            u'Reschedules': 1,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050236',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 3,
            u'timestamp': 1458217812,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.IHEP.su',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050238',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 2,
            u'timestamp': 1458217812,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.IHEP.su',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050248',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 12,
            u'timestamp': 1458218413,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.CNAF.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050248',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 5,
            u'timestamp': 1458218413,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.CNAF.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050250',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 4,
            u'timestamp': 1458218413,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.CNAF.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050251',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 1,
            u'timestamp': 1458218413,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.CNAF.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050280',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 24,
            u'timestamp': 1458219012,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.NIKHEF.nl',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050248',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 3,
            u'timestamp': 1458219012,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.NIKHEF.nl',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050251',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 1,
            u'timestamp': 1458222013,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.Bologna.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050303',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 7,
            u'timestamp': 1458222013,
            u'JobSplitType': u'User',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.Bristol.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'clangenb',
            u'JobGroup': u'lhcb',
            u'UserGroup': u'lhcb_user',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 2,
            u'timestamp': 1458222013,
            u'JobSplitType': u'User',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.Bristol.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'mrwillia',
            u'JobGroup': u'lhcb',
            u'UserGroup': u'lhcb_user',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 1,
            u'timestamp': 1458222013,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.Bari.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050244',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 11,
            u'timestamp': 1458222013,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.Bari.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050246',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 22,
            u'timestamp': 1458222013,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.Bari.it',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050248',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 23,
            u'timestamp': 1458225013,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.DESYZN.de',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00049844',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 18,
            u'timestamp': 1458225013,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.DESYZN.de',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00049847',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 1,
            u'timestamp': 1458225013,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.DESYZN.de',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050238',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Running',
            'Jobs': 1,
            u'timestamp': 1458225013,
            u'JobSplitType': u'MCSimulation',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.DESYZN.de',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050246',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458226213,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RRCKI.ru',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050243',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458226213,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RRCKI.ru',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050251',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458226213,
            u'JobSplitType': u'MCStripping',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RRCKI.ru',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050256',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458226213,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RAL.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050229',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458226213,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RAL.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050241',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 1,
            u'timestamp': 1458226213,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RAL.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050243',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }, {
            u'Status': u'Waiting',
            'Jobs': 2,
            u'timestamp': 1458226213,
            u'JobSplitType': u'MCReconstruction',
            u'MinorStatus': u'unset',
            u'Site': u'LCG.RAL.uk',
            u'Reschedules': 0,
            u'ApplicationStatus': u'unset',
            u'User': u'phicharp',
            u'JobGroup': u'00050247',
            u'UserGroup': u'lhcb_mc',
            u'metric': u'WMSHistory'
        }]
Exemple #33
0
class StatesMonitoringAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  __summaryKeyFieldsMapping = [ 'Status',
                                'Site',
                                'User',
                                'UserGroup',
                                'JobGroup',
                                'JobType',
                                'ApplicationStatus',
                                'MinorStatus']
  __summaryDefinedFields = [ ( 'ApplicationStatus', 'unset' ), ( 'MinorStatus', 'unset' ) ]
  __summaryValueFieldsMapping = [ 'Jobs',
                                  'Reschedules']
  __renameFieldsMapping = { 'JobType' : 'JobSplitType' }

  __jobDBFields = []
  
  jobDB = None
  monitoringReporter = None
  reportPeriod = None
    
  def initialize( self ):
    """ Standard constructor
    """
    
    self.jobDB = JobDB()
    
    self.reportPeriod = 120
    self.am_setOption( "PollingTime", self.reportPeriod )
    
    self.monitoringReporter = MonitoringReporter( monitoringType = "WMSHistory" )
    
    for field in self.__summaryKeyFieldsMapping:
      if field == 'User':
        field = 'Owner'
      elif field == 'UserGroup':
        field = 'OwnerGroup'
      self.__jobDBFields.append( field )
    
    return S_OK()
   
  def execute( self ):
    """ Main execution method
    """
    result = gConfig.getSections( "/DIRAC/Setups" )
    if not result[ 'OK' ]:
      return result
    validSetups = result[ 'Value' ]
    self.log.info( "Valid setups for this cycle are %s" % ", ".join( validSetups ) )
    # Get the WMS Snapshot!
    result = self.jobDB.getSummarySnapshot( self.__jobDBFields )
    now = Time.dateTime()
    if not result[ 'OK' ]:
      self.log.error( "Can't get the jobdb summary", result[ 'Message' ] )
    else:
      values = result[ 'Value' ][1]
      self.log.info( "Start sending records!" )
      for record in values:
        recordSetup = record[0]
        if recordSetup not in validSetups:
          self.log.error( "Setup %s is not valid" % recordSetup )
          continue
        record = record[1:]
        rD = {}
        for fV in self.__summaryDefinedFields:
          rD[ fV[0] ] = fV[1]
        for iP in range( len( self.__summaryKeyFieldsMapping ) ):
          fieldName = self.__summaryKeyFieldsMapping[iP]
          rD[ self.__renameFieldsMapping.get( fieldName, fieldName ) ] = record[iP]
        record = record[ len( self.__summaryKeyFieldsMapping ): ]
        for iP in range( len( self.__summaryValueFieldsMapping ) ):
          rD[ self.__summaryValueFieldsMapping[iP] ] = int( record[iP] )
        rD['timestamp'] = int( Time.toEpoch( now ) )       
        self.monitoringReporter.addRecord( rD )
      retVal = self.monitoringReporter.commit()
      if retVal['OK']:
        self.log.info( "The records are successfully sent to the Store!" )
      else:
        self.log.warn( "Faild to insert the records! It will be retried in the next iteration", retVal['Message'] )
        
    return S_OK()