def __call__(self): """call me maybe""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") # # check replicas first checkReplicas = self.__checkReplicas() if not checkReplicas["OK"]: self.log.error("Failed to check replicas", checkReplicas["Message"]) if hasattr(self, "FTSMode") and getattr(self, "FTSMode"): bannedGroups = getattr(self, "FTSBannedGroups") if hasattr( self, "FTSBannedGroups") else () if self.request.OwnerGroup in bannedGroups: self.log.verbose( "usage of FTS system is banned for request's owner") return self.dmTransfer() return self.fts3Transfer() return self.dmTransfer()
def initialize(self): """Standard initialization""" # This agent will always loop every 15 minutes self.am_setOption("PollingTime", 900) # Check whether to send to Monitoring or Accounting or both self.jobMonitoringOption = Operations().getMonitoringBackends( monitoringType="WMSHistory") self.pilotMonitoringOption = Operations().getMonitoringBackends( monitoringType="PilotsHistory") messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory") self.datastores = { } # For storing the clients to Accounting and Monitoring if "Accounting" in self.jobMonitoringOption: self.datastores["Accounting"] = DataStoreClient(retryGraceTime=900) if "Monitoring" in self.jobMonitoringOption: self.datastores["Monitoring"] = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=messageQueue) if "Monitoring" in self.pilotMonitoringOption: self.pilotReporter = MonitoringReporter( monitoringType="PilotsHistory", failoverQueueName=messageQueue) self.__jobDBFields = [] for field in self.__summaryKeyFieldsMapping: if field == "User": field = "Owner" elif field == "UserGroup": field = "OwnerGroup" self.__jobDBFields.append(field) return S_OK()
def __call__(self): """call me maybe""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") else: # # own gMonitor stuff for files gMonitor.registerActivity( "ReplicateAndRegisterAtt", "Replicate and register attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM, ) gMonitor.registerActivity("ReplicateOK", "Replications successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateFail", "Replications failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Registrations successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Registrations failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # for FTS gMonitor.registerActivity("FTSScheduleAtt", "Files schedule attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleOK", "File schedule successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleFail", "File schedule failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # check replicas first checkReplicas = self.__checkReplicas() if not checkReplicas["OK"]: self.log.error("Failed to check replicas", checkReplicas["Message"]) if hasattr(self, "FTSMode") and getattr(self, "FTSMode"): bannedGroups = getattr(self, "FTSBannedGroups") if hasattr( self, "FTSBannedGroups") else () if self.request.OwnerGroup in bannedGroups: self.log.verbose( "usage of FTS system is banned for request's owner") return self.dmTransfer() return self.fts3Transfer() return self.dmTransfer()
def __init__( self, requestJSON, handlersDict, csPath, agentName, standalone=False, requestClient=None, rmsMonitoring=False): """c'tor :param self: self reference :param str requestJSON: request serialized to JSON :param dict opHandlers: operation handlers """ self.request = Request(requestJSON) # # csPath self.csPath = csPath # # agent name self.agentName = agentName # # standalone flag self.standalone = standalone # # handlers dict self.handlersDict = handlersDict # # handlers class def self.handlers = {} # # own sublogger self.log = gLogger.getSubLogger("pid_%s/%s" % (os.getpid(), self.request.RequestName)) # # get shifters info self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error("Cannot setup shifter proxies", shifterProxies["Message"]) # This flag which is set and sent from the RequestExecutingAgent and is False by default. self.rmsMonitoring = rmsMonitoring if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") else: # # initialize gMonitor gMonitor.setComponentType(gMonitor.COMPONENT_AGENT) gMonitor.setComponentName(self.agentName) gMonitor.initialize() # # own gMonitor activities gMonitor.registerActivity("RequestAtt", "Requests processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestFail", "Requests failed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestOK", "Requests done", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM) if requestClient is None: self.requestClient = ReqClient() else: self.requestClient = requestClient
def __init__(self): monitoringType = "DataOperation" # Will use the `MonitoringBackends/Default` value as monitoring backend unless a flag for `MonitoringBackends/DataOperation` is set. self.monitoringOptions = Operations().getMonitoringBackends( monitoringType) if "Monitoring" in self.monitoringOptions: self.dataOperationReporter = MonitoringReporter(monitoringType) if "Accounting" in self.monitoringOptions: self.dataOp = DataOperation()
def initialize(self): """Standard initialization""" # This agent will always loop every 15 minutes self.am_setOption("PollingTime", 900) self.backends = self.am_getOption("Backends", "Accounting").replace(" ", "").split(",") messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory") self.log.info("Committing to %s backend" % "and ".join(self.backends)) self.datastores = { } # For storing the clients to Accounting and Monitoring if "Accounting" in self.backends: self.datastores["Accounting"] = DataStoreClient(retryGraceTime=900) if "Monitoring" in self.backends: self.datastores["Monitoring"] = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=messageQueue) self.__jobDBFields = [] for field in self.__summaryKeyFieldsMapping: if field == "User": field = "Owner" elif field == "UserGroup": field = "OwnerGroup" self.__jobDBFields.append(field) return S_OK()
def initialize(self): """ Standard initialization """ # This agent will always loop every 15 minutes self.am_setOption("PollingTime", 900) self.backends = self.am_getOption("Backends", "Accounting").replace(' ', '').split(',') messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory") self.datastores = { } # For storing the clients to Accounting and Monitoring if 'Accounting' in self.backends: self.datastores['Accounting'] = DataStoreClient(retryGraceTime=900) if 'Monitoring' in self.backends: self.datastores['Monitoring'] = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=messageQueue) self.__jobDBFields = [] for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK()
def initializeHandler(cls, serviceInfo): """ Handler class initialization """ # Check the flag for monitoring of the state of the host hostMonitoring = cls.srv_getCSOption('HostMonitoring', True) if hostMonitoring: gThreadScheduler.addPeriodicTask(60, cls.__storeHostInfo) # the SystemAdministrator service does not has to use the client to report data about the host. # Check the flag for dynamic monitoring dynamicMonitoring = cls.srv_getCSOption('DynamicMonitoring', False) messageQueue = cls.srv_getCSOption('MessageQueue', 'dirac.componentmonitoring') if dynamicMonitoring: global gMonitoringReporter gMonitoringReporter = MonitoringReporter( monitoringType="ComponentMonitoring", failoverQueueName=messageQueue) gThreadScheduler.addPeriodicTask(120, cls.__storeProfiling) keepSoftwareVersions = cls.srv_getCSOption('KeepSoftwareVersions', 0) if keepSoftwareVersions > 0: gLogger.info( "The last %s software version will be kept and the rest will be deleted!" % keepSoftwareVersions) gThreadScheduler.addPeriodicTask( 600, cls.__deleteOldSoftware, (keepSoftwareVersions, ), executions=2) # it is enough to try 2 times return S_OK('Initialization went well')
def __initializeMonitor(self): """ Initialize the system monitoring. """ # This flag is used to activate ES based monitoring if self.activityMonitoring: self.log.debug("Monitoring of the agent is enabled.") # The import needs to be here because of the CS must be initialized before importing # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793) from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="AgentMonitoring") # With the help of this periodic task we commit the data to ES at an interval of 100 seconds. gThreadScheduler.addPeriodicTask( 100, self.__activityMonitoringReporting) self.__monitorLastStatsUpdate = time.time()
def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.reportPeriod = 120 self.am_setOption("PollingTime", self.reportPeriod) self.monitoringReporter = MonitoringReporter( monitoringType="WMSHistory") for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK()
def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.am_setOption("PollingTime", 900) self.messageQueue = self.am_getOption('MessageQueue', 'dirac.wmshistory') self.monitoringReporter = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=self.messageQueue) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK()
def __initializeMonitor(self): """ Initialize the system monitoring. """ # This flag is used to activate ES based monitoring # if the "EnableActivityMonitoring" flag in "yes" or "true" in the cfg file. self.activityMonitoring = ( Operations().getValue("EnableActivityMonitoring", False) or self.am_getOption("EnableActivityMonitoring", False)) if self.activityMonitoring: # The import needs to be here because of the CS must be initialized before importing # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793) from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="ComponentMonitoring") # With the help of this periodic task we commit the data to ES at an interval of 100 seconds. gThreadScheduler.addPeriodicTask( 100, self.__activityMonitoringReporting) else: if self.__moduleProperties['standalone']: self.monitor = gMonitor else: self.monitor = MonitoringClient() self.monitor.setComponentType(self.monitor.COMPONENT_AGENT) self.monitor.setComponentName(self.__moduleProperties['fullName']) self.monitor.initialize() self.monitor.registerActivity('CPU', "CPU Usage", 'Framework', "CPU,%", self.monitor.OP_MEAN, 600) self.monitor.registerActivity('MEM', "Memory Usage", 'Framework', 'Memory,MB', self.monitor.OP_MEAN, 600) # Component monitor for field in ('version', 'DIRACVersion', 'description', 'platform'): self.monitor.setComponentExtraParam( field, self.__codeProperties[field]) self.monitor.setComponentExtraParam('startTime', Time.dateTime()) self.monitor.setComponentExtraParam('cycles', 0) self.monitor.disable() self.__monitorLastStatsUpdate = time.time()
def __init__( self, requestJSON, handlersDict, csPath, agentName, standalone=False, requestClient=None, rmsMonitoring=False ): """c'tor :param self: self reference :param str requestJSON: request serialized to JSON :param dict opHandlers: operation handlers """ self.request = Request(requestJSON) # # csPath self.csPath = csPath # # agent name self.agentName = agentName # # standalone flag self.standalone = standalone # # handlers dict self.handlersDict = handlersDict # # handlers class def self.handlers = {} # # own sublogger self.log = gLogger.getSubLogger("pid_%s/%s" % (os.getpid(), self.request.RequestName)) # # get shifters info self.__managersDict = {} shifterProxies = self.__setupManagerProxies() if not shifterProxies["OK"]: self.log.error("Cannot setup shifter proxies", shifterProxies["Message"]) # This flag which is set and sent from the RequestExecutingAgent and is False by default. self.rmsMonitoring = rmsMonitoring if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") if requestClient is None: self.requestClient = ReqClient() else: self.requestClient = requestClient
def initialize( self ): """ Standard constructor """ self.jobDB = JobDB() self.reportPeriod = 120 self.am_setOption( "PollingTime", self.reportPeriod ) self.monitoringReporter = MonitoringReporter( monitoringType = "WMSHistory" ) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append( field ) return S_OK()
def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.am_setOption("PollingTime", 900) self.messageQueue = self.am_getOption('MessageQueue', 'dirac.wmshistory') self.monitoringReporter = MonitoringReporter(monitoringType="WMSHistory", failoverQueueName=self.messageQueue) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK()
def initializeHandler( cls, serviceInfo ): """ Handler class initialization """ # Check the flag for monitoring of the state of the host hostMonitoring = cls.srv_getCSOption( 'HostMonitoring', True ) if hostMonitoring: gThreadScheduler.addPeriodicTask( 60, cls.__storeHostInfo ) #the SystemAdministrator service does not has to use the client to report data about the host. # Check the flag for dynamic monitoring dynamicMonitoring = cls.srv_getCSOption( 'DynamicMonitoring', False ) if dynamicMonitoring: global gMonitoringReporter gMonitoringReporter = MonitoringReporter( monitoringType = "ComponentMonitoring" ) gThreadScheduler.addPeriodicTask( 120, cls.__storeProfiling ) return S_OK( 'Initialization went well' )
class ReplicateAndRegister(DMSRequestOperationsBase): """ .. class:: ReplicateAndRegister ReplicateAndRegister operation handler """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param Operation operation: Operation instance :param str csPath: CS path for this handler """ super(ReplicateAndRegister, self).__init__(operation, csPath) # # SE cache # Clients self.fc = FileCatalog() def __call__(self): """call me maybe""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") else: # # own gMonitor stuff for files gMonitor.registerActivity( "ReplicateAndRegisterAtt", "Replicate and register attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM, ) gMonitor.registerActivity("ReplicateOK", "Replications successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicateFail", "Replications failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Registrations successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Registrations failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # for FTS gMonitor.registerActivity("FTSScheduleAtt", "Files schedule attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleOK", "File schedule successful", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSScheduleFail", "File schedule failed", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # check replicas first checkReplicas = self.__checkReplicas() if not checkReplicas["OK"]: self.log.error("Failed to check replicas", checkReplicas["Message"]) if hasattr(self, "FTSMode") and getattr(self, "FTSMode"): bannedGroups = getattr(self, "FTSBannedGroups") if hasattr( self, "FTSBannedGroups") else () if self.request.OwnerGroup in bannedGroups: self.log.verbose( "usage of FTS system is banned for request's owner") return self.dmTransfer() return self.fts3Transfer() return self.dmTransfer() def __checkReplicas(self): """check done replicas and update file states""" waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation if opFile.Status in ("Waiting", "Scheduled")]) targetSESet = set(self.operation.targetSEList) replicas = self.fc.getReplicas(list(waitingFiles)) if not replicas["OK"]: self.log.error("Failed to get replicas", replicas["Message"]) return replicas reMissing = re.compile(r".*such file.*") for failedLFN, errStr in replicas["Value"]["Failed"].items(): waitingFiles[failedLFN].Error = errStr if reMissing.search(errStr.lower()): self.log.error("File does not exists", failedLFN) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("ReplicateFail", len(targetSESet)) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].items(): if targetSESet.issubset(set(reps)): self.log.info("file replicated to all targets", successfulLFN) waitingFiles[successfulLFN].Status = "Done" return S_OK() def _addMetadataToFiles(self, toSchedule): """Add metadata to those files that need to be scheduled through FTS toSchedule is a dictionary: {'lfn1': opFile, 'lfn2': opFile} """ if toSchedule: self.log.info( "found %s files to schedule, getting metadata from FC" % len(toSchedule)) else: self.log.verbose("No files to schedule") return S_OK([]) res = self.fc.getFileMetadata(list(toSchedule)) if not res["OK"]: return res else: if res["Value"]["Failed"]: self.log.warn( "Can't schedule %d files: problems getting the metadata: %s" % (len(res["Value"]["Failed"]), ", ".join( res["Value"]["Failed"]))) metadata = res["Value"]["Successful"] filesToSchedule = {} for lfn, lfnMetadata in metadata.items(): opFileToSchedule = toSchedule[lfn][0] opFileToSchedule.GUID = lfnMetadata["GUID"] # In principle this is defined already in filterReplicas() if not opFileToSchedule.Checksum: opFileToSchedule.Checksum = metadata[lfn]["Checksum"] opFileToSchedule.ChecksumType = metadata[lfn]["ChecksumType"] opFileToSchedule.Size = metadata[lfn]["Size"] filesToSchedule[opFileToSchedule.LFN] = opFileToSchedule return S_OK(filesToSchedule) def _filterReplicas(self, opFile): """filter out banned/invalid source SEs""" return filterReplicas(opFile, logger=self.log, dataManager=self.dm, opSources=self.operation.sourceSEList) def _checkExistingFTS3Operations(self): """ Check if there are ongoing FTS3Operation for the current RMS Operation Under some conditions, we can be trying to schedule files while there is still an FTS transfer going on. This typically happens when the REA hangs. To prevent further race condition, we check if there are FTS3Operations in a non Final state matching the current operation ID. If so, we put the corresponding files in scheduled mode. We will then wait till the FTS3 Operation performs the callback :returns: S_OK with True if we can go on, False if we should stop the processing """ res = FTS3Client().getOperationsFromRMSOpID(self.operation.OperationID) if not res["OK"]: self.log.debug("Could not get FTS3Operations matching OperationID", self.operation.OperationID) return res existingFTSOperations = res["Value"] # It is ok to have FTS Operations in a final state, so we # care only about the others unfinishedFTSOperations = [ ops for ops in existingFTSOperations if ops.status not in FTS3TransferOperation.FINAL_STATES ] if not unfinishedFTSOperations: self.log.debug("No ongoing FTS3Operations, all good") return S_OK(True) self.log.warn( "Some FTS3Operations already exist for the RMS Operation:", [op.operationID for op in unfinishedFTSOperations], ) # This would really be a screwed up situation ! if len(unfinishedFTSOperations) > 1: self.log.warn("That's a serious problem !!") # We take the rmsFileID of the files in the Operations, # find the corresponding File object, and set them scheduled rmsFileIDsToSetScheduled = set([ ftsFile.rmsFileID for ftsOp in unfinishedFTSOperations for ftsFile in ftsOp.ftsFiles ]) for opFile in self.operation: # If it is in the DB, it has a FileID opFileID = opFile.FileID if opFileID in rmsFileIDsToSetScheduled: self.log.warn("Setting RMSFile as already scheduled", opFileID) opFile.Status = "Scheduled" # We return here such that the Request is set back to Scheduled in the DB # With no further modification return S_OK(False) def fts3Transfer(self): """replicate and register using FTS3""" self.log.info("scheduling files in FTS3...") # Check first if we do not have ongoing transfers res = self._checkExistingFTS3Operations() if not res["OK"]: return res # if res['Value'] is False # it means that there are ongoing transfers # and we should stop here if res["Value"] is False: # return S_OK such that the request is put back return S_OK() fts3Files = [] toSchedule = {} # Dict which maps the FileID to the object rmsFilesIds = {} if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(self.getWaitingFilesList()))) for opFile in self.getWaitingFilesList(): rmsFilesIds[opFile.FileID] = opFile opFile.Error = "" if not self.rmsMonitoring: gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas["NoReplicas"] badReplicas = replicas["Bad"] noPFN = replicas["NoPFN"] if validReplicas: validTargets = list( set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [opFile, validTargets] else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: self.log.warn( "unable to schedule file", "'%s': couldn't get metadata at %s" % (opFile.LFN, ",".join(noMetaReplicas)), ) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "Unable to schedule transfer", "File %s doesn't exist at %s" % (opFile.LFN, ",".join(noReplicas)), ) opFile.Error = "No replicas found" opFile.Status = "Failed" elif badReplicas: self.log.error( "Unable to schedule transfer", "File %s, all replicas have a bad checksum at %s" % (opFile.LFN, ",".join(badReplicas)), ) opFile.Error = "All replicas have a bad checksum" opFile.Status = "Failed" elif noPFN: self.log.warn( "unable to schedule %s, could not get a PFN at %s" % (opFile.LFN, ",".join(noPFN))) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() res = self._addMetadataToFiles(toSchedule) if not res["OK"]: return res else: filesToSchedule = res["Value"] for lfn in filesToSchedule: opFile = filesToSchedule[lfn] validTargets = toSchedule[lfn][1] for targetSE in validTargets: ftsFile = FTS3File.fromRMSFile(opFile, targetSE) fts3Files.append(ftsFile) if fts3Files: res = Registry.getUsernameForDN(self.request.OwnerDN) if not res["OK"]: self.log.error( "Cannot get username for DN", "%s %s" % (self.request.OwnerDN, res["Message"])) return res username = res["Value"] fts3Operation = FTS3TransferOperation.fromRMSObjects( self.request, self.operation, username) fts3Operation.ftsFiles = fts3Files try: if not fts3Operation.activity: vo = getVOfromProxyGroup().get("Value") fts3Plugin = getFTS3Plugin(vo=vo) fts3Operation.activity = fts3Plugin.inferFTSActivity( fts3Operation, self.request, self.operation) except Exception: pass ftsSchedule = FTS3Client().persistOperation(fts3Operation) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS3:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] self.log.info("Scheduled with FTS3Operation id %s" % ftsSchedule) self.log.info("%d files have been scheduled to FTS3" % len(fts3Files)) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", len(fts3Files))) for ftsFile in fts3Files: opFile = rmsFilesIds[ftsFile.rmsFileID] if not self.rmsMonitoring: gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) else: self.log.info("No files to schedule after metadata checks") if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True) def dmTransfer(self, fromFTS=False): """replicate and register using dataManager""" # # get waiting files. If none just return # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS(sourceSE, "ReadAccess") if not bannedSource["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info(self.operation.Error) return S_OK(self.operation.Error) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord( self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedTargets if bannedTargets["Value"]: self.operation.Error = "%s targets are banned for writing" % ",".join( bannedTargets["Value"]) return S_OK(self.operation.Error) # Can continue now self.log.verbose("No targets banned for writing") waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files if fromFTS: self.log.info( "Trying transfer using replica manager as FTS failed") else: self.log.info("Transferring files using Data manager...") errors = defaultdict(int) delayExecution = 0 if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(waitingFiles))) for opFile in waitingFiles: if opFile.Error in ( "Couldn't get metadata", "File doesn't exist", "No active replica found", "All replicas have a bad checksum", ): err = "File already in error status" errors[err] += 1 if not self.rmsMonitoring: gMonitor.addMark("ReplicateAndRegisterAtt", 1) opFile.Error = "" lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas(opFile) if not replicas["OK"]: self.log.error("Failed to check replicas", replicas["Message"]) continue replicas = replicas["Value"] validReplicas = replicas.get("Valid") noMetaReplicas = replicas.get("NoMetadata") noReplicas = replicas.get("NoReplicas") badReplicas = replicas.get("Bad") noActiveReplicas = replicas.get("NoActiveReplicas") if not validReplicas: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("ReplicateFail") if noMetaReplicas: err = "Couldn't get metadata" errors[err] += 1 self.log.verbose( "unable to replicate '%s', couldn't get metadata at %s" % (opFile.LFN, ",".join(noMetaReplicas))) opFile.Error = err elif noReplicas: err = "File doesn't exist" errors[err] += 1 self.log.verbose( "Unable to replicate", "File %s doesn't exist at %s" % (opFile.LFN, ",".join(noReplicas))) opFile.Error = err opFile.Status = "Failed" elif badReplicas: err = "All replicas have a bad checksum" errors[err] += 1 self.log.error( "Unable to replicate", "%s, all replicas have a bad checksum at %s" % (opFile.LFN, ",".join(badReplicas)), ) opFile.Error = err opFile.Status = "Failed" elif noActiveReplicas: err = "No active replica found" errors[err] += 1 self.log.verbose( "Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ",".join(noActiveReplicas))) opFile.Error = err # All source SEs are banned, delay execution by 1 hour delayExecution = 60 continue # # get the first one in the list if sourceSE not in validReplicas: if sourceSE: err = "File not at specified source" errors[err] += 1 self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % (lfn, sourceSE, validReplicas[0])) sourceSE = validReplicas[0] # # loop over targetSE catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(",")] for targetSE in self.operation.targetSEList: # # call DataManager if targetSE in validReplicas: self.log.warn( "Request to replicate %s to an existing location: %s" % (lfn, targetSE)) continue res = self.dm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE, catalog=catalogs) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn][ "replicate"] prString = "file %s replicated at %s in %s s." % ( lfn, targetSE, repTime) if not self.rmsMonitoring: gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: if not self.rmsMonitoring: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn][ "register"] prString += " and registered in %s s." % regTime self.log.info(prString) else: if not self.rmsMonitoring: gMonitor.addMark("RegisterFail", 1) prString += " but failed to register" self.log.warn(prString) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE, type="RegisterReplica") self.request.insertAfter( registerOperation, self.operation) else: self.log.error("Failed to replicate", "%s to %s" % (lfn, targetSE)) if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error("Failed to replicate and register", "File %s at %s:" % (lfn, targetSE), reason) opFile.Error = reason else: if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error("DataManager error", res["Message"]) if not opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) if len(self.operation.targetSEList) > 1: self.log.info( "file %s has been replicated to all targetSEs" % lfn) opFile.Status = "Done" elif self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) # Log error counts if delayExecution: self.log.info("Delay execution of the request by %d minutes" % delayExecution) self.request.delayNextExecution(delayExecution) for error, count in errors.items(): self.log.error(error, "for %d files" % count) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK()
} """ # TODO: use WMSHistory_testData.json as in Test_MonitoringDB.py # pylint: disable=invalid-name,wrong-import-position import DIRAC DIRAC.initialize() # Initialize configuration from DIRAC import gLogger from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter gLogger.setLevel("INFO") wmsMonitoringReporter = MonitoringReporter(monitoringType="WMSHistory") agentMonitoringReporter = MonitoringReporter(monitoringType="AgentMonitoring") serviceMonitoringReporter = MonitoringReporter( monitoringType="ServiceMonitoring") pilotMonitoringReporter = MonitoringReporter( monitoringType="PilotSubmissionMonitoring") pilotsHistoryReporter = MonitoringReporter(monitoringType="PilotsHistory") data = [ { "Status": "Waiting", "Jobs": 2, "timestamp": 1458130176, "JobSplitType": "MCStripping", "MinorStatus": "unset", "Site": "LCG.GRIDKA.de", "Reschedules": 0,
def __call__(self): """ call me maybe """ # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter( monitoringType="RMSMonitoring") else: # # RegisterFile specific monitor info gMonitor.registerActivity("RegisterAtt", "Attempted file registrations", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterOK", "Successful file registrations", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) gMonitor.registerActivity("RegisterFail", "Failed file registrations", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM) # # counter for failed files failedFiles = 0 # # catalog(s) to use catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(',')] dm = DataManager(catalogs=catalogs) # # get waiting files waitingFiles = self.getWaitingFilesList() if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Attempted", len(waitingFiles))) # # loop over files for opFile in waitingFiles: if not self.rmsMonitoring: gMonitor.addMark("RegisterAtt", 1) # # get LFN lfn = opFile.LFN # # and others fileTuple = (lfn, opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum) # # call DataManager registerFile = dm.registerFile(fileTuple) # # check results if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("RegisterFail", 1) # self.dataLoggingClient().addFileRecord( # lfn, "RegisterFail", ','.join(catalogs) if catalogs else "all catalogs", "", "RegisterFile") reason = str( registerFile.get( "Message", registerFile.get("Value", {}).get("Failed", {}).get(lfn, 'Unknown'))) errorStr = "failed to register LFN" opFile.Error = "%s: %s" % (errorStr, reason) if 'GUID already registered' in reason: opFile.Status = 'Failed' self.log.error(errorStr, "%s: %s" % (lfn, reason)) elif 'File already registered with no replicas' in reason: self.log.warn( errorStr, "%s: %s, will remove it and retry" % (lfn, reason)) dm.removeFile(lfn) else: self.log.warn(errorStr, "%s: %s" % (lfn, reason)) failedFiles += 1 else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("RegisterOK", 1) # self.dataLoggingClient().addFileRecord( # lfn, "Register", ','.join(catalogs) if catalogs else "all catalogs", "", "RegisterFile") self.log.verbose( "file %s has been registered at %s" % (lfn, ','.join(catalogs) if catalogs else "all catalogs")) opFile.Status = "Done" if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # # final check if failedFiles: self.log.warn("all files processed, %s files failed to register" % failedFiles) self.operation.Error = "some files failed to register" return S_ERROR(self.operation.Error) return S_OK()
class TornadoServer(object): """ Tornado webserver Initialize and run an HTTPS Server for DIRAC services. By default it load all https services defined in the CS, but you can also give an explicit list. The listening port is either: * Given as parameter * Loaded from the CS ``/Systems/Tornado/<instance>/Port`` * Default to 8443 Example 1: Easy way to start tornado:: # Initialize server and load services serverToLaunch = TornadoServer() # Start listening when ready serverToLaunch.startTornado() Example 2:We want to debug service1 and service2 only, and use another port for that :: services = ['component/service1:port1', 'component/service2'] endpoints = ['component/endpoint1', 'component/endpoint2'] serverToLaunch = TornadoServer(services=services, endpoints=endpoints, port=1234) serverToLaunch.startTornado() """ def __init__(self, services=True, endpoints=False, port=None): """C'r :param list services: (default True) List of service handlers to load. If ``True``, loads all described in the CS If ``False``, do not load services :param list endpoints: (default False) List of endpoint handlers to load. If ``True``, loads all described in the CS If ``False``, do not load endpoints :param int port: Port to listen to. If ``None``, the port is resolved following the logic described in the class documentation """ self.__startTime = time.time() # Application metadata, routes and settings mapping on the ports self.__appsSettings = {} # Default port, if enother is not discover if port is None: port = gConfig.getValue( "/Systems/Tornado/%s/Port" % PathFinder.getSystemInstance("Tornado"), 8443) self.port = port # Handler manager initialization with default settings self.handlerManager = HandlerManager(services, endpoints) # temp value for computation, used by the monitoring self.__report = None # Last update time stamp self.__monitorLastStatsUpdate = None self.__monitoringLoopDelay = 60 # In secs self.activityMonitoring = False if "Monitoring" in Operations().getMonitoringBackends( monitoringType="ServiceMonitoring"): self.activityMonitoring = True # If services are defined, load only these ones (useful for debug purpose or specific services) retVal = self.handlerManager.loadServicesHandlers() if not retVal["OK"]: sLog.error(retVal["Message"]) raise ImportError( "Some services can't be loaded, check the service names and configuration." ) # Response time to load services self.__elapsedTime = time.time() - self.__startTime retVal = self.handlerManager.loadEndpointsHandlers() if not retVal["OK"]: sLog.error(retVal["Message"]) raise ImportError( "Some endpoints can't be loaded, check the endpoint names and configuration." ) def __calculateAppSettings(self): """Calculate application information mapping on the ports""" # if no service list is given, load services from configuration handlerDict = self.handlerManager.getHandlersDict() for data in handlerDict.values(): port = data.get("Port") or self.port for hURL in data["URLs"]: if port not in self.__appsSettings: self.__appsSettings[port] = {"routes": [], "settings": {}} if hURL not in self.__appsSettings[port]["routes"]: self.__appsSettings[port]["routes"].append(hURL) return bool(self.__appsSettings) def loadServices(self, services): """Load a services :param services: List of service handlers to load. Default value set at initialization If ``True``, loads all services from CS :type services: bool or list :return: S_OK()/S_ERROR() """ return self.handlerManager.loadServicesHandlers(services) def loadEndpoints(self, endpoints): """Load a endpoints :param endpoints: List of service handlers to load. Default value set at initialization If ``True``, loads all endpoints from CS :type endpoints: bool or list :return: S_OK()/S_ERROR() """ return self.handlerManager.loadEndpointsHandlers(endpoints) def addHandlers(self, routes, settings=None, port=None): """Add new routes :param list routes: routes :param dict settings: application settings :param int port: port """ port = port or self.port if port not in self.__appsSettings: self.__appsSettings[port] = {"routes": [], "settings": {}} if settings: self.__appsSettings[port]["settings"].update(settings) for route in routes: if route not in self.__appsSettings[port]["routes"]: self.__appsSettings[port]["routes"].append(route) return S_OK() def startTornado(self): """ Starts the tornado server when ready. This method never returns. """ # If there is no services loaded: if not self.__calculateAppSettings(): raise Exception( "There is no services loaded, please check your configuration") sLog.debug("Starting Tornado") # Prepare SSL settings certs = Locations.getHostCertificateAndKeyLocation() if certs is False: sLog.fatal("Host certificates not found ! Can't start the Server") raise ImportError("Unable to load certificates") ca = Locations.getCAsLocation() ssl_options = { "certfile": certs[0], "keyfile": certs[1], "cert_reqs": M2Crypto.SSL.verify_peer, "ca_certs": ca, "sslDebug": DEBUG_M2CRYPTO, # Set to true if you want to see the TLS debug messages } # Init monitoring if self.activityMonitoring: from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="ServiceMonitoring") self.__monitorLastStatsUpdate = time.time() self.__report = self.__startReportToMonitoringLoop() # Response time # Starting monitoring, IOLoop waiting time in ms, __monitoringLoopDelay is defined in seconds tornado.ioloop.PeriodicCallback( self.__reportToMonitoring(self.__elapsedTime), self.__monitoringLoopDelay * 1000).start() # If we are running with python3, Tornado will use asyncio, # and we have to convince it to let us run in a different thread # Doing this ensures a consistent behavior between py2 and py3 asyncio.set_event_loop_policy( tornado.platform.asyncio.AnyThreadEventLoopPolicy()) for port, app in self.__appsSettings.items(): sLog.debug(" - %s" % "\n - ".join( ["%s = %s" % (k, ssl_options[k]) for k in ssl_options])) # Default server configuration settings = dict(compress_response=True, cookie_secret="secret") # Merge appllication settings settings.update(app["settings"]) # Start server router = Application(app["routes"], default_handler_class=NotFoundHandler, **settings) server = HTTPServer(router, ssl_options=ssl_options, decompress_request=True) try: server.listen(int(port)) except Exception as e: # pylint: disable=broad-except sLog.exception("Exception starting HTTPServer", e) raise sLog.always("Listening on port %s" % port) tornado.ioloop.IOLoop.current().start() def __reportToMonitoring(self, responseTime): """ Periodically reports to Monitoring """ # Calculate CPU usage by comparing realtime and cpu time since last report percentage = self.__endReportToMonitoringLoop(self.__report[0], self.__report[1]) # Send record to Monitoring self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "Tornado", "MemoryUsage": self.__report[2], "CpuPercentage": percentage, "ResponseTime": responseTime, }) self.activityMonitoringReporter.commit() # Save memory usage and save realtime/CPU time for next call self.__report = self.__startReportToMonitoringLoop() def __startReportToMonitoringLoop(self): """ Snapshot of resources to be taken at the beginning of a monitoring cycle. Also sends memory snapshot to the monitoring. This is basically copy/paste of Service.py :returns: tuple (<time.time(), cpuTime ) """ now = time.time() # Used to calulate a delta stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 0: return (now, cpuTime) # Send CPU consumption mark self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB("VmRSS:") if membytes: mem = membytes / (1024.0 * 1024.0) return (now, cpuTime, mem) def __endReportToMonitoringLoop(self, initialWallTime, initialCPUTime): """ Snapshot of resources to be taken at the end of a monitoring cycle. This is basically copy/paste of Service.py Determines CPU usage by comparing walltime and cputime and send it to monitor """ wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100.0 return percentage
def startTornado(self): """ Starts the tornado server when ready. This method never returns. """ # If there is no services loaded: if not self.__calculateAppSettings(): raise Exception( "There is no services loaded, please check your configuration") sLog.debug("Starting Tornado") # Prepare SSL settings certs = Locations.getHostCertificateAndKeyLocation() if certs is False: sLog.fatal("Host certificates not found ! Can't start the Server") raise ImportError("Unable to load certificates") ca = Locations.getCAsLocation() ssl_options = { "certfile": certs[0], "keyfile": certs[1], "cert_reqs": M2Crypto.SSL.verify_peer, "ca_certs": ca, "sslDebug": DEBUG_M2CRYPTO, # Set to true if you want to see the TLS debug messages } # Init monitoring if self.activityMonitoring: from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="ServiceMonitoring") self.__monitorLastStatsUpdate = time.time() self.__report = self.__startReportToMonitoringLoop() # Response time # Starting monitoring, IOLoop waiting time in ms, __monitoringLoopDelay is defined in seconds tornado.ioloop.PeriodicCallback( self.__reportToMonitoring(self.__elapsedTime), self.__monitoringLoopDelay * 1000).start() # If we are running with python3, Tornado will use asyncio, # and we have to convince it to let us run in a different thread # Doing this ensures a consistent behavior between py2 and py3 asyncio.set_event_loop_policy( tornado.platform.asyncio.AnyThreadEventLoopPolicy()) for port, app in self.__appsSettings.items(): sLog.debug(" - %s" % "\n - ".join( ["%s = %s" % (k, ssl_options[k]) for k in ssl_options])) # Default server configuration settings = dict(compress_response=True, cookie_secret="secret") # Merge appllication settings settings.update(app["settings"]) # Start server router = Application(app["routes"], default_handler_class=NotFoundHandler, **settings) server = HTTPServer(router, ssl_options=ssl_options, decompress_request=True) try: server.listen(int(port)) except Exception as e: # pylint: disable=broad-except sLog.exception("Exception starting HTTPServer", e) raise sLog.always("Listening on port %s" % port) tornado.ioloop.IOLoop.current().start()
class StatesMonitoringAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ __summaryKeyFieldsMapping = [ 'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType', 'ApplicationStatus', 'MinorStatus' ] __summaryDefinedFields = [('ApplicationStatus', 'unset'), ('MinorStatus', 'unset')] __summaryValueFieldsMapping = ['Jobs', 'Reschedules'] __renameFieldsMapping = {'JobType': 'JobSplitType'} __jobDBFields = [] jobDB = None monitoringReporter = None def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.am_setOption("PollingTime", 900) self.messageQueue = self.am_getOption('MessageQueue', 'dirac.wmshistory') self.monitoringReporter = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=self.messageQueue) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append(field) return S_OK() def execute(self): """ Main execution method """ result = gConfig.getSections("/DIRAC/Setups") if not result['OK']: return result validSetups = result['Value'] self.log.info("Valid setups for this cycle are %s" % ", ".join(validSetups)) # Get the WMS Snapshot! result = self.jobDB.getSummarySnapshot(self.__jobDBFields) now = Time.dateTime() if not result['OK']: self.log.error("Can't get the jobdb summary", result['Message']) else: values = result['Value'][1] self.log.info("Start sending records!") for record in values: recordSetup = record[0] if recordSetup not in validSetups: self.log.error("Setup %s is not valid" % recordSetup) continue record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[fV[0]] = fV[1] for iP in range(len(self.__summaryKeyFieldsMapping)): fieldName = self.__summaryKeyFieldsMapping[iP] rD[self.__renameFieldsMapping.get(fieldName, fieldName)] = record[iP] record = record[len(self.__summaryKeyFieldsMapping):] for iP in range(len(self.__summaryValueFieldsMapping)): rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP]) rD['timestamp'] = int(Time.toEpoch(now)) self.monitoringReporter.addRecord(rD) retVal = self.monitoringReporter.commit() if retVal['OK']: self.log.info( "The records are successfully sent to the Store!") else: self.log.warn( "Faild to insert the records! It will be retried in the next iteration", retVal['Message']) return S_OK()
class RemoveReplica(DMSRequestOperationsBase): """ .. class:: RemoveReplica """ def __init__(self, operation=None, csPath=None): """c'tor :param self: self reference :param Operation operation: operation to execute :param str csPath: CS path for this handler """ # # base class ctor DMSRequestOperationsBase.__init__(self, operation, csPath) def __call__(self): """remove replicas""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") else: # # gMonitor stuff gMonitor.registerActivity( "RemoveReplicaAtt", "Replica removals attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveReplicaOK", "Successful replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveReplicaFail", "Failed replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) # # prepare list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal bannedTargets = self.checkSEsRSS(targetSEs, access="RemoveAccess") if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("RemoveReplicaAtt") gMonitor.addMark("RemoveReplicaFail") return bannedTargets if bannedTargets["Value"]: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # and prepare dict toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles) self.log.info("Todo: %s replicas to delete from %s SEs" % (len(toRemoveDict), len(targetSEs))) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict))) else: gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs)) # # keep status for each targetSE removalStatus = dict.fromkeys(toRemoveDict, None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, None) # # loop over targetSEs for targetSE in targetSEs: self.log.info("Removing replicas at %s" % targetSE) # # 1st step - bulk removal bulkRemoval = self._bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error("Bulk replica removal failed", bulkRemoval["Message"]) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return bulkRemoval # # report removal status for successful files if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord( "Successful", len(([opFile for opFile in toRemoveDict.values() if not opFile.Error])) ) ) else: gMonitor.addMark( "RemoveReplicaOK", len([opFile for opFile in toRemoveDict.values() if not opFile.Error]) ) # # 2nd step - process the rest again toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Error) for lfn, opFile in toRetry.items(): self._removeWithOwnerProxy(opFile, targetSE) if opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("RemoveReplicaFail", 1) removalStatus[lfn][targetSE] = opFile.Error else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("RemoveReplicaOK", 1) # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = list(set(error for error in removalStatus[opFile.LFN].values() if error)) if errors: opFile.Error = "\n".join(errors) # This seems to be the only unrecoverable error if "Write access not permitted for this credential" in opFile.Error: failed += 1 opFile.Status = "Failed" else: opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s replicas" % failed if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK() def _bulkRemoval(self, toRemoveDict, targetSE): """remove replicas :toRemoveDict: at :targetSE: :param dict toRemoveDict: { lfn: opFile, ... } :param str targetSE: target SE name """ # Clear the error for opFile in toRemoveDict.values(): opFile.Error = "" removeReplicas = self.dm.removeReplica(targetSE, list(toRemoveDict)) if not removeReplicas["OK"]: for opFile in toRemoveDict.values(): opFile.Error = removeReplicas["Message"] return removeReplicas removeReplicas = removeReplicas["Value"] # # filter out failed for lfn, opFile in toRemoveDict.items(): if lfn in removeReplicas["Failed"]: errorReason = str(removeReplicas["Failed"][lfn]) # If the reason is that the file does not exist, # we consider the removal successful # TODO: use cmpError once the FC returns the proper error msg corresponding to ENOENT if "No such file" not in errorReason: opFile.Error = errorReason self.log.error("Failed removing lfn", "%s:%s" % (lfn, opFile.Error)) return S_OK() def _removeWithOwnerProxy(self, opFile, targetSE): """remove opFile replica from targetSE using owner proxy :param File opFile: File instance :param str targetSE: target SE name """ if "Write access not permitted for this credential" in opFile.Error: proxyFile = None if "DataManager" in self.shifter: # # you're a data manager - save current proxy and get a new one for LFN and retry saveProxy = os.environ["X509_USER_PROXY"] try: fileProxy = self.getProxyForLFN(opFile.LFN) if not fileProxy["OK"]: opFile.Error = fileProxy["Message"] else: proxyFile = fileProxy["Value"] removeReplica = self.dm.removeReplica(targetSE, opFile.LFN) if not removeReplica["OK"]: opFile.Error = removeReplica["Message"] else: # Set or reset the error if all OK opFile.Error = removeReplica["Value"]["Failed"].get(opFile.LFN, "") finally: if proxyFile: os.unlink(proxyFile) # # put back request owner proxy to env os.environ["X509_USER_PROXY"] = saveProxy
def __init__(self, *args, **kwargs): """c'tor""" # # call base class ctor AgentModule.__init__(self, *args, **kwargs) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle) self.log.info("Requests/cycle = %d" % self.__requestsPerCycle) self.__minProcess = self.am_getOption("MinProcess", self.__minProcess) self.log.info("ProcessPool min process = %d" % self.__minProcess) self.__maxProcess = self.am_getOption("MaxProcess", self.__maxProcess) self.log.info("ProcessPool max process = %d" % self.__maxProcess) self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize) self.log.info("ProcessPool queue size = %d" % self.__queueSize) self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout)) self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout) self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep)) self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep) self.__bulkRequest = self.am_getOption("BulkRequest", self.__bulkRequest) self.log.info("Bulk request size = %d" % self.__bulkRequest) self.__rmsMonitoring = self.am_getOption("EnableRMSMonitoring", self.__rmsMonitoring) self.log.info("Enable ES RMS Monitoring = %s" % self.__rmsMonitoring) # # keep config path and agent name self.agentName = self.am_getModuleParam("fullName") self.__configPath = PathFinder.getAgentSection(self.agentName) # # operation handlers over here opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers") opHandlers = gConfig.getSections(opHandlersPath) if not opHandlers["OK"]: self.log.error(opHandlers["Message"]) raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath) opHandlers = opHandlers["Value"] self.timeOuts = dict() # # handlers dict self.handlersDict = dict() for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler) opLocation = gConfig.getValue(opHandlerPath, "") if not opLocation: self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler)) continue self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout} opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.handlersDict[opHandler] = opLocation self.log.info("Operation handlers:") for item in enumerate(self.handlersDict.items()): opHandler = item[1][0] self.log.info( "[%s] %s: %s (timeout: %d s + %d s per file)" % ( item[0], item[1][0], item[1][1], self.timeOuts[opHandler]["PerOperation"], self.timeOuts[opHandler]["PerFile"], ) ) if self.__rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") gThreadScheduler.addPeriodicTask(100, self.__rmsMonitoringReporting) else: # # common monitor activity gMonitor.registerActivity("Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM) gMonitor.registerActivity( "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) # # create request dict self.__requestCache = dict() # ?? Probably should be removed self.FTSMode = self.am_getOption("FTSMode", False)
class StatesAccountingAgent(AgentModule): """Agent that every 15 minutes will report to the AccountingDB (MySQL) or the Monitoring DB (ElasticSearch), or both, a snapshot of the JobDB. Also sends a snapshot of PilotAgentsDB to Monitoring. """ # WMSHistory fields __summaryKeyFieldsMapping = [ "Status", "Site", "User", "UserGroup", "JobGroup", "JobType", "ApplicationStatus", "MinorStatus", ] __summaryDefinedFields = [("ApplicationStatus", "unset"), ("MinorStatus", "unset")] __summaryValueFieldsMapping = ["Jobs", "Reschedules"] __renameFieldsMapping = {"JobType": "JobSplitType"} # PilotsHistory fields __pilotsMapping = [ "TaskQueueID", "GridSite", "GridType", "Status", "NumOfPilots" ] def initialize(self): """Standard initialization""" # This agent will always loop every 15 minutes self.am_setOption("PollingTime", 900) # Check whether to send to Monitoring or Accounting or both self.jobMonitoringOption = Operations().getMonitoringBackends( monitoringType="WMSHistory") self.pilotMonitoringOption = Operations().getMonitoringBackends( monitoringType="PilotsHistory") messageQueue = self.am_getOption("MessageQueue", "dirac.wmshistory") self.datastores = { } # For storing the clients to Accounting and Monitoring if "Accounting" in self.jobMonitoringOption: self.datastores["Accounting"] = DataStoreClient(retryGraceTime=900) if "Monitoring" in self.jobMonitoringOption: self.datastores["Monitoring"] = MonitoringReporter( monitoringType="WMSHistory", failoverQueueName=messageQueue) if "Monitoring" in self.pilotMonitoringOption: self.pilotReporter = MonitoringReporter( monitoringType="PilotsHistory", failoverQueueName=messageQueue) self.__jobDBFields = [] for field in self.__summaryKeyFieldsMapping: if field == "User": field = "Owner" elif field == "UserGroup": field = "OwnerGroup" self.__jobDBFields.append(field) return S_OK() def execute(self): """Main execution method""" # PilotsHistory to Monitoring if "Monitoring" in self.pilotMonitoringOption: self.log.info("Committing PilotsHistory to Monitoring") result = PilotAgentsDB().getSummarySnapshot() now = datetime.datetime.utcnow() if not result["OK"]: self.log.error( "Can't get the PilotAgentsDB summary", "%s: won't commit PilotsHistory at this cycle" % result["Message"], ) values = result["Value"][1] for record in values: rD = {} for iP, _ in enumerate(self.__pilotsMapping): rD[self.__pilotsMapping[iP]] = record[iP] rD["timestamp"] = int(TimeUtilities.toEpoch(now)) self.pilotReporter.addRecord(rD) self.log.info("Committing to Monitoring...") result = self.pilotReporter.commit() if not result["OK"]: self.log.error("Could not commit to Monitoring", result["Message"]) self.log.verbose("Done committing PilotsHistory to Monitoring") # WMSHistory to Monitoring or Accounting self.log.info("Committing WMSHistory to %s backend" % "and ".join(self.jobMonitoringOption)) result = JobDB().getSummarySnapshot(self.__jobDBFields) now = datetime.datetime.utcnow() if not result["OK"]: self.log.error( "Can't get the JobDB summary", "%s: won't commit WMSHistory at this cycle" % result["Message"]) return S_ERROR() values = result["Value"][1] self.log.info("Start sending WMSHistory records") for record in values: record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[fV[0]] = fV[1] for iP, _ in enumerate(self.__summaryKeyFieldsMapping): fieldName = self.__summaryKeyFieldsMapping[iP] rD[self.__renameFieldsMapping.get(fieldName, fieldName)] = record[iP] record = record[len(self.__summaryKeyFieldsMapping):] for iP, _ in enumerate(self.__summaryValueFieldsMapping): rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP]) for backend in self.datastores: if backend.lower() == "monitoring": rD["timestamp"] = int(TimeUtilities.toEpoch(now)) self.datastores["Monitoring"].addRecord(rD) elif backend.lower() == "accounting": acWMS = WMSHistory() acWMS.setStartTime(now) acWMS.setEndTime(now) acWMS.setValuesFromDict(rD) retVal = acWMS.checkValues() if not retVal["OK"]: self.log.error("Invalid WMSHistory accounting record ", "%s -> %s" % (retVal["Message"], rD)) else: self.datastores["Accounting"].addRegister(acWMS) for backend, datastore in self.datastores.items(): self.log.info("Committing WMSHistory records to %s backend" % backend) result = datastore.commit() if not result["OK"]: self.log.error("Couldn't commit WMSHistory to %s" % backend, result["Message"]) return S_ERROR() self.log.verbose("Done committing WMSHistory to %s backend" % backend) return S_OK()
class RequestExecutingAgent(AgentModule): """ .. class:: RequestExecutingAgent request processing agent using ProcessPool, Operation handlers and RequestTask """ # # process pool __processPool = None # # request cache __requestCache = {} # # requests/cycle __requestsPerCycle = 100 # # minimal nb of subprocess running __minProcess = 20 # # maximal nb of subprocess executed same time __maxProcess = 20 # # ProcessPool queue size __queueSize = 20 # # file timeout __fileTimeout = 300 # # operation timeout __operationTimeout = 300 # # ProcessPool finalization timeout __poolTimeout = 900 # # ProcessPool sleep time __poolSleep = 5 # # placeholder for RequestClient instance __requestClient = None # # Size of the bulk if use of getRequests. If 0, use getRequest __bulkRequest = 0 # # Send the monitoring data to ES rather than the Framework/Monitoring __rmsMonitoring = False def __init__(self, *args, **kwargs): """c'tor""" # # call base class ctor AgentModule.__init__(self, *args, **kwargs) # # ProcessPool related stuff self.__requestsPerCycle = self.am_getOption("RequestsPerCycle", self.__requestsPerCycle) self.log.info("Requests/cycle = %d" % self.__requestsPerCycle) self.__minProcess = self.am_getOption("MinProcess", self.__minProcess) self.log.info("ProcessPool min process = %d" % self.__minProcess) self.__maxProcess = self.am_getOption("MaxProcess", self.__maxProcess) self.log.info("ProcessPool max process = %d" % self.__maxProcess) self.__queueSize = self.am_getOption("ProcessPoolQueueSize", self.__queueSize) self.log.info("ProcessPool queue size = %d" % self.__queueSize) self.__poolTimeout = int(self.am_getOption("ProcessPoolTimeout", self.__poolTimeout)) self.log.info("ProcessPool timeout = %d seconds" % self.__poolTimeout) self.__poolSleep = int(self.am_getOption("ProcessPoolSleep", self.__poolSleep)) self.log.info("ProcessPool sleep time = %d seconds" % self.__poolSleep) self.__bulkRequest = self.am_getOption("BulkRequest", self.__bulkRequest) self.log.info("Bulk request size = %d" % self.__bulkRequest) self.__rmsMonitoring = self.am_getOption("EnableRMSMonitoring", self.__rmsMonitoring) self.log.info("Enable ES RMS Monitoring = %s" % self.__rmsMonitoring) # # keep config path and agent name self.agentName = self.am_getModuleParam("fullName") self.__configPath = PathFinder.getAgentSection(self.agentName) # # operation handlers over here opHandlersPath = "%s/%s" % (self.__configPath, "OperationHandlers") opHandlers = gConfig.getSections(opHandlersPath) if not opHandlers["OK"]: self.log.error(opHandlers["Message"]) raise AgentConfigError("OperationHandlers section not found in CS under %s" % self.__configPath) opHandlers = opHandlers["Value"] self.timeOuts = dict() # # handlers dict self.handlersDict = dict() for opHandler in opHandlers: opHandlerPath = "%s/%s/Location" % (opHandlersPath, opHandler) opLocation = gConfig.getValue(opHandlerPath, "") if not opLocation: self.log.error("%s not set for %s operation handler" % (opHandlerPath, opHandler)) continue self.timeOuts[opHandler] = {"PerFile": self.__fileTimeout, "PerOperation": self.__operationTimeout} opTimeout = gConfig.getValue("%s/%s/TimeOut" % (opHandlersPath, opHandler), 0) if opTimeout: self.timeOuts[opHandler]["PerOperation"] = opTimeout fileTimeout = gConfig.getValue("%s/%s/TimeOutPerFile" % (opHandlersPath, opHandler), 0) if fileTimeout: self.timeOuts[opHandler]["PerFile"] = fileTimeout self.handlersDict[opHandler] = opLocation self.log.info("Operation handlers:") for item in enumerate(self.handlersDict.items()): opHandler = item[1][0] self.log.info( "[%s] %s: %s (timeout: %d s + %d s per file)" % ( item[0], item[1][0], item[1][1], self.timeOuts[opHandler]["PerOperation"], self.timeOuts[opHandler]["PerFile"], ) ) if self.__rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") gThreadScheduler.addPeriodicTask(100, self.__rmsMonitoringReporting) else: # # common monitor activity gMonitor.registerActivity("Iteration", "Agent Loops", "RequestExecutingAgent", "Loops/min", gMonitor.OP_SUM) gMonitor.registerActivity( "Processed", "Request Processed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RequestExecutingAgent", "Requests/min", gMonitor.OP_SUM ) # # create request dict self.__requestCache = dict() # ?? Probably should be removed self.FTSMode = self.am_getOption("FTSMode", False) def processPool(self): """facade for ProcessPool""" if not self.__processPool: minProcess = max(1, self.__minProcess) maxProcess = max(self.__minProcess, self.__maxProcess) queueSize = abs(self.__queueSize) self.log.info( "REA ProcessPool configuration", "minProcess = %d maxProcess = %d queueSize = %d" % (minProcess, maxProcess, queueSize), ) self.__processPool = ProcessPool( minProcess, maxProcess, queueSize, poolCallback=self.resultCallback, poolExceptionCallback=self.exceptionCallback, ) self.__processPool.daemonize() return self.__processPool def requestClient(self): """RequestClient getter""" if not self.__requestClient: self.__requestClient = ReqClient() return self.__requestClient def cacheRequest(self, request): """put request into requestCache :param ~Request.Request request: Request instance """ maxProcess = max(self.__minProcess, self.__maxProcess) if len(self.__requestCache) > maxProcess + 50: # For the time being we just print a warning... If the ProcessPool is working well, this is not needed # We don't know how much is acceptable as it depends on many factors self.log.warn("Too many requests in cache", ": %d" % len(self.__requestCache)) # return S_ERROR( "Too many requests in cache" ) if request.RequestID in self.__requestCache: # We don't call putRequest as we have got back the request that is still being executed. Better keep it # The main reason for this is that it lasted longer than the kick time of CleanReqAgent self.log.warn( "Duplicate request, keep it but don't execute", ": %d/%s" % (request.RequestID, request.RequestName) ) return S_ERROR(errno.EALREADY, "Request already in cache") self.__requestCache[request.RequestID] = request return S_OK() def putRequest(self, requestID, taskResult=None): """put back :requestID: to RequestClient :param str requestID: request's id """ if requestID in self.__requestCache: request = self.__requestCache.pop(requestID) if taskResult: if taskResult["OK"]: request = taskResult["Value"] # The RequestTask is putting back the Done tasks, no need to redo it if request.Status == "Done": return S_OK() # In case of timeout, we need to increment ourselves all the attempts elif cmpError(taskResult, errno.ETIME): waitingOp = request.getWaiting() for rmsFile in waitingOp.get("Value", []): rmsFile.Attempt += 1 reset = self.requestClient().putRequest(request, useFailoverProxy=False, retryMainService=2) if not reset["OK"]: return S_ERROR("putRequest: unable to reset request %s: %s" % (requestID, reset["Message"])) else: return S_ERROR("Not in cache") return S_OK() def putAllRequests(self): """put back all requests without callback called into requestClient :param self: self reference """ self.log.info("putAllRequests: will put back requests", "%s" % len(self.__requestCache)) for requestID in self.__requestCache.keys(): reset = self.putRequest(requestID) if not reset["OK"]: self.log.error("Failed to put request", reset["Message"]) else: self.log.debug("putAllRequests: request %s has been put back with its initial state" % requestID) return S_OK() def initialize(self): """initialize agent""" return S_OK() def execute(self): """read requests from RequestClient and enqueue them into ProcessPool""" if not self.__rmsMonitoring: gMonitor.addMark("Iteration", 1) # # requests (and so tasks) counter taskCounter = 0 while taskCounter < self.__requestsPerCycle: self.log.debug("execute: executing %d request in this cycle" % taskCounter) requestsToExecute = [] if not self.__bulkRequest: self.log.info("execute: ask for a single request") getRequest = self.requestClient().getRequest() if not getRequest["OK"]: self.log.error("execute:", "%s" % getRequest["Message"]) break if not getRequest["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break requestsToExecute = [getRequest["Value"]] else: numberOfRequest = min(self.__bulkRequest, self.__requestsPerCycle - taskCounter) self.log.info("execute: ask for requests", "%s" % numberOfRequest) getRequests = self.requestClient().getBulkRequests(numberOfRequest) if not getRequests["OK"]: self.log.error("execute:", "%s" % getRequests["Message"]) break if not getRequests["Value"]: self.log.info("execute: no more 'Waiting' requests to process") break for rId in getRequests["Value"]["Failed"]: self.log.error("execute:", "%s" % getRequests["Value"]["Failed"][rId]) requestsToExecute = list(getRequests["Value"]["Successful"].values()) self.log.info("execute: will execute requests ", "%s" % len(requestsToExecute)) for request in requestsToExecute: # # set task id taskID = request.RequestID self.log.info( "processPool status", "tasks idle = %s working = %s" % (self.processPool().getNumIdleProcesses(), self.processPool().getNumWorkingProcesses()), ) looping = 0 while True: if not self.processPool().getFreeSlots(): if not looping: self.log.info( "No free slots available in processPool", "will wait %d seconds to proceed" % self.__poolSleep, ) time.sleep(self.__poolSleep) looping += 1 else: if looping: self.log.info("Free slot found", "after %d seconds" % looping * self.__poolSleep) looping = 0 # # save current request in cache res = self.cacheRequest(request) if not res["OK"]: if cmpError(res, errno.EALREADY): # The request is already in the cache, skip it. break out of the while loop to get next request break # There are too many requests in the cache, commit suicide self.log.error( "Too many requests in cache", "(%d requests): put back all requests and exit cycle. Error %s" % (len(self.__requestCache), res["Message"]), ) self.putAllRequests() return res # # serialize to JSON result = request.toJSON() if not result["OK"]: continue requestJSON = result["Value"] self.log.info("spawning task for request", "'%s/%s'" % (request.RequestID, request.RequestName)) timeOut = self.getTimeout(request) enqueue = self.processPool().createAndQueueTask( RequestTask, kwargs={ "requestJSON": requestJSON, "handlersDict": self.handlersDict, "csPath": self.__configPath, "agentName": self.agentName, "rmsMonitoring": self.__rmsMonitoring, }, taskID=taskID, blocking=True, usePoolCallbacks=True, timeOut=timeOut, ) if not enqueue["OK"]: self.log.error("Could not enqueue task", enqueue["Message"]) else: self.log.debug("successfully enqueued task", "'%s'" % taskID) # # update monitor if self.__rmsMonitoring: self.rmsMonitoringReporter.addRecord( { "timestamp": int(Time.toEpoch()), "host": Network.getFQDN(), "objectType": "Request", "status": "Attempted", "objectID": request.RequestID, "nbObject": 1, } ) else: gMonitor.addMark("Processed", 1) # # update request counter taskCounter += 1 # # task created, a little time kick to proceed time.sleep(0.1) break self.log.info("Flushing callbacks", "(%d requests still in cache)" % len(self.__requestCache)) processed = self.processPool().processResults() # This happens when the result queue is screwed up. # Returning S_ERROR proved not to be sufficient, # and when in this situation, there is nothing we can do. # So we just exit. runit will restart from scratch. if processed < 0: self.log.fatal("Results queue is screwed up") sys.exit(1) # # clean return return S_OK() def getTimeout(self, request): """get timeout for request""" timeout = 0 for op in request: if op.Status not in ("Waiting", "Scheduled", "Queued"): continue if op.Type not in self.timeOuts: timeout += self.__operationTimeout else: perOp = self.timeOuts[op.Type].get("PerOperation", self.__operationTimeout) perFiles = self.timeOuts[op.Type].get("PerFile", self.__fileTimeout) * len(op) timeout += perOp + perFiles self.log.info( "estimated timeOut for request", "(%s/%s) is %s" % (request.RequestID, request.RequestName, timeout) ) return timeout def finalize(self): """agent finalization""" if self.__processPool: self.processPool().finalize(timeout=self.__poolTimeout) self.putAllRequests() return S_OK() def resultCallback(self, taskID, taskResult): """definition of request callback function :param str taskID: Request.RequestID :param dict taskResult: task result S_OK(Request)/S_ERROR(Message) """ # # clean cache res = self.putRequest(taskID, taskResult) self.log.info( "callback:", "%s result is %s(%s), put %s(%s)" % ( taskID, "S_OK" if taskResult["OK"] else "S_ERROR", taskResult["Value"].Status if taskResult["OK"] else taskResult["Message"], "S_OK" if res["OK"] else "S_ERROR", "" if res["OK"] else res["Message"], ), ) def exceptionCallback(self, taskID, taskException): """definition of exception callback function :param str taskID: Request.RequestID :param Exception taskException: Exception instance """ self.log.error("exceptionCallback:", "%s was hit by exception %s" % (taskID, taskException)) self.putRequest(taskID) def __rmsMonitoringReporting(self): """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which is done by the MonitoringReporter and is send to the 'RMSMonitoring' type. :return: True / False """ result = self.rmsMonitoringReporter.commit() return result["OK"]
def __call__(self): """perform physical removal operation""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") bannedTargets = self.checkSEsRSS(access="RemoveAccess") if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() return bannedTargets if bannedTargets["Value"]: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare lfn dict toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles) targetSEs = self.operation.targetSEList if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict))) self.rmsMonitoringReporter.commit() # # keep errors dict removalStatus = dict.fromkeys(toRemoveDict.keys(), None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, "") for targetSE in targetSEs: self.log.info("removing files from %s" % targetSE) # # 1st - bulk removal bulkRemoval = self.bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error("Failed bulk removal", bulkRemoval["Message"]) self.operation.Error = bulkRemoval["Message"] return bulkRemoval bulkRemoval = bulkRemoval["Value"] for lfn, opFile in toRemoveDict.items(): removalStatus[lfn][targetSE] = bulkRemoval["Failed"].get(lfn, "") opFile.Error = removalStatus[lfn][targetSE] # # 2nd - single file removal toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if lfn in bulkRemoval["Failed"]) for lfn, opFile in toRetry.items(): self.singleRemoval(opFile, targetSE) if not opFile.Error: removalStatus[lfn][targetSE] = "" else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) removalStatus[lfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = [error for error in removalStatus[opFile.LFN].values() if error.strip()] if errors: failed += 1 opFile.Error = ",".join(errors) if "Write access not permitted for this credential" in opFile.Error: opFile.Status = "Failed" if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) continue if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1)) opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s files" % failed if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK()
def __call__(self): """remove replicas""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") else: # # gMonitor stuff gMonitor.registerActivity( "RemoveReplicaAtt", "Replica removals attempted", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveReplicaOK", "Successful replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveReplicaFail", "Failed replica removals", "RequestExecutingAgent", "Files/min", gMonitor.OP_SUM ) # # prepare list of targetSEs targetSEs = self.operation.targetSEList # # check targetSEs for removal bannedTargets = self.checkSEsRSS(targetSEs, access="RemoveAccess") if not bannedTargets["OK"]: if self.rmsMonitoring: for status in ["Attempted", "Failed"]: self.rmsMonitoringReporter.addRecord(self.createRMSRecord(status, len(self.operation))) self.rmsMonitoringReporter.commit() else: gMonitor.addMark("RemoveReplicaAtt") gMonitor.addMark("RemoveReplicaFail") return bannedTargets if bannedTargets["Value"]: return S_OK("%s targets are banned for removal" % ",".join(bannedTargets["Value"])) # # get waiting files waitingFiles = self.getWaitingFilesList() # # and prepare dict toRemoveDict = dict((opFile.LFN, opFile) for opFile in waitingFiles) self.log.info("Todo: %s replicas to delete from %s SEs" % (len(toRemoveDict), len(targetSEs))) if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(toRemoveDict))) else: gMonitor.addMark("RemoveReplicaAtt", len(toRemoveDict) * len(targetSEs)) # # keep status for each targetSE removalStatus = dict.fromkeys(toRemoveDict, None) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys(targetSEs, None) # # loop over targetSEs for targetSE in targetSEs: self.log.info("Removing replicas at %s" % targetSE) # # 1st step - bulk removal bulkRemoval = self._bulkRemoval(toRemoveDict, targetSE) if not bulkRemoval["OK"]: self.log.error("Bulk replica removal failed", bulkRemoval["Message"]) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return bulkRemoval # # report removal status for successful files if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord( "Successful", len(([opFile for opFile in toRemoveDict.values() if not opFile.Error])) ) ) else: gMonitor.addMark( "RemoveReplicaOK", len([opFile for opFile in toRemoveDict.values() if not opFile.Error]) ) # # 2nd step - process the rest again toRetry = dict((lfn, opFile) for lfn, opFile in toRemoveDict.items() if opFile.Error) for lfn, opFile in toRetry.items(): self._removeWithOwnerProxy(opFile, targetSE) if opFile.Error: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("RemoveReplicaFail", 1) removalStatus[lfn][targetSE] = opFile.Error else: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("RemoveReplicaOK", 1) # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = list(set(error for error in removalStatus[opFile.LFN].values() if error)) if errors: opFile.Error = "\n".join(errors) # This seems to be the only unrecoverable error if "Write access not permitted for this credential" in opFile.Error: failed += 1 opFile.Status = "Failed" else: opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s replicas" % failed if self.rmsMonitoring: self.rmsMonitoringReporter.commit() return S_OK()
class Service(object): SVC_VALID_ACTIONS = { "RPC": "export", "FileTransfer": "transfer", "Message": "msg", "Connection": "Message" } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__(self, serviceData): """ Init the variables for the service :param serviceData: dict with modName, standalone, loadName, moduleObj, classObj. e.g.: {'modName': 'Framework/serviceName', 'standalone': True, 'loadName': 'Framework/serviceName', 'moduleObj': <module 'serviceNameHandler' from '/home/DIRAC/FrameworkSystem/Service/serviceNameHandler.pyo'>, 'classObj': <class 'serviceNameHandler.serviceHandler'>} """ self._svcData = serviceData self._name = serviceData["modName"] self._startTime = datetime.datetime.utcnow() self._validNames = [serviceData["modName"]] if serviceData["loadName"] not in self._validNames: self._validNames.append(serviceData["loadName"]) self._cfg = ServiceConfiguration(list(self._validNames)) self._standalone = serviceData["standalone"] self.__monitorLastStatsUpdate = time.time() self._stats = {"queries": 0, "connections": 0} self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection(serviceData["loadName"])) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 self.__maxFD = 0 self.activityMonitoring = False # Check if monitoring is enabled if "Monitoring" in Operations().getMonitoringBackends( monitoringType="ServiceMonitoring"): self.activityMonitoring = True def setCloneProcessId(self, cloneId): self.__cloneId = cloneId def _isMetaAction(self, action): referedAction = Service.SVC_VALID_ACTIONS[action] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize(self): # Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR("Could not build service URL for %s" % self._name) gLogger.verbose("Service URL is %s" % self._url) # Load handler result = self._loadHandlerInit() if not result["OK"]: return result self._handler = result["Value"] # Initialize lock manager self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions()) self._threadPool = ThreadPoolExecutor(max(0, self._cfg.getMaxThreads())) self._msgBroker = MessageBroker("%sMSB" % self._name, threadPool=self._threadPool) # Create static dict self._serviceInfoDict = { "serviceName": self._name, "serviceSectionPath": PathFinder.getServiceSection(self._name), "URL": self._cfg.getURL(), "messageSender": MessageSender(self._name, self._msgBroker), "validNames": self._validNames, "csPaths": [ PathFinder.getServiceSection(svcName) for svcName in self._validNames ], } self.securityLogging = Operations().getValue( "EnableSecurityLogging", True) and getServiceOption( self._serviceInfoDict, "EnableSecurityLogging", True) # Initialize Monitoring # The import needs to be here because of the CS must be initialized before importing # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793) from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="ServiceMonitoring") self._initMonitoring() # Call static initialization function try: self._handler["class"]._rh__initializeClass( dict(self._serviceInfoDict), self._lockManager, self._msgBroker, self.activityMonitoringReporter) if self._handler["init"]: for initFunc in self._handler["init"]: gLogger.verbose("Executing initialization function") try: result = initFunc(dict(self._serviceInfoDict)) except Exception as excp: gLogger.exception( "Exception while calling initialization function", lException=excp) return S_ERROR( "Exception while calling initialization function: %s" % str(excp)) if not isReturnStructure(result): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc) if not result["OK"]: return S_ERROR("Error while initializing %s: %s" % (self._name, result["Message"])) except Exception as e: errMsg = "Exception while initializing %s" % self._name gLogger.exception(e) gLogger.exception(errMsg) return S_ERROR(errMsg) if self.activityMonitoring: gThreadScheduler.addPeriodicTask(30, self.__reportActivity) gThreadScheduler.addPeriodicTask( 100, self.__activityMonitoringReporting) # Load actions after the handler has initialized itself result = self._loadActions() if not result["OK"]: return result self._actions = result["Value"] return S_OK() def __searchInitFunctions(self, handlerClass, currentClass=None): if not currentClass: currentClass = handlerClass initFuncs = [] ancestorHasInit = False for ancestor in currentClass.__bases__: initFuncs += self.__searchInitFunctions(handlerClass, ancestor) if "initializeHandler" in dir(ancestor): ancestorHasInit = True if ancestorHasInit: initFuncs.append( super(currentClass, handlerClass).initializeHandler) if currentClass == handlerClass and "initializeHandler" in dir( handlerClass): initFuncs.append(handlerClass.initializeHandler) return initFuncs def _loadHandlerInit(self): handlerClass = self._svcData["classObj"] handlerName = handlerClass.__name__ handlerInitMethods = self.__searchInitFunctions(handlerClass) try: handlerInitMethods.append( getattr(self._svcData["moduleObj"], "initialize%s" % handlerName)) except AttributeError: gLogger.verbose( "Not found global initialization function for service") if handlerInitMethods: gLogger.info("Found %s initialization methods" % len(handlerInitMethods)) handlerInfo = {} handlerInfo["name"] = handlerName handlerInfo["module"] = self._svcData["moduleObj"] handlerInfo["class"] = handlerClass handlerInfo["init"] = handlerInitMethods return S_OK(handlerInfo) def _loadActions(self): handlerClass = self._handler["class"] authRules = {} typeCheck = {} methodsList = {} for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction(actionType): continue authRules[actionType] = {} typeCheck[actionType] = {} methodsList[actionType] = [] handlerAttributeList = dir(handlerClass) for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction(actionType): continue methodPrefix = "%s_" % Service.SVC_VALID_ACTIONS[actionType] for attribute in handlerAttributeList: if attribute.find(methodPrefix) != 0: continue exportedName = attribute[len(methodPrefix):] methodsList[actionType].append(exportedName) gLogger.verbose("+ Found %s method %s" % (actionType, exportedName)) # Create lock for method self._lockManager.createLock( "%s/%s" % (actionType, exportedName), self._cfg.getMaxThreadsForMethod(actionType, exportedName)) # Look for type and auth rules if actionType == "RPC": typeAttr = "types_%s" % exportedName authAttr = "auth_%s" % exportedName else: typeAttr = "types_%s_%s" % ( Service.SVC_VALID_ACTIONS[actionType], exportedName) authAttr = "auth_%s_%s" % ( Service.SVC_VALID_ACTIONS[actionType], exportedName) if typeAttr in handlerAttributeList: obj = getattr(handlerClass, typeAttr) gLogger.verbose("|- Found type definition %s: %s" % (typeAttr, str(obj))) typeCheck[actionType][exportedName] = obj if authAttr in handlerAttributeList: obj = getattr(handlerClass, authAttr) gLogger.verbose("|- Found auth rules %s: %s" % (authAttr, str(obj))) authRules[actionType][exportedName] = obj for actionType in Service.SVC_VALID_ACTIONS: referedAction = self._isMetaAction(actionType) if not referedAction: continue gLogger.verbose("Action %s is a meta action for %s" % (actionType, referedAction)) authRules[actionType] = [] for method in authRules[referedAction]: for prop in authRules[referedAction][method]: if prop not in authRules[actionType]: authRules[actionType].append(prop) gLogger.verbose("Meta action %s props are %s" % (actionType, authRules[actionType])) return S_OK({ "methods": methodsList, "auth": authRules, "types": typeCheck }) def _initMonitoring(self): props = [("__doc__", "description")] for prop in props: try: value = getattr(self._handler["module"], prop[0]) except Exception as e: gLogger.exception(e) gLogger.error("Missing property", prop[0]) value = "unset" for secondaryName in self._cfg.registerAlsoAs(): gLogger.info("Registering %s also as %s" % (self._name, secondaryName)) self._validNames.append(secondaryName) return S_OK() def __reportActivity(self): initialWallTime, initialCPUTime, mem = self.__startReportToMonitoring() pendingQueries = self._threadPool._work_queue.qsize() activeQuereies = len(self._threadPool._threads) percentage = self.__endReportToMonitoring(initialWallTime, initialCPUTime) self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "MemoryUsage": mem, "CpuPercentage": percentage, "PendingQueries": pendingQueries, "ActiveQueries": activeQuereies, "RunningThreads": threading.activeCount(), "MaxFD": self.__maxFD, }) self.__maxFD = 0 def getConfig(self): return self._cfg # End of initialization functions def handleConnection(self, clientTransport): """ This method may be called by ServiceReactor. The method stacks openened connection in a queue, another thread read this queue and handle connection. :param clientTransport: Object which describes opened connection (PlainTransport or SSLTransport) """ if not self.activityMonitoring: self._stats["connections"] += 1 self._threadPool.submit(self._processInThread, clientTransport) @property def wantsThrottle(self): """Boolean property for if the service wants requests to stop being accepted""" nQueued = self._threadPool._work_queue.qsize() return nQueued > self._cfg.getMaxWaitingPetitions() # Threaded process function def _processInThread(self, clientTransport): """ This method handles a RPC, FileTransfer or Connection. Connection may be opened via ServiceReactor.__acceptIncomingConnection - Do the SSL/TLS Handshake (if dips is used) and extract credentials - Get the action called by the client - Check if the client is authorized to perform ation - If not, connection is closed - Instanciate the RequestHandler (RequestHandler contain all methods callable) (Following is not directly in this method but it describe what happen at #Execute the action) - Notify the client we're ready to execute the action (via _processProposal) and call RequestHandler._rh_executeAction() - Receive arguments/file/something else (depending on action) in the RequestHandler - Executing the action asked by the client :param clientTransport: Object which describe the opened connection (SSLTransport or PlainTransport) :return: S_OK with "closeTransport" a boolean to indicate if th connection have to be closed e.g. after RPC, closeTransport=True """ self.__maxFD = max(self.__maxFD, clientTransport.oSocket.fileno()) self._lockManager.lockGlobal() try: monReport = self.__startReportToMonitoring() except Exception: monReport = False try: # Handshake try: result = clientTransport.handshake() if not result["OK"]: clientTransport.close() return except Exception: return # Add to the transport pool trid = self._transportPool.add(clientTransport) if not trid: return # Receive and check proposal result = self._receiveAndCheckProposal(trid) if not result["OK"]: self._transportPool.sendAndClose(trid, result) return proposalTuple = result["Value"] # Instantiate handler result = self._instantiateHandler(trid, proposalTuple) if not result["OK"]: self._transportPool.sendAndClose(trid, result) return handlerObj = result["Value"] # Execute the action result = self._processProposal(trid, proposalTuple, handlerObj) # Close the connection if required if result["closeTransport"] or not result["OK"]: if not result["OK"]: gLogger.error("Error processing proposal", result["Message"]) self._transportPool.close(trid) return result finally: self._lockManager.unlockGlobal() if monReport: self.__endReportToMonitoring(monReport[0], monReport[1]) @staticmethod def _createIdentityString(credDict, clientTransport=None): if "username" in credDict: if "group" in credDict: identity = "[%s:%s]" % (credDict["username"], credDict["group"]) else: identity = "[%s:unknown]" % credDict["username"] else: identity = "unknown" if clientTransport: addr = clientTransport.getRemoteAddress() if addr: addr = "{%s:%s}" % (addr[0], addr[1]) if "DN" in credDict: identity += "(%s)" % credDict["DN"] return identity @staticmethod def _deserializeProposalTuple(serializedProposal): """We receive the proposalTuple as a list. Turn it into a tuple again """ proposalTuple = tuple( tuple(x) if isinstance(x, list) else x for x in serializedProposal) return proposalTuple def _receiveAndCheckProposal(self, trid): clientTransport = self._transportPool.get(trid) # Get the peer credentials credDict = clientTransport.getConnectingCredentials() # Receive the action proposal retVal = clientTransport.receiveData(1024) if not retVal["OK"]: gLogger.error( "Invalid action proposal", "%s %s" % (self._createIdentityString( credDict, clientTransport), retVal["Message"]), ) return S_ERROR("Invalid action proposal") proposalTuple = Service._deserializeProposalTuple(retVal["Value"]) gLogger.debug("Received action from client", "/".join(list(proposalTuple[1]))) # Check if there are extra credentials if proposalTuple[2]: clientTransport.setExtraCredentials(proposalTuple[2]) # Check if this is the requested service requestedService = proposalTuple[0][0] if requestedService not in self._validNames: return S_ERROR("%s is not up in this server" % requestedService) # Check if the action is valid requestedActionType = proposalTuple[1][0] if requestedActionType not in Service.SVC_VALID_ACTIONS: return S_ERROR("%s is not a known action type" % requestedActionType) # Check if it's authorized result = self._authorizeProposal(proposalTuple[1], trid, credDict) if not result["OK"]: return result # Proposal is OK return S_OK(proposalTuple) def _authorizeProposal(self, actionTuple, trid, credDict): # Find CS path for the Auth rules referedAction = self._isMetaAction(actionTuple[0]) if referedAction: csAuthPath = "%s/Default" % actionTuple[0] hardcodedMethodAuth = self._actions["auth"][actionTuple[0]] else: if actionTuple[0] == "RPC": csAuthPath = actionTuple[1] else: csAuthPath = "/".join(actionTuple) # Find if there are hardcoded auth rules in the code hardcodedMethodAuth = False if actionTuple[0] in self._actions["auth"]: hardcodedRulesByType = self._actions["auth"][actionTuple[0]] if actionTuple[0] == "FileTransfer": methodName = actionTuple[1][0].lower() + actionTuple[1][1:] else: methodName = actionTuple[1] if methodName in hardcodedRulesByType: hardcodedMethodAuth = hardcodedRulesByType[methodName] # Auth time! if not self._authMgr.authQuery(csAuthPath, credDict, hardcodedMethodAuth): # Get the identity string identity = self._createIdentityString(credDict) fromHost = "unknown host" tr = self._transportPool.get(trid) if tr: fromHost = "/".join( [str(item) for item in tr.getRemoteAddress()]) gLogger.warn( "Unauthorized query", "to %s:%s by %s from %s" % (self._name, "/".join(actionTuple), identity, fromHost)) result = S_ERROR(ENOAUTH, "Unauthorized query") else: result = S_OK() # Security log tr = self._transportPool.get(trid) if not tr: return S_ERROR("Client disconnected") sourceAddress = tr.getRemoteAddress() identity = self._createIdentityString(credDict) if self.securityLogging: Service.SVC_SECLOG_CLIENT.addMessage( result["OK"], sourceAddress[0], sourceAddress[1], identity, self._cfg.getHostname(), self._cfg.getPort(), self._name, "/".join(actionTuple), ) return result def _instantiateHandler(self, trid, proposalTuple=None): """ Generate an instance of the handler for a given service :param int trid: transport ID :param tuple proposalTuple: tuple describing the proposed action :return: S_OK/S_ERROR, Value is the handler object """ # Generate the client params clientParams = {"serviceStartTime": self._startTime} if proposalTuple: # The 4th element is the client version clientParams["clientVersion"] = proposalTuple[3] if len( proposalTuple) > 3 else None clientParams["clientSetup"] = proposalTuple[0][1] if len(proposalTuple[0]) < 3: clientParams["clientVO"] = gConfig.getValue( "/DIRAC/VirtualOrganization", "unknown") else: clientParams["clientVO"] = proposalTuple[0][2] clientTransport = self._transportPool.get(trid) if clientTransport: clientParams["clientAddress"] = clientTransport.getRemoteAddress() # Generate handler dict with per client info handlerInitDict = dict(self._serviceInfoDict) for key in clientParams: handlerInitDict[key] = clientParams[key] # Instantiate and initialize try: handlerInstance = self._handler["class"](handlerInitDict, trid) handlerInstance.initialize() except Exception as e: gLogger.exception("Server error while loading handler: %s" % str(e)) return S_ERROR("Server error while loading handler") return S_OK(handlerInstance) def _processProposal(self, trid, proposalTuple, handlerObj): # Notify the client we're ready to execute the action retVal = self._transportPool.send(trid, S_OK()) if not retVal["OK"]: return retVal messageConnection = False if proposalTuple[1] == ("Connection", "new"): messageConnection = True if messageConnection: if self._msgBroker.getNumConnections( ) > self._cfg.getMaxMessagingConnections(): result = S_ERROR( "Maximum number of connections reached. Try later") result["closeTransport"] = True return result # This is a stable connection self._msgBroker.addTransportId( trid, self._name, receiveMessageCallback=self._mbReceivedMsg, disconnectCallback=self._mbDisconnect, listenToConnection=False, ) result = self._executeAction(trid, proposalTuple, handlerObj) if result["OK"] and messageConnection: self._msgBroker.listenToTransport(trid) result = self._mbConnect(trid, handlerObj) if not result["OK"]: self._msgBroker.removeTransport(trid) result["closeTransport"] = not messageConnection or not result["OK"] return result def _mbConnect(self, trid, handlerObj=None): if not handlerObj: result = self._instantiateHandler(trid) if not result["OK"]: return result handlerObj = result["Value"] return handlerObj._rh_executeConnectionCallback("connected") def _executeAction(self, trid, proposalTuple, handlerObj): try: response = handlerObj._rh_executeAction(proposalTuple) if not response["OK"]: return response if self.activityMonitoring: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) return response["Value"][0] except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e)) def _mbReceivedMsg(self, trid, msgObj): result = self._authorizeProposal( ("Message", msgObj.getName()), trid, self._transportPool.get(trid).getConnectingCredentials()) if not result["OK"]: return result result = self._instantiateHandler(trid) if not result["OK"]: return result handlerObj = result["Value"] response = handlerObj._rh_executeMessageCallback(msgObj) if self.activityMonitoring and response["OK"]: self.activityMonitoringReporter.addRecord({ "timestamp": int(TimeUtilities.toEpoch()), "Host": Network.getFQDN(), "ServiceName": "_".join(self._name.split("/")), "Location": self._cfg.getURL(), "ResponseTime": response["Value"][1], }) if response["OK"]: return response["Value"][0] else: return response def _mbDisconnect(self, trid): result = self._instantiateHandler(trid) if not result["OK"]: return result handlerObj = result["Value"] return handlerObj._rh_executeConnectionCallback("drop") def __activityMonitoringReporting(self): """This method is called by the ThreadScheduler as a periodic task in order to commit the collected data which is done by the MonitoringReporter and is send to the 'ComponentMonitoring' type. :return: True / False """ return self.activityMonitoringReporter.commit() def __startReportToMonitoring(self): now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] mem = None if now - self.__monitorLastStatsUpdate < 0: return (now, cpuTime, mem) self.__monitorLastStatsUpdate = now membytes = MemStat.VmB("VmRSS:") if membytes: mem = membytes / (1024.0 * 1024.0) return (now, cpuTime, mem) def __endReportToMonitoring(self, initialWallTime, initialCPUTime): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100.0 return percentage
def __call__(self): """call me maybe""" # The flag 'rmsMonitoring' is set by the RequestTask and is False by default. # Here we use 'createRMSRecord' to create the ES record which is defined inside OperationHandlerBase. if self.rmsMonitoring: self.rmsMonitoringReporter = MonitoringReporter(monitoringType="RMSMonitoring") else: # # RegisterReplica specific monitor info gMonitor.registerActivity( "RegisterReplicaAtt", "Attempted replicas registrations", "RequestExecutingAgent", "Replicas/min", gMonitor.OP_SUM, ) gMonitor.registerActivity( "RegisterReplicaOK", "Successful replicas registrations", "RequestExecutingAgent", "Replicas/min", gMonitor.OP_SUM, ) gMonitor.registerActivity( "RegisterReplicaFail", "Failed replicas registrations", "RequestExecutingAgent", "Replicas/min", gMonitor.OP_SUM, ) # # counter for failed replicas failedReplicas = 0 # # catalog to use catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(",")] # # get waiting files waitingFiles = self.getWaitingFilesList() if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Attempted", len(waitingFiles))) # # loop over files registerOperations = {} successReplicas = 0 for opFile in waitingFiles: if not self.rmsMonitoring: gMonitor.addMark("RegisterReplicaAtt", 1) # # get LFN lfn = opFile.LFN # # and others targetSE = self.operation.targetSEList[0] replicaTuple = (lfn, opFile.PFN, targetSE) # # call ReplicaManager registerReplica = self.dm.registerReplica(replicaTuple, catalogs) # # check results if not registerReplica["OK"] or lfn in registerReplica["Value"]["Failed"]: # There have been some errors if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Failed", 1)) else: gMonitor.addMark("RegisterReplicaFail", 1) # self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaFail", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterReplica" ) reason = registerReplica.get( "Message", registerReplica.get("Value", {}).get("Failed", {}).get(lfn, "Unknown") ) errorStr = "failed to register LFN %s: %s" % (lfn, str(reason)) # FIXME: this is incompatible with the change made in the DM that we # ignore failures if successful in at least one catalog if lfn in registerReplica.get("Value", {}).get("Successful", {}) and isinstance(reason, dict): # As we managed, let's create a new operation for just the remaining registration errorStr += " - adding registerReplica operations to request" for failedCatalog in reason: key = "%s/%s" % (targetSE, failedCatalog) newOperation = self.getRegisterOperation( opFile, targetSE, type="RegisterReplica", catalog=failedCatalog ) if key not in registerOperations: registerOperations[key] = newOperation else: registerOperations[key].addFile(newOperation[0]) opFile.Status = "Done" else: opFile.Error = errorStr catMaster = True if isinstance(reason, dict): from DIRAC.Resources.Catalog.FileCatalog import FileCatalog for failedCatalog in reason: catMaster = catMaster and FileCatalog()._getCatalogConfigDetails(failedCatalog).get( "Value", {} ).get("Master", False) # If one targets explicitly a catalog and it fails or if it fails on the master catalog if (catalogs or catMaster) and ( "file does not exist" in opFile.Error.lower() or "no such file" in opFile.Error.lower() ): # Check if the file really exists in SE, if not, consider this file registration as Done res = self.dm.getReplicaMetadata(lfn, targetSE) notExist = bool("No such file" in res.get("Value", {}).get("Failed", {}).get(lfn, "")) if not notExist: opFile.Status = "Failed" else: opFile.Status = "Done" if opFile.Status != "Done": failedReplicas += 1 self.log.warn(errorStr) else: # All is OK if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord(self.createRMSRecord("Successful", 1)) else: gMonitor.addMark("RegisterReplicaOK", 1) successReplicas += 1 self.log.verbose( "Replica %s has been registered at %s" % (lfn, ",".join(catalogs) if catalogs else "all catalogs") ) opFile.Status = "Done" # # if we have new replications to take place, put them at the end if registerOperations: self.log.info("adding %d operations to the request" % len(registerOperations)) for operation in registerOperations.values(): self.operation._parent.addOperation(operation) if self.rmsMonitoring: self.rmsMonitoringReporter.commit() # # final check infoStr = "" if successReplicas: infoStr = "%d replicas successfully registered" % successReplicas if failedReplicas: infoStr += ", %d replicas failed to register" % failedReplicas self.log.info("All replicas processed", infoStr) if failedReplicas: self.operation.Error = "some replicas failed to register" return S_ERROR(self.operation.Error) return S_OK()
def initialize(self): # Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR("Could not build service URL for %s" % self._name) gLogger.verbose("Service URL is %s" % self._url) # Load handler result = self._loadHandlerInit() if not result["OK"]: return result self._handler = result["Value"] # Initialize lock manager self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions()) self._threadPool = ThreadPoolExecutor(max(0, self._cfg.getMaxThreads())) self._msgBroker = MessageBroker("%sMSB" % self._name, threadPool=self._threadPool) # Create static dict self._serviceInfoDict = { "serviceName": self._name, "serviceSectionPath": PathFinder.getServiceSection(self._name), "URL": self._cfg.getURL(), "messageSender": MessageSender(self._name, self._msgBroker), "validNames": self._validNames, "csPaths": [ PathFinder.getServiceSection(svcName) for svcName in self._validNames ], } self.securityLogging = Operations().getValue( "EnableSecurityLogging", True) and getServiceOption( self._serviceInfoDict, "EnableSecurityLogging", True) # Initialize Monitoring # The import needs to be here because of the CS must be initialized before importing # this class (see https://github.com/DIRACGrid/DIRAC/issues/4793) from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter self.activityMonitoringReporter = MonitoringReporter( monitoringType="ServiceMonitoring") self._initMonitoring() # Call static initialization function try: self._handler["class"]._rh__initializeClass( dict(self._serviceInfoDict), self._lockManager, self._msgBroker, self.activityMonitoringReporter) if self._handler["init"]: for initFunc in self._handler["init"]: gLogger.verbose("Executing initialization function") try: result = initFunc(dict(self._serviceInfoDict)) except Exception as excp: gLogger.exception( "Exception while calling initialization function", lException=excp) return S_ERROR( "Exception while calling initialization function: %s" % str(excp)) if not isReturnStructure(result): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc) if not result["OK"]: return S_ERROR("Error while initializing %s: %s" % (self._name, result["Message"])) except Exception as e: errMsg = "Exception while initializing %s" % self._name gLogger.exception(e) gLogger.exception(errMsg) return S_ERROR(errMsg) if self.activityMonitoring: gThreadScheduler.addPeriodicTask(30, self.__reportActivity) gThreadScheduler.addPeriodicTask( 100, self.__activityMonitoringReporting) # Load actions after the handler has initialized itself result = self._loadActions() if not result["OK"]: return result self._actions = result["Value"] return S_OK()
def setUp(self): gLogger.setLevel('INFO') self.wmsMonitoringReporter = MonitoringReporter( monitoringType="WMSHistory") self.componentMonitoringReporter = MonitoringReporter( monitoringType="ComponentMonitoring") self.data = [{ "Status": "Waiting", "Jobs": 2, "timestamp": 1458130176, "JobSplitType": "MCStripping", "MinorStatus": "unset", "Site": "LCG.GRIDKA.de", "Reschedules": 0, "ApplicationStatus": "unset", "User": "******", "JobGroup": "00049848", "UserGroup": "lhcb_mc", "metric": "WMSHistory" }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458130176, u'JobSplitType': u'User', u'MinorStatus': u'unset', u'Site': u'LCG.PIC.es', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'olupton', u'JobGroup': u'lhcb', u'UserGroup': u'lhcb_user', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458130176, u'JobSplitType': u'User', u'MinorStatus': u'unset', u'Site': u'LCG.RAL.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'olupton', u'JobGroup': u'lhcb', u'UserGroup': u'lhcb_user', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458130176, u'JobSplitType': u'MCStripping', u'MinorStatus': u'unset', u'Site': u'LCG.RAL.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00049845', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 34, u'timestamp': 1458141578, u'JobSplitType': u'DataStripping', u'MinorStatus': u'unset', u'Site': u'Group.RAL.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050299', u'UserGroup': u'lhcb_data', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 120, u'timestamp': 1458141578, u'JobSplitType': u'User', u'MinorStatus': u'unset', u'Site': u'LCG.CERN.ch', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'mvesteri', u'JobGroup': u'lhcb', u'UserGroup': u'lhcb_user', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458141578, u'JobSplitType': u'MCStripping', u'MinorStatus': u'unset', u'Site': u'LCG.CNAF.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00049845', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 2, u'timestamp': 1458141578, u'JobSplitType': u'MCStripping', u'MinorStatus': u'unset', u'Site': u'LCG.CNAF.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00049848', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458141578, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.CNAF.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050286', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 95, u'timestamp': 1458199202, u'JobSplitType': u'User', u'MinorStatus': u'unset', u'Site': u'Multiple', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'mamartin', u'JobGroup': u'lhcb', u'UserGroup': u'lhcb_user', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 3, u'timestamp': 1458199202, u'JobSplitType': u'User', u'MinorStatus': u'unset', u'Site': u'Multiple', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'olupton', u'JobGroup': u'lhcb', u'UserGroup': u'lhcb_user', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 129, u'timestamp': 1458199202, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'Multiple', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00049844', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 5, u'timestamp': 1458217812, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.IHEP.su', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050232', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 7, u'timestamp': 1458217812, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.IHEP.su', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050234', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 1, u'timestamp': 1458217812, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.IHEP.su', u'Reschedules': 1, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050236', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 3, u'timestamp': 1458217812, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.IHEP.su', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050238', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 2, u'timestamp': 1458217812, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.IHEP.su', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050248', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 12, u'timestamp': 1458218413, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.CNAF.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050248', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 5, u'timestamp': 1458218413, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.CNAF.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050250', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 4, u'timestamp': 1458218413, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.CNAF.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050251', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 1, u'timestamp': 1458218413, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.CNAF.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050280', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 24, u'timestamp': 1458219012, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.NIKHEF.nl', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050248', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 3, u'timestamp': 1458219012, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.NIKHEF.nl', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050251', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 1, u'timestamp': 1458222013, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.Bologna.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050303', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 7, u'timestamp': 1458222013, u'JobSplitType': u'User', u'MinorStatus': u'unset', u'Site': u'LCG.Bristol.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'clangenb', u'JobGroup': u'lhcb', u'UserGroup': u'lhcb_user', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 2, u'timestamp': 1458222013, u'JobSplitType': u'User', u'MinorStatus': u'unset', u'Site': u'LCG.Bristol.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'mrwillia', u'JobGroup': u'lhcb', u'UserGroup': u'lhcb_user', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 1, u'timestamp': 1458222013, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.Bari.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050244', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 11, u'timestamp': 1458222013, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.Bari.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050246', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 22, u'timestamp': 1458222013, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.Bari.it', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050248', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 23, u'timestamp': 1458225013, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.DESYZN.de', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00049844', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 18, u'timestamp': 1458225013, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.DESYZN.de', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00049847', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 1, u'timestamp': 1458225013, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.DESYZN.de', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050238', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Running', 'Jobs': 1, u'timestamp': 1458225013, u'JobSplitType': u'MCSimulation', u'MinorStatus': u'unset', u'Site': u'LCG.DESYZN.de', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050246', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458226213, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.RRCKI.ru', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050243', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458226213, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.RRCKI.ru', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050251', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458226213, u'JobSplitType': u'MCStripping', u'MinorStatus': u'unset', u'Site': u'LCG.RRCKI.ru', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050256', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458226213, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.RAL.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050229', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458226213, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.RAL.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050241', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 1, u'timestamp': 1458226213, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.RAL.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050243', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }, { u'Status': u'Waiting', 'Jobs': 2, u'timestamp': 1458226213, u'JobSplitType': u'MCReconstruction', u'MinorStatus': u'unset', u'Site': u'LCG.RAL.uk', u'Reschedules': 0, u'ApplicationStatus': u'unset', u'User': u'phicharp', u'JobGroup': u'00050247', u'UserGroup': u'lhcb_mc', u'metric': u'WMSHistory' }]
class StatesMonitoringAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ __summaryKeyFieldsMapping = [ 'Status', 'Site', 'User', 'UserGroup', 'JobGroup', 'JobType', 'ApplicationStatus', 'MinorStatus'] __summaryDefinedFields = [ ( 'ApplicationStatus', 'unset' ), ( 'MinorStatus', 'unset' ) ] __summaryValueFieldsMapping = [ 'Jobs', 'Reschedules'] __renameFieldsMapping = { 'JobType' : 'JobSplitType' } __jobDBFields = [] jobDB = None monitoringReporter = None reportPeriod = None def initialize( self ): """ Standard constructor """ self.jobDB = JobDB() self.reportPeriod = 120 self.am_setOption( "PollingTime", self.reportPeriod ) self.monitoringReporter = MonitoringReporter( monitoringType = "WMSHistory" ) for field in self.__summaryKeyFieldsMapping: if field == 'User': field = 'Owner' elif field == 'UserGroup': field = 'OwnerGroup' self.__jobDBFields.append( field ) return S_OK() def execute( self ): """ Main execution method """ result = gConfig.getSections( "/DIRAC/Setups" ) if not result[ 'OK' ]: return result validSetups = result[ 'Value' ] self.log.info( "Valid setups for this cycle are %s" % ", ".join( validSetups ) ) # Get the WMS Snapshot! result = self.jobDB.getSummarySnapshot( self.__jobDBFields ) now = Time.dateTime() if not result[ 'OK' ]: self.log.error( "Can't get the jobdb summary", result[ 'Message' ] ) else: values = result[ 'Value' ][1] self.log.info( "Start sending records!" ) for record in values: recordSetup = record[0] if recordSetup not in validSetups: self.log.error( "Setup %s is not valid" % recordSetup ) continue record = record[1:] rD = {} for fV in self.__summaryDefinedFields: rD[ fV[0] ] = fV[1] for iP in range( len( self.__summaryKeyFieldsMapping ) ): fieldName = self.__summaryKeyFieldsMapping[iP] rD[ self.__renameFieldsMapping.get( fieldName, fieldName ) ] = record[iP] record = record[ len( self.__summaryKeyFieldsMapping ): ] for iP in range( len( self.__summaryValueFieldsMapping ) ): rD[ self.__summaryValueFieldsMapping[iP] ] = int( record[iP] ) rD['timestamp'] = int( Time.toEpoch( now ) ) self.monitoringReporter.addRecord( rD ) retVal = self.monitoringReporter.commit() if retVal['OK']: self.log.info( "The records are successfully sent to the Store!" ) else: self.log.warn( "Faild to insert the records! It will be retried in the next iteration", retVal['Message'] ) return S_OK()