def initialize( self ): """ Agent initialization. The extensions MUST provide in the initialize method the following data members: - TransformationClient objects (self.transClient), - set the shifterProxy if different from the default one set here ('ProductionManager') - list of transformation types to be looked (self.transType) """ gMonitor.registerActivity( "SubmittedTasks", "Automatically submitted tasks", "Transformation Monitoring", "Tasks", gMonitor.OP_ACUM ) self.pluginLocation = self.am_getOption( 'PluginLocation', 'DIRAC.TransformationSystem.Client.TaskManagerPlugin' ) # Default clients self.transClient = TransformationClient() # Bulk submission flag self.bulkSubmissionFlag = self.am_getOption( 'BulkSubmission', False ) # setting up the threading maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', 15 ) threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.log.verbose( "Multithreaded with %d threads" % maxNumberOfThreads ) for i in xrange( maxNumberOfThreads ): threadPool.generateJobAndQueueIt( self._execute, [i] ) return S_OK()
def initialize(self): """ Agent initialization. The extensions MUST provide in the initialize method the following data members: - TransformationClient objects (self.transClient), - set the shifterProxy if different from the default one set here ('ProductionManager') - list of transformation types to be looked (self.transType) """ gMonitor.registerActivity("SubmittedTasks", "Automatically submitted tasks", "Transformation Monitoring", "Tasks", gMonitor.OP_ACUM) self.pluginLocation = self.am_getOption( 'PluginLocation', 'DIRAC.TransformationSystem.Client.TaskManagerPlugin') # Default clients self.transClient = TransformationClient() # Bulk submission flag self.bulkSubmissionFlag = self.am_getOption('BulkSubmission', False) # setting up the threading maxNumberOfThreads = self.am_getOption('maxNumberOfThreads', 15) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads) for i in xrange(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute, [i]) return S_OK()
class MyProxyRenewalAgent(AgentModule): def initialize(self): requiredLifeTime = self.am_getOption( "MinimumLifeTime", 3600 ) renewedLifeTime = self.am_getOption( "RenewedLifeTime", 54000 ) myProxyServer = gConfig.getValue( "/DIRAC/VOPolicy/MyProxyServer" , "myproxy.cern.ch" ) self.proxyDB = ProxyDB( requireVoms = True, useMyProxy = True ) gLogger.info( "Minimum Life time : %s" % requiredLifeTime ) gLogger.info( "Life time on renew : %s" % renewedLifeTime ) gLogger.info( "MyProxy server : %s" % self.proxyDB.getMyProxyServer() ) gLogger.info( "MyProxy max proxy time : %s" % self.proxyDB.getMyProxyMaxLifeTime() ) self.__threadPool = ThreadPool( 1, 10 ) return S_OK() def __renewProxyForCredentials( self, userDN, userGroup ): lifeTime = self.am_getOption( "RenewedLifeTime", 54000 ) gLogger.info( "Renewing for %s@%s %s secs" % ( userDN, userGroup, lifeTime ) ) retVal = self.proxyDB.renewFromMyProxy( userDN, userGroup, lifeTime = lifeTime ) if not retVal[ 'OK' ]: gLogger.error( "Failed to renew for %s@%s : %s" %( userDN, userGroup, retVal[ 'Message' ] ) ) else: gLogger.info( "Renewed proxy for %s@%s" % ( userDN, userGroup ) ) def __treatRenewalCallback( self, oTJ, exceptionList ): gLogger.exception( lException = exceptionList ) def execute(self): """ The main agent execution method """ self.proxyDB.purgeLogs() gLogger.info( "Purging expired requests" ) retVal = self.proxyDB.purgeExpiredRequests() if retVal[ 'OK' ]: gLogger.info( " purged %s requests" % retVal[ 'Value' ] ) gLogger.info( "Purging expired proxies" ) retVal = self.proxyDB.purgeExpiredProxies() if retVal[ 'OK' ]: gLogger.info( " purged %s proxies" % retVal[ 'Value' ] ) retVal = self.proxyDB.getCredentialsAboutToExpire( self.am_getOption( "MinimumLifeTime" , 3600 ) ) if not retVal[ 'OK' ]: return retVal data = retVal[ 'Value' ] gLogger.info( "Renewing %s proxies..." % len( data ) ) for record in data: userDN = record[0] userGroup = record[1] self.__threadPool.generateJobAndQueueIt( self.__renewProxyForCredentials, args = ( userDN, userGroup ), oExceptionCallback = self.__treatRenewalCallback ) self.__threadPool.processAllResults() return S_OK()
class MyProxyRenewalAgent(AgentModule): def initialize(self): requiredLifeTime = self.am_getOption( "MinimumLifeTime", 3600 ) renewedLifeTime = self.am_getOption( "RenewedLifeTime", 54000 ) self.proxyDB = ProxyDB( useMyProxy = True ) gLogger.info( "Minimum Life time : %s" % requiredLifeTime ) gLogger.info( "Life time on renew : %s" % renewedLifeTime ) gLogger.info( "MyProxy server : %s" % self.proxyDB.getMyProxyServer() ) gLogger.info( "MyProxy max proxy time : %s" % self.proxyDB.getMyProxyMaxLifeTime() ) self.__threadPool = ThreadPool( 1, 10 ) return S_OK() def __renewProxyForCredentials( self, userDN, userGroup ): lifeTime = self.am_getOption( "RenewedLifeTime", 54000 ) gLogger.info( "Renewing for %s@%s %s secs" % ( userDN, userGroup, lifeTime ) ) retVal = self.proxyDB.renewFromMyProxy( userDN, userGroup, lifeTime = lifeTime ) if not retVal[ 'OK' ]: gLogger.error( "Failed to renew proxy", "for %s@%s : %s" %( userDN, userGroup, retVal[ 'Message' ] ) ) else: gLogger.info( "Renewed proxy for %s@%s" % ( userDN, userGroup ) ) def __treatRenewalCallback( self, oTJ, exceptionList ): gLogger.exception( lException = exceptionList ) def execute(self): """ The main agent execution method """ self.proxyDB.purgeLogs() gLogger.info( "Purging expired requests" ) retVal = self.proxyDB.purgeExpiredRequests() if retVal[ 'OK' ]: gLogger.info( " purged %s requests" % retVal[ 'Value' ] ) gLogger.info( "Purging expired proxies" ) retVal = self.proxyDB.purgeExpiredProxies() if retVal[ 'OK' ]: gLogger.info( " purged %s proxies" % retVal[ 'Value' ] ) retVal = self.proxyDB.getCredentialsAboutToExpire( self.am_getOption( "MinimumLifeTime" , 3600 ) ) if not retVal[ 'OK' ]: return retVal data = retVal[ 'Value' ] gLogger.info( "Renewing %s proxies..." % len( data ) ) for record in data: userDN = record[0] userGroup = record[1] self.__threadPool.generateJobAndQueueIt( self.__renewProxyForCredentials, args = ( userDN, userGroup ), oExceptionCallback = self.__treatRenewalCallback ) self.__threadPool.processAllResults() return S_OK()
def initialize(self): """ standard initialize """ # few parameters self.pluginLocation = self.am_getOption( 'PluginLocation', 'DIRAC.TransformationSystem.Agent.TransformationPlugin') self.transformationStatus = self.am_getOption( 'transformationStatus', ['Active', 'Completing', 'Flush']) # Prepare to change the name of the CS option as MaxFiles is ambiguous self.maxFiles = self.am_getOption('MaxFilesToProcess', self.am_getOption('MaxFiles', 5000)) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: dataProc = Operations().getValue('Transformations/DataProcessing', ['MCSimulation', 'Merge']) dataManip = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal']) self.transformationTypes = sorted(dataProc + dataManip) # clients self.transfClient = TransformationClient() # for caching using a pickle file self.workDirectory = self.am_getWorkDirectory() self.cacheFile = os.path.join(self.workDirectory, 'ReplicaCache.pkl') self.controlDirectory = self.am_getControlDirectory() # remember the offset if any in TS self.lastFileOffset = {} # Validity of the cache self.replicaCache = {} self.replicaCacheValidity = self.am_getOption('ReplicaCacheValidity', 2) self.noUnusedDelay = self.am_getOption('NoUnusedDelay', 6) # Get it threaded maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.log.info("Multithreaded with %d threads" % maxNumberOfThreads) for i in xrange(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute, [i]) self.log.info("Will treat the following transformation types: %s" % str(self.transformationTypes)) return S_OK()
class SystemAdministratorIntegrator: def __init__(self, **kwargs): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] self.__kwargs = dict(kwargs) self.__pool = ThreadPool(len(self.__hosts)) self.__resultDict = {} def __getattr__(self, name): self.call = name return self.execute def __executeClient(self, host, method, *parms, **kwargs): """ Execute RPC method on a given host """ hostName = Registry.getHostOption(host, 'Host', host) client = SystemAdministratorClient(hostName, **self.__kwargs) result = getattr(client, method)(*parms, **kwargs) result['Host'] = host return result def __processResult(self, id_, result): """ Collect results in the final structure """ host = result['Host'] del result['Host'] self.__resultDict[host] = result def execute(self, *args, **kwargs): """ Main execution method """ self.__resultDict = {} for host in self.__hosts: self.__pool.generateJobAndQueueIt(self.__executeClient, args=[host, self.call] + list(args), kwargs=kwargs, oCallback=self.__processResult) self.__pool.processAllResults() return S_OK(self.__resultDict)
class SystemAdministratorIntegrator: def __init__( self, **kwargs ): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] self.__kwargs = dict( kwargs ) self.__pool = ThreadPool( len( self.__hosts ) ) self.__resultDict = {} def __getattr__( self, name ): self.call = name return self.execute def __executeClient( self, host, method, *parms, **kwargs ): """ Execute RPC method on a given host """ hostName = Registry.getHostOption( host, 'Host', host) client = SystemAdministratorClient( hostName, **self.__kwargs ) result = getattr( client, method )( *parms, **kwargs ) result['Host'] = host return result def __processResult( self, id_, result ): """ Collect results in the final structure """ host = result['Host'] del result['Host'] self.__resultDict[host] = result def execute(self, *args, **kwargs ): """ Main execution method """ self.__resultDict = {} for host in self.__hosts: self.__pool.generateJobAndQueueIt( self.__executeClient, args = [ host, self.call ] + list(args), kwargs = kwargs, oCallback = self.__processResult ) self.__pool.processAllResults() return S_OK( self.__resultDict )
def _updateServiceConfiguration(self, urlSet, fromMaster=False): """ Update configuration in a set of service in parallel :param set urlSet: a set of service URLs :param fromMaster: flag to force updating from the master CS :return: Nothing """ pool = ThreadPool(len(urlSet)) for url in urlSet: pool.generateJobAndQueueIt(self._forceServiceUpdate, args=[url, fromMaster], kwargs={}, oCallback=self.__processResults) pool.processAllResults()
def __updateServiceConfiguration(self, urlSet, fromMaster=False): """ Update configuration in a set of service in parallel :param set urlSet: a set of service URLs :param fromMaster: flag to force updating from the master CS :return: S_OK/S_ERROR, Value Successful/Failed dict with service URLs """ pool = ThreadPool(len(urlSet)) for url in urlSet: pool.generateJobAndQueueIt(self.__forceServiceUpdate, args=[url, fromMaster], kwargs={}, oCallback=self.__processResults) pool.processAllResults() return S_OK(self.__updateResultDict)
def initialize(self): """ standard initialize """ # few parameters self.pluginLocation = self.am_getOption('PluginLocation', 'DIRAC.TransformationSystem.Agent.TransformationPlugin') self.transformationStatus = self.am_getOption('transformationStatus', ['Active', 'Completing', 'Flush']) # Prepare to change the name of the CS option as MaxFiles is ambiguous self.maxFiles = self.am_getOption('MaxFilesToProcess', self.am_getOption('MaxFiles', 5000)) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: dataProc = Operations().getValue('Transformations/DataProcessing', ['MCSimulation', 'Merge']) dataManip = Operations().getValue('Transformations/DataManipulation', ['Replication', 'Removal']) self.transformationTypes = sorted(dataProc + dataManip) # clients self.transfClient = TransformationClient() # for caching using a pickle file self.workDirectory = self.am_getWorkDirectory() self.cacheFile = os.path.join(self.workDirectory, 'ReplicaCache.pkl') self.controlDirectory = self.am_getControlDirectory() # remember the offset if any in TS self.lastFileOffset = {} # Validity of the cache self.replicaCache = {} self.replicaCacheValidity = self.am_getOption('ReplicaCacheValidity', 2) self.noUnusedDelay = self.am_getOption('NoUnusedDelay', 6) # Get it threaded maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.log.info("Multithreaded with %d threads" % maxNumberOfThreads) for i in xrange(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute, [i]) self.log.info("Will treat the following transformation types: %s" % str(self.transformationTypes)) return S_OK()
def initialize( self ): """ standard initialize """ # few parameters self.pluginLocation = self.am_getOption( 'PluginLocation', 'DIRAC.TransformationSystem.Agent.TransformationPlugin' ) self.transformationStatus = self.am_getOption( 'transformationStatus', ['Active', 'Completing', 'Flush'] ) self.maxFiles = self.am_getOption( 'MaxFiles', 5000 ) agentTSTypes = self.am_getOption( 'TransformationTypes', [] ) if agentTSTypes: self.transformationTypes = sorted( agentTSTypes ) else: dataProc = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) dataManip = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] ) self.transformationTypes = sorted( dataProc + dataManip ) # clients self.transfClient = TransformationClient() # shifter self.am_setOption( 'shifterProxy', 'ProductionManager' ) # for caching using a pickle file self.__readCache() self.workDirectory = self.am_getWorkDirectory() self.cacheFile = os.path.join( self.workDirectory, 'ReplicaCache.pkl' ) self.controlDirectory = self.am_getControlDirectory() self.replicaCacheValidity = self.am_getOption( 'ReplicaCacheValidity', 2 ) self.noUnusedDelay = self.am_getOption( 'NoUnusedDelay', 6 ) self.dateWriteCache = datetime.datetime.utcnow() # Get it threaded maxNumberOfThreads = self.am_getOption( 'maxThreadsInPool', 1 ) threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.log.info( "Multithreaded with %d threads" % maxNumberOfThreads ) for i in xrange( maxNumberOfThreads ): threadPool.generateJobAndQueueIt( self._execute, [i] ) self.log.info( "Will treat the following transformation types: %s" % str( self.transformationTypes ) ) return S_OK()
def initialize( self ): """ standard initialize """ self.__readCache() self.dateWriteCache = datetime.datetime.utcnow() self.am_setOption( 'shifterProxy', 'ProductionManager' ) # Get it threaded maxNumberOfThreads = self.am_getOption( 'maxThreadsInPool', 1 ) threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.log.info( "Multithreaded with %d threads" % maxNumberOfThreads ) for i in xrange( maxNumberOfThreads ): threadPool.generateJobAndQueueIt( self._execute, [i] ) return S_OK()
def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption('PollingTime', 60 * 60) if not self.am_getOption('Enable', True): self.log.info('Stalled Job Agent running in disabled mode') # setting up the threading maxNumberOfThreads = self.am_getOption('MaxNumberOfThreads', 15) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads) for _ in range(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute) return S_OK()
def initialize(self): """ Make the necessary initializations. The ThreadPool is created here, the _execute() method is what each thread will execute. """ self.fullUpdatePeriod = self.am_getOption('FullUpdatePeriod', self.fullUpdatePeriod) self.bkUpdateLatency = self.am_getOption('BKUpdateLatency', self.bkUpdateLatency) self.debug = self.am_getOption('verbose', self.debug) self.pickleFile = os.path.join(self.am_getWorkDirectory(), self.pickleFile) self.chunkSize = self.am_getOption('maxFilesPerChunk', self.chunkSize) self.pluginsWithNoRunInfo = Operations().getValue('TransformationPlugins/PluginsWithNoRunInfo', self.pluginsWithNoRunInfo) self._logInfo('Full Update Period: %d seconds' % self.fullUpdatePeriod) self._logInfo('BK update latency : %d seconds' % self.bkUpdateLatency) self._logInfo('Plugins with no run info: %s' % ', '.join(self.pluginsWithNoRunInfo)) self.transClient = TransformationClient() self.bkClient = BookkeepingClient() try: with open(self.pickleFile, 'r') as pf: self.timeLog = pickle.load(pf) self.fullTimeLog = pickle.load(pf) self.bkQueries = pickle.load(pf) self._logInfo("successfully loaded Log from", self.pickleFile, "initialize") except (EOFError, IOError): self._logInfo("failed loading Log from", self.pickleFile, "initialize") self.timeLog = {} self.fullTimeLog = {} self.bkQueries = {} maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) for i in xrange(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute, [i]) gMonitor.registerActivity("Iteration", "Agent Loops", AGENT_NAME, "Loops/min", gMonitor.OP_SUM) return S_OK()
def initialize( self ): """ standard initialize """ self.__readCache() self.dateWriteCache = datetime.datetime.utcnow() self.am_setOption( 'shifterProxy', 'ProductionManager' ) # Get it threaded maxNumberOfThreads = self.am_getOption( 'maxThreadsInPool', 1 ) threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.log.info( "Multithreaded with %d threads" % maxNumberOfThreads ) for i in xrange( maxNumberOfThreads ): threadPool.generateJobAndQueueIt( self._execute, [i] ) self.log.info( "Will treat the following transformation types: %s" % str( self.transformationTypes ) ) return S_OK()
def __init__( self, **kwargs ): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list ( set( self.__hosts ) - set( kwargs[ 'exclude' ] ) ) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] self.silentHosts = [] self.__resultDict = {} self.__kwargs = {} pool = ThreadPool( len( self.__hosts ) ) for host in self.__hosts: pool.generateJobAndQueueIt( self.__executeClient, args = [ host, "ping" ], kwargs = {}, oCallback = self.__processResult ) pool.processAllResults() for host, result in self.__resultDict.items(): if result['OK']: sysAdminHosts.append( host ) else: self.silentHosts.append( host ) del pool self.__hosts = sysAdminHosts self.__kwargs = dict( kwargs ) self.__pool = ThreadPool( len( self.__hosts ) ) self.__resultDict = {}
def __init__(self, **kwargs): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list(set(self.__hosts) - set(kwargs['exclude'])) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] self.silentHosts = [] self.__resultDict = {} self.__kwargs = {} pool = ThreadPool(len(self.__hosts)) for host in self.__hosts: pool.generateJobAndQueueIt(self.__executeClient, args=[host, "ping"], kwargs={}, oCallback=self.__processResult) pool.processAllResults() for host, result in self.__resultDict.items(): if result['OK']: sysAdminHosts.append(host) else: self.silentHosts.append(host) del pool self.__hosts = sysAdminHosts self.__kwargs = dict(kwargs) self.__pool = ThreadPool(len(self.__hosts)) self.__resultDict = {}
class SeSInspectorAgent(AgentModule): """ Class SeSInspectorAgent is in charge of going through Services table, and pass Service and Status to the PEP """ ############################################################################# def initialize(self): """ Standard constructor """ try: self.rsDB = ResourceStatusDB() self.rmDB = ResourceManagementDB() self.ServicesToBeChecked = Queue.Queue() self.ServiceNamesInCheck = [] self.maxNumberOfThreads = self.am_getOption( 'maxThreadsInPool', 1 ) self.threadPool = ThreadPool( self.maxNumberOfThreads, self.maxNumberOfThreads ) if not self.threadPool: self.log.error('Can not create Thread Pool') return S_ERROR('Can not create Thread Pool') self.setup = getSetup()['Value'] self.VOExtension = getExt() configModule = __import__(self.VOExtension+"DIRAC.ResourceStatusSystem.Policy.Configurations", globals(), locals(), ['*']) self.Services_check_freq = copy.deepcopy(configModule.Services_check_freq) self.nc = NotificationClient() self.diracAdmin = DiracAdmin() self.csAPI = CSAPI() for i in xrange(self.maxNumberOfThreads): self.threadPool.generateJobAndQueueIt(self._executeCheck, args = (None, ) ) return S_OK() except Exception: errorStr = "SeSInspectorAgent initialization" gLogger.exception(errorStr) return S_ERROR(errorStr) ############################################################################# def execute(self): """ The main SSInspectorAgent execution method. Calls :meth:`DIRAC.ResourceStatusSystem.DB.ResourceStatusDB.getResourcesToCheck` and put result in self.ServicesToBeChecked (a Queue) and in self.ServiceNamesInCheck (a list) """ try: res = self.rsDB.getStuffToCheck('Services', self.Services_check_freq) for resourceTuple in res: if resourceTuple[0] in self.ServiceNamesInCheck: break resourceL = ['Service'] for x in resourceTuple: resourceL.append(x) self.ServiceNamesInCheck.insert(0, resourceL[1]) self.ServicesToBeChecked.put(resourceL) return S_OK() except Exception, x: errorStr = where(self, self.execute) gLogger.exception(errorStr,lException=x) return S_ERROR(errorStr)
class SeSInspectorAgent( AgentModule ): ''' The SeSInspector agent ( ServiceInspectorAgent ) is one of the four InspectorAgents of the RSS. This Agent takes care of the Service. In order to do so, it gathers the eligible ones and then evaluates their statuses with the PEP. If you want to know more about the SeSInspectorAgent, scroll down to the end of the file. ''' # Too many public methods # pylint: disable-msg=R0904 def initialize( self ): # Attribute defined outside __init__ # pylint: disable-msg=W0201 try: self.rsClient = ResourceStatusClient() self.servicesFreqs = CS.getTypedDictRootedAtOperations( 'CheckingFreqs/ServicesFreqs' ) self.queue = Queue.Queue() self.maxNumberOfThreads = self.am_getOption( 'maxThreadsInPool', 1 ) self.threadPool = ThreadPool( self.maxNumberOfThreads, self.maxNumberOfThreads ) if not self.threadPool: self.log.error( 'Can not create Thread Pool' ) return S_ERROR( 'Can not create Thread Pool' ) for _i in xrange( self.maxNumberOfThreads ): self.threadPool.generateJobAndQueueIt( self._executeCheck ) return S_OK() except Exception: errorStr = "SeSInspectorAgent initialization" self.log.exception( errorStr ) return S_ERROR( errorStr ) def execute( self ): try: kwargs = { 'meta' : {} } kwargs['meta']['columns'] = [ 'ServiceName', 'StatusType', 'Status', 'FormerStatus', 'SiteType', 'ServiceType', 'TokenOwner' ] kwargs[ 'tokenOwner' ] = 'RS_SVC' resQuery = self.rsClient.getStuffToCheck( 'Service', self.servicesFreqs, **kwargs ) if not resQuery[ 'OK' ]: self.log.error( resQuery[ 'Message' ] ) return resQuery resQuery = resQuery[ 'Value' ] self.log.info( 'Found %d candidates to be checked.' % len( resQuery ) ) for service in resQuery: resourceL = [ 'Service' ] + service # Here we peek INSIDE the Queue to know if the item is already # here. It's ok _here_ since (i.e. I know what I'm doing): # - It is a read only operation. # - We do not need exact accuracy, it's ok to have 2 times the same item in the queue sometimes. if resourceL not in self.queue.queue: self.queue.put( resourceL ) return S_OK() except Exception, x: errorStr = where( self, self.execute ) self.log.exception( errorStr, lException = x ) return S_ERROR( errorStr )
class Service(object): SVC_VALID_ACTIONS = { 'RPC': 'export', 'FileTransfer': 'transfer', 'Message': 'msg', 'Connection': 'Message' } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__(self, serviceData): """ Init the variables for the service :param serviceData: dict with modName, standalone, loadName, moduleObj, classObj. e.g.: {'modName': 'Framework/serviceName', 'standalone': True, 'loadName': 'Framework/serviceName', 'moduleObj': <module 'serviceNameHandler' from '/home/DIRAC/FrameworkSystem/Service/serviceNameHandler.pyo'>, 'classObj': <class 'serviceNameHandler.serviceHandler'>} Standalone is true if there is only one service started If it's false, every service is linked to a different MonitoringClient """ self._svcData = serviceData self._name = serviceData['modName'] self._startTime = Time.dateTime() self._validNames = [serviceData['modName']] if serviceData['loadName'] not in self._validNames: self._validNames.append(serviceData['loadName']) self._cfg = ServiceConfiguration(list(self._validNames)) if serviceData['standalone']: self._monitor = gMonitor else: self._monitor = MonitoringClient() self.__monitorLastStatsUpdate = time.time() self._stats = {'queries': 0, 'connections': 0} self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection(serviceData['loadName'])) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 self.__maxFD = 0 def setCloneProcessId(self, cloneId): self.__cloneId = cloneId self._monitor.setComponentName("%s-Clone:%s" % (self._name, cloneId)) def _isMetaAction(self, action): referedAction = Service.SVC_VALID_ACTIONS[action] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize(self): # Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR("Could not build service URL for %s" % self._name) gLogger.verbose("Service URL is %s" % self._url) # Load handler result = self._loadHandlerInit() if not result['OK']: return result self._handler = result['Value'] # Initialize lock manager self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions()) self._initMonitoring() # TODO: remove ThreadPool if useThreadPoolExecutor: self._threadPool = ThreadPoolExecutor( max(0, self._cfg.getMaxThreads())) else: self._threadPool = ThreadPool(max(1, self._cfg.getMinThreads()), max(0, self._cfg.getMaxThreads()), self._cfg.getMaxWaitingPetitions()) self._threadPool.daemonize() self._msgBroker = MessageBroker("%sMSB" % self._name, threadPool=self._threadPool) # Create static dict self._serviceInfoDict = { 'serviceName': self._name, 'serviceSectionPath': PathFinder.getServiceSection(self._name), 'URL': self._cfg.getURL(), 'messageSender': MessageSender(self._name, self._msgBroker), 'validNames': self._validNames, 'csPaths': [ PathFinder.getServiceSection(svcName) for svcName in self._validNames ] } # Call static initialization function try: self._handler['class']._rh__initializeClass( dict(self._serviceInfoDict), self._lockManager, self._msgBroker, self._monitor) if self._handler['init']: for initFunc in self._handler['init']: gLogger.verbose("Executing initialization function") try: result = initFunc(dict(self._serviceInfoDict)) except Exception as excp: gLogger.exception( "Exception while calling initialization function", lException=excp) return S_ERROR( "Exception while calling initialization function: %s" % str(excp)) if not isReturnStructure(result): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc) if not result['OK']: return S_ERROR("Error while initializing %s: %s" % (self._name, result['Message'])) except Exception as e: errMsg = "Exception while initializing %s" % self._name gLogger.exception(e) gLogger.exception(errMsg) return S_ERROR(errMsg) # Load actions after the handler has initialized itself result = self._loadActions() if not result['OK']: return result self._actions = result['Value'] gThreadScheduler.addPeriodicTask(30, self.__reportThreadPoolContents) return S_OK() def __searchInitFunctions(self, handlerClass, currentClass=None): if not currentClass: currentClass = handlerClass initFuncs = [] ancestorHasInit = False for ancestor in currentClass.__bases__: initFuncs += self.__searchInitFunctions(handlerClass, ancestor) if 'initializeHandler' in dir(ancestor): ancestorHasInit = True if ancestorHasInit: initFuncs.append( super(currentClass, handlerClass).initializeHandler) if currentClass == handlerClass and 'initializeHandler' in dir( handlerClass): initFuncs.append(handlerClass.initializeHandler) return initFuncs def _loadHandlerInit(self): handlerClass = self._svcData['classObj'] handlerName = handlerClass.__name__ handlerInitMethods = self.__searchInitFunctions(handlerClass) try: handlerInitMethods.append( getattr(self._svcData['moduleObj'], "initialize%s" % handlerName)) except AttributeError: gLogger.verbose( "Not found global initialization function for service") if handlerInitMethods: gLogger.info("Found %s initialization methods" % len(handlerInitMethods)) handlerInfo = {} handlerInfo["name"] = handlerName handlerInfo["module"] = self._svcData['moduleObj'] handlerInfo["class"] = handlerClass handlerInfo["init"] = handlerInitMethods return S_OK(handlerInfo) def _loadActions(self): handlerClass = self._handler['class'] authRules = {} typeCheck = {} methodsList = {} for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction(actionType): continue authRules[actionType] = {} typeCheck[actionType] = {} methodsList[actionType] = [] handlerAttributeList = dir(handlerClass) for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction(actionType): continue methodPrefix = '%s_' % Service.SVC_VALID_ACTIONS[actionType] for attribute in handlerAttributeList: if attribute.find(methodPrefix) != 0: continue exportedName = attribute[len(methodPrefix):] methodsList[actionType].append(exportedName) gLogger.verbose("+ Found %s method %s" % (actionType, exportedName)) # Create lock for method self._lockManager.createLock( "%s/%s" % (actionType, exportedName), self._cfg.getMaxThreadsForMethod(actionType, exportedName)) # Look for type and auth rules if actionType == 'RPC': typeAttr = "types_%s" % exportedName authAttr = "auth_%s" % exportedName else: typeAttr = "types_%s_%s" % ( Service.SVC_VALID_ACTIONS[actionType], exportedName) authAttr = "auth_%s_%s" % ( Service.SVC_VALID_ACTIONS[actionType], exportedName) if typeAttr in handlerAttributeList: obj = getattr(handlerClass, typeAttr) gLogger.verbose("|- Found type definition %s: %s" % (typeAttr, str(obj))) typeCheck[actionType][exportedName] = obj if authAttr in handlerAttributeList: obj = getattr(handlerClass, authAttr) gLogger.verbose("|- Found auth rules %s: %s" % (authAttr, str(obj))) authRules[actionType][exportedName] = obj for actionType in Service.SVC_VALID_ACTIONS: referedAction = self._isMetaAction(actionType) if not referedAction: continue gLogger.verbose("Action %s is a meta action for %s" % (actionType, referedAction)) authRules[actionType] = [] for method in authRules[referedAction]: for prop in authRules[referedAction][method]: if prop not in authRules[actionType]: authRules[actionType].append(prop) gLogger.verbose("Meta action %s props are %s" % (actionType, authRules[actionType])) return S_OK({ 'methods': methodsList, 'auth': authRules, 'types': typeCheck }) def _initMonitoring(self): # Init extra bits of monitoring self._monitor.setComponentType(MonitoringClient.COMPONENT_SERVICE) self._monitor.setComponentName(self._name) self._monitor.setComponentLocation(self._cfg.getURL()) self._monitor.initialize() self._monitor.registerActivity("Connections", "Connections received", "Framework", "connections", MonitoringClient.OP_RATE) self._monitor.registerActivity("Queries", "Queries served", "Framework", "queries", MonitoringClient.OP_RATE) self._monitor.registerActivity('CPU', "CPU Usage", 'Framework', "CPU,%", MonitoringClient.OP_MEAN, 600) self._monitor.registerActivity('MEM', "Memory Usage", 'Framework', 'Memory,MB', MonitoringClient.OP_MEAN, 600) self._monitor.registerActivity('PendingQueries', "Pending queries", 'Framework', 'queries', MonitoringClient.OP_MEAN) self._monitor.registerActivity('ActiveQueries', "Active queries", 'Framework', 'threads', MonitoringClient.OP_MEAN) self._monitor.registerActivity('RunningThreads', "Running threads", 'Framework', 'threads', MonitoringClient.OP_MEAN) self._monitor.registerActivity('MaxFD', "Max File Descriptors", 'Framework', 'fd', MonitoringClient.OP_MEAN) self._monitor.setComponentExtraParam('DIRACVersion', DIRAC.version) self._monitor.setComponentExtraParam('platform', DIRAC.getPlatform()) self._monitor.setComponentExtraParam('startTime', Time.dateTime()) for prop in (("__RCSID__", "version"), ("__doc__", "description")): try: value = getattr(self._handler['module'], prop[0]) except Exception as e: gLogger.exception(e) gLogger.error("Missing property", prop[0]) value = 'unset' self._monitor.setComponentExtraParam(prop[1], value) for secondaryName in self._cfg.registerAlsoAs(): gLogger.info("Registering %s also as %s" % (self._name, secondaryName)) self._validNames.append(secondaryName) return S_OK() def __reportThreadPoolContents(self): # TODO: remove later if useThreadPoolExecutor: pendingQueries = self._threadPool._work_queue.qsize() activeQuereies = len(self._threadPool._threads) else: pendingQueries = self._threadPool.pendingJobs() activeQuereies = self._threadPool.numWorkingThreads() self._monitor.addMark('PendingQueries', pendingQueries) self._monitor.addMark('ActiveQueries', activeQuereies) self._monitor.addMark('RunningThreads', threading.activeCount()) self._monitor.addMark('MaxFD', self.__maxFD) self.__maxFD = 0 def getConfig(self): return self._cfg # End of initialization functions def handleConnection(self, clientTransport): """ This method may be called by ServiceReactor. The method stacks openened connection in a queue, another thread read this queue and handle connection. :param clientTransport: Object wich describe opened connection (PlainTransport or SSLTransport) """ self._stats['connections'] += 1 self._monitor.setComponentExtraParam('queries', self._stats['connections']) # TODO: remove later if useThreadPoolExecutor: self._threadPool.submit(self._processInThread, clientTransport) else: self._threadPool.generateJobAndQueueIt(self._processInThread, args=(clientTransport, )) # Threaded process function def _processInThread(self, clientTransport): """ This method handles a RPC, FileTransfer or Connection. Connection may be opened via ServiceReactor.__acceptIncomingConnection - Do the SSL/TLS Handshake (if dips is used) and extract credentials - Get the action called by the client - Check if the client is authorized to perform ation - If not, connection is closed - Instanciate the RequestHandler (RequestHandler contain all methods callable) (Following is not directly in this method but it describe what happen at #Execute the action) - Notify the client we're ready to execute the action (via _processProposal) and call RequestHandler._rh_executeAction() - Receive arguments/file/something else (depending on action) in the RequestHandler - Executing the action asked by the client :param clientTransport: Object who describe the opened connection (SSLTransport or PlainTransport) :return: S_OK with "closeTransport" a boolean to indicate if th connection have to be closed e.g. after RPC, closeTransport=True """ self.__maxFD = max(self.__maxFD, clientTransport.oSocket.fileno()) self._lockManager.lockGlobal() try: monReport = self.__startReportToMonitoring() except Exception: monReport = False try: # Handshake try: result = clientTransport.handshake() if not result['OK']: clientTransport.close() return except BaseException: return # Add to the transport pool trid = self._transportPool.add(clientTransport) if not trid: return # Receive and check proposal result = self._receiveAndCheckProposal(trid) if not result['OK']: self._transportPool.sendAndClose(trid, result) return proposalTuple = result['Value'] # Instantiate handler result = self._instantiateHandler(trid, proposalTuple) if not result['OK']: self._transportPool.sendAndClose(trid, result) return handlerObj = result['Value'] # Execute the action result = self._processProposal(trid, proposalTuple, handlerObj) # Close the connection if required if result['closeTransport'] or not result['OK']: if not result['OK']: gLogger.error("Error processing proposal", result['Message']) self._transportPool.close(trid) return result finally: self._lockManager.unlockGlobal() if monReport: self.__endReportToMonitoring(*monReport) def _createIdentityString(self, credDict, clientTransport=None): if 'username' in credDict: if 'group' in credDict: identity = "[%s:%s]" % (credDict['username'], credDict['group']) else: identity = "[%s:unknown]" % credDict['username'] else: identity = 'unknown' if clientTransport: addr = clientTransport.getRemoteAddress() if addr: addr = "{%s:%s}" % (addr[0], addr[1]) if 'DN' in credDict: identity += "(%s)" % credDict['DN'] return identity @staticmethod def _deserializeProposalTuple(serializedProposal): """ We receive the proposalTuple as a list. Turn it into a tuple again """ proposalTuple = tuple( tuple(x) if isinstance(x, list) else x for x in serializedProposal) return proposalTuple def _receiveAndCheckProposal(self, trid): clientTransport = self._transportPool.get(trid) # Get the peer credentials credDict = clientTransport.getConnectingCredentials() # Receive the action proposal retVal = clientTransport.receiveData(1024) if not retVal['OK']: gLogger.error( "Invalid action proposal", "%s %s" % (self._createIdentityString( credDict, clientTransport), retVal['Message'])) return S_ERROR("Invalid action proposal") proposalTuple = Service._deserializeProposalTuple(retVal['Value']) gLogger.debug("Received action from client", "/".join(list(proposalTuple[1]))) # Check if there are extra credentials if proposalTuple[2]: clientTransport.setExtraCredentials(proposalTuple[2]) # Check if this is the requested service requestedService = proposalTuple[0][0] if requestedService not in self._validNames: return S_ERROR("%s is not up in this server" % requestedService) # Check if the action is valid requestedActionType = proposalTuple[1][0] if requestedActionType not in Service.SVC_VALID_ACTIONS: return S_ERROR("%s is not a known action type" % requestedActionType) # Check if it's authorized result = self._authorizeProposal(proposalTuple[1], trid, credDict) if not result['OK']: return result # Proposal is OK return S_OK(proposalTuple) def _authorizeProposal(self, actionTuple, trid, credDict): # Find CS path for the Auth rules referedAction = self._isMetaAction(actionTuple[0]) if referedAction: csAuthPath = "%s/Default" % actionTuple[0] hardcodedMethodAuth = self._actions['auth'][actionTuple[0]] else: if actionTuple[0] == 'RPC': csAuthPath = actionTuple[1] else: csAuthPath = "/".join(actionTuple) # Find if there are hardcoded auth rules in the code hardcodedMethodAuth = False if actionTuple[0] in self._actions['auth']: hardcodedRulesByType = self._actions['auth'][actionTuple[0]] if actionTuple[0] == "FileTransfer": methodName = actionTuple[1][0].lower() + actionTuple[1][1:] else: methodName = actionTuple[1] if methodName in hardcodedRulesByType: hardcodedMethodAuth = hardcodedRulesByType[methodName] # Auth time! if not self._authMgr.authQuery(csAuthPath, credDict, hardcodedMethodAuth): # Get the identity string identity = self._createIdentityString(credDict) fromHost = "unknown host" tr = self._transportPool.get(trid) if tr: fromHost = '/'.join( [str(item) for item in tr.getRemoteAddress()]) gLogger.warn( "Unauthorized query", "to %s:%s by %s from %s" % (self._name, "/".join(actionTuple), identity, fromHost)) result = S_ERROR(ENOAUTH, "Unauthorized query") else: result = S_OK() # Security log tr = self._transportPool.get(trid) if not tr: return S_ERROR("Client disconnected") sourceAddress = tr.getRemoteAddress() identity = self._createIdentityString(credDict) Service.SVC_SECLOG_CLIENT.addMessage(result['OK'], sourceAddress[0], sourceAddress[1], identity, self._cfg.getHostname(), self._cfg.getPort(), self._name, "/".join(actionTuple)) return result def _instantiateHandler(self, trid, proposalTuple=None): """ Generate an instance of the handler for a given service :param int trid: transport ID :param tuple proposalTuple: tuple describing the proposed action :return: S_OK/S_ERROR, Value is the handler object """ # Generate the client params clientParams = {'serviceStartTime': self._startTime} if proposalTuple: # The 4th element is the client version clientParams['clientVersion'] = proposalTuple[3] if len( proposalTuple) > 3 else None clientParams['clientSetup'] = proposalTuple[0][1] if len(proposalTuple[0]) < 3: clientParams['clientVO'] = gConfig.getValue( "/DIRAC/VirtualOrganization", "unknown") else: clientParams['clientVO'] = proposalTuple[0][2] clientTransport = self._transportPool.get(trid) if clientTransport: clientParams['clientAddress'] = clientTransport.getRemoteAddress() # Generate handler dict with per client info handlerInitDict = dict(self._serviceInfoDict) for key in clientParams: handlerInitDict[key] = clientParams[key] # Instantiate and initialize try: handlerInstance = self._handler['class'](handlerInitDict, trid) handlerInstance.initialize() except Exception as e: gLogger.exception("Server error while loading handler: %s" % str(e)) return S_ERROR("Server error while loading handler") return S_OK(handlerInstance) def _processProposal(self, trid, proposalTuple, handlerObj): # Notify the client we're ready to execute the action retVal = self._transportPool.send(trid, S_OK()) if not retVal['OK']: return retVal messageConnection = False if proposalTuple[1] == ('Connection', 'new'): messageConnection = True if messageConnection: if self._msgBroker.getNumConnections( ) > self._cfg.getMaxMessagingConnections(): result = S_ERROR( "Maximum number of connections reached. Try later") result['closeTransport'] = True return result # This is a stable connection self._msgBroker.addTransportId( trid, self._name, receiveMessageCallback=self._mbReceivedMsg, disconnectCallback=self._mbDisconnect, listenToConnection=False) result = self._executeAction(trid, proposalTuple, handlerObj) if result['OK'] and messageConnection: self._msgBroker.listenToTransport(trid) result = self._mbConnect(trid, handlerObj) if not result['OK']: self._msgBroker.removeTransport(trid) result['closeTransport'] = not messageConnection or not result['OK'] return result def _mbConnect(self, trid, handlerObj=None): if not handlerObj: result = self._instantiateHandler(trid) if not result['OK']: return result handlerObj = result['Value'] return handlerObj._rh_executeConnectionCallback('connected') def _executeAction(self, trid, proposalTuple, handlerObj): try: return handlerObj._rh_executeAction(proposalTuple) except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e)) def _mbReceivedMsg(self, trid, msgObj): result = self._authorizeProposal( ('Message', msgObj.getName()), trid, self._transportPool.get(trid).getConnectingCredentials()) if not result['OK']: return result result = self._instantiateHandler(trid) if not result['OK']: return result handlerObj = result['Value'] return handlerObj._rh_executeMessageCallback(msgObj) def _mbDisconnect(self, trid): result = self._instantiateHandler(trid) if not result['OK']: return result handlerObj = result['Value'] return handlerObj._rh_executeConnectionCallback('drop') def __startReportToMonitoring(self): self._monitor.addMark("Queries") now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 0: return (now, cpuTime) # Send CPU consumption mark wallClock = now - self.__monitorLastStatsUpdate self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB('VmRSS:') if membytes: mem = membytes / (1024. * 1024.) self._monitor.addMark('MEM', mem) return (now, cpuTime) def __endReportToMonitoring(self, initialWallTime, initialCPUTime): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100. if percentage > 0: self._monitor.addMark('CPU', percentage)
class ElementInspectorAgent(AgentModule): ''' The ElementInspector agent is a generic agent used to check the elements of one of the elementTypes ( e.g. Site, Resource, Node ). This Agent takes care of the Elements. In order to do so, it gathers the eligible ones and then evaluates their statuses with the PEP. ''' # Max number of worker threads by default __maxNumberOfThreads = 5 # ElementType, to be defined among Site, Resource or Node __elementType = None # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = { 'Default': { 'Active': 60, 'Degraded': 30, 'Probing': 30, 'Banned': 30, 'Unknown': 15, 'Error': 15 } } # queue size limit to stop feeding __limitQueueFeeder = 15 def __init__(self, *args, **kwargs): ''' c'tor ''' AgentModule.__init__(self, *args, **kwargs) # members initialization self.maxNumberOfThreads = self.__maxNumberOfThreads self.elementType = self.__elementType self.checkingFreqs = self.__checkingFreqs self.limitQueueFeeder = self.__limitQueueFeeder self.elementsToBeChecked = None self.threadPool = None self.rsClient = None self.clients = {} def initialize(self): ''' Standard initialize. Uses the ProductionManager shifterProxy to modify the ResourceStatus DB ''' self.maxNumberOfThreads = self.am_getOption('maxNumberOfThreads', self.maxNumberOfThreads) self.elementType = self.am_getOption('elementType', self.elementType) self.checkingFreqs = self.am_getOption('checkingFreqs', self.checkingFreqs) self.limitQueueFeeder = self.am_getOption('limitQueueFeeder', self.limitQueueFeeder) self.elementsToBeChecked = Queue.Queue() self.threadPool = ThreadPool(self.maxNumberOfThreads, self.maxNumberOfThreads) self.rsClient = ResourceStatusClient() self.clients['ResourceStatusClient'] = self.rsClient self.clients['ResourceManagementClient'] = ResourceManagementClient() self.clients['PilotsDB'] = PilotAgentsDB() return S_OK() def execute(self): # If there are elements in the queue to be processed, we wait ( we know how # many elements in total we can have, so if there are more than 15% of them # on the queue, we do not add anything ), but the threads are running and # processing items from the queue on background. qsize = self.elementsToBeChecked.qsize() if qsize > self.limitQueueFeeder: self.log.warn( 'Queue not empty ( %s > %s ), skipping feeding loop' % (qsize, self.limitQueueFeeder)) return S_OK() # We get all the elements, then we filter. elements = self.rsClient.selectStatusElement(self.elementType, 'Status') if not elements['OK']: self.log.error(elements['Message']) return elements utcnow = datetime.datetime.utcnow().replace(microsecond=0) # filter elements by Type for element in elements['Value']: # Maybe an overkill, but this way I have NEVER again to worry about order # of elements returned by mySQL on tuples elemDict = dict(zip(elements['Columns'], element)) # We skip the elements with token different than "rs_svc" if elemDict['TokenOwner'] != 'rs_svc': self.log.info('Skipping %s ( %s ) with token %s' % (elemDict['Name'], elemDict['StatusType'], elemDict['TokenOwner'])) continue if not elemDict['ElementType'] in self.checkingFreqs: #self.log.warn( '"%s" not in inspectionFreqs, getting default' % elemDict[ 'ElementType' ] ) timeToNextCheck = self.checkingFreqs['Default'][ elemDict['Status']] else: timeToNextCheck = self.checkingFreqs[elemDict['ElementType']][ elemDict['Status']] if utcnow - datetime.timedelta( minutes=timeToNextCheck) > elemDict['LastCheckTime']: # We are not checking if the item is already on the queue or not. It may # be there, but in any case, it is not a big problem. lowerElementDict = {'element': self.elementType} for key, value in elemDict.items(): lowerElementDict[key[0].lower() + key[1:]] = value # We add lowerElementDict to the queue self.elementsToBeChecked.put(lowerElementDict) self.log.verbose('%s # "%s" # "%s" # %s # %s' % (elemDict['Name'], elemDict['ElementType'], elemDict['StatusType'], elemDict['Status'], elemDict['LastCheckTime'])) # Measure size of the queue, more or less, to know how many threads should # we start ! queueSize = self.elementsToBeChecked.qsize() # 30, could have been other number.. but it works reasonably well. ( +1 to get ceil ) threadsToStart = max( min(self.maxNumberOfThreads, (queueSize / 30) + 1), 1) threadsRunning = self.threadPool.numWorkingThreads() self.log.info('Needed %d threads to process %d elements' % (threadsToStart, queueSize)) if threadsRunning: self.log.info('Already %d threads running' % threadsRunning) threadsToStart = max(0, threadsToStart - threadsRunning) self.log.info('Starting %d threads to process %d elements' % (threadsToStart, queueSize)) # It may happen that we start two threads, 0 and 1. 1 goes DOWN, but 0 keeps # running. In next loop we will start a new thread, and will be called 0 # again. To have a mechanism to see which thread is where, we append the # cycle number before the threadId. cycle = self._AgentModule__moduleProperties['cyclesDone'] for _x in xrange(threadsToStart): threadId = '%s_%s' % (cycle, _x) jobUp = self.threadPool.generateJobAndQueueIt(self._execute, args=(threadId, )) if not jobUp['OK']: self.log.error(jobUp['Message']) return S_OK() def finalize(self): self.log.info('draining queue... blocking until empty') # block until all tasks are done self.elementsToBeChecked.join() return S_OK() ## Private methods ############################################################# def _execute(self, threadNumber): ''' Method run by the thread pool. It enters a loop until there are no elements on the queue. On each iteration, it evaluates the policies for such element and enforces the necessary actions. If there are no more elements in the queue, the loop is finished. ''' tHeader = '%sJob%s' % ('* ' * 30, threadNumber) self.log.info('%s UP' % tHeader) pep = PEP(clients=self.clients) while True: try: element = self.elementsToBeChecked.get_nowait() except Queue.Empty: self.log.info('%s DOWN' % tHeader) return S_OK() self.log.info( '%s ( %s / %s ) being processed' % (element['name'], element['status'], element['statusType'])) resEnforce = pep.enforce(element) if not resEnforce['OK']: self.log.error(resEnforce['Message']) self.elementsToBeChecked.task_done() continue resEnforce = resEnforce['Value'] oldStatus = resEnforce['decissionParams']['status'] statusType = resEnforce['decissionParams']['statusType'] newStatus = resEnforce['policyCombinedResult']['Status'] reason = resEnforce['policyCombinedResult']['Reason'] if oldStatus != newStatus: self.log.info('%s (%s) is now %s ( %s ), before %s' % (element['name'], statusType, newStatus, reason, oldStatus)) # Used together with join ! self.elementsToBeChecked.task_done() self.log.info('%s DOWN' % tHeader) return S_OK()
class Service( object ): SVC_VALID_ACTIONS = { 'RPC' : 'export', 'FileTransfer': 'transfer', 'Message' : 'msg', 'Connection' : 'Message' } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__( self, serviceData ): self._svcData = serviceData self._name = serviceData[ 'modName' ] self._startTime = Time.dateTime() self._validNames = [ serviceData[ 'modName' ] ] if serviceData[ 'loadName' ] not in self._validNames: self._validNames.append( serviceData[ 'loadName' ] ) self._cfg = ServiceConfiguration( list( self._validNames ) ) if serviceData[ 'standalone' ]: self._monitor = gMonitor else: self._monitor = MonitoringClient() self.__monitorLastStatsUpdate = time.time() self._stats = { 'queries' : 0, 'connections' : 0 } self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection( serviceData[ 'loadName' ] ) ) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 self.__maxFD = 0 def setCloneProcessId( self, cloneId ): self.__cloneId = cloneId self._monitor.setComponentName( "%s-Clone:%s" % ( self._name, cloneId ) ) def _isMetaAction( self, action ): referedAction = Service.SVC_VALID_ACTIONS[ action ] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize( self ): #Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR( "Could not build service URL for %s" % self._name ) gLogger.verbose( "Service URL is %s" % self._url ) #Load handler result = self._loadHandlerInit() if not result[ 'OK' ]: return result self._handler = result[ 'Value' ] #Initialize lock manager self._lockManager = LockManager( self._cfg.getMaxWaitingPetitions() ) self._initMonitoring() self._threadPool = ThreadPool( max( 1, self._cfg.getMinThreads() ), max( 0, self._cfg.getMaxThreads() ), self._cfg.getMaxWaitingPetitions() ) self._threadPool.daemonize() self._msgBroker = MessageBroker( "%sMSB" % self._name, threadPool = self._threadPool ) #Create static dict self._serviceInfoDict = { 'serviceName' : self._name, 'serviceSectionPath' : PathFinder.getServiceSection( self._name ), 'URL' : self._cfg.getURL(), 'messageSender' : MessageSender( self._name, self._msgBroker ), 'validNames' : self._validNames, 'csPaths' : [ PathFinder.getServiceSection( svcName ) for svcName in self._validNames ] } #Call static initialization function try: self._handler[ 'class' ]._rh__initializeClass( dict( self._serviceInfoDict ), self._lockManager, self._msgBroker, self._monitor ) if self._handler[ 'init' ]: for initFunc in self._handler[ 'init' ]: gLogger.verbose( "Executing initialization function" ) try: result = initFunc( dict( self._serviceInfoDict ) ) except Exception as excp: gLogger.exception( "Exception while calling initialization function", lException = excp ) return S_ERROR( "Exception while calling initialization function: %s" % str( excp ) ) if not isReturnStructure( result ): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc ) if not result[ 'OK' ]: return S_ERROR( "Error while initializing %s: %s" % ( self._name, result[ 'Message' ] ) ) except Exception as e: errMsg = "Exception while initializing %s" % self._name gLogger.exception( e ) gLogger.exception( errMsg ) return S_ERROR( errMsg ) #Load actions after the handler has initialized itself result = self._loadActions() if not result[ 'OK' ]: return result self._actions = result[ 'Value' ] gThreadScheduler.addPeriodicTask( 30, self.__reportThreadPoolContents ) return S_OK() def __searchInitFunctions( self, handlerClass, currentClass = None ): if not currentClass: currentClass = handlerClass initFuncs = [] ancestorHasInit = False for ancestor in currentClass.__bases__: initFuncs += self.__searchInitFunctions( handlerClass, ancestor ) if 'initializeHandler' in dir( ancestor ): ancestorHasInit = True if ancestorHasInit: initFuncs.append( super( currentClass, handlerClass ).initializeHandler ) if currentClass == handlerClass and 'initializeHandler' in dir( handlerClass ): initFuncs.append( handlerClass.initializeHandler ) return initFuncs def _loadHandlerInit( self ): handlerClass = self._svcData[ 'classObj' ] handlerName = handlerClass.__name__ handlerInitMethods = self.__searchInitFunctions( handlerClass ) try: handlerInitMethods.append( getattr( self._svcData[ 'moduleObj' ], "initialize%s" % handlerName ) ) except AttributeError: gLogger.verbose( "Not found global initialization function for service" ) if handlerInitMethods: gLogger.info( "Found %s initialization methods" % len( handlerInitMethods ) ) handlerInfo = {} handlerInfo[ "name" ] = handlerName handlerInfo[ "module" ] = self._svcData[ 'moduleObj' ] handlerInfo[ "class" ] = handlerClass handlerInfo[ "init" ] = handlerInitMethods return S_OK( handlerInfo ) def _loadActions( self ): handlerClass = self._handler[ 'class' ] authRules = {} typeCheck = {} methodsList = {} for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction( actionType ): continue authRules[ actionType ] = {} typeCheck[ actionType ] = {} methodsList[ actionType ] = [] handlerAttributeList = dir( handlerClass ) for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction( actionType ): continue methodPrefix = '%s_' % Service.SVC_VALID_ACTIONS[ actionType ] for attribute in handlerAttributeList: if attribute.find( methodPrefix ) != 0: continue exportedName = attribute[ len( methodPrefix ) : ] methodsList[ actionType ].append( exportedName ) gLogger.verbose( "+ Found %s method %s" % ( actionType, exportedName ) ) #Create lock for method self._lockManager.createLock( "%s/%s" % ( actionType, exportedName ), self._cfg.getMaxThreadsForMethod( actionType, exportedName ) ) #Look for type and auth rules if actionType == 'RPC': typeAttr = "types_%s" % exportedName authAttr = "auth_%s" % exportedName else: typeAttr = "types_%s_%s" % ( Service.SVC_VALID_ACTIONS[ actionType ], exportedName ) authAttr = "auth_%s_%s" % ( Service.SVC_VALID_ACTIONS[ actionType ], exportedName ) if typeAttr in handlerAttributeList: obj = getattr( handlerClass, typeAttr ) gLogger.verbose( "|- Found type definition %s: %s" % ( typeAttr, str( obj ) ) ) typeCheck[ actionType ][ exportedName ] = obj if authAttr in handlerAttributeList: obj = getattr( handlerClass, authAttr ) gLogger.verbose( "|- Found auth rules %s: %s" % ( authAttr, str( obj ) ) ) authRules[ actionType ][ exportedName ] = obj for actionType in Service.SVC_VALID_ACTIONS: referedAction = self._isMetaAction( actionType ) if not referedAction: continue gLogger.verbose( "Action %s is a meta action for %s" % ( actionType, referedAction ) ) authRules[ actionType ] = [] for method in authRules[ referedAction ]: for prop in authRules[ referedAction ][ method ]: if prop not in authRules[ actionType ]: authRules[ actionType ].append( prop ) gLogger.verbose( "Meta action %s props are %s" % ( actionType, authRules[ actionType ] ) ) return S_OK( { 'methods' : methodsList, 'auth' : authRules, 'types' : typeCheck } ) def _initMonitoring( self ): #Init extra bits of monitoring self._monitor.setComponentType( MonitoringClient.COMPONENT_SERVICE ) self._monitor.setComponentName( self._name ) self._monitor.setComponentLocation( self._cfg.getURL() ) self._monitor.initialize() self._monitor.registerActivity( "Connections", "Connections received", "Framework", "connections", MonitoringClient.OP_RATE ) self._monitor.registerActivity( "Queries", "Queries served", "Framework", "queries", MonitoringClient.OP_RATE ) self._monitor.registerActivity( 'CPU', "CPU Usage", 'Framework', "CPU,%", MonitoringClient.OP_MEAN, 600 ) self._monitor.registerActivity( 'MEM', "Memory Usage", 'Framework', 'Memory,MB', MonitoringClient.OP_MEAN, 600 ) self._monitor.registerActivity( 'PendingQueries', "Pending queries", 'Framework', 'queries', MonitoringClient.OP_MEAN ) self._monitor.registerActivity( 'ActiveQueries', "Active queries", 'Framework', 'threads', MonitoringClient.OP_MEAN ) self._monitor.registerActivity( 'RunningThreads', "Running threads", 'Framework', 'threads', MonitoringClient.OP_MEAN ) self._monitor.registerActivity( 'MaxFD', "Max File Descriptors", 'Framework', 'fd', MonitoringClient.OP_MEAN ) self._monitor.setComponentExtraParam( 'DIRACVersion', DIRAC.version ) self._monitor.setComponentExtraParam( 'platform', DIRAC.getPlatform() ) self._monitor.setComponentExtraParam( 'startTime', Time.dateTime() ) for prop in ( ( "__RCSID__", "version" ), ( "__doc__", "description" ) ): try: value = getattr( self._handler[ 'module' ], prop[0] ) except Exception as e: gLogger.exception( e ) gLogger.error( "Missing property", prop[0] ) value = 'unset' self._monitor.setComponentExtraParam( prop[1], value ) for secondaryName in self._cfg.registerAlsoAs(): gLogger.info( "Registering %s also as %s" % ( self._name, secondaryName ) ) self._validNames.append( secondaryName ) return S_OK() def __reportThreadPoolContents( self ): self._monitor.addMark( 'PendingQueries', self._threadPool.pendingJobs() ) self._monitor.addMark( 'ActiveQueries', self._threadPool.numWorkingThreads() ) self._monitor.addMark( 'RunningThreads', threading.activeCount() ) self._monitor.addMark( 'MaxFD', self.__maxFD ) self.__maxFD = 0 def getConfig( self ): return self._cfg #End of initialization functions def handleConnection( self, clientTransport ): self._stats[ 'connections' ] += 1 self._monitor.setComponentExtraParam( 'queries', self._stats[ 'connections' ] ) self._threadPool.generateJobAndQueueIt( self._processInThread, args = ( clientTransport, ) ) #Threaded process function def _processInThread( self, clientTransport ): self.__maxFD = max( self.__maxFD, clientTransport.oSocket.fileno() ) self._lockManager.lockGlobal() try: monReport = self.__startReportToMonitoring() except Exception: monReport = False try: #Handshake try: result = clientTransport.handshake() if not result[ 'OK' ]: clientTransport.close() return except: return #Add to the transport pool trid = self._transportPool.add( clientTransport ) if not trid: return #Receive and check proposal result = self._receiveAndCheckProposal( trid ) if not result[ 'OK' ]: self._transportPool.sendAndClose( trid, result ) return proposalTuple = result[ 'Value' ] #Instantiate handler result = self._instantiateHandler( trid, proposalTuple ) if not result[ 'OK' ]: self._transportPool.sendAndClose( trid, result ) return handlerObj = result[ 'Value' ] #Execute the action result = self._processProposal( trid, proposalTuple, handlerObj ) #Close the connection if required if result[ 'closeTransport' ] or not result[ 'OK' ]: if not result[ 'OK' ]: gLogger.error( "Error processing proposal", result[ 'Message' ] ) self._transportPool.close( trid ) return result finally: self._lockManager.unlockGlobal() if monReport: self.__endReportToMonitoring( *monReport ) def _createIdentityString( self, credDict, clientTransport = None ): if 'username' in credDict: if 'group' in credDict: identity = "[%s:%s]" % ( credDict[ 'username' ], credDict[ 'group' ] ) else: identity = "[%s:unknown]" % credDict[ 'username' ] else: identity = 'unknown' if clientTransport: addr = clientTransport.getRemoteAddress() if addr: addr = "{%s:%s}" % ( addr[0], addr[1] ) if 'DN' in credDict: identity += "(%s)" % credDict[ 'DN' ] return identity def _receiveAndCheckProposal( self, trid ): clientTransport = self._transportPool.get( trid ) #Get the peer credentials credDict = clientTransport.getConnectingCredentials() #Receive the action proposal retVal = clientTransport.receiveData( 1024 ) if not retVal[ 'OK' ]: gLogger.error( "Invalid action proposal", "%s %s" % ( self._createIdentityString( credDict, clientTransport ), retVal[ 'Message' ] ) ) return S_ERROR( "Invalid action proposal" ) proposalTuple = retVal[ 'Value' ] gLogger.debug( "Received action from client", "/".join( list( proposalTuple[1] ) ) ) #Check if there are extra credentials if proposalTuple[2]: clientTransport.setExtraCredentials( proposalTuple[2] ) #Check if this is the requested service requestedService = proposalTuple[0][0] if requestedService not in self._validNames: return S_ERROR( "%s is not up in this server" % requestedService ) #Check if the action is valid requestedActionType = proposalTuple[1][0] if requestedActionType not in Service.SVC_VALID_ACTIONS: return S_ERROR( "%s is not a known action type" % requestedActionType ) #Check if it's authorized result = self._authorizeProposal( proposalTuple[1], trid, credDict ) if not result[ 'OK' ]: return result #Proposal is OK return S_OK( proposalTuple ) def _authorizeProposal( self, actionTuple, trid, credDict ): #Find CS path for the Auth rules referedAction = self._isMetaAction( actionTuple[0] ) if referedAction: csAuthPath = "%s/Default" % actionTuple[0] hardcodedMethodAuth = self._actions[ 'auth' ][ actionTuple[0] ] else: if actionTuple[0] == 'RPC': csAuthPath = actionTuple[1] else: csAuthPath = "/".join( actionTuple ) #Find if there are hardcoded auth rules in the code hardcodedMethodAuth = False if actionTuple[0] in self._actions[ 'auth' ]: hardcodedRulesByType = self._actions[ 'auth' ][ actionTuple[0] ] if actionTuple[0] == "FileTransfer": methodName = actionTuple[1][0].lower() + actionTuple[1][1:] else: methodName = actionTuple[1] if methodName in hardcodedRulesByType: hardcodedMethodAuth = hardcodedRulesByType[ methodName ] #Auth time! if not self._authMgr.authQuery( csAuthPath, credDict, hardcodedMethodAuth ): #Get the identity string identity = self._createIdentityString( credDict ) fromHost = "unknown host" tr = self._transportPool.get( trid ) if tr: fromHost = '/'.join( [ str( item ) for item in tr.getRemoteAddress() ] ) gLogger.warn( "Unauthorized query", "to %s:%s by %s from %s" % ( self._name, "/".join( actionTuple ), identity, fromHost ) ) result = S_ERROR( "Unauthorized query" ) else: result = S_OK() #Security log tr = self._transportPool.get( trid ) if not tr: return S_ERROR( "Client disconnected" ) sourceAddress = tr.getRemoteAddress() identity = self._createIdentityString( credDict ) Service.SVC_SECLOG_CLIENT.addMessage( result[ 'OK' ], sourceAddress[0], sourceAddress[1], identity, self._cfg.getHostname(), self._cfg.getPort(), self._name, "/".join( actionTuple ) ) return result def _instantiateHandler( self, trid, proposalTuple = None ): """ Generate an instance of the handler for a given service """ #Generate the client params clientParams = { 'serviceStartTime' : self._startTime } if proposalTuple: clientParams[ 'clientSetup' ] = proposalTuple[0][1] if len( proposalTuple[0] ) < 3: clientParams[ 'clientVO' ] = gConfig.getValue( "/DIRAC/VirtualOrganization", "unknown" ) else: clientParams[ 'clientVO' ] = proposalTuple[0][2] clientTransport = self._transportPool.get( trid ) if clientTransport: clientParams[ 'clientAddress' ] = clientTransport.getRemoteAddress() #Generate handler dict with per client info handlerInitDict = dict( self._serviceInfoDict ) for key in clientParams: handlerInitDict[ key ] = clientParams[ key ] #Instantiate and initialize try: handlerInstance = self._handler[ 'class' ]( handlerInitDict, trid ) handlerInstance.initialize() except Exception as e: gLogger.exception( "Server error while loading handler: %s" % str( e ) ) return S_ERROR( "Server error while loading handler" ) return S_OK( handlerInstance ) def _processProposal( self, trid, proposalTuple, handlerObj ): #Notify the client we're ready to execute the action retVal = self._transportPool.send( trid, S_OK() ) if not retVal[ 'OK' ]: return retVal messageConnection = False if proposalTuple[1] == ( 'Connection', 'new' ): messageConnection = True if messageConnection: if self._msgBroker.getNumConnections() > self._cfg.getMaxMessagingConnections(): result = S_ERROR( "Maximum number of connections reached. Try later" ) result[ 'closeTransport' ] = True return result #This is a stable connection self._msgBroker.addTransportId( trid, self._name, receiveMessageCallback = self._mbReceivedMsg, disconnectCallback = self._mbDisconnect, listenToConnection = False ) result = self._executeAction( trid, proposalTuple, handlerObj ) if result[ 'OK' ] and messageConnection: self._msgBroker.listenToTransport( trid ) result = self._mbConnect( trid, handlerObj ) if not result[ 'OK' ]: self._msgBroker.removeTransport( trid ) result[ 'closeTransport' ] = not messageConnection or not result[ 'OK' ] return result def _mbConnect( self, trid, handlerObj = None ): if not handlerObj: result = self._instantiateHandler( trid ) if not result[ 'OK' ]: return result handlerObj = result[ 'Value' ] return handlerObj._rh_executeConnectionCallback( 'connected' ) def _executeAction( self, trid, proposalTuple, handlerObj ): try: return handlerObj._rh_executeAction( proposalTuple ) except Exception as e: gLogger.exception( "Exception while executing handler action" ) return S_ERROR( "Server error while executing action: %s" % str( e ) ) def _mbReceivedMsg( self, trid, msgObj ): result = self._authorizeProposal( ( 'Message', msgObj.getName() ), trid, self._transportPool.get( trid ).getConnectingCredentials() ) if not result[ 'OK' ]: return result result = self._instantiateHandler( trid ) if not result[ 'OK' ]: return result handlerObj = result[ 'Value' ] return handlerObj._rh_executeMessageCallback( msgObj ) def _mbDisconnect( self, trid ): result = self._instantiateHandler( trid ) if not result[ 'OK' ]: return result handlerObj = result[ 'Value' ] return handlerObj._rh_executeConnectionCallback( 'drop' ) def __startReportToMonitoring( self ): self._monitor.addMark( "Queries" ) now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 0: return ( now, cpuTime ) # Send CPU consumption mark wallClock = now - self.__monitorLastStatsUpdate self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB( 'VmRSS:' ) if membytes: mem = membytes / ( 1024. * 1024. ) self._monitor.addMark( 'MEM', mem ) return ( now, cpuTime ) def __endReportToMonitoring( self, initialWallTime, initialCPUTime ): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100. if percentage > 0: self._monitor.addMark( 'CPU', percentage )
class FTSMonitorAgent( AgentModule ): """ .. class:: FTSMonitorAgent Monitor submitted FTS jobs. """ # # transfer DB handle transferDB = None # # thread pool threadPool = None # # min threads minThreads = 1 # # max threads maxThreads = 10 # # missing source regexp patterns missingSourceErrors = [ re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ), re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\ " Command failed. : open error: No such file or directory" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ] def initialize( self ): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption( "shifterProxy", "DataManager" ) self.minThreads = self.am_getOption( "MinThreads", self.minThreads ) self.maxThreads = self.am_getOption( "MaxThreads", self.maxThreads ) minmax = ( abs( self.minThreads ), abs( self.maxThreads ) ) self.minThreads, self.maxThreads = min( minmax ), max( minmax ) self.log.info( "ThreadPool min threads = %s" % self.minThreads ) self.log.info( "ThreadPool max threads = %s" % self.maxThreads ) self.threadPool = ThreadPool( self.minThreads, self.maxThreads ) self.threadPool.daemonize() return S_OK() def execute( self ): """ push jobs to the thread pool """ self.log.info( "Obtaining requests to monitor" ) res = self.transferDB.getFTSReq() if not res["OK"]: self.log.error( "Failed to get FTS requests", res['Message'] ) return res if not res["Value"]: self.log.info( "No FTS requests found to monitor." ) return S_OK() ftsReqs = res["Value"] self.log.info( "Found %s FTS jobs" % len( ftsReqs ) ) i = 1 for ftsJob in ftsReqs: while True: self.log.debug( "submitting FTS Job %s FTSReqID=%s to monitor" % ( i, ftsJob["FTSReqID"] ) ) ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args = ( ftsJob, ), ) if ret["OK"]: i += 1 break # # sleep 1 second to proceed time.sleep( 1 ) self.threadPool.processAllResults() return S_OK() def ftsJobExpired( self, ftsReqID, channelID ): """ clean up when FTS job had expired on the server side :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID """ log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) ) fileIDs = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not fileIDs["OK"]: log.error( "Unable to retrieve FileIDs associated to %s request" % ftsReqID ) return fileIDs fileIDs = fileIDs["Value"] # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed" ) if not fileStatus["OK"]: log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % ( fileID, fileStatus["Message"] ) ) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server" ) if not failReason["OK"]: log.error( "Unable to set FileToFTS reason for FileID %s: %s" % ( fileID, failReason["Message"] ) ) # # update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs ) if not resetChannels["OK"]: log.error( "Failed to reset Channel table for files to retry" ) return resetChannels # # update FTSReq table log.info( "Setting FTS request status to 'Finished'" ) ftsReqStatus = self.transferDB.setFTSReqStatus( ftsReqID, "Finished" ) if not ftsReqStatus["OK"]: log.error( "Failed update FTS Request status", ftsReqStatus["Message"] ) return ftsReqStatus # # if we land here, everything should be OK return S_OK() def monitorTransfer( self, ftsReqDict ): """ monitors transfer obtained from TransferDB :param dict ftsReqDict: FTS job dictionary """ ftsReqID = ftsReqDict.get( "FTSReqID" ) ftsGUID = ftsReqDict.get( "FTSGuid" ) ftsServer = ftsReqDict.get( "FTSServer" ) channelID = ftsReqDict.get( "ChannelID" ) sourceSE = ftsReqDict.get( "SourceSE" ) targetSE = ftsReqDict.get( "TargetSE" ) oFTSRequest = FTSRequest() oFTSRequest.setFTSServer( ftsServer ) oFTSRequest.setFTSGUID( ftsGUID ) oFTSRequest.setSourceSE( sourceSE ) oFTSRequest.setTargetSE( targetSE ) log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) ) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. log.info( "Perform summary update of the FTS Request" ) infoStr = [ "glite-transfer-status -s %s -l %s" % ( ftsServer, ftsGUID ) ] infoStr.append( "FTS GUID: %s" % ftsGUID ) infoStr.append( "FTS Server: %s" % ftsServer ) log.info( "\n".join( infoStr ) ) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor( ftsReqID ) if not res["OK"]: log.error( "Failed to update the FTS request summary", res["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in res["Message"]: log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" ) cleanUp = self.ftsJobExpired( ftsReqID, channelID ) if not cleanUp["OK"]: log.error( cleanUp["Message"] ) return cleanUp return res res = oFTSRequest.dumpSummary() if not res['OK']: log.error( "Failed to get FTS request summary", res["Message"] ) return res log.info( res['Value'] ) res = oFTSRequest.getPercentageComplete() if not res['OK']: log.error( "Failed to get FTS percentage complete", res["Message"] ) return res log.info( 'FTS Request found to be %.1f percent complete' % res["Value"] ) self.transferDB.setFTSReqAttribute( ftsReqID, "PercentageComplete", res["Value"] ) self.transferDB.addLoggingEvent( ftsReqID, res["Value"] ) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res["OK"]: log.error( "Failed to determine whether FTS request terminal", res["Message"] ) return res if not res["Value"]: return S_OK() # # request is terminal return self.terminalRequest( oFTSRequest, ftsReqID, channelID, sourceSE ) def terminalRequest( self, oFTSRequest, ftsReqID, channelID, sourceSE ): """ process terminal FTS job :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param str sourceSE: FTSReq.SourceSE """ log = gLogger.getSubLogger( "@%s" % ftsReqID ) log.info( "FTS Request found to be terminal, updating file states" ) ######################################################################### # Get the LFNS associated to the FTS request log.info( "Obtaining the LFNs associated to this request" ) res = self.transferDB.getFTSReqLFNs( ftsReqID, channelID, sourceSE ) if not res["OK"]: log.error( "Failed to obtain FTS request LFNs", res['Message'] ) return res files = res["Value"] if not files: log.error( "No files present for transfer" ) return S_ERROR( "No files were found in the DB" ) lfns = files.keys() log.debug( "Obtained %s files" % len( lfns ) ) for lfn in lfns: oFTSRequest.setLFN( lfn ) res = oFTSRequest.monitor() if not res["OK"]: log.error( "Failed to perform detailed monitoring of FTS request", res["Message"] ) return res res = oFTSRequest.getFailed() if not res["OK"]: log.error( "Failed to obtained failed files for FTS request", res["Message"] ) return res failedFiles = res["Value"] res = oFTSRequest.getDone() if not res["OK"]: log.error( "Failed to obtained successful files for FTS request", res["Message"] ) return res completedFiles = res["Value"] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not res["OK"]: log.error( "Failed to get FileIDs associated to FTS Request", res["Message"] ) return res fileIDs = res["Value"] res = self.transferDB.getAttributesForFilesList( fileIDs, ["LFN"] ) if not res["OK"]: log.error( "Failed to get LFNs associated to FTS Request", res["Message"] ) return res fileIDDict = res["Value"] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append( fileID ) transferTime = 0 res = oFTSRequest.getTransferTime( lfn ) if res["OK"]: transferTime = res["Value"] fileToFTSUpdates.append( ( fileID, "Completed", "", 0, transferTime ) ) if lfn in failedFiles: failReason = "" res = oFTSRequest.getFailReason( lfn ) if res["OK"]: failReason = res["Value"] if "Source file/user checksum mismatch" in failReason: filesToFail.append( fileID ) continue if self.missingSource( failReason ): log.error( "The source SURL does not exist.", "%s %s" % ( lfn, oFTSRequest.getSourceSURL( lfn ) ) ) filesToFail.append( fileID ) else: filesToRetry.append( fileID ) log.error( "Failed to replicate file on channel.", "%s %s" % ( channelID, failReason ) ) fileToFTSUpdates.append( ( fileID, "Failed", failReason, 0, 0 ) ) # # update TransferDB.FileToFTS table updateFileToFTS = self.updateFileToFTS( ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ) if updateFileToFTS["OK"] and updateFileToFTS["Value"]: res = oFTSRequest.finalize() if not res["OK"]: log.error( "Failed to perform the finalization for the FTS request", res["Message"] ) return res log.info( 'Adding logging event for FTS request' ) # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent( ftsReqID, 'Finished' ) if not res['OK']: log.error( 'Failed to add logging event for FTS Request', res['Message'] ) # update TransferDB.FileToCat table updateFileToCat = self.updateFileToCat( oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ) if not updateFileToCat["OK"]: log.error( updateFileToCat["Message"] ) log.debug( "Updating FTS request status" ) res = self.transferDB.setFTSReqStatus( ftsReqID, 'Finished' ) if not res['OK']: log.error( 'Failed update FTS Request status', res['Message'] ) return S_OK() def updateFileToFTS( self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ): """ update TransferDB.FileToFTS table for finished request :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param list filesToRetry: FileIDs to retry :param list filesToFail: FileIDs for failed files :param list completedFileIDs: files completed :param list fileToFTSUpdates: ??? """ log = gLogger.getSubLogger( "@%s" % ftsReqID ) allUpdated = True res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry ) if filesToRetry else S_OK() if not res["OK"]: log.error( "Failed to update the Channel table for file to retry.", res["Message"] ) allUpdated = False for fileID in filesToFail: log.info( "Updating the Channel table for files to reschedule" ) res = self.transferDB.setFileToReschedule( fileID ) if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"] ) allUpdated = False elif res["Value"] == "max reschedule attempt reached": log.error( "setting Channel status to 'Failed' : " % res["Value"] ) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed' ) if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"] ) allUpdated = False if completedFileIDs: res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs ) if not res["OK"]: log.error( "Failed to update the Channel table for successful files.", res["Message"] ) allUpdated = False res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs ) if not res["OK"]: log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message'] ) allUpdated = False if fileToFTSUpdates: res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates ) if not res["OK"]: log.error( "Failed to update the FileToFTS table for files.", res["Message"] ) allUpdated = False return S_OK( allUpdated ) def updateFileToCat( self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ): """ update TransferDB.FileToCat table for finished request :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param dict fileIDDict: fileIDs dictionary :param int channelID: FTSReq.ChannelID """ res = oFTSRequest.getFailedRegistrations() failedRegistrations = res["Value"] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: regFailedFileIDs.append( fileID ) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append( fileID ) elif fileID in filesToFail: regForgetFileIDs.append( fileID ) res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs ) if regFailedFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to reset entries in FileToCat: %s" % res["Message"] return res res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs ) if regDoneFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"] return res # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs ) if regForgetFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"] return res return S_OK() @classmethod def missingSource( cls, failReason ): """ check if message sent by FTS server is concering missing source file :param str failReason: message sent by FTS server """ for error in cls.missingSourceErrors: if error.search( failReason ): return 1 return 0
class Publisher: """ Class Publisher is in charge of getting dispersed information, to be published on the web. """ ############################################################################# def __init__(self, VOExtension, rsDBIn = None, commandCallerIn = None, infoGetterIn = None, WMSAdminIn = None): """ Standard constructor :params: :attr:`VOExtension`: string, VO Extension (e.g. 'LHCb') :attr:`rsDBIn`: optional ResourceStatusDB object (see :class: `DIRAC.ResourceStatusSystem.DB.ResourceStatusDB.ResourceStatusDB`) :attr:`commandCallerIn`: optional CommandCaller object (see :class: `DIRAC.ResourceStatusSystem.Command.CommandCaller.CommandCaller`) :attr:`infoGetterIn`: optional InfoGetter object (see :class: `DIRAC.ResourceStatusSystem.Utilities.InfoGetter.InfoGetter`) :attr:`WMSAdminIn`: optional RPCClient object for WMSAdmin (see :class: `DIRAC.Core.DISET.RPCClient.RPCClient`) """ self.configModule = Utils.voimport("DIRAC.ResourceStatusSystem.Policy.Configurations", VOExtension) if rsDBIn is not None: self.rsDB = rsDBIn else: from DIRAC.ResourceStatusSystem.DB.ResourceStatusDB import ResourceStatusDB self.rsDB = ResourceStatusDB() from DIRAC.ResourceStatusSystem.DB.ResourceManagementDB import ResourceManagementDB self.rmDB = ResourceManagementDB() if commandCallerIn is not None: self.cc = commandCallerIn else: from DIRAC.ResourceStatusSystem.Command.CommandCaller import CommandCaller self.cc = CommandCaller() if infoGetterIn is not None: self.ig = infoGetterIn else: from DIRAC.ResourceStatusSystem.Utilities.InfoGetter import InfoGetter self.ig = InfoGetter(VOExtension) if WMSAdminIn is not None: self.WMSAdmin = WMSAdminIn else: from DIRAC.Core.DISET.RPCClient import RPCClient self.WMSAdmin = RPCClient("WorkloadManagement/WMSAdministrator") self.threadPool = ThreadPool( 2, 5 ) self.lockObj = threading.RLock() self.infoForPanel_res = {} ############################################################################# def getInfo(self, granularity, name, useNewRes = False): """ Standard method to get all the info to be published This method uses a ThreadPool (:class:`DIRAC.Core.Utilities.ThreadPool.ThreadPool`) with 2-5 threads. The threaded method is :meth:`DIRAC.ResourceStatusSystem.Utilities.Publisher.Publisher.getInfoForPanel` :params: :attr:`granularity`: string - a ValidRes :attr:`name`: string - name of the Validres :attr:`useNewRes`: boolean. When set to true, will get new results, otherwise it will get cached results (where available). """ if granularity not in ValidRes: raise InvalidRes, Utils.where(self, self.getInfo) self.infoForPanel_res = {} status = None formerStatus = None siteType = None serviceType = None resourceType = None if granularity in ('Resource', 'Resources'): try: resourceType = self.rsDB.getMonitoredsList('Resource', ['ResourceType'], resourceName = name)[0][0] except IndexError: return "%s does not exist!" %name if granularity in ('StorageElement', 'StorageElements'): try: siteType = self.rsDB.getMonitoredsList('StorageElement', ['SiteType'], storageElementName = name)[0][0] except IndexError: return "%s does not exist!" %name paramNames = ['Type', 'Group', 'Name', 'Policy', 'DIRAC Status', 'RSS Status', 'Reason', 'Description'] infoToGet = self.ig.getInfoToApply(('view_info', ), granularity, status = status, formerStatus = formerStatus, siteType = siteType, serviceType = serviceType, resourceType = resourceType, useNewRes = useNewRes)[0]['Panels'] infoToGet_res = {} recordsList = [] infosForPolicy = {} for panel in infoToGet.keys(): (granularityForPanel, nameForPanel) = self.__getNameForPanel(granularity, name, panel) if not self._resExist(granularityForPanel, nameForPanel): # completeInfoForPanel_res = None continue #take composite RSS result for name nameStatus_res = self._getStatus(nameForPanel, panel) recordBase = [None, None, None, None, None, None, None, None] recordBase[1] = panel.replace('_Panel', '') recordBase[2] = nameForPanel #nameForPanel try: recordBase[4] = nameStatus_res[nameForPanel]['DIRACStatus'] #DIRAC Status except: pass recordBase[5] = nameStatus_res[nameForPanel]['RSSStatus'] #RSS Status record = copy.deepcopy(recordBase) record[0] = 'ResultsForResource' recordsList.append(record) #take info that goes into the panel infoForPanel = infoToGet[panel] for info in infoForPanel: self.threadPool.generateJobAndQueueIt(self.getInfoForPanel, args = (info, granularityForPanel, nameForPanel) ) self.threadPool.processAllResults() for policy in [x.keys()[0] for x in infoForPanel]: record = copy.deepcopy(recordBase) record[0] = 'SpecificInformation' record[3] = policy #policyName record[4] = None #DIRAC Status record[5] = self.infoForPanel_res[policy]['Status'] #RSS status for the policy record[6] = self.infoForPanel_res[policy]['Reason'] #Reason record[7] = self.infoForPanel_res[policy]['desc'] #Description recordsList.append(record) infosForPolicy[policy] = self.infoForPanel_res[policy]['infos'] infoToGet_res['TotalRecords'] = len(recordsList) infoToGet_res['ParameterNames'] = paramNames infoToGet_res['Records'] = recordsList infoToGet_res['Extras'] = infosForPolicy return infoToGet_res ############################################################################# def getInfoForPanel(self, info, granularityForPanel, nameForPanel): #get single RSS policy results policyResToGet = info.keys()[0] pol_res = self.rmDB.getPolicyRes(nameForPanel, policyResToGet) if pol_res != []: pol_res_dict = {'Status' : pol_res[0], 'Reason' : pol_res[1]} else: pol_res_dict = {'Status' : 'Unknown', 'Reason' : 'Unknown'} self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet] = pol_res_dict finally: self.lockObj.release() #get policy description desc = self._getPolicyDesc(policyResToGet) #get other info othersInfo = info.values()[0] if not isinstance(othersInfo, list): othersInfo = [othersInfo] info_res = {} for oi in othersInfo: format_ = oi.keys()[0] what = oi.values()[0] info_bit_got = self._getInfo(granularityForPanel, nameForPanel, format_, what) info_res[format_] = info_bit_got self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet]['infos'] = info_res self.infoForPanel_res[policyResToGet]['desc'] = desc finally: self.lockObj.release() ############################################################################# def _getStatus(self, name, panel): #get RSS status RSSStatus = self._getInfoFromRSSDB(name, panel)[0][1] #get DIRAC status if panel in ('Site_Panel', 'SE_Panel'): if panel == 'Site_Panel': DIRACStatus = self.WMSAdmin.getSiteMaskLogging(name) if DIRACStatus['OK']: DIRACStatus = DIRACStatus['Value'][name].pop()[0] else: raise RSSException, Utils.where(self, self._getStatus) elif panel == 'SE_Panel': ra = getStorageElementStatus(name, 'ReadAccess')['Value'] wa = getStorageElementStatus(name, 'WriteAccess')['Value'] DIRACStatus = {'ReadAccess': ra, 'WriteAccess': wa} status = { name : { 'RSSStatus': RSSStatus, 'DIRACStatus': DIRACStatus } } else: status = { name : { 'RSSStatus': RSSStatus} } return status ############################################################################# def _getInfo(self, granularity, name, format_, what): if format_ == 'RSS': info_bit_got = self._getInfoFromRSSDB(name, what) else: if isinstance(what, dict): command = what['CommandIn'] extraArgs = what['args'] else: command = what extraArgs = None info_bit_got = self.cc.commandInvocation(granularity, name, None, None, command, extraArgs) try: info_bit_got = info_bit_got['Result'] except: pass return info_bit_got ############################################################################# def _getInfoFromRSSDB(self, name, what): paramsL = ['Status'] siteName = None serviceName = None resourceName = None storageElementName = None serviceType = None gridSiteName = None if what == 'ServiceOfSite': gran = 'Service' paramsL.insert(0, 'ServiceName') paramsL.append('Reason') siteName = name elif what == 'ResOfCompService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorEl': gran = 'StorageElements' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') storageElementName = name elif what == 'StorageElementsOfSite': gran = 'StorageElements' paramsL.insert(0, 'StorageElementName') paramsL.append('Reason') if '@' in name: DIRACsiteName = name.split('@').pop() else: DIRACsiteName = name gridSiteName = getGOCSiteName(DIRACsiteName) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'Site_Panel': gran = 'Site' paramsL.insert(0, 'SiteName') siteName = name elif what == 'Service_Computing_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_Storage_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VO-BOX_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VOMS_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Resource_Panel': gran = 'Resource' paramsL.insert(0, 'ResourceName') resourceName = name elif what == 'SE_Panel': gran = 'StorageElement' paramsL.insert(0, 'StorageElementName') storageElementName = name info_bit_got = self.rsDB.getMonitoredsList(gran, paramsList = paramsL, siteName = siteName, serviceName = serviceName, serviceType = serviceType, resourceName = resourceName, storageElementName = storageElementName, gridSiteName = gridSiteName) return info_bit_got ############################################################################# def _getPolicyDesc(self, policyName): return self.configModule.Policies[policyName]['Description'] ############################################################################# def __getNameForPanel(self, granularity, name, panel): if granularity in ('Site', 'Sites'): if panel == 'Service_Computing_Panel': granularity = 'Service' name = 'Computing@' + name elif panel == 'Service_Storage_Panel': granularity = 'Service' name = 'Storage@' + name elif panel == 'OtherServices_Panel': granularity = 'Service' name = 'OtherS@' + name elif panel == 'Service_VOMS_Panel': granularity = 'Service' name = 'VOMS@' + name elif panel == 'Service_VO-BOX_Panel': granularity = 'Service' name = 'VO-BOX@' + name # else: # granularity = granularity # name = name # else: # granularity = granularity # name = name return (granularity, name) ############################################################################# def _resExist(self, granularity, name): siteName = None serviceName = None resourceName = None storageElementName = None if granularity in ('Site', 'Sites'): siteName = name elif granularity in ('Service', 'Services'): serviceName = name elif granularity in ('Resource', 'Resources'): resourceName = name elif granularity in ('StorageElement', 'StorageElements'): storageElementName = name res = self.rsDB.getMonitoredsList(granularity, siteName = siteName, serviceName = serviceName, resourceName = resourceName, storageElementName = storageElementName) if res == []: return False else: return True
class RSInspectorAgent(AgentModule): """ Class RSInspectorAgent is in charge of going through Resources table, and pass Resource and Status to the PEP """ ############################################################################# def initialize(self): """ Standard constructor """ try: self.rsDB = ResourceStatusDB() self.rmDB = ResourceManagementDB() self.ResourcesToBeChecked = Queue.Queue() self.ResourceNamesInCheck = [] self.maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1) self.threadPool = ThreadPool(self.maxNumberOfThreads, self.maxNumberOfThreads) if not self.threadPool: self.log.error('Can not create Thread Pool') return S_ERROR('Can not create Thread Pool') self.setup = getSetup()['Value'] self.VOExtension = getExt() configModule = __import__( self.VOExtension + "DIRAC.ResourceStatusSystem.Policy.Configurations", globals(), locals(), ['*']) self.Resources_check_freq = copy.deepcopy( configModule.Resources_check_freq) self.nc = NotificationClient() self.diracAdmin = DiracAdmin() self.csAPI = CSAPI() for i in xrange(self.maxNumberOfThreads): self.threadPool.generateJobAndQueueIt(self._executeCheck, args=(None, )) return S_OK() except Exception: errorStr = "RSInspectorAgent initialization" gLogger.exception(errorStr) return S_ERROR(errorStr) ############################################################################# def execute(self): """ The main RSInspectorAgent execution method. Calls :meth:`DIRAC.ResourceStatusSystem.DB.ResourceStatusDB.getResourcesToCheck` and put result in self.ResourcesToBeChecked (a Queue) and in self.ResourceNamesInCheck (a list) """ try: res = self.rsDB.getStuffToCheck('Resources', self.Resources_check_freq) for resourceTuple in res: if resourceTuple[0] in self.ResourceNamesInCheck: break resourceL = ['Resource'] for x in resourceTuple: resourceL.append(x) self.ResourceNamesInCheck.insert(0, resourceL[1]) self.ResourcesToBeChecked.put(resourceL) return S_OK() except Exception, x: errorStr = where(self, self.execute) gLogger.exception(errorStr, lException=x) return S_ERROR(errorStr)
class SiteInspectorAgent( AgentModule ): """ SiteInspectorAgent The SiteInspectorAgent agent is an agent that is used to get the all the site names and trigger PEP to evaluate their status. """ # Max number of worker threads by default __maxNumberOfThreads = 15 # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = {'Active' : 20, 'Degraded' : 20, 'Probing' : 20, 'Banned' : 15, 'Unknown' : 10, 'Error' : 5} def __init__( self, *args, **kwargs ): AgentModule.__init__( self, *args, **kwargs ) # ElementType, to be defined among Site, Resource or Node self.sitesToBeChecked = None self.threadPool = None self.siteClient = None self.clients = {} def initialize( self ): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', self.__maxNumberOfThreads ) self.threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK() def execute( self ): """ execute This is the main method of the agent. It gets the sites from the Database, calculates how many threads should be started and spawns them. Each thread will get a site from the queue until it is empty. At the end, the method will join the queue such that the agent will not terminate a cycle until all sites have been processed. """ # Gets sites to be checked ( returns a Queue ) sitesToBeChecked = self.getSitesToBeChecked() if not sitesToBeChecked['OK']: self.log.error( sitesToBeChecked['Message'] ) return sitesToBeChecked self.sitesToBeChecked = sitesToBeChecked['Value'] queueSize = self.sitesToBeChecked.qsize() pollingTime = self.am_getPollingTime() # Assigns number of threads on the fly such that we exhaust the PollingTime # without having to spawn too many threads. We assume 10 seconds per element # to be processed ( actually, it takes something like 1 sec per element ): # numberOfThreads = elements * 10(s/element) / pollingTime numberOfThreads = int( math.ceil( queueSize * 10. / pollingTime ) ) self.log.info( 'Needed %d threads to process %d elements' % ( numberOfThreads, queueSize ) ) for _x in xrange( numberOfThreads ): jobUp = self.threadPool.generateJobAndQueueIt( self._execute ) if not jobUp['OK']: self.log.error( jobUp['Message'] ) self.log.info( 'blocking until all sites have been processed' ) # block until all tasks are done self.sitesToBeChecked.join() self.log.info( 'done') return S_OK() def getSitesToBeChecked( self ): """ getElementsToBeChecked This method gets all the site names from the SiteStatus table, after that it get the details of each site (status, name, etc..) and adds them to a queue. """ toBeChecked = Queue.Queue() res = self.siteClient.getSites('All') if not res['OK']: return res # get the current status res = self.siteClient.getSiteStatuses( res['Value'] ) if not res['OK']: return res # filter elements for site in res['Value']: status = res['Value'].get(site, 'Unknown') toBeChecked.put( { 'status': status, 'name': site, 'site' : site, 'element' : 'Site', 'statusType': 'all', 'elementType': 'Site' } ) return S_OK( toBeChecked ) # Private methods ............................................................ def _execute( self ): """ Method run by each of the thread that is in the ThreadPool. It enters a loop until there are no sites on the queue. On each iteration, it evaluates the policies for such site and enforces the necessary actions. If there are no more sites in the queue, the loop is finished. """ pep = PEP( clients = self.clients ) while True: try: site = self.sitesToBeChecked.get_nowait() except Queue.Empty: return S_OK() resEnforce = pep.enforce( site ) if not resEnforce['OK']: self.log.error( 'Failed policy enforcement', resEnforce['Message'] ) self.sitesToBeChecked.task_done() continue # Used together with join ! self.sitesToBeChecked.task_done()
class FTSMonitorAgent(AgentModule): """ .. class:: FTSMonitorAgent Monitor submitted FTS jobs. """ # # transfer DB handle transferDB = None # # thread pool threadPool = None # # min threads minThreads = 1 # # max threads maxThreads = 10 # # missing source regexp patterns missingSourceErrors = [ re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ), re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\ " Command failed. : open error: No such file or directory" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ] def initialize(self): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption("shifterProxy", "DataManager") self.minThreads = self.am_getOption("MinThreads", self.minThreads) self.maxThreads = self.am_getOption("MaxThreads", self.maxThreads) minmax = (abs(self.minThreads), abs(self.maxThreads)) self.minThreads, self.maxThreads = min(minmax), max(minmax) self.log.info("ThreadPool min threads = %s" % self.minThreads) self.log.info("ThreadPool max threads = %s" % self.maxThreads) self.threadPool = ThreadPool(self.minThreads, self.maxThreads) self.threadPool.daemonize() return S_OK() def execute(self): """ push jobs to the thread pool """ self.log.info("Obtaining requests to monitor") res = self.transferDB.getFTSReq() if not res["OK"]: self.log.error("Failed to get FTS requests", res['Message']) return res if not res["Value"]: self.log.info("No FTS requests found to monitor.") return S_OK() ftsReqs = res["Value"] self.log.info("Found %s FTS jobs" % len(ftsReqs)) i = 1 for ftsJob in ftsReqs: while True: self.log.debug("submitting FTS Job %s FTSReqID=%s to monitor" % (i, ftsJob["FTSReqID"])) ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args=(ftsJob, ), ) if ret["OK"]: i += 1 break # # sleep 1 second to proceed time.sleep(1) self.threadPool.processAllResults() return S_OK() def ftsJobExpired(self, ftsReqID, channelID): """ clean up when FTS job had expired on the server side :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID """ log = gLogger.getSubLogger("@%s" % str(ftsReqID)) fileIDs = self.transferDB.getFTSReqFileIDs(ftsReqID) if not fileIDs["OK"]: log.error("Unable to retrieve FileIDs associated to %s request" % ftsReqID) return fileIDs fileIDs = fileIDs["Value"] # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed") if not fileStatus["OK"]: log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % (fileID, fileStatus["Message"])) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server") if not failReason["OK"]: log.error("Unable to set FileToFTS reason for FileID %s: %s" % (fileID, failReason["Message"])) # # update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs) if not resetChannels["OK"]: log.error("Failed to reset Channel table for files to retry") return resetChannels # # update FTSReq table log.info("Setting FTS request status to 'Finished'") ftsReqStatus = self.transferDB.setFTSReqStatus(ftsReqID, "Finished") if not ftsReqStatus["OK"]: log.error("Failed update FTS Request status", ftsReqStatus["Message"]) return ftsReqStatus # # if we land here, everything should be OK return S_OK() def monitorTransfer(self, ftsReqDict): """ monitors transfer obtained from TransferDB :param dict ftsReqDict: FTS job dictionary """ ftsReqID = ftsReqDict.get("FTSReqID") ftsGUID = ftsReqDict.get("FTSGuid") ftsServer = ftsReqDict.get("FTSServer") channelID = ftsReqDict.get("ChannelID") sourceSE = ftsReqDict.get("SourceSE") targetSE = ftsReqDict.get("TargetSE") oFTSRequest = FTSRequest() oFTSRequest.setFTSServer(ftsServer) oFTSRequest.setFTSGUID(ftsGUID) oFTSRequest.setSourceSE(sourceSE) oFTSRequest.setTargetSE(targetSE) log = gLogger.getSubLogger("@%s" % str(ftsReqID)) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. log.info("Perform summary update of the FTS Request") infoStr = ["glite-transfer-status -s %s -l %s" % (ftsServer, ftsGUID)] infoStr.append("FTS GUID: %s" % ftsGUID) infoStr.append("FTS Server: %s" % ftsServer) log.info("\n".join(infoStr)) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor(ftsReqID) if not res["OK"]: log.error("Failed to update the FTS request summary", res["Message"]) if "getTransferJobSummary2: Not authorised to query request" in res[ "Message"]: log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" ) cleanUp = self.ftsJobExpired(ftsReqID, channelID) if not cleanUp["OK"]: log.error(cleanUp["Message"]) return cleanUp return res res = oFTSRequest.dumpSummary() if not res['OK']: log.error("Failed to get FTS request summary", res["Message"]) return res log.info(res['Value']) res = oFTSRequest.getPercentageComplete() if not res['OK']: log.error("Failed to get FTS percentage complete", res["Message"]) return res log.info('FTS Request found to be %.1f percent complete' % res["Value"]) self.transferDB.setFTSReqAttribute(ftsReqID, "PercentageComplete", res["Value"]) self.transferDB.addLoggingEvent(ftsReqID, res["Value"]) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res["OK"]: log.error("Failed to determine whether FTS request terminal", res["Message"]) return res if not res["Value"]: return S_OK() # # request is terminal return self.terminalRequest(oFTSRequest, ftsReqID, channelID, sourceSE) def terminalRequest(self, oFTSRequest, ftsReqID, channelID, sourceSE): """ process terminal FTS job :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param str sourceSE: FTSReq.SourceSE """ log = gLogger.getSubLogger("@%s" % ftsReqID) log.info("FTS Request found to be terminal, updating file states") ######################################################################### # Get the LFNS associated to the FTS request log.info("Obtaining the LFNs associated to this request") res = self.transferDB.getFTSReqLFNs(ftsReqID, channelID, sourceSE) if not res["OK"]: log.error("Failed to obtain FTS request LFNs", res['Message']) return res files = res["Value"] if not files: log.error("No files present for transfer") return S_ERROR("No files were found in the DB") lfns = files.keys() log.debug("Obtained %s files" % len(lfns)) for lfn in lfns: oFTSRequest.setLFN(lfn) res = oFTSRequest.monitor() if not res["OK"]: log.error("Failed to perform detailed monitoring of FTS request", res["Message"]) return res res = oFTSRequest.getFailed() if not res["OK"]: log.error("Failed to obtained failed files for FTS request", res["Message"]) return res failedFiles = res["Value"] res = oFTSRequest.getDone() if not res["OK"]: log.error("Failed to obtained successful files for FTS request", res["Message"]) return res completedFiles = res["Value"] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs(ftsReqID) if not res["OK"]: log.error("Failed to get FileIDs associated to FTS Request", res["Message"]) return res fileIDs = res["Value"] res = self.transferDB.getAttributesForFilesList(fileIDs, ["LFN"]) if not res["OK"]: log.error("Failed to get LFNs associated to FTS Request", res["Message"]) return res fileIDDict = res["Value"] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append(fileID) transferTime = 0 res = oFTSRequest.getTransferTime(lfn) if res["OK"]: transferTime = res["Value"] fileToFTSUpdates.append( (fileID, "Completed", "", 0, transferTime)) if lfn in failedFiles: failReason = "" res = oFTSRequest.getFailReason(lfn) if res["OK"]: failReason = res["Value"] if "Source file/user checksum mismatch" in failReason: filesToFail.append(fileID) continue if self.missingSource(failReason): log.error("The source SURL does not exist.", "%s %s" % (lfn, oFTSRequest.getSourceSURL(lfn))) filesToFail.append(fileID) else: filesToRetry.append(fileID) log.error("Failed to replicate file on channel.", "%s %s" % (channelID, failReason)) fileToFTSUpdates.append((fileID, "Failed", failReason, 0, 0)) # # update TransferDB.FileToFTS table updateFileToFTS = self.updateFileToFTS(ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates) if updateFileToFTS["OK"] and updateFileToFTS["Value"]: res = oFTSRequest.finalize() if not res["OK"]: log.error( "Failed to perform the finalization for the FTS request", res["Message"]) return res log.info('Adding logging event for FTS request') # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent(ftsReqID, 'Finished') if not res['OK']: log.error('Failed to add logging event for FTS Request', res['Message']) # update TransferDB.FileToCat table updateFileToCat = self.updateFileToCat(oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail) if not updateFileToCat["OK"]: log.error(updateFileToCat["Message"]) log.debug("Updating FTS request status") res = self.transferDB.setFTSReqStatus(ftsReqID, 'Finished') if not res['OK']: log.error('Failed update FTS Request status', res['Message']) return S_OK() def updateFileToFTS(self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates): """ update TransferDB.FileToFTS table for finished request :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param list filesToRetry: FileIDs to retry :param list filesToFail: FileIDs for failed files :param list completedFileIDs: files completed :param list fileToFTSUpdates: ??? """ log = gLogger.getSubLogger("@%s" % ftsReqID) allUpdated = True res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry) if filesToRetry else S_OK() if not res["OK"]: log.error("Failed to update the Channel table for file to retry.", res["Message"]) allUpdated = False for fileID in filesToFail: log.info("Updating the Channel table for files to reschedule") res = self.transferDB.setFileToReschedule(fileID) if not res["OK"]: log.error("Failed to update Channel table for failed files.", res["Message"]) allUpdated = False elif res["Value"] == "max reschedule attempt reached": log.error("setting Channel status to 'Failed' : " % res["Value"]) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed') if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"]) allUpdated = False if completedFileIDs: res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs) if not res["OK"]: log.error( "Failed to update the Channel table for successful files.", res["Message"]) allUpdated = False res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs) if not res["OK"]: log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message']) allUpdated = False if fileToFTSUpdates: res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates) if not res["OK"]: log.error("Failed to update the FileToFTS table for files.", res["Message"]) allUpdated = False return S_OK(allUpdated) def updateFileToCat(self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail): """ update TransferDB.FileToCat table for finished request :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param dict fileIDDict: fileIDs dictionary :param int channelID: FTSReq.ChannelID """ res = oFTSRequest.getFailedRegistrations() failedRegistrations = res["Value"] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: regFailedFileIDs.append(fileID) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append(fileID) elif fileID in filesToFail: regForgetFileIDs.append(fileID) res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs) if regFailedFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to reset entries in FileToCat: %s" % res[ "Message"] return res res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs) if regDoneFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[ "Message"] return res # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs) if regForgetFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[ "Message"] return res return S_OK() @classmethod def missingSource(cls, failReason): """ check if message sent by FTS server is concering missing source file :param str failReason: message sent by FTS server """ for error in cls.missingSourceErrors: if error.search(failReason): return 1 return 0
class Publisher: """ Class Publisher is in charge of getting dispersed information, to be published on the web. """ ############################################################################# def __init__(self, VOExtension, rsDBIn=None, commandCallerIn=None, infoGetterIn=None, WMSAdminIn=None): """ Standard constructor :params: :attr:`VOExtension`: string, VO Extension (e.g. 'LHCb') :attr:`rsDBIn`: optional ResourceStatusDB object (see :class: `DIRAC.ResourceStatusSystem.DB.ResourceStatusDB.ResourceStatusDB`) :attr:`commandCallerIn`: optional CommandCaller object (see :class: `DIRAC.ResourceStatusSystem.Command.CommandCaller.CommandCaller`) :attr:`infoGetterIn`: optional InfoGetter object (see :class: `DIRAC.ResourceStatusSystem.Utilities.InfoGetter.InfoGetter`) :attr:`WMSAdminIn`: optional RPCClient object for WMSAdmin (see :class: `DIRAC.Core.DISET.RPCClient.RPCClient`) """ self.configModule = __import__( VOExtension + "DIRAC.ResourceStatusSystem.Policy.Configurations", globals(), locals(), ['*']) if rsDBIn is not None: self.rsDB = rsDBIn else: from DIRAC.ResourceStatusSystem.DB.ResourceStatusDB import ResourceStatusDB self.rsDB = ResourceStatusDB() if commandCallerIn is not None: self.cc = commandCallerIn else: from DIRAC.ResourceStatusSystem.Command.CommandCaller import CommandCaller self.cc = CommandCaller() if infoGetterIn is not None: self.ig = infoGetterIn else: from DIRAC.ResourceStatusSystem.Utilities.InfoGetter import InfoGetter self.ig = InfoGetter(VOExtension) if WMSAdminIn is not None: self.WMSAdmin = WMSAdminIn else: from DIRAC.Core.DISET.RPCClient import RPCClient self.WMSAdmin = RPCClient("WorkloadManagement/WMSAdministrator") self.threadPool = ThreadPool(2, 5) self.lockObj = threading.RLock() self.infoForPanel_res = {} ############################################################################# def getInfo(self, granularity, name, useNewRes=False): """ Standard method to get all the info to be published This method uses a ThreadPool (:class:`DIRAC.Core.Utilities.ThreadPool.ThreadPool`) with 2-5 threads. The threaded method is :meth:`DIRAC.ResourceStatusSystem.Utilities.Publisher.Publisher.getInfoForPanel` :params: :attr:`granularity`: string - a ValidRes :attr:`name`: string - name of the Validres :attr:`useNewRes`: boolean. When set to true, will get new results, otherwise it will get cached results (where available). """ if granularity not in ValidRes: raise InvalidRes, where(self, self.getInfo) self.infoForPanel_res = {} status = None formerStatus = None siteType = None serviceType = None resourceType = None if granularity in ('Resource', 'Resources'): try: resourceType = self.rsDB.getMonitoredsList( 'Resource', ['ResourceType'], resourceName=name)[0][0] except IndexError: return "%s does not exist!" % name if granularity in ('StorageElement', 'StorageElements'): try: siteType = self.rsDB.getMonitoredsList( 'StorageElement', ['SiteType'], storageElementName=name)[0][0] except IndexError: return "%s does not exist!" % name paramNames = [ 'Type', 'Group', 'Name', 'Policy', 'DIRAC Status', 'RSS Status', 'Reason', 'Description' ] infoToGet = self.ig.getInfoToApply(('view_info', ), granularity, status=status, formerStatus=formerStatus, siteType=siteType, serviceType=serviceType, resourceType=resourceType, useNewRes=useNewRes)[0]['Panels'] infoToGet_res = {} recordsList = [] infosForPolicy = {} for panel in infoToGet.keys(): (granularityForPanel, nameForPanel) = self.__getNameForPanel(granularity, name, panel) if not self._resExist(granularityForPanel, nameForPanel): # completeInfoForPanel_res = None continue #take composite RSS result for name nameStatus_res = self._getStatus(nameForPanel, panel) recordBase = [None, None, None, None, None, None, None, None] recordBase[1] = panel.replace('_Panel', '') recordBase[2] = nameForPanel #nameForPanel try: recordBase[4] = nameStatus_res[nameForPanel][ 'DIRACStatus'] #DIRAC Status except: pass recordBase[5] = nameStatus_res[nameForPanel][ 'RSSStatus'] #RSS Status record = copy.deepcopy(recordBase) record[0] = 'ResultsForResource' recordsList.append(record) #take info that goes into the panel infoForPanel = infoToGet[panel] for info in infoForPanel: self.threadPool.generateJobAndQueueIt( self.getInfoForPanel, args=(info, granularityForPanel, nameForPanel)) self.threadPool.processAllResults() for policy in [x.keys()[0] for x in infoForPanel]: record = copy.deepcopy(recordBase) record[0] = 'SpecificInformation' record[3] = policy #policyName record[4] = None #DIRAC Status record[5] = self.infoForPanel_res[policy][ 'Status'] #RSS status for the policy record[6] = self.infoForPanel_res[policy]['Reason'] #Reason record[7] = self.infoForPanel_res[policy]['desc'] #Description recordsList.append(record) infosForPolicy[policy] = self.infoForPanel_res[policy]['infos'] infoToGet_res['TotalRecords'] = len(recordsList) infoToGet_res['ParameterNames'] = paramNames infoToGet_res['Records'] = recordsList infoToGet_res['Extras'] = infosForPolicy return infoToGet_res ############################################################################# def getInfoForPanel(self, info, granularityForPanel, nameForPanel): #get single RSS policy results policyResToGet = info.keys()[0] pol_res = self.rsDB.getPolicyRes(nameForPanel, policyResToGet) if pol_res != []: pol_res_dict = {'Status': pol_res[0], 'Reason': pol_res[1]} else: pol_res_dict = {'Status': 'Unknown', 'Reason': 'Unknown'} self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet] = pol_res_dict finally: self.lockObj.release() #get policy description desc = self._getPolicyDesc(policyResToGet) #get other info othersInfo = info.values()[0] if not isinstance(othersInfo, list): othersInfo = [othersInfo] info_res = {} for oi in othersInfo: format = oi.keys()[0] what = oi.values()[0] info_bit_got = self._getInfo(granularityForPanel, nameForPanel, format, what) info_res[format] = info_bit_got self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet]['infos'] = info_res self.infoForPanel_res[policyResToGet]['desc'] = desc finally: self.lockObj.release() ############################################################################# def _getStatus(self, name, panel): #get RSS status RSSStatus = self._getInfoFromRSSDB(name, panel)[0][1] #get DIRAC status if panel in ('Site_Panel', 'SE_Panel'): if panel == 'Site_Panel': DIRACStatus = self.WMSAdmin.getSiteMaskLogging(name) if DIRACStatus['OK']: DIRACStatus = DIRACStatus['Value'][name].pop()[0] else: raise RSSException, where(self, self._getStatus) elif panel == 'SE_Panel': ra = getStorageElementStatus(name, 'ReadAccess')['Value'] wa = getStorageElementStatus(name, 'WriteAccess')['Value'] DIRACStatus = {'ReadAccess': ra, 'WriteAccess': wa} status = { name: { 'RSSStatus': RSSStatus, 'DIRACStatus': DIRACStatus } } else: status = {name: {'RSSStatus': RSSStatus}} return status ############################################################################# def _getInfo(self, granularity, name, format, what): if format == 'RSS': info_bit_got = self._getInfoFromRSSDB(name, what) else: if isinstance(what, dict): command = what['CommandIn'] extraArgs = what['args'] else: command = what extraArgs = None info_bit_got = self.cc.commandInvocation(granularity, name, None, None, command, extraArgs) try: info_bit_got = info_bit_got['Result'] except: pass return info_bit_got ############################################################################# def _getInfoFromRSSDB(self, name, what): paramsL = ['Status'] siteName = None serviceName = None resourceName = None storageElementName = None serviceType = None gridSiteName = None if what == 'ServiceOfSite': gran = 'Service' paramsL.insert(0, 'ServiceName') paramsL.append('Reason') siteName = name elif what == 'ResOfCompService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorEl': gran = 'StorageElements' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') storageElementName = name elif what == 'StorageElementsOfSite': gran = 'StorageElements' paramsL.insert(0, 'StorageElementName') paramsL.append('Reason') if '@' in name: DIRACsiteName = name.split('@').pop() else: DIRACsiteName = name gridSiteName = getGOCSiteName(DIRACsiteName) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'Site_Panel': gran = 'Site' paramsL.insert(0, 'SiteName') siteName = name elif what == 'Service_Computing_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_Storage_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VO-BOX_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VOMS_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Resource_Panel': gran = 'Resource' paramsL.insert(0, 'ResourceName') resourceName = name elif what == 'SE_Panel': gran = 'StorageElement' paramsL.insert(0, 'StorageElementName') storageElementName = name info_bit_got = self.rsDB.getMonitoredsList( gran, paramsList=paramsL, siteName=siteName, serviceName=serviceName, serviceType=serviceType, resourceName=resourceName, storageElementName=storageElementName, gridSiteName=gridSiteName) return info_bit_got ############################################################################# def _getPolicyDesc(self, policyName): return self.configModule.Policies[policyName]['Description'] ############################################################################# def __getNameForPanel(self, granularity, name, panel): if granularity in ('Site', 'Sites'): if panel == 'Service_Computing_Panel': granularity = 'Service' name = 'Computing@' + name elif panel == 'Service_Storage_Panel': granularity = 'Service' name = 'Storage@' + name elif panel == 'OtherServices_Panel': granularity = 'Service' name = 'OtherS@' + name elif panel == 'Service_VOMS_Panel': granularity = 'Service' name = 'VOMS@' + name elif panel == 'Service_VO-BOX_Panel': granularity = 'Service' name = 'VO-BOX@' + name # else: # granularity = granularity # name = name # else: # granularity = granularity # name = name return (granularity, name) ############################################################################# def _resExist(self, granularity, name): siteName = None serviceName = None resourceName = None storageElementName = None if granularity in ('Site', 'Sites'): siteName = name elif granularity in ('Service', 'Services'): serviceName = name elif granularity in ('Resource', 'Resources'): resourceName = name elif granularity in ('StorageElement', 'StorageElements'): storageElementName = name res = self.rsDB.getMonitoredsList( granularity, siteName=siteName, serviceName=serviceName, resourceName=resourceName, storageElementName=storageElementName) if res == []: return False else: return True
class StElWriteInspectorAgent( AgentModule ): """ Class StElWriteInspectorAgent is in charge of going through StorageElements table, and pass StorageElement and Status to the PEP """ ############################################################################# def initialize( self ): """ Standard constructor """ try: self.rsDB = ResourceStatusDB() self.rmDB = ResourceManagementDB() self.StorageElementToBeChecked = Queue.Queue() self.StorageElementInCheck = [] self.maxNumberOfThreads = self.am_getOption( 'maxThreadsInPool', 1 ) self.threadPool = ThreadPool( self.maxNumberOfThreads, self.maxNumberOfThreads ) if not self.threadPool: self.log.error( 'Can not create Thread Pool' ) return S_ERROR( 'Can not create Thread Pool' ) self.setup = getSetup()[ 'Value' ] self.VOExtension = getExt() self.StorageElsWriteFreqs = CheckingFreqs[ 'StorageElsWriteFreqs' ] self.nc = NotificationClient() self.diracAdmin = DiracAdmin() self.csAPI = CSAPI() for _i in xrange( self.maxNumberOfThreads ): self.threadPool.generateJobAndQueueIt( self._executeCheck, args = ( None, ) ) return S_OK() except Exception: errorStr = "StElWriteInspectorAgent initialization" gLogger.exception( errorStr ) return S_ERROR( errorStr ) ############################################################################# def execute( self ): """ The main RSInspectorAgent execution method. Calls :meth:`DIRAC.ResourceStatusSystem.DB.ResourceStatusDB.getResourcesToCheck` and put result in self.StorageElementToBeChecked (a Queue) and in self.StorageElementInCheck (a list) """ try: res = self.rsDB.getStuffToCheck( 'StorageElementsWrite', self.StorageElsWriteFreqs ) for resourceTuple in res: if resourceTuple[ 0 ] in self.StorageElementInCheck: break resourceL = [ 'StorageElementWrite' ] for x in resourceTuple: resourceL.append( x ) self.StorageElementInCheck.insert( 0, resourceL[ 1 ] ) self.StorageElementToBeChecked.put( resourceL ) return S_OK() except Exception, x: errorStr = where( self, self.execute ) gLogger.exception( errorStr, lException = x ) return S_ERROR( errorStr )
class OutputDataExecutor: def __init__( self, csPath = "" ): self.log = gLogger.getSubLogger( "OutputDataExecutor" ) if not csPath: vo = gConfig.getValue( "/DIRAC/VirtualOrganization", "" ) self.__transfersCSPath = '/Operations/%s/OutputData' % vo else: self.__transfersCSPath = csPath self.log.verbose( "Reading transfer paths from %s" % self.__transfersCSPath ) self.__requiredCSOptions = ['InputPath', 'InputFC', 'OutputPath', 'OutputFC', 'OutputSE'] self.__threadPool = ThreadPool( gConfig.getValue( "%s/MinTransfers" % self.__transfersCSPath, 1 ), gConfig.getValue( "%s/MaxTransfers" % self.__transfersCSPath, 4 ), gConfig.getValue( "%s/MaxQueuedTransfers" % self.__transfersCSPath, 100 ) ) self.__threadPool.daemonize() self.__processingFiles = set() self.__okTransferredFiles = 0 self.__okTransferredBytes = 0 self.__failedFiles = {} def getNumOKTransferredFiles( self ): return self.__okTransferredFiles def getNumOKTransferredBytes( self ): return self.__okTransferredBytes def transfersPending( self ): return self.__threadPool.isWorking() def getDefinedTransferPaths( self ): result = gConfig.getSections( self.__transfersCSPath ) if not result['OK']: self.log.info( 'No Input/Output Pair defined in CS' ) return S_OK() pathList = result['Value'] tPaths = {} for name in pathList: csPath = self.__transfersCSPath + '/%s' % name result = gConfig.getOptionsDict( csPath ) if not result['OK']: continue transferDict = result['Value'] ok = True for i in self.__requiredCSOptions: if i not in transferDict: self.log.error( 'Missing Option %s in %s' % ( i, csPath ) ) ok = False break if not ok: continue tPaths[ name ] = transferDict return S_OK( tPaths ) def getNumLocalOutgoingFiles( self ): result = self.getDefinedTransferPaths() if not result[ 'OK' ]: return 0 localOutgoing = 0 tPaths = result[ 'Value' ] for name in tPaths: transferDict = tPaths[ name ] if 'LocalDisk' != transferDict['InputFC']: continue localOutgoing += len( self.getOutgoingFiles( transferDict ) ) return localOutgoing def getOutgoingFiles( self, transferDict ): """ Get list of files to be processed from InputPath """ inputFCName = transferDict['InputFC'] inputPath = transferDict['InputPath'] if inputFCName == 'LocalDisk': files = [] try: for fileName in os.listdir( inputPath ): if os.path.isfile( os.path.join( inputPath, fileName ) ): files.append( fileName ) except: pass return files inputFC = FileCatalog( [inputFCName] ) result = inputFC.listDirectory( inputPath, True ) if not result['OK']: self.log.error( result['Message'] ) return [] if not inputPath in result['Value']['Successful']: self.log.error( result['Value']['Failed'][inputPath] ) return [] subDirs = result['Value']['Successful'][inputPath]['SubDirs'] files = result['Value']['Successful'][inputPath]['Files'] for subDir in subDirs: self.log.info( 'Ignoring subdirectory:', subDir ) return files.keys() def checkForTransfers( self ): """ Check for transfers to do and start them """ result = self.getDefinedTransferPaths() if not result[ 'OK' ]: return result tPaths = result[ 'Value' ] for name in tPaths: transferPath = tPaths[ name ] self.log.verbose( "Checking %s transfer path" % name ) filesToTransfer = self.getOutgoingFiles( tPaths[ name ] ) self.log.info( "Transfer path %s has %d files" % ( name, len( filesToTransfer ) ) ) ret = self.__addFilesToThreadPool( filesToTransfer, transferPath ) if not ret['OK']: # The thread pool got full break def processAllPendingTransfers( self ): self.__threadPool.processAllResults() @transferSync def __addFilesToThreadPool( self, files, transferDict ): for fileName in files: fileName = os.path.basename( fileName ) if fileName in self.__processingFiles: continue self.__processingFiles.add( fileName ) time.sleep( 1 ) ret = self.__threadPool.generateJobAndQueueIt( self.__transferIfNotRegistered, args = ( fileName, transferDict ), oCallback = self.transferCallback, blocking = False ) if not ret['OK']: # The thread pool got full return ret return S_OK() def __transferIfNotRegistered( self, file, transferDict ): result = self.isRegisteredInOutputCatalog( file, transferDict ) if not result[ 'OK' ]: self.log.error( result[ 'Message' ] ) return result #Already registered. Need to delete if result[ 'Value' ]: self.log.info( "Transfer file %s is already registered in the output catalog" % file ) #Delete filePath = os.path.join( transferDict[ 'InputPath' ], file ) if transferDict[ 'InputFC' ] == 'LocalDisk': os.unlink( filePath ) #FIXME: what is inFile supposed to be ?? else: inputFC = FileCatalog( [ transferDict['InputFC'] ] ) replicaDict = inputFC.getReplicas( filePath ) if not replicaDict['OK']: self.log.error( "Error deleting file", replicaDict['Message'] ) elif not inFile in replicaDict['Value']['Successful']: self.log.error( "Error deleting file", replicaDict['Value']['Failed'][inFile] ) else: seList = replicaDict['Value']['Successful'][inFile].keys() for se in seList: se = StorageElement( se ) self.log.info( 'Removing from %s:' % se.name, inFile ) se.removeFile( inFile ) inputFC.removeFile( file ) self.log.info( "File %s deleted from %s" % ( file, transferDict[ 'InputFC' ] ) ) self.__processingFiles.discard( file ) return S_OK( file ) #Do the transfer return self.__retrieveAndUploadFile( file, transferDict ) def isRegisteredInOutputCatalog( self, file, transferDict ): fc = FileCatalog( [ transferDict[ 'OutputFC' ] ] ) lfn = os.path.join( transferDict['OutputPath'], os.path.basename( file ) ) result = fc.getReplicas( lfn ) if not result[ 'OK' ]: return result if lfn not in result[ 'Value' ][ 'Successful' ]: return S_OK( False ) replicas = result[ 'Value' ][ 'Successful' ][ lfn ] for seName in List.fromChar( transferDict[ 'OutputSE' ], "," ): if seName in replicas: self.log.verbose( "Transfer file %s is already registered in %s SE" % ( file, seName ) ) return S_OK( True ) return S_OK( False ) def __retrieveAndUploadFile( self, file, outputDict ): """ Retrieve, Upload, and remove """ fileName = file inputPath = outputDict['InputPath'] inputFCName = outputDict['InputFC'] inBytes = 0 if inputFCName == 'LocalDisk': inFile = file file = os.path.join( inputPath, file ) else: inputFC = FileCatalog( [inputFCName] ) inFile = os.path.join( inputPath, file ) replicaDict = inputFC.getReplicas( inFile ) if not replicaDict['OK']: self.log.error( replicaDict['Message'] ) return S_ERROR( fileName ) if not inFile in replicaDict['Value']['Successful']: self.log.error( replicaDict['Value']['Failed'][inFile] ) return S_ERROR( fileName ) seList = replicaDict['Value']['Successful'][inFile].keys() inputSE = StorageElement( seList[0] ) self.log.info( 'Retrieving from %s:' % inputSE.name, inFile ) # ret = inputSE.getFile( inFile ) # lcg_util binding prevent multithreading, use subprocess instead res = pythonCall( 2 * 3600, inputSE.getFile, inFile ) if not res['OK']: self.log.error( res['Message'] ) return S_ERROR( fileName ) ret = res['Value'] if not ret['OK']: self.log.error( ret['Message'] ) return S_ERROR( fileName ) if not inFile in ret['Value']['Successful']: self.log.error( ret['Value']['Failed'][inFile] ) return S_ERROR( fileName ) if os.path.isfile( file ): inBytes = os.stat( file )[6] outputPath = outputDict['OutputPath'] outputFCName = outputDict['OutputFC'] replicaManager = ReplicaManager() outFile = os.path.join( outputPath, os.path.basename( file ) ) transferOK = False for outputSEName in List.fromChar( outputDict['OutputSE'], "," ): outputSE = StorageElement( outputSEName ) self.log.info( 'Trying to upload to %s:' % outputSE.name, outFile ) # ret = replicaManager.putAndRegister( outFile, os.path.realpath( file ), outputSE.name, catalog=outputFCName ) # lcg_util binding prevent multithreading, use subprocess instead result = pythonCall( 2 * 3600, replicaManager.putAndRegister, outFile, os.path.realpath( file ), outputSE.name, catalog = outputFCName ) if result['OK'] and result['Value']['OK']: if outFile in result['Value']['Value']['Successful']: transferOK = True break else: self.log.error( result['Value']['Value']['Failed'][outFile] ) else: if result['OK']: self.log.error( result['Value']['Message'] ) else: self.log.error( result['Message'] ) if not transferOK: return S_ERROR( fileName ) if result['OK'] or not inputFCName == 'LocalDisk': os.unlink( file ) if not result['OK']: self.log.error( ret['Message'] ) return S_ERROR( fileName ) self.log.info( "Finished transferring %s [%s bytes]" % ( inFile, inBytes ) ) self.__okTransferredFiles += 1 self.__okTransferredBytes += inBytes if inputFCName == 'LocalDisk': return S_OK( fileName ) # Now the file is on final SE/FC, remove from input SE/FC for se in seList: se = StorageElement( se ) self.log.info( 'Removing from %s:' % se.name, inFile ) se.removeFile( inFile ) inputFC.removeFile( inFile ) return S_OK( fileName ) @transferSync def transferCallback( self, threadedJob, submitResult ): if not submitResult['OK']: fileName = submitResult['Message'] if fileName not in self.__failedFiles: self.__failedFiles[fileName] = 0 self.__failedFiles[fileName] += 1 else: fileName = submitResult['Value'] if fileName in self.__failedFiles: del self.__failedFiles[fileName] #Take out from processing files if fileName in self.__processingFiles: self.__processingFiles.discard( fileName )
class ElementInspectorAgent( AgentModule ): """ ElementInspectorAgent The ElementInspector agent is a generic agent used to check the elements of one of the elementTypes ( e.g. Site, Resource, Node ). This Agent takes care of the Elements. In order to do so, it gathers the eligible ones and then evaluates their statuses with the PEP. """ # Max number of worker threads by default __maxNumberOfThreads = 15 # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = { 'Active' : 20, 'Degraded' : 20, 'Probing' : 20, 'Banned' : 15, 'Unknown' : 10, 'Error' : 5 } def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) # ElementType, to be defined among Site, Resource or Node self.elementType = '' self.elementsToBeChecked = None self.threadPool = None self.rsClient = None self.clients = {} def initialize( self ): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', self.__maxNumberOfThreads ) self.threadPool = ThreadPool( maxNumberOfThreads, maxNumberOfThreads ) self.elementType = self.am_getOption( 'elementType', self.elementType ) self.rsClient = ResourceStatusClient() self.clients[ 'ResourceStatusClient' ] = self.rsClient self.clients[ 'ResourceManagementClient' ] = ResourceManagementClient() if not self.elementType: return S_ERROR( 'Missing elementType' ) return S_OK() def execute( self ): """ execute This is the main method of the agent. It gets the elements from the Database which are eligible to be re-checked, calculates how many threads should be started and spawns them. Each thread will get an element from the queue until it is empty. At the end, the method will join the queue such that the agent will not terminate a cycle until all elements have been processed. """ # Gets elements to be checked ( returns a Queue ) elementsToBeChecked = self.getElementsToBeChecked() if not elementsToBeChecked[ 'OK' ]: self.log.error( elementsToBeChecked[ 'Message' ] ) return elementsToBeChecked self.elementsToBeChecked = elementsToBeChecked[ 'Value' ] queueSize = self.elementsToBeChecked.qsize() pollingTime = self.am_getPollingTime() # Assigns number of threads on the fly such that we exhaust the PollingTime # without having to spawn too many threads. We assume 10 seconds per element # to be processed ( actually, it takes something like 1 sec per element ): # numberOfThreads = elements * 10(s/element) / pollingTime numberOfThreads = int( math.ceil( queueSize * 10. / pollingTime ) ) self.log.info( 'Needed %d threads to process %d elements' % ( numberOfThreads, queueSize ) ) for _x in xrange( numberOfThreads ): jobUp = self.threadPool.generateJobAndQueueIt( self._execute ) if not jobUp[ 'OK' ]: self.log.error( jobUp[ 'Message' ] ) self.log.info( 'blocking until all elements have been processed' ) # block until all tasks are done self.elementsToBeChecked.join() self.log.info( 'done') return S_OK() def getElementsToBeChecked( self ): """ getElementsToBeChecked This method gets all the rows in the <self.elementType>Status table, and then discards entries with TokenOwner != rs_svc. On top of that, there are check frequencies that are applied: depending on the current status of the element, they will be checked more or less often. """ toBeChecked = Queue.Queue() # We get all the elements, then we filter. elements = self.rsClient.selectStatusElement( self.elementType, 'Status' ) if not elements[ 'OK' ]: return elements utcnow = datetime.datetime.utcnow().replace( microsecond = 0 ) # filter elements by Type for element in elements[ 'Value' ]: # Maybe an overkill, but this way I have NEVER again to worry about order # of elements returned by mySQL on tuples elemDict = dict( zip( elements[ 'Columns' ], element ) ) # This if-clause skips all the elements that are should not be checked yet timeToNextCheck = self.__checkingFreqs[ elemDict[ 'Status' ] ] if utcnow <= elemDict[ 'LastCheckTime' ] + datetime.timedelta( minutes = timeToNextCheck ): continue # We skip the elements with token different than "rs_svc" if elemDict[ 'TokenOwner' ] != 'rs_svc': self.log.verbose( 'Skipping %s ( %s ) with token %s' % ( elemDict[ 'Name' ], elemDict[ 'StatusType' ], elemDict[ 'TokenOwner' ] )) continue # We are not checking if the item is already on the queue or not. It may # be there, but in any case, it is not a big problem. lowerElementDict = { 'element' : self.elementType } for key, value in elemDict.items(): lowerElementDict[ key[0].lower() + key[1:] ] = value # We add lowerElementDict to the queue toBeChecked.put( lowerElementDict ) self.log.verbose( '%s # "%s" # "%s" # %s # %s' % ( elemDict[ 'Name' ], elemDict[ 'ElementType' ], elemDict[ 'StatusType' ], elemDict[ 'Status' ], elemDict[ 'LastCheckTime' ]) ) return S_OK( toBeChecked ) # Private methods ............................................................ def _execute( self ): """ Method run by the thread pool. It enters a loop until there are no elements on the queue. On each iteration, it evaluates the policies for such element and enforces the necessary actions. If there are no more elements in the queue, the loop is finished. """ pep = PEP( clients = self.clients ) while True: try: element = self.elementsToBeChecked.get_nowait() except Queue.Empty: return S_OK() self.log.verbose( '%s ( %s / %s ) being processed' % ( element[ 'name' ], element[ 'status' ], element[ 'statusType' ] ) ) resEnforce = pep.enforce( element ) if not resEnforce[ 'OK' ]: self.log.error( 'Failed policy enforcement', resEnforce[ 'Message' ] ) self.elementsToBeChecked.task_done() continue resEnforce = resEnforce[ 'Value' ] oldStatus = resEnforce[ 'decissionParams' ][ 'status' ] statusType = resEnforce[ 'decissionParams' ][ 'statusType' ] newStatus = resEnforce[ 'policyCombinedResult' ][ 'Status' ] reason = resEnforce[ 'policyCombinedResult' ][ 'Reason' ] if oldStatus != newStatus: self.log.info( '%s (%s) is now %s ( %s ), before %s' % ( element[ 'name' ], statusType, newStatus, reason, oldStatus ) ) # Used together with join ! self.elementsToBeChecked.task_done()
class ElementInspectorAgent( AgentModule ): ''' The ElementInspector agent is a generic agent used to check the elements of one of the elementTypes ( e.g. Site, Resource, Node ). This Agent takes care of the Elements. In order to do so, it gathers the eligible ones and then evaluates their statuses with the PEP. ''' # Max number of worker threads by default __maxNumberOfThreads = 5 # ElementType, to be defined among Site, Resource or Node __elementType = None # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = { 'Default' : { 'Active' : 60, 'Degraded' : 30, 'Probing' : 30, 'Banned' : 30, 'Unknown' : 15, 'Error' : 15 } } # queue size limit to stop feeding __limitQueueFeeder = 15 def __init__( self, *args, **kwargs ): ''' c'tor ''' AgentModule.__init__( self, *args, **kwargs ) # members initialization self.maxNumberOfThreads = self.__maxNumberOfThreads self.elementType = self.__elementType self.checkingFreqs = self.__checkingFreqs self.limitQueueFeeder = self.__limitQueueFeeder self.elementsToBeChecked = None self.threadPool = None self.rsClient = None self.clients = {} def initialize( self ): ''' Standard initialize. Uses the ProductionManager shifterProxy to modify the ResourceStatus DB ''' self.maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', self.maxNumberOfThreads ) self.elementType = self.am_getOption( 'elementType', self.elementType ) self.checkingFreqs = self.am_getOption( 'checkingFreqs', self.checkingFreqs ) self.limitQueueFeeder = self.am_getOption( 'limitQueueFeeder', self.limitQueueFeeder ) self.elementsToBeChecked = Queue.Queue() self.threadPool = ThreadPool( self.maxNumberOfThreads, self.maxNumberOfThreads ) self.rsClient = ResourceStatusClient() self.clients[ 'ResourceStatusClient' ] = self.rsClient self.clients[ 'ResourceManagementClient' ] = ResourceManagementClient() return S_OK() def execute( self ): # If there are elements in the queue to be processed, we wait ( we know how # many elements in total we can have, so if there are more than 15% of them # on the queue, we do not add anything ), but the threads are running and # processing items from the queue on background. qsize = self.elementsToBeChecked.qsize() if qsize > self.limitQueueFeeder: self.log.warn( 'Queue not empty ( %s > %s ), skipping feeding loop' % ( qsize, self.limitQueueFeeder ) ) return S_OK() # We get all the elements, then we filter. elements = self.rsClient.selectStatusElement( self.elementType, 'Status' ) if not elements[ 'OK' ]: self.log.error( elements[ 'Message' ] ) return elements utcnow = datetime.datetime.utcnow().replace( microsecond = 0 ) # filter elements by Type for element in elements[ 'Value' ]: # Maybe an overkill, but this way I have NEVER again to worry about order # of elements returned by mySQL on tuples elemDict = dict( zip( elements[ 'Columns' ], element ) ) # We skip the elements with token different than "rs_svc" if elemDict[ 'TokenOwner' ] != 'rs_svc': self.log.info( 'Skipping %s ( %s ) with token %s' % ( elemDict[ 'Name' ], elemDict[ 'StatusType' ], elemDict[ 'TokenOwner' ] )) continue if not elemDict[ 'ElementType' ] in self.checkingFreqs: #self.log.warn( '"%s" not in inspectionFreqs, getting default' % elemDict[ 'ElementType' ] ) timeToNextCheck = self.checkingFreqs[ 'Default' ][ elemDict[ 'Status' ] ] else: timeToNextCheck = self.checkingFreqs[ elemDict[ 'ElementType' ] ][ elemDict[ 'Status' ] ] if utcnow - datetime.timedelta( minutes = timeToNextCheck ) > elemDict[ 'LastCheckTime' ]: # We are not checking if the item is already on the queue or not. It may # be there, but in any case, it is not a big problem. lowerElementDict = { 'element' : self.elementType } for key, value in elemDict.items(): lowerElementDict[ key[0].lower() + key[1:] ] = value # We add lowerElementDict to the queue self.elementsToBeChecked.put( lowerElementDict ) self.log.verbose( '%s # "%s" # "%s" # %s # %s' % ( elemDict[ 'Name' ], elemDict[ 'ElementType' ], elemDict[ 'StatusType' ], elemDict[ 'Status' ], elemDict[ 'LastCheckTime' ]) ) # Measure size of the queue, more or less, to know how many threads should # we start ! queueSize = self.elementsToBeChecked.qsize() # 30, could have been other number.. but it works reasonably well. ( +1 to get ceil ) threadsToStart = max( min( self.maxNumberOfThreads, ( queueSize / 30 ) + 1 ), 1 ) threadsRunning = self.threadPool.numWorkingThreads() self.log.info( 'Needed %d threads to process %d elements' % ( threadsToStart, queueSize ) ) if threadsRunning: self.log.info( 'Already %d threads running' % threadsRunning ) threadsToStart = max( 0, threadsToStart - threadsRunning ) self.log.info( 'Starting %d threads to process %d elements' % ( threadsToStart, queueSize ) ) # It may happen that we start two threads, 0 and 1. 1 goes DOWN, but 0 keeps # running. In next loop we will start a new thread, and will be called 0 # again. To have a mechanism to see which thread is where, we append the # cycle number before the threadId. cycle = self._AgentModule__moduleProperties[ 'cyclesDone' ] for _x in xrange( threadsToStart ): threadId = '%s_%s' % ( cycle, _x ) jobUp = self.threadPool.generateJobAndQueueIt( self._execute, args = ( threadId, ) ) if not jobUp[ 'OK' ]: self.log.error( jobUp[ 'Message' ] ) return S_OK() def finalize( self ): self.log.info( 'draining queue... blocking until empty' ) # block until all tasks are done self.elementsToBeChecked.join() return S_OK() ## Private methods ############################################################# def _execute( self, threadNumber ): ''' Method run by the thread pool. It enters a loop until there are no elements on the queue. On each iteration, it evaluates the policies for such element and enforces the necessary actions. If there are no more elements in the queue, the loop is finished. ''' tHeader = '%sJob%s' % ( '* '*30, threadNumber ) self.log.info( '%s UP' % tHeader ) pep = PEP( clients = self.clients ) while True: try: element = self.elementsToBeChecked.get_nowait() except Queue.Empty: self.log.info( '%s DOWN' % tHeader ) return S_OK() self.log.info( '%s ( %s / %s ) being processed' % ( element[ 'name' ], element[ 'status' ], element[ 'statusType' ] ) ) resEnforce = pep.enforce( element ) if not resEnforce[ 'OK' ]: self.log.error( resEnforce[ 'Message' ] ) self.elementsToBeChecked.task_done() continue resEnforce = resEnforce[ 'Value' ] oldStatus = resEnforce[ 'decissionParams' ][ 'status' ] statusType = resEnforce[ 'decissionParams' ][ 'statusType' ] newStatus = resEnforce[ 'policyCombinedResult' ][ 'Status' ] reason = resEnforce[ 'policyCombinedResult' ][ 'Reason' ] if oldStatus != newStatus: self.log.info( '%s (%s) is now %s ( %s ), before %s' % ( element[ 'name' ], statusType, newStatus, reason, oldStatus ) ) # Used together with join ! self.elementsToBeChecked.task_done() self.log.info( '%s DOWN' % tHeader ) return S_OK()
class SystemAdministratorIntegrator(object): def __init__(self, **kwargs): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list(set(self.__hosts) - set(kwargs['exclude'])) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] self.silentHosts = [] self.__resultDict = {} self.__kwargs = {} pool = ThreadPool(len(self.__hosts)) for host in self.__hosts: pool.generateJobAndQueueIt(self.__executeClient, args=[host, "ping"], kwargs={}, oCallback=self.__processResult) pool.processAllResults() for host, result in self.__resultDict.items(): if result['OK']: sysAdminHosts.append(host) else: self.silentHosts.append(host) del pool self.__hosts = sysAdminHosts self.__kwargs = dict(kwargs) self.__pool = ThreadPool(len(self.__hosts)) self.__resultDict = {} def getSilentHosts(self): """ Get a list of non-responding hosts :return: list of hosts """ return self.silentHosts def getRespondingHosts(self): """ Get a list of responding hosts :return: list of hosts """ return self.__hosts def __getattr__(self, name): self.call = name return self.execute def __executeClient(self, host, method, *parms, **kwargs): """ Execute RPC method on a given host """ hostName = Registry.getHostOption(host, 'Host', host) client = SystemAdministratorClient(hostName, **self.__kwargs) result = getattr(client, method)(*parms, **kwargs) result['Host'] = host return result def __processResult(self, id_, result): """ Collect results in the final structure """ host = result['Host'] del result['Host'] self.__resultDict[host] = result def execute(self, *args, **kwargs): """ Main execution method """ self.__resultDict = {} for host in self.__hosts: self.__pool.generateJobAndQueueIt(self.__executeClient, args=[host, self.call] + list(args), kwargs=kwargs, oCallback=self.__processResult) self.__pool.processAllResults() return S_OK(self.__resultDict)
class SystemAdministratorIntegrator( object ): def __init__( self, **kwargs ): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list ( set( self.__hosts ) - set( kwargs[ 'exclude' ] ) ) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] self.silentHosts = [] self.__resultDict = {} self.__kwargs = {} pool = ThreadPool( len( self.__hosts ) ) for host in self.__hosts: pool.generateJobAndQueueIt( self.__executeClient, args = [ host, "ping" ], kwargs = {}, oCallback = self.__processResult ) pool.processAllResults() for host, result in self.__resultDict.items(): if result['OK']: sysAdminHosts.append( host ) else: self.silentHosts.append( host ) del pool self.__hosts = sysAdminHosts self.__kwargs = dict( kwargs ) self.__pool = ThreadPool( len( self.__hosts ) ) self.__resultDict = {} def getSilentHosts( self ): """ Get a list of non-responding hosts :return: list of hosts """ return self.silentHosts def getRespondingHosts( self ): """ Get a list of responding hosts :return: list of hosts """ return self.__hosts def __getattr__( self, name ): self.call = name return self.execute def __executeClient( self, host, method, *parms, **kwargs ): """ Execute RPC method on a given host """ hostName = Registry.getHostOption( host, 'Host', host) client = SystemAdministratorClient( hostName, **self.__kwargs ) result = getattr( client, method )( *parms, **kwargs ) result['Host'] = host return result def __processResult( self, id_, result ): """ Collect results in the final structure """ host = result['Host'] del result['Host'] self.__resultDict[host] = result def execute(self, *args, **kwargs ): """ Main execution method """ self.__resultDict = {} for host in self.__hosts: self.__pool.generateJobAndQueueIt( self.__executeClient, args = [ host, self.call ] + list(args), kwargs = kwargs, oCallback = self.__processResult ) self.__pool.processAllResults() return S_OK( self.__resultDict )
class SiteInspectorAgent(AgentModule): """ SiteInspectorAgent The SiteInspectorAgent agent is an agent that is used to get the all the site names and trigger PEP to evaluate their status. """ # Max number of worker threads by default __maxNumberOfThreads = 15 # Inspection freqs, defaults, the lower, the higher priority to be checked. # Error state usually means there is a glitch somewhere, so it has the highest # priority. __checkingFreqs = { 'Active': 20, 'Degraded': 20, 'Probing': 20, 'Banned': 15, 'Unknown': 10, 'Error': 5 } def __init__(self, *args, **kwargs): AgentModule.__init__(self, *args, **kwargs) # ElementType, to be defined among Site, Resource or Node self.sitesToBeChecked = None self.threadPool = None self.siteClient = None self.clients = {} def initialize(self): """ Standard initialize. """ maxNumberOfThreads = self.am_getOption('maxNumberOfThreads', self.__maxNumberOfThreads) self.threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.siteClient = SiteStatus() self.clients['SiteStatus'] = self.siteClient self.clients['ResourceManagementClient'] = ResourceManagementClient() return S_OK() def execute(self): """ execute This is the main method of the agent. It gets the sites from the Database, calculates how many threads should be started and spawns them. Each thread will get a site from the queue until it is empty. At the end, the method will join the queue such that the agent will not terminate a cycle until all sites have been processed. """ # Gets sites to be checked ( returns a Queue ) sitesToBeChecked = self.getSitesToBeChecked() if not sitesToBeChecked['OK']: self.log.error(sitesToBeChecked['Message']) return sitesToBeChecked self.sitesToBeChecked = sitesToBeChecked['Value'] queueSize = self.sitesToBeChecked.qsize() pollingTime = self.am_getPollingTime() # Assigns number of threads on the fly such that we exhaust the PollingTime # without having to spawn too many threads. We assume 10 seconds per element # to be processed ( actually, it takes something like 1 sec per element ): # numberOfThreads = elements * 10(s/element) / pollingTime numberOfThreads = int(math.ceil(queueSize * 10. / pollingTime)) self.log.info('Needed %d threads to process %d elements' % (numberOfThreads, queueSize)) for _x in xrange(numberOfThreads): jobUp = self.threadPool.generateJobAndQueueIt(self._execute) if not jobUp['OK']: self.log.error(jobUp['Message']) self.log.info('blocking until all sites have been processed') # block until all tasks are done self.sitesToBeChecked.join() self.log.info('done') return S_OK() def getSitesToBeChecked(self): """ getElementsToBeChecked This method gets all the site names from the SiteStatus table, after that it get the details of each site (status, name, etc..) and adds them to a queue. """ toBeChecked = Queue.Queue() res = self.siteClient.getSites('All') if not res['OK']: return res # get the current status res = self.siteClient.getSiteStatuses(res['Value']) if not res['OK']: return res # filter elements for site in res['Value']: status = res['Value'].get(site, 'Unknown') toBeChecked.put({ 'status': status, 'name': site, 'site': site, 'element': 'Site', 'statusType': 'all', 'elementType': 'Site' }) return S_OK(toBeChecked) # Private methods ............................................................ def _execute(self): """ Method run by each of the thread that is in the ThreadPool. It enters a loop until there are no sites on the queue. On each iteration, it evaluates the policies for such site and enforces the necessary actions. If there are no more sites in the queue, the loop is finished. """ pep = PEP(clients=self.clients) while True: try: site = self.sitesToBeChecked.get_nowait() except Queue.Empty: return S_OK() resEnforce = pep.enforce(site) if not resEnforce['OK']: self.log.error('Failed policy enforcement', resEnforce['Message']) self.sitesToBeChecked.task_done() continue # Used together with join ! self.sitesToBeChecked.task_done()
class SSInspectorAgent(AgentModule): ''' The SSInspector agent ( SiteInspectorAgent ) is one of the four InspectorAgents of the RSS. This Agent takes care of the Sites. In order to do so, it gathers the eligible ones and then evaluates their statuses with the PEP. If you want to know more about the SSInspectorAgent, scroll down to the end of the file. ''' # Too many public methods # pylint: disable-msg=R0904 def initialize(self): # Attribute defined outside __init__ # pylint: disable-msg=W0201 try: self.rsClient = ResourceStatusClient() self.sitesFreqs = CS.getTypedDictRootedAtOperations( 'CheckingFreqs/SitesFreqs') self.sitesToBeChecked = Queue.Queue() self.siteNamesInCheck = [] self.maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1) self.threadPool = ThreadPool(self.maxNumberOfThreads, self.maxNumberOfThreads) if not self.threadPool: self.log.error('Can not create Thread Pool') return S_ERROR('Can not create Thread Pool') for _i in xrange(self.maxNumberOfThreads): self.threadPool.generateJobAndQueueIt(self._executeCheck, args=(None, )) return S_OK() except Exception: errorStr = "SSInspectorAgent initialization" self.log.exception(errorStr) return S_ERROR(errorStr) def execute(self): try: kwargs = {'meta': {}} kwargs['meta']['columns'] = [ 'SiteName', 'StatusType', 'Status', 'FormerStatus', 'SiteType', 'TokenOwner' ] kwargs['tokenOwner'] = 'RS_SVC' resQuery = self.rsClient.getStuffToCheck('Site', self.sitesFreqs, **kwargs) if not resQuery['OK']: self.log.error(resQuery['Message']) return resQuery resQuery = resQuery['Value'] self.log.info('Found %d candidates to be checked.' % len(resQuery)) for siteTuple in resQuery: if (siteTuple[0], siteTuple[1]) in self.siteNamesInCheck: self.log.info('%s(%s) discarded, already on the queue' % (siteTuple[0], siteTuple[1])) continue resourceL = ['Site'] + siteTuple self.siteNamesInCheck.insert(0, (siteTuple[0], siteTuple[1])) self.sitesToBeChecked.put(resourceL) return S_OK() except Exception, x: errorStr = where(self, self.execute) self.log.exception(errorStr, lException=x) return S_ERROR(errorStr)
class RSInspectorAgent( AgentModule ): """ The RSInspector agent ( ResourceInspectorAgent ) is one of the four InspectorAgents of the RSS. This Agent takes care of the Resources. In order to do so, it gathers the eligible ones and then evaluates their statuses with the PEP. If you want to know more about the RSInspectorAgent, scroll down to the end of the file. """ # Too many public methods # pylint: disable-msg=R0904 def initialize( self ): # Attribute defined outside __init__ # pylint: disable-msg=W0201 try: self.rsClient = ResourceStatusClient() self.resourcesFreqs = CS.getTypedDictRootedAtOperations( 'CheckingFreqs/ResourcesFreqs' ) self.resourcesToBeChecked = Queue.Queue() self.resourceNamesInCheck = [] self.maxNumberOfThreads = self.am_getOption( 'maxThreadsInPool', 1 ) self.threadPool = ThreadPool( self.maxNumberOfThreads, self.maxNumberOfThreads ) if not self.threadPool: self.log.error( 'Can not create Thread Pool' ) return S_ERROR( 'Can not create Thread Pool' ) for _i in xrange( self.maxNumberOfThreads ): self.threadPool.generateJobAndQueueIt( self._executeCheck, args = ( None, ) ) return S_OK() except Exception: errorStr = "RSInspectorAgent initialization" self.log.exception( errorStr ) return S_ERROR( errorStr ) def execute( self ): try: kwargs = { 'meta' : {} } kwargs['meta']['columns'] = [ 'ResourceName', 'StatusType', 'Status', 'FormerStatus', 'SiteType', 'ResourceType', \ 'TokenOwner' ] kwargs[ 'tokenOwner' ] = 'RS_SVC' resQuery = self.rsClient.getStuffToCheck( 'Resource', self.resourcesFreqs, **kwargs ) if not resQuery[ 'OK' ]: self.log.error( resQuery[ 'Message' ] ) return resQuery resQuery = resQuery[ 'Value' ] self.log.info( 'Found %d candidates to be checked.' % len( resQuery ) ) for resourceTuple in resQuery: if ( resourceTuple[ 0 ], resourceTuple[ 1 ] ) in self.resourceNamesInCheck: self.log.info( '%s(%s) discarded, already on the queue' % ( resourceTuple[ 0 ], resourceTuple[ 1 ] ) ) continue resourceL = [ 'Resource' ] + resourceTuple self.resourceNamesInCheck.insert( 0, ( resourceTuple[ 0 ], resourceTuple[ 1 ] ) ) self.resourcesToBeChecked.put( resourceL ) return S_OK() except Exception, x: errorStr = where( self, self.execute ) self.log.exception( errorStr, lException = x ) return S_ERROR( errorStr )
class OutputDataExecutor: def __init__(self, csPath=""): self.log = gLogger.getSubLogger("OutputDataExecutor") if not csPath: vo = gConfig.getValue("/DIRAC/VirtualOrganization", "") self.__transfersCSPath = '/Operations/%s/OutputData' % vo else: self.__transfersCSPath = csPath self.log.verbose("Reading transfer paths from %s" % self.__transfersCSPath) self.__requiredCSOptions = [ 'InputPath', 'InputFC', 'OutputPath', 'OutputFC', 'OutputSE' ] self.__threadPool = ThreadPool( gConfig.getValue("%s/MinTransfers" % self.__transfersCSPath, 1), gConfig.getValue("%s/MaxTransfers" % self.__transfersCSPath, 4), gConfig.getValue("%s/MaxQueuedTransfers" % self.__transfersCSPath, 100)) self.__threadPool.daemonize() self.__processingFiles = set() self.__okTransferredFiles = 0 self.__okTransferredBytes = 0 self.__failedFiles = {} def getNumOKTransferredFiles(self): return self.__okTransferredFiles def getNumOKTransferredBytes(self): return self.__okTransferredBytes def transfersPending(self): return self.__threadPool.isWorking() def getDefinedTransferPaths(self): result = gConfig.getSections(self.__transfersCSPath) if not result['OK']: self.log.info('No Input/Output Pair defined in CS') return S_OK() pathList = result['Value'] tPaths = {} for name in pathList: csPath = self.__transfersCSPath + '/%s' % name result = gConfig.getOptionsDict(csPath) if not result['OK']: continue transferDict = result['Value'] ok = True for i in self.__requiredCSOptions: if i not in transferDict: self.log.error('Missing Option %s in %s' % (i, csPath)) ok = False break if not ok: continue tPaths[name] = transferDict return S_OK(tPaths) def getNumLocalOutgoingFiles(self): result = self.getDefinedTransferPaths() if not result['OK']: return 0 localOutgoing = 0 tPaths = result['Value'] for name in tPaths: transferDict = tPaths[name] if 'LocalDisk' != transferDict['InputFC']: continue localOutgoing += len(self.getOutgoingFiles(transferDict)) return localOutgoing def getOutgoingFiles(self, transferDict): """ Get list of files to be processed from InputPath """ inputFCName = transferDict['InputFC'] inputPath = transferDict['InputPath'] if inputFCName == 'LocalDisk': files = [] try: for file in os.listdir(inputPath): if os.path.isfile(os.path.join(inputPath, file)): files.append(file) except: pass return files inputFC = FileCatalog([inputFCName]) result = inputFC.listDirectory(inputPath, True) if not result['OK']: self.log.error(result['Message']) return [] if not inputPath in result['Value']['Successful']: self.log.error(result['Value']['Failed'][inputPath]) return [] subDirs = result['Value']['Successful'][inputPath]['SubDirs'] files = result['Value']['Successful'][inputPath]['Files'] for dir in subDirs: self.log.info('Ignoring subdirectory:', dir) return files.keys() def checkForTransfers(self): """ Check for transfers to do and start them """ result = self.getDefinedTransferPaths() if not result['OK']: return result tPaths = result['Value'] for name in tPaths: transferPath = tPaths[name] self.log.verbose("Checking %s transfer path" % name) filesToTransfer = self.getOutgoingFiles(tPaths[name]) self.log.info("Transfer path %s has %d files" % (name, len(filesToTransfer))) ret = self.__addFilesToThreadPool(filesToTransfer, transferPath) if not ret['OK']: # The thread pool got full break def processAllPendingTransfers(self): self.__threadPool.processAllResults() @transferSync def __addFilesToThreadPool(self, files, transferDict): for file in files: file = os.path.basename(file) if file in self.__processingFiles: continue self.__processingFiles.add(file) time.sleep(1) ret = self.__threadPool.generateJobAndQueueIt( self.__transferIfNotRegistered, args=(file, transferDict), oCallback=self.transferCallback, blocking=False) if not ret['OK']: # The thread pool got full return ret return S_OK() def __transferIfNotRegistered(self, file, transferDict): result = self.isRegisteredInOutputCatalog(file, transferDict) if not result['OK']: self.log.error(result['Message']) return result #Already registered. Need to delete if result['Value']: self.log.info( "Transfer file %s is already registered in the output catalog" % file) #Delete filePath = os.path.join(transferDict['InputPath'], file) if transferDict['InputFC'] == 'LocalDisk': os.unlink(filePath) else: inputFC = FileCatalog([transferDict['InputFC']]) replicaDict = inputFC.getReplicas(filePath) if not replicaDict['OK']: self.log.error("Error deleting file", replicaDict['Message']) elif not inFile in replicaDict['Value']['Successful']: self.log.error("Error deleting file", replicaDict['Value']['Failed'][inFile]) else: seList = replicaDict['Value']['Successful'][inFile].keys() for se in seList: se = StorageElement(se) self.log.info('Removing from %s:' % se.name, inFile) se.removeFile(inFile) inputFC.removeFile(file) self.log.info("File %s deleted from %s" % (file, transferDict['InputFC'])) self.__processingFiles.discard(file) return S_OK(file) #Do the transfer return self.__retrieveAndUploadFile(file, transferDict) def isRegisteredInOutputCatalog(self, file, transferDict): fc = FileCatalog([transferDict['OutputFC']]) lfn = os.path.join(transferDict['OutputPath'], os.path.basename(file)) result = fc.getReplicas(lfn) if not result['OK']: return result if lfn not in result['Value']['Successful']: return S_OK(False) replicas = result['Value']['Successful'][lfn] for seName in List.fromChar(transferDict['OutputSE'], ","): if seName in replicas: self.log.verbose( "Transfer file %s is already registered in %s SE" % (file, seName)) return S_OK(True) return S_OK(False) def __retrieveAndUploadFile(self, file, outputDict): """ Retrieve, Upload, and remove """ fileName = file inputPath = outputDict['InputPath'] inputFCName = outputDict['InputFC'] inBytes = 0 if inputFCName == 'LocalDisk': inFile = file file = os.path.join(inputPath, file) else: inputFC = FileCatalog([inputFCName]) inFile = os.path.join(inputPath, file) replicaDict = inputFC.getReplicas(inFile) if not replicaDict['OK']: self.log.error(replicaDict['Message']) return S_ERROR(fileName) if not inFile in replicaDict['Value']['Successful']: self.log.error(replicaDict['Value']['Failed'][inFile]) return S_ERROR(fileName) seList = replicaDict['Value']['Successful'][inFile].keys() inputSE = StorageElement(seList[0]) self.log.info('Retrieving from %s:' % inputSE.name, inFile) # ret = inputSE.getFile( inFile ) # lcg_util binding prevent multithreading, use subprocess instead res = pythonCall(2 * 3600, inputSE.getFile, inFile) if not res['OK']: self.log.error(res['Message']) return S_ERROR(fileName) ret = res['Value'] if not ret['OK']: self.log.error(ret['Message']) return S_ERROR(fileName) if not inFile in ret['Value']['Successful']: self.log.error(ret['Value']['Failed'][inFile]) return S_ERROR(fileName) if os.path.isfile(file): inBytes = os.stat(file)[6] outputPath = outputDict['OutputPath'] outputFCName = outputDict['OutputFC'] replicaManager = ReplicaManager() outFile = os.path.join(outputPath, os.path.basename(file)) transferOK = False for outputSEName in List.fromChar(outputDict['OutputSE'], ","): outputSE = StorageElement(outputSEName) self.log.info('Trying to upload to %s:' % outputSE.name, outFile) # ret = replicaManager.putAndRegister( outFile, os.path.realpath( file ), outputSE.name, catalog=outputFCName ) # lcg_util binding prevent multithreading, use subprocess instead result = pythonCall(2 * 3600, replicaManager.putAndRegister, outFile, os.path.realpath(file), outputSE.name, catalog=outputFCName) if result['OK'] and result['Value']['OK']: if outFile in result['Value']['Value']['Successful']: transferOK = True break else: self.log.error(result['Value']['Value']['Failed'][outFile]) else: if result['OK']: self.log.error(result['Value']['Message']) else: self.log.error(result['Message']) if not transferOK: return S_ERROR(fileName) if result['OK'] or not inputFCName == 'LocalDisk': os.unlink(file) if not result['OK']: self.log.error(ret['Message']) return S_ERROR(fileName) self.log.info("Finished transferring %s [%s bytes]" % (inFile, inBytes)) self.__okTransferredFiles += 1 self.__okTransferredBytes += inBytes if inputFCName == 'LocalDisk': return S_OK(fileName) # Now the file is on final SE/FC, remove from input SE/FC for se in seList: se = StorageElement(se) self.log.info('Removing from %s:' % se.name, inFile) se.removeFile(inFile) inputFC.removeFile(inFile) return S_OK(fileName) @transferSync def transferCallback(self, threadedJob, submitResult): if not submitResult['OK']: file = submitResult['Message'] if file not in self.__failedFiles: self.__failedFiles[file] = 0 self.__failedFiles[file] += 1 else: file = submitResult['Value'] if file in self.__failedFiles: del self.__failedFiles[file] #Take out from processing files if file in self.__processingFiles: self.__processingFiles.discard(file)