class MyProxyRenewalAgent(AgentModule): def initialize(self): requiredLifeTime = self.am_getOption( "MinimumLifeTime", 3600 ) renewedLifeTime = self.am_getOption( "RenewedLifeTime", 54000 ) myProxyServer = gConfig.getValue( "/DIRAC/VOPolicy/MyProxyServer" , "myproxy.cern.ch" ) self.proxyDB = ProxyDB( requireVoms = True, useMyProxy = True ) gLogger.info( "Minimum Life time : %s" % requiredLifeTime ) gLogger.info( "Life time on renew : %s" % renewedLifeTime ) gLogger.info( "MyProxy server : %s" % self.proxyDB.getMyProxyServer() ) gLogger.info( "MyProxy max proxy time : %s" % self.proxyDB.getMyProxyMaxLifeTime() ) self.__threadPool = ThreadPool( 1, 10 ) return S_OK() def __renewProxyForCredentials( self, userDN, userGroup ): lifeTime = self.am_getOption( "RenewedLifeTime", 54000 ) gLogger.info( "Renewing for %s@%s %s secs" % ( userDN, userGroup, lifeTime ) ) retVal = self.proxyDB.renewFromMyProxy( userDN, userGroup, lifeTime = lifeTime ) if not retVal[ 'OK' ]: gLogger.error( "Failed to renew for %s@%s : %s" %( userDN, userGroup, retVal[ 'Message' ] ) ) else: gLogger.info( "Renewed proxy for %s@%s" % ( userDN, userGroup ) ) def __treatRenewalCallback( self, oTJ, exceptionList ): gLogger.exception( lException = exceptionList ) def execute(self): """ The main agent execution method """ self.proxyDB.purgeLogs() gLogger.info( "Purging expired requests" ) retVal = self.proxyDB.purgeExpiredRequests() if retVal[ 'OK' ]: gLogger.info( " purged %s requests" % retVal[ 'Value' ] ) gLogger.info( "Purging expired proxies" ) retVal = self.proxyDB.purgeExpiredProxies() if retVal[ 'OK' ]: gLogger.info( " purged %s proxies" % retVal[ 'Value' ] ) retVal = self.proxyDB.getCredentialsAboutToExpire( self.am_getOption( "MinimumLifeTime" , 3600 ) ) if not retVal[ 'OK' ]: return retVal data = retVal[ 'Value' ] gLogger.info( "Renewing %s proxies..." % len( data ) ) for record in data: userDN = record[0] userGroup = record[1] self.__threadPool.generateJobAndQueueIt( self.__renewProxyForCredentials, args = ( userDN, userGroup ), oExceptionCallback = self.__treatRenewalCallback ) self.__threadPool.processAllResults() return S_OK()
class MyProxyRenewalAgent(AgentModule): def initialize(self): requiredLifeTime = self.am_getOption( "MinimumLifeTime", 3600 ) renewedLifeTime = self.am_getOption( "RenewedLifeTime", 54000 ) self.proxyDB = ProxyDB( useMyProxy = True ) gLogger.info( "Minimum Life time : %s" % requiredLifeTime ) gLogger.info( "Life time on renew : %s" % renewedLifeTime ) gLogger.info( "MyProxy server : %s" % self.proxyDB.getMyProxyServer() ) gLogger.info( "MyProxy max proxy time : %s" % self.proxyDB.getMyProxyMaxLifeTime() ) self.__threadPool = ThreadPool( 1, 10 ) return S_OK() def __renewProxyForCredentials( self, userDN, userGroup ): lifeTime = self.am_getOption( "RenewedLifeTime", 54000 ) gLogger.info( "Renewing for %s@%s %s secs" % ( userDN, userGroup, lifeTime ) ) retVal = self.proxyDB.renewFromMyProxy( userDN, userGroup, lifeTime = lifeTime ) if not retVal[ 'OK' ]: gLogger.error( "Failed to renew proxy", "for %s@%s : %s" %( userDN, userGroup, retVal[ 'Message' ] ) ) else: gLogger.info( "Renewed proxy for %s@%s" % ( userDN, userGroup ) ) def __treatRenewalCallback( self, oTJ, exceptionList ): gLogger.exception( lException = exceptionList ) def execute(self): """ The main agent execution method """ self.proxyDB.purgeLogs() gLogger.info( "Purging expired requests" ) retVal = self.proxyDB.purgeExpiredRequests() if retVal[ 'OK' ]: gLogger.info( " purged %s requests" % retVal[ 'Value' ] ) gLogger.info( "Purging expired proxies" ) retVal = self.proxyDB.purgeExpiredProxies() if retVal[ 'OK' ]: gLogger.info( " purged %s proxies" % retVal[ 'Value' ] ) retVal = self.proxyDB.getCredentialsAboutToExpire( self.am_getOption( "MinimumLifeTime" , 3600 ) ) if not retVal[ 'OK' ]: return retVal data = retVal[ 'Value' ] gLogger.info( "Renewing %s proxies..." % len( data ) ) for record in data: userDN = record[0] userGroup = record[1] self.__threadPool.generateJobAndQueueIt( self.__renewProxyForCredentials, args = ( userDN, userGroup ), oExceptionCallback = self.__treatRenewalCallback ) self.__threadPool.processAllResults() return S_OK()
class SystemAdministratorIntegrator: def __init__( self, **kwargs ): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] self.__kwargs = dict( kwargs ) self.__pool = ThreadPool( len( self.__hosts ) ) self.__resultDict = {} def __getattr__( self, name ): self.call = name return self.execute def __executeClient( self, host, method, *parms, **kwargs ): """ Execute RPC method on a given host """ hostName = Registry.getHostOption( host, 'Host', host) client = SystemAdministratorClient( hostName, **self.__kwargs ) result = getattr( client, method )( *parms, **kwargs ) result['Host'] = host return result def __processResult( self, id_, result ): """ Collect results in the final structure """ host = result['Host'] del result['Host'] self.__resultDict[host] = result def execute(self, *args, **kwargs ): """ Main execution method """ self.__resultDict = {} for host in self.__hosts: self.__pool.generateJobAndQueueIt( self.__executeClient, args = [ host, self.call ] + list(args), kwargs = kwargs, oCallback = self.__processResult ) self.__pool.processAllResults() return S_OK( self.__resultDict )
class SystemAdministratorIntegrator: def __init__(self, **kwargs): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] self.__kwargs = dict(kwargs) self.__pool = ThreadPool(len(self.__hosts)) self.__resultDict = {} def __getattr__(self, name): self.call = name return self.execute def __executeClient(self, host, method, *parms, **kwargs): """ Execute RPC method on a given host """ hostName = Registry.getHostOption(host, 'Host', host) client = SystemAdministratorClient(hostName, **self.__kwargs) result = getattr(client, method)(*parms, **kwargs) result['Host'] = host return result def __processResult(self, id_, result): """ Collect results in the final structure """ host = result['Host'] del result['Host'] self.__resultDict[host] = result def execute(self, *args, **kwargs): """ Main execution method """ self.__resultDict = {} for host in self.__hosts: self.__pool.generateJobAndQueueIt(self.__executeClient, args=[host, self.call] + list(args), kwargs=kwargs, oCallback=self.__processResult) self.__pool.processAllResults() return S_OK(self.__resultDict)
def _updateServiceConfiguration(self, urlSet, fromMaster=False): """ Update configuration in a set of service in parallel :param set urlSet: a set of service URLs :param fromMaster: flag to force updating from the master CS :return: Nothing """ pool = ThreadPool(len(urlSet)) for url in urlSet: pool.generateJobAndQueueIt(self._forceServiceUpdate, args=[url, fromMaster], kwargs={}, oCallback=self.__processResults) pool.processAllResults()
def __updateServiceConfiguration(self, urlSet, fromMaster=False): """ Update configuration in a set of service in parallel :param set urlSet: a set of service URLs :param fromMaster: flag to force updating from the master CS :return: S_OK/S_ERROR, Value Successful/Failed dict with service URLs """ pool = ThreadPool(len(urlSet)) for url in urlSet: pool.generateJobAndQueueIt(self.__forceServiceUpdate, args=[url, fromMaster], kwargs={}, oCallback=self.__processResults) pool.processAllResults() return S_OK(self.__updateResultDict)
def __init__(self, **kwargs): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list(set(self.__hosts) - set(kwargs['exclude'])) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] self.silentHosts = [] self.__resultDict = {} self.__kwargs = {} pool = ThreadPool(len(self.__hosts)) for host in self.__hosts: pool.generateJobAndQueueIt(self.__executeClient, args=[host, "ping"], kwargs={}, oCallback=self.__processResult) pool.processAllResults() for host, result in self.__resultDict.items(): if result['OK']: sysAdminHosts.append(host) else: self.silentHosts.append(host) del pool self.__hosts = sysAdminHosts self.__kwargs = dict(kwargs) self.__pool = ThreadPool(len(self.__hosts)) self.__resultDict = {}
def __init__( self, **kwargs ): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list ( set( self.__hosts ) - set( kwargs[ 'exclude' ] ) ) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] self.silentHosts = [] self.__resultDict = {} self.__kwargs = {} pool = ThreadPool( len( self.__hosts ) ) for host in self.__hosts: pool.generateJobAndQueueIt( self.__executeClient, args = [ host, "ping" ], kwargs = {}, oCallback = self.__processResult ) pool.processAllResults() for host, result in self.__resultDict.items(): if result['OK']: sysAdminHosts.append( host ) else: self.silentHosts.append( host ) del pool self.__hosts = sysAdminHosts self.__kwargs = dict( kwargs ) self.__pool = ThreadPool( len( self.__hosts ) ) self.__resultDict = {}
class Publisher: """ Class Publisher is in charge of getting dispersed information, to be published on the web. """ ############################################################################# def __init__(self, VOExtension, rsDBIn=None, commandCallerIn=None, infoGetterIn=None, WMSAdminIn=None): """ Standard constructor :params: :attr:`VOExtension`: string, VO Extension (e.g. 'LHCb') :attr:`rsDBIn`: optional ResourceStatusDB object (see :class: `DIRAC.ResourceStatusSystem.DB.ResourceStatusDB.ResourceStatusDB`) :attr:`commandCallerIn`: optional CommandCaller object (see :class: `DIRAC.ResourceStatusSystem.Command.CommandCaller.CommandCaller`) :attr:`infoGetterIn`: optional InfoGetter object (see :class: `DIRAC.ResourceStatusSystem.Utilities.InfoGetter.InfoGetter`) :attr:`WMSAdminIn`: optional RPCClient object for WMSAdmin (see :class: `DIRAC.Core.DISET.RPCClient.RPCClient`) """ self.configModule = __import__( VOExtension + "DIRAC.ResourceStatusSystem.Policy.Configurations", globals(), locals(), ['*']) if rsDBIn is not None: self.rsDB = rsDBIn else: from DIRAC.ResourceStatusSystem.DB.ResourceStatusDB import ResourceStatusDB self.rsDB = ResourceStatusDB() if commandCallerIn is not None: self.cc = commandCallerIn else: from DIRAC.ResourceStatusSystem.Command.CommandCaller import CommandCaller self.cc = CommandCaller() if infoGetterIn is not None: self.ig = infoGetterIn else: from DIRAC.ResourceStatusSystem.Utilities.InfoGetter import InfoGetter self.ig = InfoGetter(VOExtension) if WMSAdminIn is not None: self.WMSAdmin = WMSAdminIn else: from DIRAC.Core.DISET.RPCClient import RPCClient self.WMSAdmin = RPCClient("WorkloadManagement/WMSAdministrator") self.threadPool = ThreadPool(2, 5) self.lockObj = threading.RLock() self.infoForPanel_res = {} ############################################################################# def getInfo(self, granularity, name, useNewRes=False): """ Standard method to get all the info to be published This method uses a ThreadPool (:class:`DIRAC.Core.Utilities.ThreadPool.ThreadPool`) with 2-5 threads. The threaded method is :meth:`DIRAC.ResourceStatusSystem.Utilities.Publisher.Publisher.getInfoForPanel` :params: :attr:`granularity`: string - a ValidRes :attr:`name`: string - name of the Validres :attr:`useNewRes`: boolean. When set to true, will get new results, otherwise it will get cached results (where available). """ if granularity not in ValidRes: raise InvalidRes, where(self, self.getInfo) self.infoForPanel_res = {} status = None formerStatus = None siteType = None serviceType = None resourceType = None if granularity in ('Resource', 'Resources'): try: resourceType = self.rsDB.getMonitoredsList( 'Resource', ['ResourceType'], resourceName=name)[0][0] except IndexError: return "%s does not exist!" % name if granularity in ('StorageElement', 'StorageElements'): try: siteType = self.rsDB.getMonitoredsList( 'StorageElement', ['SiteType'], storageElementName=name)[0][0] except IndexError: return "%s does not exist!" % name paramNames = [ 'Type', 'Group', 'Name', 'Policy', 'DIRAC Status', 'RSS Status', 'Reason', 'Description' ] infoToGet = self.ig.getInfoToApply(('view_info', ), granularity, status=status, formerStatus=formerStatus, siteType=siteType, serviceType=serviceType, resourceType=resourceType, useNewRes=useNewRes)[0]['Panels'] infoToGet_res = {} recordsList = [] infosForPolicy = {} for panel in infoToGet.keys(): (granularityForPanel, nameForPanel) = self.__getNameForPanel(granularity, name, panel) if not self._resExist(granularityForPanel, nameForPanel): # completeInfoForPanel_res = None continue #take composite RSS result for name nameStatus_res = self._getStatus(nameForPanel, panel) recordBase = [None, None, None, None, None, None, None, None] recordBase[1] = panel.replace('_Panel', '') recordBase[2] = nameForPanel #nameForPanel try: recordBase[4] = nameStatus_res[nameForPanel][ 'DIRACStatus'] #DIRAC Status except: pass recordBase[5] = nameStatus_res[nameForPanel][ 'RSSStatus'] #RSS Status record = copy.deepcopy(recordBase) record[0] = 'ResultsForResource' recordsList.append(record) #take info that goes into the panel infoForPanel = infoToGet[panel] for info in infoForPanel: self.threadPool.generateJobAndQueueIt( self.getInfoForPanel, args=(info, granularityForPanel, nameForPanel)) self.threadPool.processAllResults() for policy in [x.keys()[0] for x in infoForPanel]: record = copy.deepcopy(recordBase) record[0] = 'SpecificInformation' record[3] = policy #policyName record[4] = None #DIRAC Status record[5] = self.infoForPanel_res[policy][ 'Status'] #RSS status for the policy record[6] = self.infoForPanel_res[policy]['Reason'] #Reason record[7] = self.infoForPanel_res[policy]['desc'] #Description recordsList.append(record) infosForPolicy[policy] = self.infoForPanel_res[policy]['infos'] infoToGet_res['TotalRecords'] = len(recordsList) infoToGet_res['ParameterNames'] = paramNames infoToGet_res['Records'] = recordsList infoToGet_res['Extras'] = infosForPolicy return infoToGet_res ############################################################################# def getInfoForPanel(self, info, granularityForPanel, nameForPanel): #get single RSS policy results policyResToGet = info.keys()[0] pol_res = self.rsDB.getPolicyRes(nameForPanel, policyResToGet) if pol_res != []: pol_res_dict = {'Status': pol_res[0], 'Reason': pol_res[1]} else: pol_res_dict = {'Status': 'Unknown', 'Reason': 'Unknown'} self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet] = pol_res_dict finally: self.lockObj.release() #get policy description desc = self._getPolicyDesc(policyResToGet) #get other info othersInfo = info.values()[0] if not isinstance(othersInfo, list): othersInfo = [othersInfo] info_res = {} for oi in othersInfo: format = oi.keys()[0] what = oi.values()[0] info_bit_got = self._getInfo(granularityForPanel, nameForPanel, format, what) info_res[format] = info_bit_got self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet]['infos'] = info_res self.infoForPanel_res[policyResToGet]['desc'] = desc finally: self.lockObj.release() ############################################################################# def _getStatus(self, name, panel): #get RSS status RSSStatus = self._getInfoFromRSSDB(name, panel)[0][1] #get DIRAC status if panel in ('Site_Panel', 'SE_Panel'): if panel == 'Site_Panel': DIRACStatus = self.WMSAdmin.getSiteMaskLogging(name) if DIRACStatus['OK']: DIRACStatus = DIRACStatus['Value'][name].pop()[0] else: raise RSSException, where(self, self._getStatus) elif panel == 'SE_Panel': ra = getStorageElementStatus(name, 'ReadAccess')['Value'] wa = getStorageElementStatus(name, 'WriteAccess')['Value'] DIRACStatus = {'ReadAccess': ra, 'WriteAccess': wa} status = { name: { 'RSSStatus': RSSStatus, 'DIRACStatus': DIRACStatus } } else: status = {name: {'RSSStatus': RSSStatus}} return status ############################################################################# def _getInfo(self, granularity, name, format, what): if format == 'RSS': info_bit_got = self._getInfoFromRSSDB(name, what) else: if isinstance(what, dict): command = what['CommandIn'] extraArgs = what['args'] else: command = what extraArgs = None info_bit_got = self.cc.commandInvocation(granularity, name, None, None, command, extraArgs) try: info_bit_got = info_bit_got['Result'] except: pass return info_bit_got ############################################################################# def _getInfoFromRSSDB(self, name, what): paramsL = ['Status'] siteName = None serviceName = None resourceName = None storageElementName = None serviceType = None gridSiteName = None if what == 'ServiceOfSite': gran = 'Service' paramsL.insert(0, 'ServiceName') paramsL.append('Reason') siteName = name elif what == 'ResOfCompService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorEl': gran = 'StorageElements' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') storageElementName = name elif what == 'StorageElementsOfSite': gran = 'StorageElements' paramsL.insert(0, 'StorageElementName') paramsL.append('Reason') if '@' in name: DIRACsiteName = name.split('@').pop() else: DIRACsiteName = name gridSiteName = getGOCSiteName(DIRACsiteName) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'Site_Panel': gran = 'Site' paramsL.insert(0, 'SiteName') siteName = name elif what == 'Service_Computing_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_Storage_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VO-BOX_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VOMS_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Resource_Panel': gran = 'Resource' paramsL.insert(0, 'ResourceName') resourceName = name elif what == 'SE_Panel': gran = 'StorageElement' paramsL.insert(0, 'StorageElementName') storageElementName = name info_bit_got = self.rsDB.getMonitoredsList( gran, paramsList=paramsL, siteName=siteName, serviceName=serviceName, serviceType=serviceType, resourceName=resourceName, storageElementName=storageElementName, gridSiteName=gridSiteName) return info_bit_got ############################################################################# def _getPolicyDesc(self, policyName): return self.configModule.Policies[policyName]['Description'] ############################################################################# def __getNameForPanel(self, granularity, name, panel): if granularity in ('Site', 'Sites'): if panel == 'Service_Computing_Panel': granularity = 'Service' name = 'Computing@' + name elif panel == 'Service_Storage_Panel': granularity = 'Service' name = 'Storage@' + name elif panel == 'OtherServices_Panel': granularity = 'Service' name = 'OtherS@' + name elif panel == 'Service_VOMS_Panel': granularity = 'Service' name = 'VOMS@' + name elif panel == 'Service_VO-BOX_Panel': granularity = 'Service' name = 'VO-BOX@' + name # else: # granularity = granularity # name = name # else: # granularity = granularity # name = name return (granularity, name) ############################################################################# def _resExist(self, granularity, name): siteName = None serviceName = None resourceName = None storageElementName = None if granularity in ('Site', 'Sites'): siteName = name elif granularity in ('Service', 'Services'): serviceName = name elif granularity in ('Resource', 'Resources'): resourceName = name elif granularity in ('StorageElement', 'StorageElements'): storageElementName = name res = self.rsDB.getMonitoredsList( granularity, siteName=siteName, serviceName=serviceName, resourceName=resourceName, storageElementName=storageElementName) if res == []: return False else: return True
class SystemAdministratorIntegrator(object): def __init__(self, **kwargs): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list(set(self.__hosts) - set(kwargs['exclude'])) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] self.silentHosts = [] self.__resultDict = {} self.__kwargs = {} pool = ThreadPool(len(self.__hosts)) for host in self.__hosts: pool.generateJobAndQueueIt(self.__executeClient, args=[host, "ping"], kwargs={}, oCallback=self.__processResult) pool.processAllResults() for host, result in self.__resultDict.items(): if result['OK']: sysAdminHosts.append(host) else: self.silentHosts.append(host) del pool self.__hosts = sysAdminHosts self.__kwargs = dict(kwargs) self.__pool = ThreadPool(len(self.__hosts)) self.__resultDict = {} def getSilentHosts(self): """ Get a list of non-responding hosts :return: list of hosts """ return self.silentHosts def getRespondingHosts(self): """ Get a list of responding hosts :return: list of hosts """ return self.__hosts def __getattr__(self, name): self.call = name return self.execute def __executeClient(self, host, method, *parms, **kwargs): """ Execute RPC method on a given host """ hostName = Registry.getHostOption(host, 'Host', host) client = SystemAdministratorClient(hostName, **self.__kwargs) result = getattr(client, method)(*parms, **kwargs) result['Host'] = host return result def __processResult(self, id_, result): """ Collect results in the final structure """ host = result['Host'] del result['Host'] self.__resultDict[host] = result def execute(self, *args, **kwargs): """ Main execution method """ self.__resultDict = {} for host in self.__hosts: self.__pool.generateJobAndQueueIt(self.__executeClient, args=[host, self.call] + list(args), kwargs=kwargs, oCallback=self.__processResult) self.__pool.processAllResults() return S_OK(self.__resultDict)
class RemovalAgent( AgentModule, RequestAgentMixIn ): """ This Agent takes care of executing "removal" request from the RequestManagement system """ def __init__( self, *args ): """ Initialize the base class and define some extra data members """ AgentModule.__init__( self, *args ) self.requestDBClient = None self.replicaManager = None self.maxNumberOfThreads = 4 self.maxRequestsInQueue = 100 self.threadPool = None def initialize( self ): """ Called by the framework upon startup, before any cycle (execute method bellow) """ self.requestDBClient = RequestClient() self.replicaManager = ReplicaManager() gMonitor.registerActivity( "Iteration", "Agent Loops", "RemovalAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Execute", "Request Processed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalAtt", "Physical removals attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalDone", "Successful physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalFail", "Failed physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalSize", "Physically removed size", "RemovalAgent", "Bytes", gMonitor.OP_ACUM ) gMonitor.registerActivity( "ReplicaRemovalAtt", "Replica removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicaRemovalDone", "Successful replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicaRemovalFail", "Failed replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileAtt", "File removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileDone", "File removal done", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileFail", "File removal failed", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) self.maxNumberOfThreads = self.am_getOption( 'NumberOfThreads', self.maxNumberOfThreads ) self.maxRequestsInQueue = self.am_getOption( 'RequestsInQueue', self.maxRequestsInQueue ) self.threadPool = ThreadPool( 1, self.maxNumberOfThreads, self.maxRequestsInQueue ) # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted self.threadPool.daemonize() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ Fill the TreadPool with ThreadJobs """ while True: requestExecutor = ThreadedJob( self.executeRequest ) ret = self.threadPool.queueJob( requestExecutor ) if not ret['OK']: break return S_OK() def executeRequest( self ): """ Do the actual work in the Thread """ ################################################ # Get a request from request DB gMonitor.addMark( "Iteration", 1 ) res = self.requestDBClient.getRequest( 'removal' ) if not res['OK']: gLogger.info( "RemovalAgent.execute: Failed to get request from database." ) return S_OK() elif not res['Value']: gLogger.info( "RemovalAgent.execute: No requests to be executed found." ) return S_OK() requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] sourceServer = res['Value']['Server'] try: jobID = int( res['Value']['JobID'] ) except ValueError: jobID = 0 gLogger.info( "RemovalAgent.execute: Obtained request %s" % requestName ) result = self.requestDBClient.getCurrentExecutionOrder( requestName, sourceServer ) if result['OK']: currentOrder = result['Value'] else: gLogger.error( 'Can not get the request execution order' ) return S_OK( 'Can not get the request execution order' ) oRequest = RequestContainer( request = requestString ) ################################################ # Find the number of sub-requests from the request res = oRequest.getNumSubRequests( 'removal' ) if not res['OK']: errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests." gLogger.error( errStr, res['Message'] ) return S_OK() gLogger.info( "RemovalAgent.execute: Found %s sub requests." % res['Value'] ) ################################################ # For all the sub-requests in the request modified = False for ind in range( res['Value'] ): gMonitor.addMark( "Execute", 1 ) gLogger.info( "RemovalAgent.execute: Processing sub-request %s." % ind ) subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'removal' )['Value'] subExecutionOrder = int( subRequestAttributes['ExecutionOrder'] ) subStatus = subRequestAttributes['Status'] if subStatus == 'Waiting' and subExecutionOrder <= currentOrder: subRequestFiles = oRequest.getSubRequestFiles( ind, 'removal' )['Value'] operation = subRequestAttributes['Operation'] ################################################ # If the sub-request is a physical removal operation if operation == 'physicalRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSEs = subRequestAttributes['TargetSE'].split( ',' ) physicalFiles = [] pfnToLfn = {} for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str( subRequestFile['PFN'] ) lfn = str( subRequestFile['LFN'] ) pfnToLfn[pfn] = lfn physicalFiles.append( pfn ) gMonitor.addMark( 'PhysicalRemovalAtt', len( physicalFiles ) ) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeStorageFile( physicalFiles, diracSE ) if res['OK']: for pfn in res['Value']['Failed'].keys(): if not failed.has_key( pfn ): failed[pfn] = {} failed[pfn][diracSE] = res['Value']['Failed'][pfn] else: errMsg[diracSE] = res['Message'] for pfn in physicalFiles: if not failed.has_key( pfn ): failed[pfn] = {} failed[pfn][diracSE] = 'Completely' # Now analyse the results failedPFNs = failed.keys() pfnsOK = [pfn for pfn in physicalFiles if not pfn in failedPFNs] gMonitor.addMark( 'PhysicalRemovalDone', len( pfnsOK ) ) for pfn in pfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( pfn, str( diracSEs ) ) ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) ) modified = True if failed: gMonitor.addMark( 'PhysicalRemovalFail', len( failedPFNs ) ) for pfn in failedPFNs: for diracSE in failed[pfn].keys(): if type( failed[pfn][diracSE] ) in StringTypes: if re.search( 'no such file or directory', failed[pfn][diracSE].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", pfn ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( pfn, diracSE, failed[pfn][diracSE] ) ) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error( errStr, errMsg[diracSE] ) ################################################ # If the sub-request is a physical removal operation elif operation == 'removeFile': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str( subRequestFile['LFN'] ) lfns.append( lfn ) gMonitor.addMark( 'RemoveFileAtt', len( lfns ) ) res = self.replicaManager.removeFile( lfns ) if res['OK']: gMonitor.addMark( 'RemoveFileDone', len( res['Value']['Successful'].keys() ) ) for lfn in res['Value']['Successful'].keys(): gLogger.info( "RemovalAgent.execute: Successfully removed %s." % lfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True gMonitor.addMark( 'RemoveFileFail', len( res['Value']['Failed'].keys() ) ) for lfn in res['Value']['Failed'].keys(): if type( res['Value']['Failed'][lfn] ) in StringTypes: if re.search( 'no such file or directory', res['Value']['Failed'][lfn].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file:", "%s %s" % ( lfn, res['Value']['Failed'][lfn] ) ) else: gMonitor.addMark( 'RemoveFileFail', len( lfns ) ) errStr = "RemovalAgent.execute: Completely failed to remove files files." gLogger.error( errStr, res['Message'] ) ################################################ # If the sub-request is a physical removal operation elif operation == 'replicaRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSEs = subRequestAttributes['TargetSE'].split( ',' ) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str( subRequestFile['LFN'] ) lfns.append( lfn ) gMonitor.addMark( 'ReplicaRemovalAtt', len( lfns ) ) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeReplica( diracSE, lfns ) if res['OK']: for lfn in res['Value']['Failed'].keys(): if not failed.has_key( lfn ): failed[lfn] = {} failed[lfn][diracSE] = res['Value']['Failed'][lfn] else: errMsg[diracSE] = res['Message'] for lfn in lfns: if not failed.has_key( lfn ): failed[lfn] = {} failed[lfn][diracSE] = 'Completely' # Now analyse the results failedLFNs = failed.keys() lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs] gMonitor.addMark( 'ReplicaRemovalDone', len( lfnsOK ) ) for lfn in lfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( lfn, str( diracSEs ) ) ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True if failed: gMonitor.addMark( 'PhysicalRemovalFail', len( failedLFNs ) ) for lfn in failedLFNs: for diracSE in failed[lfn].keys(): if type( failed[lfn][diracSE] ) in StringTypes: if re.search( 'no such file or directory', failed[lfn][diracSE].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( lfn, diracSE, failed[lfn][diracSE] ) ) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error( errStr, errMsg[diracSE] ) ################################################ # If the sub-request is a request to the online system to retransfer elif operation == 'reTransfer': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSE = subRequestAttributes['TargetSE'] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str( subRequestFile['PFN'] ) lfn = str( subRequestFile['LFN'] ) res = self.replicaManager.onlineRetransfer( diracSE, pfn ) if res['OK']: if res['Value']['Successful'].has_key( pfn ): gLogger.info( "RemovalAgent.execute: Successfully requested retransfer of %s." % pfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: errStr = "RemovalAgent.execute: Failed to request retransfer." gLogger.error( errStr, "%s %s %s" % ( pfn, diracSE, res['Value']['Failed'][pfn] ) ) else: errStr = "RemovalAgent.execute: Completely failed to request retransfer." gLogger.error( errStr, res['Message'] ) else: gLogger.info( "RemovalAgent.execute: File already completed." ) ################################################ # If the sub-request is none of the above types else: gLogger.error( "RemovalAgent.execute: Operation not supported.", operation ) ################################################ # Determine whether there are any active files if oRequest.isSubRequestEmpty( ind, 'removal' )['Value']: oRequest.setSubRequestStatus( ind, 'removal', 'Done' ) gMonitor.addMark( "Done", 1 ) ################################################ # If the sub-request is already in terminal state else: gLogger.info( "RemovalAgent.execute:", "Sub-request %s is status '%s' and not to be executed." % ( ind, subRequestAttributes['Status'] ) ) ################################################ # Generate the new request string after operation requestString = oRequest.toXML()['Value'] res = self.requestDBClient.updateRequest( requestName, requestString, sourceServer ) if modified and jobID: result = self.finalizeRequest( requestName, jobID, sourceServer ) return S_OK() def finalize( self ): """ Called by the Agent framework to cleanly end execution. In this case this module will wait until all pending ThreadedJbos in the ThreadPool get executed """ self.threadPool.processAllResults() return S_OK()
class RemovalAgent(AgentModule, RequestAgentMixIn): """ This Agent takes care of executing "removal" request from the RequestManagement system """ def __init__(self, *args): """ Initialize the base class and define some extra data members """ AgentModule.__init__(self, *args) self.requestDBClient = None self.replicaManager = None self.maxNumberOfThreads = 4 self.maxRequestsInQueue = 100 self.threadPool = None self.timeOutCounter = 0 self.pendingRequests = True def initialize(self): """ Called by the framework upon startup, before any cycle (execute method bellow) """ self.requestDBClient = RequestClient() # the RequestAgentMixIn needs the capitalized version, until is is fixed keep this. self.RequestDBClient = self.requestDBClient self.replicaManager = ReplicaManager() gMonitor.registerActivity("Iteration", "Agent Loops", "RemovalAgent", "Loops/min", gMonitor.OP_SUM) gMonitor.registerActivity("Execute", "Request Processed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("Done", "Request Completed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("PhysicalRemovalAtt", "Physical removals attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("PhysicalRemovalDone", "Successful physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("PhysicalRemovalFail", "Failed physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("PhysicalRemovalSize", "Physically removed size", "RemovalAgent", "Bytes", gMonitor.OP_ACUM) gMonitor.registerActivity("ReplicaRemovalAtt", "Replica removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicaRemovalDone", "Successful replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicaRemovalFail", "Failed replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveFileAtt", "File removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveFileDone", "File removal done", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveFileFail", "File removal failed", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) self.maxNumberOfThreads = self.am_getOption('NumberOfThreads', self.maxNumberOfThreads) self.maxRequestsInQueue = self.am_getOption('RequestsInQueue', self.maxRequestsInQueue) self.threadPool = ThreadPool(1, self.maxNumberOfThreads, self.maxRequestsInQueue) # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted self.threadPool.daemonize() self.maxRequests = self.am_getOption('MaxRequestsPerCycle', 1200.) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): """ Fill the TreadPool with ThreadJobs """ self.pendingRequests = True self.maxRequests = min( 10000., self.am_getOption('MaxRequestsPerCycle', self.maxRequests)) requestCounter = 0 while self.pendingRequests: if requestCounter > self.maxRequests: break requestCounter += 1 requestExecutor = ThreadedJob(self.executeRequest) ret = self.threadPool.queueJob(requestExecutor) if not ret['OK']: break time.sleep(0.1) if self.timeOutCounter: gLogger.error('Timeouts during removal execution:', self.timeOutCounter) return S_OK() def executeRequest(self): """ Do the actual work in the Thread """ ################################################ # Get a request from request DB gMonitor.addMark("Iteration", 1) res = self.requestDBClient.getRequest('removal') if not res['OK']: gLogger.info( "RemovalAgent.execute: Failed to get request from database.") return S_OK() elif not res['Value']: gLogger.info( "RemovalAgent.execute: No requests to be executed found.") self.pendingRequests = False return S_OK() requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] sourceServer = res['Value']['Server'] jobID = 0 try: jobID = int(res['Value']['JobID']) except: gLogger.warn( "RemovalAgent.execute: JobID not present or malformed in request '%s', will use 0 instead." % requestName) gLogger.info("RemovalAgent.execute: Obtained request %s" % requestName) try: result = self.requestDBClient.getCurrentExecutionOrder( requestName, sourceServer) if result['OK']: currentOrder = result['Value'] else: gLogger.error('Can not get the request execution order') self.requestDBClient.updateRequest(requestName, requestString, sourceServer) return S_OK('Can not get the request execution order') oRequest = RequestContainer(request=requestString) ################################################ # Find the number of sub-requests from the request res = oRequest.getNumSubRequests('removal') if not res['OK']: errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests." gLogger.error(errStr, res['Message']) return S_OK() gLogger.info("RemovalAgent.execute: Found %s sub requests." % res['Value']) ################################################ # For all the sub-requests in the request modified = False for ind in range(res['Value']): gMonitor.addMark("Execute", 1) gLogger.info( "RemovalAgent.execute: Processing sub-request %s." % ind) subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'removal')['Value'] subExecutionOrder = int(subRequestAttributes['ExecutionOrder']) subStatus = subRequestAttributes['Status'] if subStatus == 'Waiting' and subExecutionOrder <= currentOrder: subRequestFiles = oRequest.getSubRequestFiles( ind, 'removal')['Value'] operation = subRequestAttributes['Operation'] ################################################ # If the sub-request is a physical removal operation if operation == 'physicalRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation) diracSEs = subRequestAttributes['TargetSE'].split(',') physicalFiles = [] pfnToLfn = {} for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str(subRequestFile['PFN']) lfn = str(subRequestFile['LFN']) pfnToLfn[pfn] = lfn physicalFiles.append(pfn) gMonitor.addMark('PhysicalRemovalAtt', len(physicalFiles)) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeStorageFile( physicalFiles, diracSE) if res['OK']: for pfn in res['Value']['Failed'].keys(): if not failed.has_key(pfn): failed[pfn] = {} failed[pfn][diracSE] = res['Value'][ 'Failed'][pfn] else: errMsg[diracSE] = res['Message'] for pfn in physicalFiles: if not failed.has_key(pfn): failed[pfn] = {} failed[pfn][diracSE] = 'Completely' # Now analyse the results failedPFNs = failed.keys() pfnsOK = [ pfn for pfn in physicalFiles if not pfn in failedPFNs ] gMonitor.addMark('PhysicalRemovalDone', len(pfnsOK)) for pfn in pfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % (pfn, str(diracSEs))) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done') if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', pfnToLfn[pfn])) modified = True if failed: gMonitor.addMark('PhysicalRemovalFail', len(failedPFNs)) for pfn in failedPFNs: for diracSE in failed[pfn].keys(): if type(failed[pfn] [diracSE]) in StringTypes: if re.search( 'no such file or directory', failed[pfn][diracSE].lower()): gLogger.info( "RemovalAgent.execute: File did not exist.", pfn) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done') if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', pfnToLfn[pfn])) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % (pfn, diracSE, failed[pfn][diracSE])) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error(errStr, errMsg[diracSE]) ################################################ # If the sub-request is a physical removal operation elif operation == 'removeFile': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str(subRequestFile['LFN']) lfns.append(lfn) gMonitor.addMark('RemoveFileAtt', len(lfns)) res = self.replicaManager.removeFile(lfns) if res['OK']: gMonitor.addMark( 'RemoveFileDone', len(res['Value']['Successful'].keys())) for lfn in res['Value']['Successful'].keys(): gLogger.info( "RemovalAgent.execute: Successfully removed %s." % lfn) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True gMonitor.addMark( 'RemoveFileFail', len(res['Value']['Failed'].keys())) for lfn in res['Value']['Failed'].keys(): if type(res['Value']['Failed'] [lfn]) in StringTypes: if re.search( 'no such file or directory', res['Value']['Failed'] [lfn].lower()): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file:", "%s %s" % (lfn, res['Value']['Failed'][lfn])) else: gMonitor.addMark('RemoveFileFail', len(lfns)) errStr = "RemovalAgent.execute: Completely failed to remove files files." gLogger.error(errStr, res['Message']) ################################################ # If the sub-request is a physical removal operation elif operation == 'replicaRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation) diracSEs = subRequestAttributes['TargetSE'].split(',') lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str(subRequestFile['LFN']) lfns.append(lfn) gMonitor.addMark('ReplicaRemovalAtt', len(lfns)) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeReplica( diracSE, lfns) if res['OK']: for lfn in res['Value']['Failed'].keys(): errorMessage = str( res['Value']['Failed'][lfn]) if errorMessage.find( 'Write access not permitted for this credential.' ) != -1: if self.__getProxyAndRemoveReplica( diracSE, lfn): continue if errorMessage.find( 'seconds timeout for "__gfal_wrapper" call' ) != -1: self.timeOutCounter += 1 if not failed.has_key(lfn): failed[lfn] = {} failed[lfn][diracSE] = res['Value'][ 'Failed'][lfn] else: errMsg[diracSE] = res['Message'] for lfn in lfns: if not failed.has_key(lfn): failed[lfn] = {} failed[lfn][diracSE] = 'Completely' # Now analyse the results failedLFNs = failed.keys() lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs] gMonitor.addMark('ReplicaRemovalDone', len(lfnsOK)) for lfn in lfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % (lfn, str(diracSEs))) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True if failed: gMonitor.addMark('PhysicalRemovalFail', len(failedLFNs)) for lfn in failedLFNs: for diracSE in failed[lfn].keys(): if type(failed[lfn] [diracSE]) in StringTypes: if re.search( 'no such file or directory', failed[lfn][diracSE].lower()): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % (lfn, diracSE, failed[lfn][diracSE])) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error(errStr, errMsg[diracSE]) ################################################ # If the sub-request is a request to the online system to retransfer elif operation == 'reTransfer': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation) diracSE = subRequestAttributes['TargetSE'] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str(subRequestFile['PFN']) lfn = str(subRequestFile['LFN']) res = self.replicaManager.onlineRetransfer( diracSE, pfn) if res['OK']: if res['Value']['Successful'].has_key(pfn): gLogger.info( "RemovalAgent.execute: Successfully requested retransfer of %s." % pfn) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True else: errStr = "RemovalAgent.execute: Failed to request retransfer." gLogger.error( errStr, "%s %s %s" % (pfn, diracSE, res['Value']['Failed'][pfn])) else: errStr = "RemovalAgent.execute: Completely failed to request retransfer." gLogger.error(errStr, res['Message']) else: gLogger.info( "RemovalAgent.execute: File already completed." ) ################################################ # If the sub-request is none of the above types else: gLogger.error( "RemovalAgent.execute: Operation not supported.", operation) ################################################ # Determine whether there are any active files if oRequest.isSubRequestEmpty(ind, 'removal')['Value']: oRequest.setSubRequestStatus(ind, 'removal', 'Done') gMonitor.addMark("Done", 1) ################################################ # If the sub-request is already in terminal state else: gLogger.info( "RemovalAgent.execute:", "Sub-request %s is status '%s' and not to be executed." % (ind, subRequestAttributes['Status'])) ################################################ # Generate the new request string after operation newrequestString = oRequest.toXML()['Value'] except: # if something fails return the original request back to the server res = self.requestDBClient.updateRequest(requestName, requestString, sourceServer) return S_OK() res = self.requestDBClient.updateRequest(requestName, newrequestString, sourceServer) if modified and jobID: result = self.finalizeRequest(requestName, jobID, sourceServer) return S_OK() def __getProxyAndRemoveReplica(self, diracSE, lfn): """ get a proxy from the owner of the file and try to remove it returns True if it succeeds, False otherwise """ result = self.replicaManager.getCatalogDirectoryMetadata( lfn, singleFile=True) if not result['OK']: gLogger.error("Could not get metadata info", result['Message']) return False ownerRole = result['Value']['OwnerRole'] ownerDN = result['Value']['OwnerDN'] if ownerRole[0] != "/": ownerRole = "/%s" % ownerRole userProxy = '' for ownerGroup in Registry.getGroupsWithVOMSAttribute(ownerRole): result = gProxyManager.downloadVOMSProxy( ownerDN, ownerGroup, limited=True, requiredVOMSAttribute=ownerRole) if not result['OK']: gLogger.verbose( 'Failed to retrieve voms proxy for %s : %s:' % (ownerDN, ownerRole), result['Message']) continue userProxy = result['Value'] gLogger.verbose("Got proxy for %s@%s [%s]" % (ownerDN, ownerGroup, ownerRole)) break if not userProxy: return False result = userProxy.dumpAllToFile() if not result['OK']: gLogger.verbose(result['Message']) return False upFile = result['Value'] prevProxyEnv = os.environ['X509_USER_PROXY'] os.environ['X509_USER_PROXY'] = upFile try: res = self.replicaManager.removeReplica(diracSE, lfn) if res['OK'] and lfn in res['Value']['Successful']: gLogger.verbose('Removed %s from %s' % (lfn, diracSE)) return True finally: os.environ['X509_USER_PROXY'] = prevProxyEnv os.unlink(upFile) return False def finalize(self): """ Called by the Agent framework to cleanly end execution. In this case this module will wait until all pending ThreadedJbos in the ThreadPool get executed """ self.threadPool.processAllResults() return S_OK()
class FTSMonitorAgent( AgentModule ): """ .. class:: FTSMonitorAgent Monitor submitted FTS jobs. """ # # transfer DB handle transferDB = None # # thread pool threadPool = None # # min threads minThreads = 1 # # max threads maxThreads = 10 # # missing source regexp patterns missingSourceErrors = [ re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ), re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\ " Command failed. : open error: No such file or directory" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ] def initialize( self ): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption( "shifterProxy", "DataManager" ) self.minThreads = self.am_getOption( "MinThreads", self.minThreads ) self.maxThreads = self.am_getOption( "MaxThreads", self.maxThreads ) minmax = ( abs( self.minThreads ), abs( self.maxThreads ) ) self.minThreads, self.maxThreads = min( minmax ), max( minmax ) self.log.info( "ThreadPool min threads = %s" % self.minThreads ) self.log.info( "ThreadPool max threads = %s" % self.maxThreads ) self.threadPool = ThreadPool( self.minThreads, self.maxThreads ) self.threadPool.daemonize() return S_OK() def execute( self ): """ push jobs to the thread pool """ self.log.info( "Obtaining requests to monitor" ) res = self.transferDB.getFTSReq() if not res["OK"]: self.log.error( "Failed to get FTS requests", res['Message'] ) return res if not res["Value"]: self.log.info( "No FTS requests found to monitor." ) return S_OK() ftsReqs = res["Value"] self.log.info( "Found %s FTS jobs" % len( ftsReqs ) ) i = 1 for ftsJob in ftsReqs: while True: self.log.debug( "submitting FTS Job %s FTSReqID=%s to monitor" % ( i, ftsJob["FTSReqID"] ) ) ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args = ( ftsJob, ), ) if ret["OK"]: i += 1 break # # sleep 1 second to proceed time.sleep( 1 ) self.threadPool.processAllResults() return S_OK() def ftsJobExpired( self, ftsReqID, channelID ): """ clean up when FTS job had expired on the server side :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID """ log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) ) fileIDs = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not fileIDs["OK"]: log.error( "Unable to retrieve FileIDs associated to %s request" % ftsReqID ) return fileIDs fileIDs = fileIDs["Value"] # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed" ) if not fileStatus["OK"]: log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % ( fileID, fileStatus["Message"] ) ) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server" ) if not failReason["OK"]: log.error( "Unable to set FileToFTS reason for FileID %s: %s" % ( fileID, failReason["Message"] ) ) # # update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs ) if not resetChannels["OK"]: log.error( "Failed to reset Channel table for files to retry" ) return resetChannels # # update FTSReq table log.info( "Setting FTS request status to 'Finished'" ) ftsReqStatus = self.transferDB.setFTSReqStatus( ftsReqID, "Finished" ) if not ftsReqStatus["OK"]: log.error( "Failed update FTS Request status", ftsReqStatus["Message"] ) return ftsReqStatus # # if we land here, everything should be OK return S_OK() def monitorTransfer( self, ftsReqDict ): """ monitors transfer obtained from TransferDB :param dict ftsReqDict: FTS job dictionary """ ftsReqID = ftsReqDict.get( "FTSReqID" ) ftsGUID = ftsReqDict.get( "FTSGuid" ) ftsServer = ftsReqDict.get( "FTSServer" ) channelID = ftsReqDict.get( "ChannelID" ) sourceSE = ftsReqDict.get( "SourceSE" ) targetSE = ftsReqDict.get( "TargetSE" ) oFTSRequest = FTSRequest() oFTSRequest.setFTSServer( ftsServer ) oFTSRequest.setFTSGUID( ftsGUID ) oFTSRequest.setSourceSE( sourceSE ) oFTSRequest.setTargetSE( targetSE ) log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) ) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. log.info( "Perform summary update of the FTS Request" ) infoStr = [ "glite-transfer-status -s %s -l %s" % ( ftsServer, ftsGUID ) ] infoStr.append( "FTS GUID: %s" % ftsGUID ) infoStr.append( "FTS Server: %s" % ftsServer ) log.info( "\n".join( infoStr ) ) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor( ftsReqID ) if not res["OK"]: log.error( "Failed to update the FTS request summary", res["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in res["Message"]: log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" ) cleanUp = self.ftsJobExpired( ftsReqID, channelID ) if not cleanUp["OK"]: log.error( cleanUp["Message"] ) return cleanUp return res res = oFTSRequest.dumpSummary() if not res['OK']: log.error( "Failed to get FTS request summary", res["Message"] ) return res log.info( res['Value'] ) res = oFTSRequest.getPercentageComplete() if not res['OK']: log.error( "Failed to get FTS percentage complete", res["Message"] ) return res log.info( 'FTS Request found to be %.1f percent complete' % res["Value"] ) self.transferDB.setFTSReqAttribute( ftsReqID, "PercentageComplete", res["Value"] ) self.transferDB.addLoggingEvent( ftsReqID, res["Value"] ) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res["OK"]: log.error( "Failed to determine whether FTS request terminal", res["Message"] ) return res if not res["Value"]: return S_OK() # # request is terminal return self.terminalRequest( oFTSRequest, ftsReqID, channelID, sourceSE ) def terminalRequest( self, oFTSRequest, ftsReqID, channelID, sourceSE ): """ process terminal FTS job :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param str sourceSE: FTSReq.SourceSE """ log = gLogger.getSubLogger( "@%s" % ftsReqID ) log.info( "FTS Request found to be terminal, updating file states" ) ######################################################################### # Get the LFNS associated to the FTS request log.info( "Obtaining the LFNs associated to this request" ) res = self.transferDB.getFTSReqLFNs( ftsReqID, channelID, sourceSE ) if not res["OK"]: log.error( "Failed to obtain FTS request LFNs", res['Message'] ) return res files = res["Value"] if not files: log.error( "No files present for transfer" ) return S_ERROR( "No files were found in the DB" ) lfns = files.keys() log.debug( "Obtained %s files" % len( lfns ) ) for lfn in lfns: oFTSRequest.setLFN( lfn ) res = oFTSRequest.monitor() if not res["OK"]: log.error( "Failed to perform detailed monitoring of FTS request", res["Message"] ) return res res = oFTSRequest.getFailed() if not res["OK"]: log.error( "Failed to obtained failed files for FTS request", res["Message"] ) return res failedFiles = res["Value"] res = oFTSRequest.getDone() if not res["OK"]: log.error( "Failed to obtained successful files for FTS request", res["Message"] ) return res completedFiles = res["Value"] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not res["OK"]: log.error( "Failed to get FileIDs associated to FTS Request", res["Message"] ) return res fileIDs = res["Value"] res = self.transferDB.getAttributesForFilesList( fileIDs, ["LFN"] ) if not res["OK"]: log.error( "Failed to get LFNs associated to FTS Request", res["Message"] ) return res fileIDDict = res["Value"] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append( fileID ) transferTime = 0 res = oFTSRequest.getTransferTime( lfn ) if res["OK"]: transferTime = res["Value"] fileToFTSUpdates.append( ( fileID, "Completed", "", 0, transferTime ) ) if lfn in failedFiles: failReason = "" res = oFTSRequest.getFailReason( lfn ) if res["OK"]: failReason = res["Value"] if "Source file/user checksum mismatch" in failReason: filesToFail.append( fileID ) continue if self.missingSource( failReason ): log.error( "The source SURL does not exist.", "%s %s" % ( lfn, oFTSRequest.getSourceSURL( lfn ) ) ) filesToFail.append( fileID ) else: filesToRetry.append( fileID ) log.error( "Failed to replicate file on channel.", "%s %s" % ( channelID, failReason ) ) fileToFTSUpdates.append( ( fileID, "Failed", failReason, 0, 0 ) ) # # update TransferDB.FileToFTS table updateFileToFTS = self.updateFileToFTS( ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ) if updateFileToFTS["OK"] and updateFileToFTS["Value"]: res = oFTSRequest.finalize() if not res["OK"]: log.error( "Failed to perform the finalization for the FTS request", res["Message"] ) return res log.info( 'Adding logging event for FTS request' ) # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent( ftsReqID, 'Finished' ) if not res['OK']: log.error( 'Failed to add logging event for FTS Request', res['Message'] ) # update TransferDB.FileToCat table updateFileToCat = self.updateFileToCat( oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ) if not updateFileToCat["OK"]: log.error( updateFileToCat["Message"] ) log.debug( "Updating FTS request status" ) res = self.transferDB.setFTSReqStatus( ftsReqID, 'Finished' ) if not res['OK']: log.error( 'Failed update FTS Request status', res['Message'] ) return S_OK() def updateFileToFTS( self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ): """ update TransferDB.FileToFTS table for finished request :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param list filesToRetry: FileIDs to retry :param list filesToFail: FileIDs for failed files :param list completedFileIDs: files completed :param list fileToFTSUpdates: ??? """ log = gLogger.getSubLogger( "@%s" % ftsReqID ) allUpdated = True res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry ) if filesToRetry else S_OK() if not res["OK"]: log.error( "Failed to update the Channel table for file to retry.", res["Message"] ) allUpdated = False for fileID in filesToFail: log.info( "Updating the Channel table for files to reschedule" ) res = self.transferDB.setFileToReschedule( fileID ) if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"] ) allUpdated = False elif res["Value"] == "max reschedule attempt reached": log.error( "setting Channel status to 'Failed' : " % res["Value"] ) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed' ) if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"] ) allUpdated = False if completedFileIDs: res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs ) if not res["OK"]: log.error( "Failed to update the Channel table for successful files.", res["Message"] ) allUpdated = False res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs ) if not res["OK"]: log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message'] ) allUpdated = False if fileToFTSUpdates: res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates ) if not res["OK"]: log.error( "Failed to update the FileToFTS table for files.", res["Message"] ) allUpdated = False return S_OK( allUpdated ) def updateFileToCat( self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ): """ update TransferDB.FileToCat table for finished request :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param dict fileIDDict: fileIDs dictionary :param int channelID: FTSReq.ChannelID """ res = oFTSRequest.getFailedRegistrations() failedRegistrations = res["Value"] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: regFailedFileIDs.append( fileID ) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append( fileID ) elif fileID in filesToFail: regForgetFileIDs.append( fileID ) res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs ) if regFailedFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to reset entries in FileToCat: %s" % res["Message"] return res res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs ) if regDoneFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"] return res # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs ) if regForgetFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"] return res return S_OK() @classmethod def missingSource( cls, failReason ): """ check if message sent by FTS server is concering missing source file :param str failReason: message sent by FTS server """ for error in cls.missingSourceErrors: if error.search( failReason ): return 1 return 0
class RemovalAgent( AgentModule, RequestAgentMixIn ): """ This Agent takes care of executing "removal" request from the RequestManagement system """ def __init__( self, *args ): """ Initialize the base class and define some extra data members """ AgentModule.__init__( self, *args ) self.requestDBClient = None self.replicaManager = None self.maxNumberOfThreads = 4 self.maxRequestsInQueue = 100 self.threadPool = None self.timeOutCounter = 0 self.pendingRequests = True def initialize( self ): """ Called by the framework upon startup, before any cycle (execute method bellow) """ self.requestDBClient = RequestClient() # the RequestAgentMixIn needs the capitalized version, until is is fixed keep this. self.RequestDBClient = self.requestDBClient self.replicaManager = ReplicaManager() gMonitor.registerActivity( "Iteration", "Agent Loops", "RemovalAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Execute", "Request Processed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalAtt", "Physical removals attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalDone", "Successful physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalFail", "Failed physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalSize", "Physically removed size", "RemovalAgent", "Bytes", gMonitor.OP_ACUM ) gMonitor.registerActivity( "ReplicaRemovalAtt", "Replica removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicaRemovalDone", "Successful replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicaRemovalFail", "Failed replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileAtt", "File removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileDone", "File removal done", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileFail", "File removal failed", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) self.maxNumberOfThreads = self.am_getOption( 'NumberOfThreads', self.maxNumberOfThreads ) self.maxRequestsInQueue = self.am_getOption( 'RequestsInQueue', self.maxRequestsInQueue ) self.threadPool = ThreadPool( 1, self.maxNumberOfThreads, self.maxRequestsInQueue ) # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted self.threadPool.daemonize() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ Fill the TreadPool with ThreadJobs """ self.pendingRequests = True while self.pendingRequests: requestExecutor = ThreadedJob( self.executeRequest ) ret = self.threadPool.queueJob( requestExecutor ) if not ret['OK']: break time.sleep( 0.1 ) if self.timeOutCounter: gLogger.error( 'Timeouts during removal execution:', self.timeOutCounter ) return S_OK() def executeRequest( self ): """ Do the actual work in the Thread """ ################################################ # Get a request from request DB gMonitor.addMark( "Iteration", 1 ) res = self.requestDBClient.getRequest( 'removal' ) if not res['OK']: gLogger.info( "RemovalAgent.execute: Failed to get request from database." ) return S_OK() elif not res['Value']: gLogger.info( "RemovalAgent.execute: No requests to be executed found." ) self.pendingRequests = False return S_OK() requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] sourceServer = res['Value']['Server'] try: jobID = int( res['Value']['JobID'] ) except ValueError: jobID = 0 gLogger.info( "RemovalAgent.execute: Obtained request %s" % requestName ) try: result = self.requestDBClient.getCurrentExecutionOrder( requestName, sourceServer ) if result['OK']: currentOrder = result['Value'] else: gLogger.error( 'Can not get the request execution order' ) self.requestDBClient.updateRequest( requestName, requestString, sourceServer ) return S_OK( 'Can not get the request execution order' ) oRequest = RequestContainer( request = requestString ) ################################################ # Find the number of sub-requests from the request res = oRequest.getNumSubRequests( 'removal' ) if not res['OK']: errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests." gLogger.error( errStr, res['Message'] ) return S_OK() gLogger.info( "RemovalAgent.execute: Found %s sub requests." % res['Value'] ) ################################################ # For all the sub-requests in the request modified = False for ind in range( res['Value'] ): gMonitor.addMark( "Execute", 1 ) gLogger.info( "RemovalAgent.execute: Processing sub-request %s." % ind ) subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'removal' )['Value'] subExecutionOrder = int( subRequestAttributes['ExecutionOrder'] ) subStatus = subRequestAttributes['Status'] if subStatus == 'Waiting' and subExecutionOrder <= currentOrder: subRequestFiles = oRequest.getSubRequestFiles( ind, 'removal' )['Value'] operation = subRequestAttributes['Operation'] ################################################ # If the sub-request is a physical removal operation if operation == 'physicalRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSEs = subRequestAttributes['TargetSE'].split( ',' ) physicalFiles = [] pfnToLfn = {} for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str( subRequestFile['PFN'] ) lfn = str( subRequestFile['LFN'] ) pfnToLfn[pfn] = lfn physicalFiles.append( pfn ) gMonitor.addMark( 'PhysicalRemovalAtt', len( physicalFiles ) ) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeStorageFile( physicalFiles, diracSE ) if res['OK']: for pfn in res['Value']['Failed'].keys(): if not failed.has_key( pfn ): failed[pfn] = {} failed[pfn][diracSE] = res['Value']['Failed'][pfn] else: errMsg[diracSE] = res['Message'] for pfn in physicalFiles: if not failed.has_key( pfn ): failed[pfn] = {} failed[pfn][diracSE] = 'Completely' # Now analyse the results failedPFNs = failed.keys() pfnsOK = [pfn for pfn in physicalFiles if not pfn in failedPFNs] gMonitor.addMark( 'PhysicalRemovalDone', len( pfnsOK ) ) for pfn in pfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( pfn, str( diracSEs ) ) ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) ) modified = True if failed: gMonitor.addMark( 'PhysicalRemovalFail', len( failedPFNs ) ) for pfn in failedPFNs: for diracSE in failed[pfn].keys(): if type( failed[pfn][diracSE] ) in StringTypes: if re.search( 'no such file or directory', failed[pfn][diracSE].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", pfn ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( pfn, diracSE, failed[pfn][diracSE] ) ) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error( errStr, errMsg[diracSE] ) ################################################ # If the sub-request is a physical removal operation elif operation == 'removeFile': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str( subRequestFile['LFN'] ) lfns.append( lfn ) gMonitor.addMark( 'RemoveFileAtt', len( lfns ) ) res = self.replicaManager.removeFile( lfns ) if res['OK']: gMonitor.addMark( 'RemoveFileDone', len( res['Value']['Successful'].keys() ) ) for lfn in res['Value']['Successful'].keys(): gLogger.info( "RemovalAgent.execute: Successfully removed %s." % lfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True gMonitor.addMark( 'RemoveFileFail', len( res['Value']['Failed'].keys() ) ) for lfn in res['Value']['Failed'].keys(): if type( res['Value']['Failed'][lfn] ) in StringTypes: if re.search( 'no such file or directory', res['Value']['Failed'][lfn].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file:", "%s %s" % ( lfn, res['Value']['Failed'][lfn] ) ) else: gMonitor.addMark( 'RemoveFileFail', len( lfns ) ) errStr = "RemovalAgent.execute: Completely failed to remove files files." gLogger.error( errStr, res['Message'] ) ################################################ # If the sub-request is a physical removal operation elif operation == 'replicaRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSEs = subRequestAttributes['TargetSE'].split( ',' ) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str( subRequestFile['LFN'] ) lfns.append( lfn ) gMonitor.addMark( 'ReplicaRemovalAtt', len( lfns ) ) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeReplica( diracSE, lfns ) if res['OK']: for lfn in res['Value']['Failed'].keys(): errorMessage = str( res['Value']['Failed'][lfn] ) if errorMessage.find( 'Write access not permitted for this credential.' ) != -1: if self.__getProxyAndRemoveReplica( diracSE, lfn ): continue if errorMessage.find( 'seconds timeout for "__gfal_wrapper" call' ) != -1: self.timeOutCounter += 1 if not failed.has_key( lfn ): failed[lfn] = {} failed[lfn][diracSE] = res['Value']['Failed'][lfn] else: errMsg[diracSE] = res['Message'] for lfn in lfns: if not failed.has_key( lfn ): failed[lfn] = {} failed[lfn][diracSE] = 'Completely' # Now analyse the results failedLFNs = failed.keys() lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs] gMonitor.addMark( 'ReplicaRemovalDone', len( lfnsOK ) ) for lfn in lfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( lfn, str( diracSEs ) ) ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True if failed: gMonitor.addMark( 'PhysicalRemovalFail', len( failedLFNs ) ) for lfn in failedLFNs: for diracSE in failed[lfn].keys(): if type( failed[lfn][diracSE] ) in StringTypes: if re.search( 'no such file or directory', failed[lfn][diracSE].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( lfn, diracSE, failed[lfn][diracSE] ) ) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error( errStr, errMsg[diracSE] ) ################################################ # If the sub-request is a request to the online system to retransfer elif operation == 'reTransfer': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSE = subRequestAttributes['TargetSE'] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str( subRequestFile['PFN'] ) lfn = str( subRequestFile['LFN'] ) res = self.replicaManager.onlineRetransfer( diracSE, pfn ) if res['OK']: if res['Value']['Successful'].has_key( pfn ): gLogger.info( "RemovalAgent.execute: Successfully requested retransfer of %s." % pfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: errStr = "RemovalAgent.execute: Failed to request retransfer." gLogger.error( errStr, "%s %s %s" % ( pfn, diracSE, res['Value']['Failed'][pfn] ) ) else: errStr = "RemovalAgent.execute: Completely failed to request retransfer." gLogger.error( errStr, res['Message'] ) else: gLogger.info( "RemovalAgent.execute: File already completed." ) ################################################ # If the sub-request is none of the above types else: gLogger.error( "RemovalAgent.execute: Operation not supported.", operation ) ################################################ # Determine whether there are any active files if oRequest.isSubRequestEmpty( ind, 'removal' )['Value']: oRequest.setSubRequestStatus( ind, 'removal', 'Done' ) gMonitor.addMark( "Done", 1 ) ################################################ # If the sub-request is already in terminal state else: gLogger.info( "RemovalAgent.execute:", "Sub-request %s is status '%s' and not to be executed." % ( ind, subRequestAttributes['Status'] ) ) ################################################ # Generate the new request string after operation newrequestString = oRequest.toXML()['Value'] except: # if something fails return the original request back to the server res = self.requestDBClient.updateRequest( requestName, requestString, sourceServer ) return S_OK() res = self.requestDBClient.updateRequest( requestName, newrequestString, sourceServer ) if modified and jobID: result = self.finalizeRequest( requestName, jobID, sourceServer ) return S_OK() def __getProxyAndRemoveReplica( self, diracSE, lfn ): """ get a proxy from the owner of the file and try to remove it returns True if it succeeds, False otherwise """ result = self.replicaManager.getCatalogDirectoryMetadata( lfn, singleFile = True ) if not result[ 'OK' ]: gLogger.error( "Could not get metadata info", result[ 'Message' ] ) return False ownerRole = result[ 'Value' ][ 'OwnerRole' ] ownerDN = result[ 'Value' ][ 'OwnerDN' ] if ownerRole[0] != "/": ownerRole = "/%s" % ownerRole userProxy = '' for ownerGroup in Registry.getGroupsWithVOMSAttribute( ownerRole ): result = gProxyManager.downloadVOMSProxy( ownerDN, ownerGroup, limited = True, requiredVOMSAttribute = ownerRole ) if not result[ 'OK' ]: gLogger.verbose ( 'Failed to retrieve voms proxy for %s : %s:' % ( ownerDN, ownerRole ), result[ 'Message' ] ) continue userProxy = result[ 'Value' ] gLogger.verbose( "Got proxy for %s@%s [%s]" % ( ownerDN, ownerGroup, ownerRole ) ) break if not userProxy: return False result = userProxy.dumpAllToFile() if not result[ 'OK' ]: gLogger.verbose( result[ 'Message' ] ) return False upFile = result[ 'Value' ] prevProxyEnv = os.environ[ 'X509_USER_PROXY' ] os.environ[ 'X509_USER_PROXY' ] = upFile try: res = self.replicaManager.removeReplica( diracSE, lfn ) if res['OK'] and lfn in res[ 'Value' ]['Successful']: gLogger.verbose( 'Removed %s from %s' % ( lfn, diracSE ) ) return True finally: os.environ[ 'X509_USER_PROXY' ] = prevProxyEnv os.unlink( upFile ) return False def finalize( self ): """ Called by the Agent framework to cleanly end execution. In this case this module will wait until all pending ThreadedJbos in the ThreadPool get executed """ self.threadPool.processAllResults() return S_OK()
class OutputDataExecutor: def __init__( self, csPath = "" ): self.log = gLogger.getSubLogger( "OutputDataExecutor" ) if not csPath: vo = gConfig.getValue( "/DIRAC/VirtualOrganization", "" ) self.__transfersCSPath = '/Operations/%s/OutputData' % vo else: self.__transfersCSPath = csPath self.log.verbose( "Reading transfer paths from %s" % self.__transfersCSPath ) self.__requiredCSOptions = ['InputPath', 'InputFC', 'OutputPath', 'OutputFC', 'OutputSE'] self.__threadPool = ThreadPool( gConfig.getValue( "%s/MinTransfers" % self.__transfersCSPath, 1 ), gConfig.getValue( "%s/MaxTransfers" % self.__transfersCSPath, 4 ), gConfig.getValue( "%s/MaxQueuedTransfers" % self.__transfersCSPath, 100 ) ) self.__threadPool.daemonize() self.__processingFiles = set() self.__okTransferredFiles = 0 self.__okTransferredBytes = 0 self.__failedFiles = {} def getNumOKTransferredFiles( self ): return self.__okTransferredFiles def getNumOKTransferredBytes( self ): return self.__okTransferredBytes def transfersPending( self ): return self.__threadPool.isWorking() def getDefinedTransferPaths( self ): result = gConfig.getSections( self.__transfersCSPath ) if not result['OK']: self.log.info( 'No Input/Output Pair defined in CS' ) return S_OK() pathList = result['Value'] tPaths = {} for name in pathList: csPath = self.__transfersCSPath + '/%s' % name result = gConfig.getOptionsDict( csPath ) if not result['OK']: continue transferDict = result['Value'] ok = True for i in self.__requiredCSOptions: if i not in transferDict: self.log.error( 'Missing Option %s in %s' % ( i, csPath ) ) ok = False break if not ok: continue tPaths[ name ] = transferDict return S_OK( tPaths ) def getNumLocalOutgoingFiles( self ): result = self.getDefinedTransferPaths() if not result[ 'OK' ]: return 0 localOutgoing = 0 tPaths = result[ 'Value' ] for name in tPaths: transferDict = tPaths[ name ] if 'LocalDisk' != transferDict['InputFC']: continue localOutgoing += len( self.getOutgoingFiles( transferDict ) ) return localOutgoing def getOutgoingFiles( self, transferDict ): """ Get list of files to be processed from InputPath """ inputFCName = transferDict['InputFC'] inputPath = transferDict['InputPath'] if inputFCName == 'LocalDisk': files = [] try: for fileName in os.listdir( inputPath ): if os.path.isfile( os.path.join( inputPath, fileName ) ): files.append( fileName ) except: pass return files inputFC = FileCatalog( [inputFCName] ) result = inputFC.listDirectory( inputPath, True ) if not result['OK']: self.log.error( result['Message'] ) return [] if not inputPath in result['Value']['Successful']: self.log.error( result['Value']['Failed'][inputPath] ) return [] subDirs = result['Value']['Successful'][inputPath]['SubDirs'] files = result['Value']['Successful'][inputPath]['Files'] for subDir in subDirs: self.log.info( 'Ignoring subdirectory:', subDir ) return files.keys() def checkForTransfers( self ): """ Check for transfers to do and start them """ result = self.getDefinedTransferPaths() if not result[ 'OK' ]: return result tPaths = result[ 'Value' ] for name in tPaths: transferPath = tPaths[ name ] self.log.verbose( "Checking %s transfer path" % name ) filesToTransfer = self.getOutgoingFiles( tPaths[ name ] ) self.log.info( "Transfer path %s has %d files" % ( name, len( filesToTransfer ) ) ) ret = self.__addFilesToThreadPool( filesToTransfer, transferPath ) if not ret['OK']: # The thread pool got full break def processAllPendingTransfers( self ): self.__threadPool.processAllResults() @transferSync def __addFilesToThreadPool( self, files, transferDict ): for fileName in files: fileName = os.path.basename( fileName ) if fileName in self.__processingFiles: continue self.__processingFiles.add( fileName ) time.sleep( 1 ) ret = self.__threadPool.generateJobAndQueueIt( self.__transferIfNotRegistered, args = ( fileName, transferDict ), oCallback = self.transferCallback, blocking = False ) if not ret['OK']: # The thread pool got full return ret return S_OK() def __transferIfNotRegistered( self, file, transferDict ): result = self.isRegisteredInOutputCatalog( file, transferDict ) if not result[ 'OK' ]: self.log.error( result[ 'Message' ] ) return result #Already registered. Need to delete if result[ 'Value' ]: self.log.info( "Transfer file %s is already registered in the output catalog" % file ) #Delete filePath = os.path.join( transferDict[ 'InputPath' ], file ) if transferDict[ 'InputFC' ] == 'LocalDisk': os.unlink( filePath ) #FIXME: what is inFile supposed to be ?? else: inputFC = FileCatalog( [ transferDict['InputFC'] ] ) replicaDict = inputFC.getReplicas( filePath ) if not replicaDict['OK']: self.log.error( "Error deleting file", replicaDict['Message'] ) elif not inFile in replicaDict['Value']['Successful']: self.log.error( "Error deleting file", replicaDict['Value']['Failed'][inFile] ) else: seList = replicaDict['Value']['Successful'][inFile].keys() for se in seList: se = StorageElement( se ) self.log.info( 'Removing from %s:' % se.name, inFile ) se.removeFile( inFile ) inputFC.removeFile( file ) self.log.info( "File %s deleted from %s" % ( file, transferDict[ 'InputFC' ] ) ) self.__processingFiles.discard( file ) return S_OK( file ) #Do the transfer return self.__retrieveAndUploadFile( file, transferDict ) def isRegisteredInOutputCatalog( self, file, transferDict ): fc = FileCatalog( [ transferDict[ 'OutputFC' ] ] ) lfn = os.path.join( transferDict['OutputPath'], os.path.basename( file ) ) result = fc.getReplicas( lfn ) if not result[ 'OK' ]: return result if lfn not in result[ 'Value' ][ 'Successful' ]: return S_OK( False ) replicas = result[ 'Value' ][ 'Successful' ][ lfn ] for seName in List.fromChar( transferDict[ 'OutputSE' ], "," ): if seName in replicas: self.log.verbose( "Transfer file %s is already registered in %s SE" % ( file, seName ) ) return S_OK( True ) return S_OK( False ) def __retrieveAndUploadFile( self, file, outputDict ): """ Retrieve, Upload, and remove """ fileName = file inputPath = outputDict['InputPath'] inputFCName = outputDict['InputFC'] inBytes = 0 if inputFCName == 'LocalDisk': inFile = file file = os.path.join( inputPath, file ) else: inputFC = FileCatalog( [inputFCName] ) inFile = os.path.join( inputPath, file ) replicaDict = inputFC.getReplicas( inFile ) if not replicaDict['OK']: self.log.error( replicaDict['Message'] ) return S_ERROR( fileName ) if not inFile in replicaDict['Value']['Successful']: self.log.error( replicaDict['Value']['Failed'][inFile] ) return S_ERROR( fileName ) seList = replicaDict['Value']['Successful'][inFile].keys() inputSE = StorageElement( seList[0] ) self.log.info( 'Retrieving from %s:' % inputSE.name, inFile ) # ret = inputSE.getFile( inFile ) # lcg_util binding prevent multithreading, use subprocess instead res = pythonCall( 2 * 3600, inputSE.getFile, inFile ) if not res['OK']: self.log.error( res['Message'] ) return S_ERROR( fileName ) ret = res['Value'] if not ret['OK']: self.log.error( ret['Message'] ) return S_ERROR( fileName ) if not inFile in ret['Value']['Successful']: self.log.error( ret['Value']['Failed'][inFile] ) return S_ERROR( fileName ) if os.path.isfile( file ): inBytes = os.stat( file )[6] outputPath = outputDict['OutputPath'] outputFCName = outputDict['OutputFC'] replicaManager = ReplicaManager() outFile = os.path.join( outputPath, os.path.basename( file ) ) transferOK = False for outputSEName in List.fromChar( outputDict['OutputSE'], "," ): outputSE = StorageElement( outputSEName ) self.log.info( 'Trying to upload to %s:' % outputSE.name, outFile ) # ret = replicaManager.putAndRegister( outFile, os.path.realpath( file ), outputSE.name, catalog=outputFCName ) # lcg_util binding prevent multithreading, use subprocess instead result = pythonCall( 2 * 3600, replicaManager.putAndRegister, outFile, os.path.realpath( file ), outputSE.name, catalog = outputFCName ) if result['OK'] and result['Value']['OK']: if outFile in result['Value']['Value']['Successful']: transferOK = True break else: self.log.error( result['Value']['Value']['Failed'][outFile] ) else: if result['OK']: self.log.error( result['Value']['Message'] ) else: self.log.error( result['Message'] ) if not transferOK: return S_ERROR( fileName ) if result['OK'] or not inputFCName == 'LocalDisk': os.unlink( file ) if not result['OK']: self.log.error( ret['Message'] ) return S_ERROR( fileName ) self.log.info( "Finished transferring %s [%s bytes]" % ( inFile, inBytes ) ) self.__okTransferredFiles += 1 self.__okTransferredBytes += inBytes if inputFCName == 'LocalDisk': return S_OK( fileName ) # Now the file is on final SE/FC, remove from input SE/FC for se in seList: se = StorageElement( se ) self.log.info( 'Removing from %s:' % se.name, inFile ) se.removeFile( inFile ) inputFC.removeFile( inFile ) return S_OK( fileName ) @transferSync def transferCallback( self, threadedJob, submitResult ): if not submitResult['OK']: fileName = submitResult['Message'] if fileName not in self.__failedFiles: self.__failedFiles[fileName] = 0 self.__failedFiles[fileName] += 1 else: fileName = submitResult['Value'] if fileName in self.__failedFiles: del self.__failedFiles[fileName] #Take out from processing files if fileName in self.__processingFiles: self.__processingFiles.discard( fileName )
class Publisher: """ Class Publisher is in charge of getting dispersed information, to be published on the web. """ ############################################################################# def __init__(self, VOExtension, rsDBIn = None, commandCallerIn = None, infoGetterIn = None, WMSAdminIn = None): """ Standard constructor :params: :attr:`VOExtension`: string, VO Extension (e.g. 'LHCb') :attr:`rsDBIn`: optional ResourceStatusDB object (see :class: `DIRAC.ResourceStatusSystem.DB.ResourceStatusDB.ResourceStatusDB`) :attr:`commandCallerIn`: optional CommandCaller object (see :class: `DIRAC.ResourceStatusSystem.Command.CommandCaller.CommandCaller`) :attr:`infoGetterIn`: optional InfoGetter object (see :class: `DIRAC.ResourceStatusSystem.Utilities.InfoGetter.InfoGetter`) :attr:`WMSAdminIn`: optional RPCClient object for WMSAdmin (see :class: `DIRAC.Core.DISET.RPCClient.RPCClient`) """ self.configModule = Utils.voimport("DIRAC.ResourceStatusSystem.Policy.Configurations", VOExtension) if rsDBIn is not None: self.rsDB = rsDBIn else: from DIRAC.ResourceStatusSystem.DB.ResourceStatusDB import ResourceStatusDB self.rsDB = ResourceStatusDB() from DIRAC.ResourceStatusSystem.DB.ResourceManagementDB import ResourceManagementDB self.rmDB = ResourceManagementDB() if commandCallerIn is not None: self.cc = commandCallerIn else: from DIRAC.ResourceStatusSystem.Command.CommandCaller import CommandCaller self.cc = CommandCaller() if infoGetterIn is not None: self.ig = infoGetterIn else: from DIRAC.ResourceStatusSystem.Utilities.InfoGetter import InfoGetter self.ig = InfoGetter(VOExtension) if WMSAdminIn is not None: self.WMSAdmin = WMSAdminIn else: from DIRAC.Core.DISET.RPCClient import RPCClient self.WMSAdmin = RPCClient("WorkloadManagement/WMSAdministrator") self.threadPool = ThreadPool( 2, 5 ) self.lockObj = threading.RLock() self.infoForPanel_res = {} ############################################################################# def getInfo(self, granularity, name, useNewRes = False): """ Standard method to get all the info to be published This method uses a ThreadPool (:class:`DIRAC.Core.Utilities.ThreadPool.ThreadPool`) with 2-5 threads. The threaded method is :meth:`DIRAC.ResourceStatusSystem.Utilities.Publisher.Publisher.getInfoForPanel` :params: :attr:`granularity`: string - a ValidRes :attr:`name`: string - name of the Validres :attr:`useNewRes`: boolean. When set to true, will get new results, otherwise it will get cached results (where available). """ if granularity not in ValidRes: raise InvalidRes, Utils.where(self, self.getInfo) self.infoForPanel_res = {} status = None formerStatus = None siteType = None serviceType = None resourceType = None if granularity in ('Resource', 'Resources'): try: resourceType = self.rsDB.getMonitoredsList('Resource', ['ResourceType'], resourceName = name)[0][0] except IndexError: return "%s does not exist!" %name if granularity in ('StorageElement', 'StorageElements'): try: siteType = self.rsDB.getMonitoredsList('StorageElement', ['SiteType'], storageElementName = name)[0][0] except IndexError: return "%s does not exist!" %name paramNames = ['Type', 'Group', 'Name', 'Policy', 'DIRAC Status', 'RSS Status', 'Reason', 'Description'] infoToGet = self.ig.getInfoToApply(('view_info', ), granularity, status = status, formerStatus = formerStatus, siteType = siteType, serviceType = serviceType, resourceType = resourceType, useNewRes = useNewRes)[0]['Panels'] infoToGet_res = {} recordsList = [] infosForPolicy = {} for panel in infoToGet.keys(): (granularityForPanel, nameForPanel) = self.__getNameForPanel(granularity, name, panel) if not self._resExist(granularityForPanel, nameForPanel): # completeInfoForPanel_res = None continue #take composite RSS result for name nameStatus_res = self._getStatus(nameForPanel, panel) recordBase = [None, None, None, None, None, None, None, None] recordBase[1] = panel.replace('_Panel', '') recordBase[2] = nameForPanel #nameForPanel try: recordBase[4] = nameStatus_res[nameForPanel]['DIRACStatus'] #DIRAC Status except: pass recordBase[5] = nameStatus_res[nameForPanel]['RSSStatus'] #RSS Status record = copy.deepcopy(recordBase) record[0] = 'ResultsForResource' recordsList.append(record) #take info that goes into the panel infoForPanel = infoToGet[panel] for info in infoForPanel: self.threadPool.generateJobAndQueueIt(self.getInfoForPanel, args = (info, granularityForPanel, nameForPanel) ) self.threadPool.processAllResults() for policy in [x.keys()[0] for x in infoForPanel]: record = copy.deepcopy(recordBase) record[0] = 'SpecificInformation' record[3] = policy #policyName record[4] = None #DIRAC Status record[5] = self.infoForPanel_res[policy]['Status'] #RSS status for the policy record[6] = self.infoForPanel_res[policy]['Reason'] #Reason record[7] = self.infoForPanel_res[policy]['desc'] #Description recordsList.append(record) infosForPolicy[policy] = self.infoForPanel_res[policy]['infos'] infoToGet_res['TotalRecords'] = len(recordsList) infoToGet_res['ParameterNames'] = paramNames infoToGet_res['Records'] = recordsList infoToGet_res['Extras'] = infosForPolicy return infoToGet_res ############################################################################# def getInfoForPanel(self, info, granularityForPanel, nameForPanel): #get single RSS policy results policyResToGet = info.keys()[0] pol_res = self.rmDB.getPolicyRes(nameForPanel, policyResToGet) if pol_res != []: pol_res_dict = {'Status' : pol_res[0], 'Reason' : pol_res[1]} else: pol_res_dict = {'Status' : 'Unknown', 'Reason' : 'Unknown'} self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet] = pol_res_dict finally: self.lockObj.release() #get policy description desc = self._getPolicyDesc(policyResToGet) #get other info othersInfo = info.values()[0] if not isinstance(othersInfo, list): othersInfo = [othersInfo] info_res = {} for oi in othersInfo: format_ = oi.keys()[0] what = oi.values()[0] info_bit_got = self._getInfo(granularityForPanel, nameForPanel, format_, what) info_res[format_] = info_bit_got self.lockObj.acquire() try: self.infoForPanel_res[policyResToGet]['infos'] = info_res self.infoForPanel_res[policyResToGet]['desc'] = desc finally: self.lockObj.release() ############################################################################# def _getStatus(self, name, panel): #get RSS status RSSStatus = self._getInfoFromRSSDB(name, panel)[0][1] #get DIRAC status if panel in ('Site_Panel', 'SE_Panel'): if panel == 'Site_Panel': DIRACStatus = self.WMSAdmin.getSiteMaskLogging(name) if DIRACStatus['OK']: DIRACStatus = DIRACStatus['Value'][name].pop()[0] else: raise RSSException, Utils.where(self, self._getStatus) elif panel == 'SE_Panel': ra = getStorageElementStatus(name, 'ReadAccess')['Value'] wa = getStorageElementStatus(name, 'WriteAccess')['Value'] DIRACStatus = {'ReadAccess': ra, 'WriteAccess': wa} status = { name : { 'RSSStatus': RSSStatus, 'DIRACStatus': DIRACStatus } } else: status = { name : { 'RSSStatus': RSSStatus} } return status ############################################################################# def _getInfo(self, granularity, name, format_, what): if format_ == 'RSS': info_bit_got = self._getInfoFromRSSDB(name, what) else: if isinstance(what, dict): command = what['CommandIn'] extraArgs = what['args'] else: command = what extraArgs = None info_bit_got = self.cc.commandInvocation(granularity, name, None, None, command, extraArgs) try: info_bit_got = info_bit_got['Result'] except: pass return info_bit_got ############################################################################# def _getInfoFromRSSDB(self, name, what): paramsL = ['Status'] siteName = None serviceName = None resourceName = None storageElementName = None serviceType = None gridSiteName = None if what == 'ServiceOfSite': gran = 'Service' paramsL.insert(0, 'ServiceName') paramsL.append('Reason') siteName = name elif what == 'ResOfCompService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorService': gran = 'Resources' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') serviceType = name.split('@')[0] gridSiteName = getGOCSiteName(name.split('@')[1]) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'ResOfStorEl': gran = 'StorageElements' paramsL.insert(0, 'ResourceName') paramsL.append('Reason') storageElementName = name elif what == 'StorageElementsOfSite': gran = 'StorageElements' paramsL.insert(0, 'StorageElementName') paramsL.append('Reason') if '@' in name: DIRACsiteName = name.split('@').pop() else: DIRACsiteName = name gridSiteName = getGOCSiteName(DIRACsiteName) if not gridSiteName['OK']: raise RSSException, gridSiteName['Message'] gridSiteName = gridSiteName['Value'] elif what == 'Site_Panel': gran = 'Site' paramsL.insert(0, 'SiteName') siteName = name elif what == 'Service_Computing_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_Storage_Panel': gran = 'Service' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VO-BOX_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Service_VOMS_Panel': gran = 'Services' paramsL.insert(0, 'ServiceName') serviceName = name elif what == 'Resource_Panel': gran = 'Resource' paramsL.insert(0, 'ResourceName') resourceName = name elif what == 'SE_Panel': gran = 'StorageElement' paramsL.insert(0, 'StorageElementName') storageElementName = name info_bit_got = self.rsDB.getMonitoredsList(gran, paramsList = paramsL, siteName = siteName, serviceName = serviceName, serviceType = serviceType, resourceName = resourceName, storageElementName = storageElementName, gridSiteName = gridSiteName) return info_bit_got ############################################################################# def _getPolicyDesc(self, policyName): return self.configModule.Policies[policyName]['Description'] ############################################################################# def __getNameForPanel(self, granularity, name, panel): if granularity in ('Site', 'Sites'): if panel == 'Service_Computing_Panel': granularity = 'Service' name = 'Computing@' + name elif panel == 'Service_Storage_Panel': granularity = 'Service' name = 'Storage@' + name elif panel == 'OtherServices_Panel': granularity = 'Service' name = 'OtherS@' + name elif panel == 'Service_VOMS_Panel': granularity = 'Service' name = 'VOMS@' + name elif panel == 'Service_VO-BOX_Panel': granularity = 'Service' name = 'VO-BOX@' + name # else: # granularity = granularity # name = name # else: # granularity = granularity # name = name return (granularity, name) ############################################################################# def _resExist(self, granularity, name): siteName = None serviceName = None resourceName = None storageElementName = None if granularity in ('Site', 'Sites'): siteName = name elif granularity in ('Service', 'Services'): serviceName = name elif granularity in ('Resource', 'Resources'): resourceName = name elif granularity in ('StorageElement', 'StorageElements'): storageElementName = name res = self.rsDB.getMonitoredsList(granularity, siteName = siteName, serviceName = serviceName, resourceName = resourceName, storageElementName = storageElementName) if res == []: return False else: return True
class FTSMonitorAgent(AgentModule): """ .. class:: FTSMonitorAgent Monitor submitted FTS jobs. """ # # transfer DB handle transferDB = None # # thread pool threadPool = None # # min threads minThreads = 1 # # max threads maxThreads = 10 # # missing source regexp patterns missingSourceErrors = [ re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ), re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\ " Command failed. : open error: No such file or directory" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ] def initialize(self): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption("shifterProxy", "DataManager") self.minThreads = self.am_getOption("MinThreads", self.minThreads) self.maxThreads = self.am_getOption("MaxThreads", self.maxThreads) minmax = (abs(self.minThreads), abs(self.maxThreads)) self.minThreads, self.maxThreads = min(minmax), max(minmax) self.log.info("ThreadPool min threads = %s" % self.minThreads) self.log.info("ThreadPool max threads = %s" % self.maxThreads) self.threadPool = ThreadPool(self.minThreads, self.maxThreads) self.threadPool.daemonize() return S_OK() def execute(self): """ push jobs to the thread pool """ self.log.info("Obtaining requests to monitor") res = self.transferDB.getFTSReq() if not res["OK"]: self.log.error("Failed to get FTS requests", res['Message']) return res if not res["Value"]: self.log.info("No FTS requests found to monitor.") return S_OK() ftsReqs = res["Value"] self.log.info("Found %s FTS jobs" % len(ftsReqs)) i = 1 for ftsJob in ftsReqs: while True: self.log.debug("submitting FTS Job %s FTSReqID=%s to monitor" % (i, ftsJob["FTSReqID"])) ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args=(ftsJob, ), ) if ret["OK"]: i += 1 break # # sleep 1 second to proceed time.sleep(1) self.threadPool.processAllResults() return S_OK() def ftsJobExpired(self, ftsReqID, channelID): """ clean up when FTS job had expired on the server side :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID """ log = gLogger.getSubLogger("@%s" % str(ftsReqID)) fileIDs = self.transferDB.getFTSReqFileIDs(ftsReqID) if not fileIDs["OK"]: log.error("Unable to retrieve FileIDs associated to %s request" % ftsReqID) return fileIDs fileIDs = fileIDs["Value"] # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed") if not fileStatus["OK"]: log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % (fileID, fileStatus["Message"])) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server") if not failReason["OK"]: log.error("Unable to set FileToFTS reason for FileID %s: %s" % (fileID, failReason["Message"])) # # update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs) if not resetChannels["OK"]: log.error("Failed to reset Channel table for files to retry") return resetChannels # # update FTSReq table log.info("Setting FTS request status to 'Finished'") ftsReqStatus = self.transferDB.setFTSReqStatus(ftsReqID, "Finished") if not ftsReqStatus["OK"]: log.error("Failed update FTS Request status", ftsReqStatus["Message"]) return ftsReqStatus # # if we land here, everything should be OK return S_OK() def monitorTransfer(self, ftsReqDict): """ monitors transfer obtained from TransferDB :param dict ftsReqDict: FTS job dictionary """ ftsReqID = ftsReqDict.get("FTSReqID") ftsGUID = ftsReqDict.get("FTSGuid") ftsServer = ftsReqDict.get("FTSServer") channelID = ftsReqDict.get("ChannelID") sourceSE = ftsReqDict.get("SourceSE") targetSE = ftsReqDict.get("TargetSE") oFTSRequest = FTSRequest() oFTSRequest.setFTSServer(ftsServer) oFTSRequest.setFTSGUID(ftsGUID) oFTSRequest.setSourceSE(sourceSE) oFTSRequest.setTargetSE(targetSE) log = gLogger.getSubLogger("@%s" % str(ftsReqID)) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. log.info("Perform summary update of the FTS Request") infoStr = ["glite-transfer-status -s %s -l %s" % (ftsServer, ftsGUID)] infoStr.append("FTS GUID: %s" % ftsGUID) infoStr.append("FTS Server: %s" % ftsServer) log.info("\n".join(infoStr)) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor(ftsReqID) if not res["OK"]: log.error("Failed to update the FTS request summary", res["Message"]) if "getTransferJobSummary2: Not authorised to query request" in res[ "Message"]: log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" ) cleanUp = self.ftsJobExpired(ftsReqID, channelID) if not cleanUp["OK"]: log.error(cleanUp["Message"]) return cleanUp return res res = oFTSRequest.dumpSummary() if not res['OK']: log.error("Failed to get FTS request summary", res["Message"]) return res log.info(res['Value']) res = oFTSRequest.getPercentageComplete() if not res['OK']: log.error("Failed to get FTS percentage complete", res["Message"]) return res log.info('FTS Request found to be %.1f percent complete' % res["Value"]) self.transferDB.setFTSReqAttribute(ftsReqID, "PercentageComplete", res["Value"]) self.transferDB.addLoggingEvent(ftsReqID, res["Value"]) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res["OK"]: log.error("Failed to determine whether FTS request terminal", res["Message"]) return res if not res["Value"]: return S_OK() # # request is terminal return self.terminalRequest(oFTSRequest, ftsReqID, channelID, sourceSE) def terminalRequest(self, oFTSRequest, ftsReqID, channelID, sourceSE): """ process terminal FTS job :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param str sourceSE: FTSReq.SourceSE """ log = gLogger.getSubLogger("@%s" % ftsReqID) log.info("FTS Request found to be terminal, updating file states") ######################################################################### # Get the LFNS associated to the FTS request log.info("Obtaining the LFNs associated to this request") res = self.transferDB.getFTSReqLFNs(ftsReqID, channelID, sourceSE) if not res["OK"]: log.error("Failed to obtain FTS request LFNs", res['Message']) return res files = res["Value"] if not files: log.error("No files present for transfer") return S_ERROR("No files were found in the DB") lfns = files.keys() log.debug("Obtained %s files" % len(lfns)) for lfn in lfns: oFTSRequest.setLFN(lfn) res = oFTSRequest.monitor() if not res["OK"]: log.error("Failed to perform detailed monitoring of FTS request", res["Message"]) return res res = oFTSRequest.getFailed() if not res["OK"]: log.error("Failed to obtained failed files for FTS request", res["Message"]) return res failedFiles = res["Value"] res = oFTSRequest.getDone() if not res["OK"]: log.error("Failed to obtained successful files for FTS request", res["Message"]) return res completedFiles = res["Value"] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs(ftsReqID) if not res["OK"]: log.error("Failed to get FileIDs associated to FTS Request", res["Message"]) return res fileIDs = res["Value"] res = self.transferDB.getAttributesForFilesList(fileIDs, ["LFN"]) if not res["OK"]: log.error("Failed to get LFNs associated to FTS Request", res["Message"]) return res fileIDDict = res["Value"] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append(fileID) transferTime = 0 res = oFTSRequest.getTransferTime(lfn) if res["OK"]: transferTime = res["Value"] fileToFTSUpdates.append( (fileID, "Completed", "", 0, transferTime)) if lfn in failedFiles: failReason = "" res = oFTSRequest.getFailReason(lfn) if res["OK"]: failReason = res["Value"] if "Source file/user checksum mismatch" in failReason: filesToFail.append(fileID) continue if self.missingSource(failReason): log.error("The source SURL does not exist.", "%s %s" % (lfn, oFTSRequest.getSourceSURL(lfn))) filesToFail.append(fileID) else: filesToRetry.append(fileID) log.error("Failed to replicate file on channel.", "%s %s" % (channelID, failReason)) fileToFTSUpdates.append((fileID, "Failed", failReason, 0, 0)) # # update TransferDB.FileToFTS table updateFileToFTS = self.updateFileToFTS(ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates) if updateFileToFTS["OK"] and updateFileToFTS["Value"]: res = oFTSRequest.finalize() if not res["OK"]: log.error( "Failed to perform the finalization for the FTS request", res["Message"]) return res log.info('Adding logging event for FTS request') # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent(ftsReqID, 'Finished') if not res['OK']: log.error('Failed to add logging event for FTS Request', res['Message']) # update TransferDB.FileToCat table updateFileToCat = self.updateFileToCat(oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail) if not updateFileToCat["OK"]: log.error(updateFileToCat["Message"]) log.debug("Updating FTS request status") res = self.transferDB.setFTSReqStatus(ftsReqID, 'Finished') if not res['OK']: log.error('Failed update FTS Request status', res['Message']) return S_OK() def updateFileToFTS(self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates): """ update TransferDB.FileToFTS table for finished request :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param list filesToRetry: FileIDs to retry :param list filesToFail: FileIDs for failed files :param list completedFileIDs: files completed :param list fileToFTSUpdates: ??? """ log = gLogger.getSubLogger("@%s" % ftsReqID) allUpdated = True res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry) if filesToRetry else S_OK() if not res["OK"]: log.error("Failed to update the Channel table for file to retry.", res["Message"]) allUpdated = False for fileID in filesToFail: log.info("Updating the Channel table for files to reschedule") res = self.transferDB.setFileToReschedule(fileID) if not res["OK"]: log.error("Failed to update Channel table for failed files.", res["Message"]) allUpdated = False elif res["Value"] == "max reschedule attempt reached": log.error("setting Channel status to 'Failed' : " % res["Value"]) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed') if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"]) allUpdated = False if completedFileIDs: res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs) if not res["OK"]: log.error( "Failed to update the Channel table for successful files.", res["Message"]) allUpdated = False res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs) if not res["OK"]: log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message']) allUpdated = False if fileToFTSUpdates: res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates) if not res["OK"]: log.error("Failed to update the FileToFTS table for files.", res["Message"]) allUpdated = False return S_OK(allUpdated) def updateFileToCat(self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail): """ update TransferDB.FileToCat table for finished request :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param dict fileIDDict: fileIDs dictionary :param int channelID: FTSReq.ChannelID """ res = oFTSRequest.getFailedRegistrations() failedRegistrations = res["Value"] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: regFailedFileIDs.append(fileID) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append(fileID) elif fileID in filesToFail: regForgetFileIDs.append(fileID) res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs) if regFailedFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to reset entries in FileToCat: %s" % res[ "Message"] return res res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs) if regDoneFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[ "Message"] return res # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs) if regForgetFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[ "Message"] return res return S_OK() @classmethod def missingSource(cls, failReason): """ check if message sent by FTS server is concering missing source file :param str failReason: message sent by FTS server """ for error in cls.missingSourceErrors: if error.search(failReason): return 1 return 0
class SystemAdministratorIntegrator( object ): def __init__( self, **kwargs ): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list ( set( self.__hosts ) - set( kwargs[ 'exclude' ] ) ) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] self.silentHosts = [] self.__resultDict = {} self.__kwargs = {} pool = ThreadPool( len( self.__hosts ) ) for host in self.__hosts: pool.generateJobAndQueueIt( self.__executeClient, args = [ host, "ping" ], kwargs = {}, oCallback = self.__processResult ) pool.processAllResults() for host, result in self.__resultDict.items(): if result['OK']: sysAdminHosts.append( host ) else: self.silentHosts.append( host ) del pool self.__hosts = sysAdminHosts self.__kwargs = dict( kwargs ) self.__pool = ThreadPool( len( self.__hosts ) ) self.__resultDict = {} def getSilentHosts( self ): """ Get a list of non-responding hosts :return: list of hosts """ return self.silentHosts def getRespondingHosts( self ): """ Get a list of responding hosts :return: list of hosts """ return self.__hosts def __getattr__( self, name ): self.call = name return self.execute def __executeClient( self, host, method, *parms, **kwargs ): """ Execute RPC method on a given host """ hostName = Registry.getHostOption( host, 'Host', host) client = SystemAdministratorClient( hostName, **self.__kwargs ) result = getattr( client, method )( *parms, **kwargs ) result['Host'] = host return result def __processResult( self, id_, result ): """ Collect results in the final structure """ host = result['Host'] del result['Host'] self.__resultDict[host] = result def execute(self, *args, **kwargs ): """ Main execution method """ self.__resultDict = {} for host in self.__hosts: self.__pool.generateJobAndQueueIt( self.__executeClient, args = [ host, self.call ] + list(args), kwargs = kwargs, oCallback = self.__processResult ) self.__pool.processAllResults() return S_OK( self.__resultDict )
class OutputDataExecutor: def __init__(self, csPath=""): self.log = gLogger.getSubLogger("OutputDataExecutor") if not csPath: vo = gConfig.getValue("/DIRAC/VirtualOrganization", "") self.__transfersCSPath = '/Operations/%s/OutputData' % vo else: self.__transfersCSPath = csPath self.log.verbose("Reading transfer paths from %s" % self.__transfersCSPath) self.__requiredCSOptions = [ 'InputPath', 'InputFC', 'OutputPath', 'OutputFC', 'OutputSE' ] self.__threadPool = ThreadPool( gConfig.getValue("%s/MinTransfers" % self.__transfersCSPath, 1), gConfig.getValue("%s/MaxTransfers" % self.__transfersCSPath, 4), gConfig.getValue("%s/MaxQueuedTransfers" % self.__transfersCSPath, 100)) self.__threadPool.daemonize() self.__processingFiles = set() self.__okTransferredFiles = 0 self.__okTransferredBytes = 0 self.__failedFiles = {} def getNumOKTransferredFiles(self): return self.__okTransferredFiles def getNumOKTransferredBytes(self): return self.__okTransferredBytes def transfersPending(self): return self.__threadPool.isWorking() def getDefinedTransferPaths(self): result = gConfig.getSections(self.__transfersCSPath) if not result['OK']: self.log.info('No Input/Output Pair defined in CS') return S_OK() pathList = result['Value'] tPaths = {} for name in pathList: csPath = self.__transfersCSPath + '/%s' % name result = gConfig.getOptionsDict(csPath) if not result['OK']: continue transferDict = result['Value'] ok = True for i in self.__requiredCSOptions: if i not in transferDict: self.log.error('Missing Option %s in %s' % (i, csPath)) ok = False break if not ok: continue tPaths[name] = transferDict return S_OK(tPaths) def getNumLocalOutgoingFiles(self): result = self.getDefinedTransferPaths() if not result['OK']: return 0 localOutgoing = 0 tPaths = result['Value'] for name in tPaths: transferDict = tPaths[name] if 'LocalDisk' != transferDict['InputFC']: continue localOutgoing += len(self.getOutgoingFiles(transferDict)) return localOutgoing def getOutgoingFiles(self, transferDict): """ Get list of files to be processed from InputPath """ inputFCName = transferDict['InputFC'] inputPath = transferDict['InputPath'] if inputFCName == 'LocalDisk': files = [] try: for file in os.listdir(inputPath): if os.path.isfile(os.path.join(inputPath, file)): files.append(file) except: pass return files inputFC = FileCatalog([inputFCName]) result = inputFC.listDirectory(inputPath, True) if not result['OK']: self.log.error(result['Message']) return [] if not inputPath in result['Value']['Successful']: self.log.error(result['Value']['Failed'][inputPath]) return [] subDirs = result['Value']['Successful'][inputPath]['SubDirs'] files = result['Value']['Successful'][inputPath]['Files'] for dir in subDirs: self.log.info('Ignoring subdirectory:', dir) return files.keys() def checkForTransfers(self): """ Check for transfers to do and start them """ result = self.getDefinedTransferPaths() if not result['OK']: return result tPaths = result['Value'] for name in tPaths: transferPath = tPaths[name] self.log.verbose("Checking %s transfer path" % name) filesToTransfer = self.getOutgoingFiles(tPaths[name]) self.log.info("Transfer path %s has %d files" % (name, len(filesToTransfer))) ret = self.__addFilesToThreadPool(filesToTransfer, transferPath) if not ret['OK']: # The thread pool got full break def processAllPendingTransfers(self): self.__threadPool.processAllResults() @transferSync def __addFilesToThreadPool(self, files, transferDict): for file in files: file = os.path.basename(file) if file in self.__processingFiles: continue self.__processingFiles.add(file) time.sleep(1) ret = self.__threadPool.generateJobAndQueueIt( self.__transferIfNotRegistered, args=(file, transferDict), oCallback=self.transferCallback, blocking=False) if not ret['OK']: # The thread pool got full return ret return S_OK() def __transferIfNotRegistered(self, file, transferDict): result = self.isRegisteredInOutputCatalog(file, transferDict) if not result['OK']: self.log.error(result['Message']) return result #Already registered. Need to delete if result['Value']: self.log.info( "Transfer file %s is already registered in the output catalog" % file) #Delete filePath = os.path.join(transferDict['InputPath'], file) if transferDict['InputFC'] == 'LocalDisk': os.unlink(filePath) else: inputFC = FileCatalog([transferDict['InputFC']]) replicaDict = inputFC.getReplicas(filePath) if not replicaDict['OK']: self.log.error("Error deleting file", replicaDict['Message']) elif not inFile in replicaDict['Value']['Successful']: self.log.error("Error deleting file", replicaDict['Value']['Failed'][inFile]) else: seList = replicaDict['Value']['Successful'][inFile].keys() for se in seList: se = StorageElement(se) self.log.info('Removing from %s:' % se.name, inFile) se.removeFile(inFile) inputFC.removeFile(file) self.log.info("File %s deleted from %s" % (file, transferDict['InputFC'])) self.__processingFiles.discard(file) return S_OK(file) #Do the transfer return self.__retrieveAndUploadFile(file, transferDict) def isRegisteredInOutputCatalog(self, file, transferDict): fc = FileCatalog([transferDict['OutputFC']]) lfn = os.path.join(transferDict['OutputPath'], os.path.basename(file)) result = fc.getReplicas(lfn) if not result['OK']: return result if lfn not in result['Value']['Successful']: return S_OK(False) replicas = result['Value']['Successful'][lfn] for seName in List.fromChar(transferDict['OutputSE'], ","): if seName in replicas: self.log.verbose( "Transfer file %s is already registered in %s SE" % (file, seName)) return S_OK(True) return S_OK(False) def __retrieveAndUploadFile(self, file, outputDict): """ Retrieve, Upload, and remove """ fileName = file inputPath = outputDict['InputPath'] inputFCName = outputDict['InputFC'] inBytes = 0 if inputFCName == 'LocalDisk': inFile = file file = os.path.join(inputPath, file) else: inputFC = FileCatalog([inputFCName]) inFile = os.path.join(inputPath, file) replicaDict = inputFC.getReplicas(inFile) if not replicaDict['OK']: self.log.error(replicaDict['Message']) return S_ERROR(fileName) if not inFile in replicaDict['Value']['Successful']: self.log.error(replicaDict['Value']['Failed'][inFile]) return S_ERROR(fileName) seList = replicaDict['Value']['Successful'][inFile].keys() inputSE = StorageElement(seList[0]) self.log.info('Retrieving from %s:' % inputSE.name, inFile) # ret = inputSE.getFile( inFile ) # lcg_util binding prevent multithreading, use subprocess instead res = pythonCall(2 * 3600, inputSE.getFile, inFile) if not res['OK']: self.log.error(res['Message']) return S_ERROR(fileName) ret = res['Value'] if not ret['OK']: self.log.error(ret['Message']) return S_ERROR(fileName) if not inFile in ret['Value']['Successful']: self.log.error(ret['Value']['Failed'][inFile]) return S_ERROR(fileName) if os.path.isfile(file): inBytes = os.stat(file)[6] outputPath = outputDict['OutputPath'] outputFCName = outputDict['OutputFC'] replicaManager = ReplicaManager() outFile = os.path.join(outputPath, os.path.basename(file)) transferOK = False for outputSEName in List.fromChar(outputDict['OutputSE'], ","): outputSE = StorageElement(outputSEName) self.log.info('Trying to upload to %s:' % outputSE.name, outFile) # ret = replicaManager.putAndRegister( outFile, os.path.realpath( file ), outputSE.name, catalog=outputFCName ) # lcg_util binding prevent multithreading, use subprocess instead result = pythonCall(2 * 3600, replicaManager.putAndRegister, outFile, os.path.realpath(file), outputSE.name, catalog=outputFCName) if result['OK'] and result['Value']['OK']: if outFile in result['Value']['Value']['Successful']: transferOK = True break else: self.log.error(result['Value']['Value']['Failed'][outFile]) else: if result['OK']: self.log.error(result['Value']['Message']) else: self.log.error(result['Message']) if not transferOK: return S_ERROR(fileName) if result['OK'] or not inputFCName == 'LocalDisk': os.unlink(file) if not result['OK']: self.log.error(ret['Message']) return S_ERROR(fileName) self.log.info("Finished transferring %s [%s bytes]" % (inFile, inBytes)) self.__okTransferredFiles += 1 self.__okTransferredBytes += inBytes if inputFCName == 'LocalDisk': return S_OK(fileName) # Now the file is on final SE/FC, remove from input SE/FC for se in seList: se = StorageElement(se) self.log.info('Removing from %s:' % se.name, inFile) se.removeFile(inFile) inputFC.removeFile(inFile) return S_OK(fileName) @transferSync def transferCallback(self, threadedJob, submitResult): if not submitResult['OK']: file = submitResult['Message'] if file not in self.__failedFiles: self.__failedFiles[file] = 0 self.__failedFiles[file] += 1 else: file = submitResult['Value'] if file in self.__failedFiles: del self.__failedFiles[file] #Take out from processing files if file in self.__processingFiles: self.__processingFiles.discard(file)