class StorageFactory: def __init__( self, useProxy = False, vo = None ): self.rootConfigPath = '/Resources/StorageElements' self.proxy = False self.proxy = useProxy self.resourceStatus = ResourceStatus() self.vo = vo if self.vo is None: result = getVOfromProxyGroup() if result['OK']: self.vo = result['Value'] else: RuntimeError( "Can not get the current VO context" ) self.remotePlugins = [] self.localPlugins = [] self.name = '' self.options = {} self.protocolDetails = [] self.storages = [] ########################################################################################### # # Below are public methods for obtaining storage objects # def getStorageName( self, initialName ): return self._getConfigStorageName( initialName, 'Alias' ) def getStorage( self, parameterDict ): """ This instantiates a single storage for the details provided and doesn't check the CS. """ # The storage name must be supplied. if parameterDict.has_key( 'StorageName' ): storageName = parameterDict['StorageName'] else: errStr = "StorageFactory.getStorage: StorageName must be supplied" gLogger.error( errStr ) return S_ERROR( errStr ) # PluginName must be supplied otherwise nothing with work. if parameterDict.has_key( 'PluginName' ): pluginName = parameterDict['PluginName'] # Temporary fix for backward compatibility elif parameterDict.has_key( 'ProtocolName' ): pluginName = parameterDict['ProtocolName'] else: errStr = "StorageFactory.getStorage: PluginName must be supplied" gLogger.error( errStr ) return S_ERROR( errStr ) return self.__generateStorageObject( storageName, pluginName, parameterDict ) def getStorages( self, storageName, pluginList = None ): """ Get an instance of a Storage based on the DIRAC SE name based on the CS entries CS 'storageName' is the DIRAC SE name i.e. 'CERN-RAW' 'pluginList' is an optional list of protocols if a sub-set is desired i.e ['SRM2','SRM1'] """ self.remotePlugins = [] self.localPlugins = [] self.name = '' self.options = {} self.protocolDetails = [] self.storages = [] if pluginList is None: pluginList = [] if not self.vo: return S_ERROR( 'Mandatory vo parameter is not defined' ) # Get the name of the storage provided res = self._getConfigStorageName( storageName, 'Alias' ) if not res['OK']: return res storageName = res['Value'] self.name = storageName # In case the storage is made from a base SE, get this information res = self._getConfigStorageName( storageName, 'BaseSE' ) if not res['OK']: return res storageName = res['Value'] # Get the options defined in the CS for this storage res = self._getConfigStorageOptions( storageName ) if not res['OK']: return res self.options = res['Value'] # Get the protocol specific details res = self._getConfigStorageProtocols( storageName ) if not res['OK']: return res self.protocolDetails = res['Value'] requestedLocalPlugins = [] requestedRemotePlugins = [] requestedProtocolDetails = [] turlProtocols = [] # Generate the protocol specific plug-ins for protocolDict in self.protocolDetails: pluginName = protocolDict.get( 'PluginName' ) if pluginList and pluginName not in pluginList: continue protocol = protocolDict['Protocol'] result = self.__generateStorageObject( storageName, pluginName, protocolDict ) if result['OK']: self.storages.append( result['Value'] ) if pluginName in self.localPlugins: turlProtocols.append( protocol ) requestedLocalPlugins.append( pluginName ) if pluginName in self.remotePlugins: requestedRemotePlugins.append( pluginName ) requestedProtocolDetails.append( protocolDict ) else: gLogger.info( result['Message'] ) if len( self.storages ) > 0: resDict = {} resDict['StorageName'] = self.name resDict['StorageOptions'] = self.options resDict['StorageObjects'] = self.storages resDict['LocalPlugins'] = requestedLocalPlugins resDict['RemotePlugins'] = requestedRemotePlugins resDict['ProtocolOptions'] = requestedProtocolDetails resDict['TurlProtocols'] = turlProtocols return S_OK( resDict ) else: errStr = "StorageFactory.getStorages: Failed to instantiate any storage protocols." gLogger.error( errStr, self.name ) return S_ERROR( errStr ) ########################################################################################### # # Below are internal methods for obtaining section/option/value configuration # def _getConfigStorageName( self, storageName, referenceType ): """ This gets the name of the storage the configuration service. If the storage is a reference to another SE the resolution is performed. 'storageName' is the storage section to check in the CS """ configPath = '%s/%s' % ( self.rootConfigPath, storageName ) res = gConfig.getOptions( configPath ) if not res['OK']: errStr = "StorageFactory._getConfigStorageName: Failed to get storage options" gLogger.error( errStr, res['Message'] ) return S_ERROR( errStr ) if not res['Value']: errStr = "StorageFactory._getConfigStorageName: Supplied storage doesn't exist." gLogger.error( errStr, configPath ) return S_ERROR( errStr ) if referenceType in res['Value']: configPath = cfgPath( self.rootConfigPath, storageName, referenceType ) referenceName = gConfig.getValue( configPath ) result = self._getConfigStorageName( referenceName, 'Alias' ) if not result['OK']: return result resolvedName = result['Value'] else: resolvedName = storageName return S_OK( resolvedName ) def _getConfigStorageOptions( self, storageName ): """ Get the options associated to the StorageElement as defined in the CS """ storageConfigPath = cfgPath( self.rootConfigPath, storageName ) res = gConfig.getOptions( storageConfigPath ) if not res['OK']: errStr = "StorageFactory._getStorageOptions: Failed to get storage options." gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) options = res['Value'] optionsDict = {} for option in options: if option in [ 'ReadAccess', 'WriteAccess', 'CheckAccess', 'RemoveAccess']: continue optionConfigPath = cfgPath( storageConfigPath, option ) if option in [ 'VO' ]: optionsDict[option] = gConfig.getValue( optionConfigPath, [] ) else: optionsDict[option] = gConfig.getValue( optionConfigPath, '' ) res = self.resourceStatus.getStorageElementStatus( storageName ) if not res[ 'OK' ]: errStr = "StorageFactory._getStorageOptions: Failed to get storage status" gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) # For safety, we did not add the ${statusType}Access keys # this requires modifications in the StorageElement class # We add the dictionary with the statusTypes and values # { 'statusType1' : 'status1', 'statusType2' : 'status2' ... } optionsDict.update( res[ 'Value' ][ storageName ] ) return S_OK( optionsDict ) def _getConfigStorageProtocols( self, storageName ): """ Protocol specific information is present as sections in the Storage configuration """ storageConfigPath = cfgPath( self.rootConfigPath, storageName ) res = gConfig.getSections( storageConfigPath ) if not res['OK']: errStr = "StorageFactory._getConfigStorageProtocols: Failed to get storage sections" gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) protocolSections = res['Value'] sortedProtocolSections = sortList( protocolSections ) protocolDetails = [] for protocolSection in sortedProtocolSections: res = self._getConfigStorageProtocolDetails( storageName, protocolSection ) if not res['OK']: return res protocolDetails.append( res['Value'] ) return S_OK( protocolDetails ) def _getConfigStorageProtocolDetails( self, storageName, protocolSection ): """ Parse the contents of the protocol block """ # First obtain the options that are available protocolConfigPath = cfgPath( self.rootConfigPath, storageName, protocolSection ) res = gConfig.getOptions( protocolConfigPath ) if not res['OK']: errStr = "StorageFactory.__getProtocolDetails: Failed to get protocol options." gLogger.error( errStr, "%s: %s" % ( storageName, protocolSection ) ) return S_ERROR( errStr ) options = res['Value'] # We must have certain values internally even if not supplied in CS protocolDict = {'Access':'', 'Host':'', 'Path':'', 'Port':'', 'Protocol':'', 'PluginName':'', 'SpaceToken':'', 'WSUrl':''} for option in options: configPath = cfgPath( protocolConfigPath, option ) optionValue = gConfig.getValue( configPath, '' ) protocolDict[option] = optionValue # This is a temporary for backward compatibility if "ProtocolName" in protocolDict and not protocolDict['PluginName']: protocolDict['PluginName'] = protocolDict['ProtocolName'] protocolDict.pop( 'ProtocolName', None ) # Evaluate the base path taking into account possible VO specific setting if self.vo: result = gConfig.getOptionsDict( cfgPath( protocolConfigPath, 'VOPath' ) ) voPath = '' if result['OK']: voPath = result['Value'].get( self.vo, '' ) if voPath: protocolDict['Path'] = voPath # Now update the local and remote protocol lists. # A warning will be given if the Access option is not set. if protocolDict['Access'].lower() == 'remote': self.remotePlugins.append( protocolDict['PluginName'] ) elif protocolDict['Access'].lower() == 'local': self.localPlugins.append( protocolDict['PluginName'] ) else: errStr = "StorageFactory.__getProtocolDetails: The 'Access' option for %s:%s is neither 'local' or 'remote'." % ( storageName, protocolSection ) gLogger.warn( errStr ) # The PluginName option must be defined if not protocolDict['PluginName']: errStr = "StorageFactory.__getProtocolDetails: 'PluginName' option is not defined." gLogger.error( errStr, "%s: %s" % ( storageName, protocolSection ) ) return S_ERROR( errStr ) return S_OK( protocolDict ) ########################################################################################### # # Below is the method for obtaining the object instantiated for a provided storage configuration # def __generateStorageObject( self, storageName, pluginName, parameters ): storageType = pluginName if self.proxy: storageType = 'Proxy' objectLoader = ObjectLoader() result = objectLoader.loadObject( 'Resources.Storage.%sStorage' % storageType, storageType + 'Storage' ) if not result['OK']: gLogger.error( 'Failed to load storage object: %s' % result['Message'] ) return result storageClass = result['Value'] try: storage = storageClass( storageName, parameters ) except Exception, x: errStr = "StorageFactory._generateStorageObject: Failed to instantiate %s: %s" % ( storageName, x ) gLogger.exception( errStr ) return S_ERROR( errStr ) return S_OK( storage )
storageCFGBase = "/Resources/StorageElements" res = gConfig.getSections( storageCFGBase, True ) if not res[ 'OK' ]: gLogger.error( 'Failed to get storage element info' ) gLogger.error( res[ 'Message' ] ) DIRAC.exit( -1 ) gLogger.info( "%s %s %s" % ( 'Storage Element'.ljust( 25 ), 'Read Status'.rjust( 15 ), 'Write Status'.rjust( 15 ) ) ) seList = sortList( res[ 'Value' ] ) resourceStatus = ResourceStatus() res = resourceStatus.getStorageElementStatus( seList ) if not res[ 'OK' ]: gLogger.error( "Failed to get StorageElement status for %s" % str( seList ) ) for k,v in res[ 'Value' ].items(): readState, writeState = 'Active', 'Active' if v.has_key( 'Read' ): readState = v[ 'Read' ] if v.has_key( 'Write' ): writeState = v[ 'Write'] gLogger.notice("%s %s %s" % ( k.ljust(25),readState.rjust(15),writeState.rjust(15)) )
class FTS3Placement( FTSAbstractPlacement ): """ This class manages all the FTS strategies, routes and what not """ __serverPolicy = "Random" __nextServerID = 0 __serverList = None __maxAttempts = 0 def __init__( self, csPath = None, ftsHistoryViews = None ): """ Call the init of the parent, and initialize the list of FTS3 servers """ self.log = gLogger.getSubLogger( "FTS3Placement" ) super( FTS3Placement, self ).__init__( csPath = csPath, ftsHistoryViews = ftsHistoryViews ) srvList = getFTS3Servers() if not srvList['OK']: self.log.error( srvList['Message'] ) self.__serverList = srvList.get( 'Value', [] ) self.maxAttempts = len( self.__serverList ) self.rssClient = ResourceStatus() def getReplicationTree( self, sourceSEs, targetSEs, size, strategy = None ): """ For multiple source to multiple destination, find the optimal replication strategy. :param sourceSEs : list of source SE :param targetSEs : list of destination SE :param size : size of the File :param strategy : which strategy to use :returns S_OK(dict) < route name : { dict with key Ancestor, SourceSE, TargetSEtargetSE, Strategy } > For the time being, we are waiting for FTS3 to provide advisory mechanisms. So we just use simple techniques """ # We will use a single random source sourceSE = random.choice( sourceSEs ) tree = {} for targetSE in targetSEs: tree["%s#%s" % ( sourceSE, targetSE )] = { "Ancestor" : False, "SourceSE" : sourceSE, "TargetSE" : targetSE, "Strategy" : "FTS3Simple" } return S_OK( tree ) def refresh( self, ftsHistoryViews ): """ Refresh, whatever that means... recalculate all what you need, fetches the latest conf and what not. """ return super( FTS3Placement, self ).refresh( ftsHistoryViews = ftsHistoryViews ) def __failoverServerPolicy(self, attempt = 0): """ Returns always the server at a given position (normally the first one) :param attempt: position of the server in the list """ if attempt >= len( self.__serverList ): raise Exception( "FTS3Placement.__failoverServerPolicy: attempt to reach non existing server index" ) return self.__serverList[attempt] def __sequenceServerPolicy( self ): """ Every time the this policy is called, return the next server on the list """ fts3server = self.__serverList[self.__nextServerID] self.__nextServerID = ( self.__nextServerID + 1 ) % len( self.__serverList ) return fts3server def __randomServerPolicy(self): """ return a random server from the list """ return random.choice( self.__serverList ) def __chooseFTS3Server( self ): """ Choose the appropriate FTS3 server depending on the policy """ fts3Server = None attempt = 0 # FIXME : need to get real valeu from RSS ftsServerStatus = True while not fts3Server and attempt < self.maxAttempts: if self.__serverPolicy == 'Random': fts3Server = self.__randomServerPolicy() elif self.__serverPolicy == 'Sequence': fts3Server = self.__sequenceServerPolicy() elif self.__serverPolicy == 'Failover': fts3Server = self.__failoverServerPolicy( attempt = attempt ) else: self.log.error( 'Unknown server policy %s. Using Random instead' % self.__serverPolicy ) fts3Server = self.__randomServerPolicy() if not ftsServerStatus: self.log.warn( 'FTS server %s is not in good shape. Choose another one' % fts3Server ) fts3Server = None attempt += 1 # FIXME : I need to get the FTS server status from RSS # ftsStatusFromRss = rss.ftsStatusOrSomethingLikeThat if fts3Server: return S_OK( fts3Server ) return S_ERROR ( "Could not find an FTS3 server (max attempt reached)" ) def findRoute( self, sourceSE, targetSE ): """ Find the appropriate route from point A to B :param sourceSE : source SE :param targetSE : destination SE :returns S_OK(FTSRoute) """ fts3server = self.__chooseFTS3Server() if not fts3server['OK']: return fts3server fts3server = fts3server['Value'] route = FTSRoute( sourceSE, targetSE, fts3server ) return S_OK( route ) def isRouteValid( self, route ): """ FIXME: until RSS is ready, I check manually the status In FTS3, all routes are valid a priori. If a route was not valid for some reason, then FTS would know it thanks to the blacklist sent by RSS, and would deal with it itself. :param route : FTSRoute :returns S_OK or S_ERROR(reason) """ rAccess = self.rssClient.getStorageElementStatus( route.sourceSE, "ReadAccess" ) self.log.debug( "se read %s %s" % ( route.sourceSE, rAccess ) ) if not rAccess["OK"]: self.log.error( rAccess["Message"] ) return rAccess if rAccess["Value"][route.sourceSE]["ReadAccess"] not in ( "Active", "Degraded" ): return S_ERROR( "Source SE is not readable" ) wAccess = self.rssClient.getStorageElementStatus( route.targetSE, "WriteAccess" ) self.log.debug( "se write %s %s" % ( route.targetSE, wAccess ) ) if not wAccess["OK"]: self.log.error( wAccess["Message"] ) return wAccess if wAccess["Value"][route.targetSE]["WriteAccess"] not in ( "Active", "Degraded" ): return S_ERROR( "Target SE is not writable" ) return S_OK()
class StrategyHandler( object ): """ .. class:: StrategyHandler StrategyHandler is a helper class for determining optimal replication tree for given source files, their replicas and target storage elements. """ def __init__( self, configSection, channels=None, bandwidths=None, failedFiles=None ): """c'tor :param self: self reference :param str configSection: path on CS to ReplicationScheduler agent :param bandwithds: observed throughput on active channels :param channels: active channels :param int failedFiles: max number of distinct failed files to allow scheduling """ ## save config section self.configSection = configSection + "/" + self.__class__.__name__ ## ## sublogger self.log = gLogger.getSubLogger( "StrategyHandler", child=True ) self.log.setLevel( gConfig.getValue( self.configSection + "/LogLevel", "DEBUG" ) ) self.supportedStrategies = [ 'Simple', 'DynamicThroughput', 'Swarm', 'MinimiseTotalWait' ] self.log.info( "Supported strategies = %s" % ", ".join( self.supportedStrategies ) ) self.sigma = gConfig.getValue( self.configSection + '/HopSigma', 0.0 ) self.log.info( "HopSigma = %s" % self.sigma ) self.schedulingType = gConfig.getValue( self.configSection + '/SchedulingType', 'File' ) self.log.info( "SchedulingType = %s" % self.schedulingType ) self.activeStrategies = gConfig.getValue( self.configSection + '/ActiveStrategies', ['MinimiseTotalWait'] ) self.log.info( "ActiveStrategies = %s" % ", ".join( self.activeStrategies ) ) self.numberOfStrategies = len( self.activeStrategies ) self.log.info( "Number of active strategies = %s" % self.numberOfStrategies ) self.acceptableFailureRate = gConfig.getValue( self.configSection + '/AcceptableFailureRate', 75 ) self.log.info( "AcceptableFailureRate = %s" % self.acceptableFailureRate ) self.acceptableFailedFiles = gConfig.getValue( self.configSection + "/AcceptableFailedFiles", 5 ) self.log.info( "AcceptableFailedFiles = %s" % self.acceptableFailedFiles ) self.rwUpdatePeriod = gConfig.getValue( self.configSection + "/RssRWUpdatePeriod", 300 ) self.log.info( "RSSUpdatePeriod = %s s" % self.rwUpdatePeriod ) self.rwUpdatePeriod = datetime.timedelta( seconds=self.rwUpdatePeriod ) ## bandwithds self.bandwidths = bandwidths if bandwidths else {} ## channels self.channels = channels if channels else {} ## distinct failed files per channel self.failedFiles = failedFiles if failedFiles else {} ## chosen strategy self.chosenStrategy = 0 ## fts graph self.ftsGraph = None ## timestamp for last update self.lastRssUpdate = datetime.datetime.now() # dispatcher self.strategyDispatcher = { "MinimiseTotalWait" : self.minimiseTotalWait, "DynamicThroughput" : self.dynamicThroughput, "Simple" : self.simple, "Swarm" : self.swarm } ## own RSS client self.resourceStatus = ResourceStatus() ## create fts graph ftsGraph = self.setup( self.channels, self.bandwidths, self.failedFiles ) if not ftsGraph["OK"]: raise SHGraphCreationError( ftsGraph["Message"] ) self.log.info("%s has been constructed" % self.__class__.__name__ ) def setup( self, channels, bandwithds, failedFiles ): """ prepare fts graph :param dict channels: { channelID : { "Files" : long , Size = long, "ChannelName" : str, "Source" : str, "Destination" : str , "ChannelName" : str, "Status" : str } } :param dict bandwidths: { channelID { "Throughput" : float, "Fileput" : float, "SucessfulFiles" : long, "FailedFiles" : long } } :param dict failedFiles: { channelID : int } channelInfo { channelName : { "ChannelID" : int, "TimeToStart" : float} } """ graph = FTSGraph( "sites" ) result = getStorageElementSiteMapping() if not result['OK']: return result sitesDict = result['Value'] ## create nodes for site, ses in sitesDict.items(): rwDict = self.__getRWAccessForSE( ses ) if not rwDict["OK"]: return rwDict siteName = site if '.' in site: siteName = site.split('.')[1] graph.addNode( LCGSite( siteName, { "SEs" : rwDict["Value"] } ) ) ## channels { channelID : { "Files" : long , Size = long, "ChannelName" : str, ## "Source" : str, "Destination" : str , ## "ChannelName" : str, "Status" : str } } ## bandwidths { channelID { "Throughput" : float, "Fileput" : float, ## "SucessfulFiles" : long, "FailedFiles" : long } } ## channelInfo { channelName : { "ChannelID" : int, "TimeToStart" : float} } for channelID, channelDict in channels.items(): sourceName = channelDict["Source"] destName = channelDict["Destination"] fromNode = graph.getNode( sourceName ) toNode = graph.getNode( destName ) if fromNode and toNode: rwAttrs = { "status" : channels[channelID]["Status"], "files" : channelDict["Files"], "size" : channelDict["Size"], "successfulAttempts" : bandwithds[channelID]["SuccessfulFiles"], "failedAttempts" : bandwithds[channelID]["FailedFiles"], "distinctFailedFiles" : failedFiles.get( channelID, 0 ), "fileput" : bandwithds[channelID]["Fileput"], "throughput" : bandwithds[channelID]["Throughput"] } roAttrs = { "channelID" : channelID, "channelName" : channelDict["ChannelName"], "acceptableFailureRate" : self.acceptableFailureRate, "acceptableFailedFiles" : self.acceptableFailedFiles, "schedulingType" : self.schedulingType } ftsChannel = FTSChannel( fromNode, toNode, rwAttrs, roAttrs ) graph.addEdge( ftsChannel ) self.ftsGraph = graph self.lastRssUpdate = datetime.datetime.now() return S_OK() def updateGraph( self, rwAccess=False, replicationTree=None, size=0.0 ): """ update rw access for nodes (sites) and size anf files for edges (channels) """ replicationTree = replicationTree if replicationTree else {} size = size if size else 0.0 ## update nodes rw access for SEs if rwAccess: for lcgSite in self.ftsGraph.nodes(): rwDict = self.__getRWAccessForSE( lcgSite.SEs.keys() ) if not rwDict["OK"]: return rwDict lcgSite.SEs = rwDict["Value"] ## update channels size and files if replicationTree: for channel in self.ftsGraph.edges(): if channel.channelID in replicationTree: channel.size += size channel.files += 1 return S_OK() def simple( self, sourceSEs, targetSEs ): """ simple strategy - one source, many targets :param list sourceSEs: list with only one sourceSE name :param list targetSEs: list with target SE names :param str lfn: logical file name :param dict metadata: file metadata read from catalogue """ ## make targetSEs list unique if len(sourceSEs) != 1: return S_ERROR( "simple: wrong argument supplied for sourceSEs, only one sourceSE allowed" ) sourceSE = sourceSEs[0] tree = {} for targetSE in targetSEs: channel = self.ftsGraph.findChannel( sourceSE, targetSE ) if not channel["OK"]: return S_ERROR( channel["Message"] ) channel = channel["Value"] if not channel.fromNode.SEs[sourceSE]["read"]: return S_ERROR( "simple: sourceSE '%s' in banned for reading rigth now" % sourceSE ) if not channel.toNode.SEs[targetSE]["write"]: return S_ERROR( "simple: targetSE '%s' is banned for writing rigth now" % targetSE ) if channel.channelID in tree: return S_ERROR( "simple: unable to create replication tree, channel '%s' cannot be used twice" %\ channel.channelName ) tree[channel.channelID] = { "Ancestor" : False, "SourceSE" : sourceSE, "DestSE" : targetSE, "Strategy" : "Simple" } return S_OK(tree) def swarm( self, sourceSEs, targetSEs ): """ swarm strategy - one target, many sources, pick up the fastest :param list sourceSEs: list of source SE :param str targetSEs: on element list with name of target SE :param str lfn: logical file name :param dict metadata: file metadata read from catalogue """ tree = {} channels = [] if len(targetSEs) > 1: return S_ERROR("swarm: wrong argument supplied for targetSEs, only one targetSE allowed") targetSE = targetSEs[0] ## find channels for sourceSE in sourceSEs: channel = self.ftsGraph.findChannel( sourceSE, targetSE ) if not channel["OK"]: self.log.warn( "swarm: %s" % channel["Message"] ) continue channels.append( ( sourceSE, channel["Value"] ) ) ## exit - no channels if not channels: return S_ERROR("swarm: unable to find FTS channels between '%s' and '%s'" % ( ",".join(sourceSEs), targetSE ) ) ## filter out non active channels channels = [ ( sourceSE, channel ) for sourceSE, channel in channels if channel.fromNode.SEs[sourceSE]["read"] and channel.toNode.SEs[targetSE]["write"] and channel.status == "Active" and channel.timeToStart < float("inf") ] ## exit - no active channels if not channels: return S_ERROR( "swarm: no active channels found between %s and %s" % ( sourceSEs, targetSE ) ) ## find min timeToStart minTimeToStart = float("inf") selSourceSE = selChannel = None for sourceSE, ftsChannel in channels: if ftsChannel.timeToStart < minTimeToStart: minTimeToStart = ftsChannel.timeToStart selSourceSE = sourceSE selChannel = ftsChannel if not selSourceSE: return S_ERROR( "swarm: no active channels found between %s and %s" % ( sourceSEs, targetSE ) ) tree[selChannel.channelID] = { "Ancestor" : False, "SourceSE" : selSourceSE, "DestSE" : targetSE, "Strategy" : "Swarm" } return S_OK( tree ) def minimiseTotalWait( self, sourceSEs, targetSEs ): """ find dag that minimises start time :param list sourceSEs: list of avialable source SEs :param list targetSEs: list of target SEs :param str lfn: logical file name :param dict metadata: file metadata read from catalogue """ tree = {} primarySources = sourceSEs while targetSEs: minTimeToStart = float("inf") channels = [] for targetSE in targetSEs: for sourceSE in sourceSEs: ftsChannel = self.ftsGraph.findChannel( sourceSE, targetSE ) if not ftsChannel["OK"]: self.log.warn( "minimiseTotalWait: %s" % ftsChannel["Message"] ) continue ftsChannel = ftsChannel["Value"] channels.append( ( ftsChannel, sourceSE, targetSE ) ) if not channels: msg = "minimiseTotalWait: FTS channels between %s and %s not defined" % ( ",".join(sourceSEs), ",".join(targetSEs) ) self.log.error( msg ) return S_ERROR( msg ) ## filter out already used channels channels = [ (channel, sourceSE, targetSE) for channel, sourceSE, targetSE in channels if channel.channelID not in tree ] if not channels: msg = "minimiseTotalWait: all FTS channels between %s and %s are already used in tree" % ( ",".join(sourceSEs), ",".join(targetSEs) ) self.log.error( msg ) return S_ERROR( msg ) self.log.debug("minimiseTotalWait: found %s candiate channels, checking activity" % len( channels) ) channels = [ ( channel, sourceSE, targetSE ) for channel, sourceSE, targetSE in channels if channel.fromNode.SEs[sourceSE]["read"] and channel.toNode.SEs[targetSE]["write"] and channel.status == "Active" and channel.timeToStart < float("inf") ] if not channels: self.log.error("minimiseTotalWait: no active FTS channels found" ) return S_ERROR("minimiseTotalWait: no active FTS channels found" ) candidates = [] for channel, sourceSE, targetSE in channels: timeToStart = channel.timeToStart if sourceSE not in primarySources: timeToStart += self.sigma ## local found if channel.fromNode == channel.toNode: self.log.debug("minimiseTotalWait: found local channel '%s'" % channel.channelName ) candidates = [ ( channel, sourceSE, targetSE ) ] break if timeToStart <= minTimeToStart: minTimeToStart = timeToStart candidates = [ ( channel, sourceSE, targetSE ) ] elif timeToStart == minTimeToStart: candidates.append( (channel, sourceSE, targetSE ) ) if not candidates: return S_ERROR("minimiseTotalWait: unable to find candidate FTS channels minimising total wait time") random.shuffle( candidates ) selChannel, selSourceSE, selTargetSE = candidates[0] ancestor = False for channelID, treeItem in tree.items(): if selSourceSE in treeItem["DestSE"]: ancestor = channelID tree[selChannel.channelID] = { "Ancestor" : ancestor, "SourceSE" : selSourceSE, "DestSE" : selTargetSE, "Strategy" : "MinimiseTotalWait" } sourceSEs.append( selTargetSE ) targetSEs.remove( selTargetSE ) return S_OK(tree) def dynamicThroughput( self, sourceSEs, targetSEs ): """ dynamic throughput - many sources, many targets - find dag that minimises overall throughput :param list sourceSEs: list of available source SE names :param list targetSE: list of target SE names :param str lfn: logical file name :param dict metadata: file metadata read from catalogue """ tree = {} primarySources = sourceSEs timeToSite = {} while targetSEs: minTimeToStart = float("inf") channels = [] for targetSE in targetSEs: for sourceSE in sourceSEs: ftsChannel = self.ftsGraph.findChannel( sourceSE, targetSE ) if not ftsChannel["OK"]: self.log.warn( "dynamicThroughput: %s" % ftsChannel["Message"] ) continue ftsChannel = ftsChannel["Value"] channels.append( ( ftsChannel, sourceSE, targetSE ) ) ## no candidate channels found if not channels: msg = "dynamicThroughput: FTS channels between %s and %s are not defined" % ( ",".join(sourceSEs), ",".join(targetSEs) ) self.log.error( msg ) return S_ERROR( msg ) ## filter out already used channels channels = [ (channel, sourceSE, targetSE) for channel, sourceSE, targetSE in channels if channel.channelID not in tree ] if not channels: msg = "dynamicThroughput: all FTS channels between %s and %s are already used in tree" % ( ",".join(sourceSEs), ",".join(targetSEs) ) self.log.error( msg ) return S_ERROR( msg ) ## filter out non-active channels self.log.debug("dynamicThroughput: found %s candidate channels, checking activity" % len(channels) ) channels = [ ( channel, sourceSE, targetSE ) for channel, sourceSE, targetSE in channels if channel.fromNode.SEs[sourceSE]["read"] and channel.toNode.SEs[targetSE]["write"] and channel.status == "Active" and channel.timeToStart < float("inf") ] if not channels: self.log.info("dynamicThroughput: active candidate channels not found") return S_ERROR("dynamicThroughput: no active candidate FTS channels") candidates = [] selTimeToStart = None for channel, sourceSE, targetSE in channels: timeToStart = channel.timeToStart if sourceSE not in primarySources: timeToStart += self.sigma if sourceSE in timeToSite: timeToStart += timeToSite[sourceSE] ## local found if channel.fromNode == channel.toNode: self.log.debug("dynamicThroughput: found local channel '%s'" % channel.channelName ) candidates = [ ( channel, sourceSE, targetSE ) ] selTimeToStart = timeToStart break if timeToStart <= minTimeToStart: selTimeToStart = timeToStart minTimeToStart = timeToStart candidates = [ ( channel, sourceSE, targetSE ) ] elif timeToStart == minTimeToStart: candidates.append( (channel, sourceSE, targetSE ) ) if not candidates: return S_ERROR("dynamicThroughput: unable to find candidate FTS channels") random.shuffle( candidates ) selChannel, selSourceSE, selTargetSE = candidates[0] ancestor = False for channelID, treeItem in tree.items(): if selSourceSE in treeItem["DestSE"]: ancestor = channelID tree[selChannel.channelID] = { "Ancestor" : ancestor, "SourceSE" : selSourceSE, "DestSE" : selTargetSE, "Strategy" : "DynamicThroughput" } timeToSite[selTargetSE] = selTimeToStart sourceSEs.append( selTargetSE ) targetSEs.remove( selTargetSE ) return S_OK( tree ) def reset( self ): """ reset :chosenStrategy: :param self: self reference """ self.chosenStrategy = 0 def getSupportedStrategies( self ): """ Get supported strategies. :param self: self reference """ return self.supportedStrategies def replicationTree( self, sourceSEs, targetSEs, size, strategy=None ): """ get replication tree :param str lfn: LFN :param list sourceSEs: list of sources SE names to use :param list targetSEs: liost of target SE names to use :param long size: file size :param str strategy: strategy name """ ## update SEs rwAccess every rwUpdatePertion timedelta (default 300 s) now = datetime.datetime.now() if now - self.lastRssUpdate > self.rwUpdatePeriod: update = self.updateGraph( rwAccess=True ) if not update["OK"]: self.log.warn("replicationTree: unable to update FTS graph: %s" % update["Message"] ) else: self.lastRssUpdate = now ## get strategy strategy = strategy if strategy else self.__selectStrategy() if strategy not in self.getSupportedStrategies(): return S_ERROR("replicationTree: unsupported strategy '%s'" % strategy ) self.log.info( "replicationTree: strategy=%s sourceSEs=%s targetSEs=%s size=%s" %\ ( strategy, sourceSEs, targetSEs, size ) ) ## fire action from dispatcher tree = self.strategyDispatcher[strategy]( sourceSEs, targetSEs ) if not tree["OK"]: self.log.error( "replicationTree: %s" % tree["Message"] ) return tree ## update graph edges update = self.updateGraph( replicationTree=tree["Value"], size=size ) if not update["OK"]: self.log.error( "replicationTree: unable to update FTS graph: %s" % update["Message"] ) return update return tree def __selectStrategy( self ): """ If more than one active strategy use one after the other. :param self: self reference """ chosenStrategy = self.activeStrategies[self.chosenStrategy] self.chosenStrategy += 1 if self.chosenStrategy == self.numberOfStrategies: self.chosenStrategy = 0 return chosenStrategy def __getRWAccessForSE( self, seList ): """ get RSS R/W for :seList: :param list seList: SE list """ rwDict = dict.fromkeys( seList ) for se in rwDict: rwDict[se] = { "read" : False, "write" : False } rAccess = self.resourceStatus.getStorageElementStatus( seList, statusType = "ReadAccess", default = 'Unknown' ) if not rAccess["OK"]: return rAccess["Message"] rAccess = [ k for k, v in rAccess["Value"].items() if "ReadAccess" in v and v["ReadAccess"] in ( "Active", "Degraded" ) ] wAccess = self.resourceStatus.getStorageElementStatus( seList, statusType = "WriteAccess", default = 'Unknown' ) if not wAccess["OK"]: return wAccess["Message"] wAccess = [ k for k, v in wAccess["Value"].items() if "WriteAccess" in v and v["WriteAccess"] in ( "Active", "Degraded" ) ] for se in rwDict: rwDict[se]["read"] = se in rAccess rwDict[se]["write"] = se in wAccess return S_OK( rwDict )
result = getVOfromProxyGroup() if not result['OK']: gLogger.notice('Error:', result['Message']) DIRAC.exit(1) vo = result['Value'] resources = Resources(vo=vo) result = resources.getEligibleStorageElements() if not result['OK']: gLogger.notice('Error:', result['Message']) DIRAC.exit(2) seList = sortList(result['Value']) resourceStatus = ResourceStatus() result = resourceStatus.getStorageElementStatus(seList) if not result['OK']: gLogger.notice('Error:', result['Message']) DIRAC.exit(3) for k, v in result['Value'].items(): readState, writeState = 'Active', 'Active' if v.has_key('ReadAccess'): readState = v['ReadAccess'] if v.has_key('WriteAccess'): writeState = v['WriteAccess'] gLogger.notice( "%s %s %s" %
gLogger.error('The provided site (%s) is not known.' % site) DIRAC.exit(-1) ses.extend(res['Value']['SE'].replace(' ', '').split(',')) if not ses: gLogger.error('There were no SEs provided') DIRAC.exit(-1) readBanned = [] writeBanned = [] checkBanned = [] removeBanned = [] resourceStatus = ResourceStatus() res = resourceStatus.getStorageElementStatus(ses) if not res['OK']: gLogger.error("Storage Element %s does not exist" % ses) DIRAC.exit(-1) reason = 'Forced with dirac-admin-ban-se by %s' % userName for se, seOptions in res['Value'].items(): resW = resC = resR = {'OK': False} # Eventually, we will get rid of the notion of InActive, as we always write Banned. if read and seOptions.has_key('ReadAccess'): if not seOptions['ReadAccess'] in ['Active', 'Degraded', 'Probing']: gLogger.notice('Read option for %s is %s, instead of %s' %
class FTS3Placement(FTSAbstractPlacement): """ This class manages all the FTS strategies, routes and what not """ __serverPolicy = "Random" __nextServerID = 0 __serverList = None __maxAttempts = 0 def __init__(self, csPath=None, ftsHistoryViews=None): """ Call the init of the parent, and initialize the list of FTS3 servers """ self.log = gLogger.getSubLogger("FTS3Placement") super(FTS3Placement, self).__init__(csPath=csPath, ftsHistoryViews=ftsHistoryViews) srvList = getFTS3Servers() if not srvList['OK']: self.log.error(srvList['Message']) self.__serverList = srvList.get('Value', []) self.maxAttempts = len(self.__serverList) self.rssClient = ResourceStatus() def getReplicationTree(self, sourceSEs, targetSEs, size, strategy=None): """ For multiple source to multiple destination, find the optimal replication strategy. :param sourceSEs : list of source SE :param targetSEs : list of destination SE :param size : size of the File :param strategy : which strategy to use :returns S_OK(dict) < route name : { dict with key Ancestor, SourceSE, TargetSEtargetSE, Strategy } > For the time being, we are waiting for FTS3 to provide advisory mechanisms. So we just use simple techniques """ # We will use a single random source sourceSE = random.choice(sourceSEs) tree = {} for targetSE in targetSEs: tree["%s#%s" % (sourceSE, targetSE)] = { "Ancestor": False, "SourceSE": sourceSE, "TargetSE": targetSE, "Strategy": "FTS3Simple" } return S_OK(tree) def refresh(self, ftsHistoryViews): """ Refresh, whatever that means... recalculate all what you need, fetches the latest conf and what not. """ return super(FTS3Placement, self).refresh(ftsHistoryViews=ftsHistoryViews) def __failoverServerPolicy(self, attempt=0): """ Returns always the server at a given position (normally the first one) :param attempt: position of the server in the list """ if attempt >= len(self.__serverList): raise Exception( "FTS3Placement.__failoverServerPolicy: attempt to reach non existing server index" ) return self.__serverList[attempt] def __sequenceServerPolicy(self): """ Every time the this policy is called, return the next server on the list """ fts3server = self.__serverList[self.__nextServerID] self.__nextServerID = (self.__nextServerID + 1) % len( self.__serverList) return fts3server def __randomServerPolicy(self): """ return a random server from the list """ return random.choice(self.__serverList) def __chooseFTS3Server(self): """ Choose the appropriate FTS3 server depending on the policy """ fts3Server = None attempt = 0 # FIXME : need to get real valeu from RSS ftsServerStatus = True while not fts3Server and attempt < self.maxAttempts: if self.__serverPolicy == 'Random': fts3Server = self.__randomServerPolicy() elif self.__serverPolicy == 'Sequence': fts3Server = self.__sequenceServerPolicy() elif self.__serverPolicy == 'Failover': fts3Server = self.__failoverServerPolicy(attempt=attempt) else: self.log.error( 'Unknown server policy %s. Using Random instead' % self.__serverPolicy) fts3Server = self.__randomServerPolicy() if not ftsServerStatus: self.log.warn( 'FTS server %s is not in good shape. Choose another one' % fts3Server) fts3Server = None attempt += 1 # FIXME : I need to get the FTS server status from RSS # ftsStatusFromRss = rss.ftsStatusOrSomethingLikeThat if fts3Server: return S_OK(fts3Server) return S_ERROR("Could not find an FTS3 server (max attempt reached)") def findRoute(self, sourceSE, targetSE): """ Find the appropriate route from point A to B :param sourceSE : source SE :param targetSE : destination SE :returns S_OK(FTSRoute) """ fts3server = self.__chooseFTS3Server() if not fts3server['OK']: return fts3server fts3server = fts3server['Value'] route = FTSRoute(sourceSE, targetSE, fts3server) return S_OK(route) def isRouteValid(self, route): """ FIXME: until RSS is ready, I check manually the status In FTS3, all routes are valid a priori. If a route was not valid for some reason, then FTS would know it thanks to the blacklist sent by RSS, and would deal with it itself. :param route : FTSRoute :returns S_OK or S_ERROR(reason) """ rAccess = self.rssClient.getStorageElementStatus( route.sourceSE, "ReadAccess") self.log.debug("se read %s %s" % (route.sourceSE, rAccess)) if not rAccess["OK"]: self.log.error(rAccess["Message"]) return rAccess if rAccess["Value"][route.sourceSE]["ReadAccess"] not in ("Active", "Degraded"): return S_ERROR("Source SE is not readable") wAccess = self.rssClient.getStorageElementStatus( route.targetSE, "WriteAccess") self.log.debug("se write %s %s" % (route.targetSE, wAccess)) if not wAccess["OK"]: self.log.error(wAccess["Message"]) return wAccess if wAccess["Value"][route.targetSE]["WriteAccess"] not in ("Active", "Degraded"): return S_ERROR("Target SE is not writable") return S_OK()
class StrategyHandler(object): """ .. class:: StrategyHandler StrategyHandler is a helper class for determining optimal replication tree for given source files, their replicas and target storage elements. """ def __init__(self, configSection, channels=None, bandwidths=None, failedFiles=None): """c'tor :param self: self reference :param str configSection: path on CS to ReplicationScheduler agent :param bandwithds: observed throughput on active channels :param channels: active channels :param int failedFiles: max number of distinct failed files to allow scheduling """ ## save config section self.configSection = configSection + "/" + self.__class__.__name__ ## ## sublogger self.log = gLogger.getSubLogger("StrategyHandler", child=True) self.log.setLevel( gConfig.getValue(self.configSection + "/LogLevel", "DEBUG")) self.supportedStrategies = [ 'Simple', 'DynamicThroughput', 'Swarm', 'MinimiseTotalWait' ] self.log.info("Supported strategies = %s" % ", ".join(self.supportedStrategies)) self.sigma = gConfig.getValue(self.configSection + '/HopSigma', 0.0) self.log.info("HopSigma = %s" % self.sigma) self.schedulingType = gConfig.getValue( self.configSection + '/SchedulingType', 'File') self.log.info("SchedulingType = %s" % self.schedulingType) self.activeStrategies = gConfig.getValue( self.configSection + '/ActiveStrategies', ['MinimiseTotalWait']) self.log.info("ActiveStrategies = %s" % ", ".join(self.activeStrategies)) self.numberOfStrategies = len(self.activeStrategies) self.log.info("Number of active strategies = %s" % self.numberOfStrategies) self.acceptableFailureRate = gConfig.getValue( self.configSection + '/AcceptableFailureRate', 75) self.log.info("AcceptableFailureRate = %s" % self.acceptableFailureRate) self.acceptableFailedFiles = gConfig.getValue( self.configSection + "/AcceptableFailedFiles", 5) self.log.info("AcceptableFailedFiles = %s" % self.acceptableFailedFiles) self.rwUpdatePeriod = gConfig.getValue( self.configSection + "/RssRWUpdatePeriod", 600) self.log.info("RSSUpdatePeriod = %s s" % self.rwUpdatePeriod) self.rwUpdatePeriod = datetime.timedelta(seconds=self.rwUpdatePeriod) ## bandwithds self.bandwidths = bandwidths if bandwidths else {} ## channels self.channels = channels if channels else {} ## distinct failed files per channel self.failedFiles = failedFiles if failedFiles else {} ## chosen strategy self.chosenStrategy = 0 ## fts graph self.ftsGraph = None ## timestamp for last update self.lastRssUpdate = datetime.datetime.now() # dispatcher self.strategyDispatcher = { "MinimiseTotalWait": self.minimiseTotalWait, "DynamicThroughput": self.dynamicThroughput, "Simple": self.simple, "Swarm": self.swarm } ## own RSS client self.resourceStatus = ResourceStatus() ## create fts graph ftsGraph = self.setup(self.channels, self.bandwidths, self.failedFiles) if not ftsGraph["OK"]: raise SHGraphCreationError(ftsGraph["Message"]) self.log.info("%s has been constructed" % self.__class__.__name__) def setup(self, channels, bandwithds, failedFiles): """ prepare fts graph :param dict channels: { channelID : { "Files" : long , Size = long, "ChannelName" : str, "Source" : str, "Destination" : str , "ChannelName" : str, "Status" : str } } :param dict bandwidths: { channelID { "Throughput" : float, "Fileput" : float, "SucessfulFiles" : long, "FailedFiles" : long } } :param dict failedFiles: { channelID : int } channelInfo { channelName : { "ChannelID" : int, "TimeToStart" : float} } """ graph = FTSGraph("sites") result = getStorageElementSiteMapping() if not result['OK']: return result sitesDict = result['Value'] ## create nodes for site, ses in sitesDict.items(): rwDict = self.__getRWAccessForSE(ses) if not rwDict["OK"]: return rwDict siteName = site if '.' in site: siteName = site.split('.')[1] graph.addNode(LCGSite(siteName, {"SEs": rwDict["Value"]})) ## channels { channelID : { "Files" : long , Size = long, "ChannelName" : str, ## "Source" : str, "Destination" : str , ## "ChannelName" : str, "Status" : str } } ## bandwidths { channelID { "Throughput" : float, "Fileput" : float, ## "SucessfulFiles" : long, "FailedFiles" : long } } ## channelInfo { channelName : { "ChannelID" : int, "TimeToStart" : float} } for channelID, channelDict in channels.items(): sourceName = channelDict["Source"] destName = channelDict["Destination"] fromNode = graph.getNode(sourceName) toNode = graph.getNode(destName) if fromNode and toNode: rwAttrs = { "status": channels[channelID]["Status"], "files": channelDict["Files"], "size": channelDict["Size"], "successfulAttempts": bandwithds[channelID]["SuccessfulFiles"], "failedAttempts": bandwithds[channelID]["FailedFiles"], "distinctFailedFiles": failedFiles.get(channelID, 0), "fileput": bandwithds[channelID]["Fileput"], "throughput": bandwithds[channelID]["Throughput"] } roAttrs = { "channelID": channelID, "channelName": channelDict["ChannelName"], "acceptableFailureRate": self.acceptableFailureRate, "acceptableFailedFiles": self.acceptableFailedFiles, "schedulingType": self.schedulingType } ftsChannel = FTSChannel(fromNode, toNode, rwAttrs, roAttrs) graph.addEdge(ftsChannel) self.ftsGraph = graph self.lastRssUpdate = datetime.datetime.now() return S_OK() def updateGraph(self, rwAccess=False, replicationTree=None, size=0.0): """ update rw access for nodes (sites) and size anf files for edges (channels) """ replicationTree = replicationTree if replicationTree else {} size = size if size else 0.0 ## update nodes rw access for SEs if rwAccess: for lcgSite in self.ftsGraph.nodes(): rwDict = self.__getRWAccessForSE(lcgSite.SEs.keys()) if not rwDict["OK"]: return rwDict lcgSite.SEs = rwDict["Value"] ## update channels size and files if replicationTree: for channel in self.ftsGraph.edges(): if channel.channelID in replicationTree: channel.size += size channel.files += 1 return S_OK() def simple(self, sourceSEs, targetSEs): """ simple strategy - one source, many targets :param list sourceSEs: list with only one sourceSE name :param list targetSEs: list with target SE names :param str lfn: logical file name :param dict metadata: file metadata read from catalogue """ ## make targetSEs list unique if len(sourceSEs) != 1: return S_ERROR( "simple: wrong argument supplied for sourceSEs, only one sourceSE allowed" ) sourceSE = sourceSEs[0] tree = {} for targetSE in targetSEs: channel = self.ftsGraph.findChannel(sourceSE, targetSE) if not channel["OK"]: return S_ERROR(channel["Message"]) channel = channel["Value"] if not channel.fromNode.SEs[sourceSE]["read"]: return S_ERROR( "simple: sourceSE '%s' in banned for reading rigth now" % sourceSE) if not channel.toNode.SEs[targetSE]["write"]: return S_ERROR( "simple: targetSE '%s' is banned for writing rigth now" % targetSE) if channel.channelID in tree: return S_ERROR( "simple: unable to create replication tree, channel '%s' cannot be used twice" %\ channel.channelName ) tree[channel.channelID] = { "Ancestor": False, "SourceSE": sourceSE, "DestSE": targetSE, "Strategy": "Simple" } return S_OK(tree) def swarm(self, sourceSEs, targetSEs): """ swarm strategy - one target, many sources, pick up the fastest :param list sourceSEs: list of source SE :param str targetSEs: on element list with name of target SE :param str lfn: logical file name :param dict metadata: file metadata read from catalogue """ tree = {} channels = [] if len(targetSEs) > 1: return S_ERROR( "swarm: wrong argument supplied for targetSEs, only one targetSE allowed" ) targetSE = targetSEs[0] ## find channels for sourceSE in sourceSEs: channel = self.ftsGraph.findChannel(sourceSE, targetSE) if not channel["OK"]: self.log.warn("swarm: %s" % channel["Message"]) continue channels.append((sourceSE, channel["Value"])) ## exit - no channels if not channels: return S_ERROR( "swarm: unable to find FTS channels between '%s' and '%s'" % (",".join(sourceSEs), targetSE)) ## filter out non active channels channels = [ (sourceSE, channel) for sourceSE, channel in channels if channel.fromNode.SEs[sourceSE]["read"] and channel.toNode.SEs[targetSE]["write"] and channel.status == "Active" and channel.timeToStart < float("inf") ] ## exit - no active channels if not channels: return S_ERROR( "swarm: no active channels found between %s and %s" % (sourceSEs, targetSE)) ## find min timeToStart minTimeToStart = float("inf") selSourceSE = selChannel = None for sourceSE, ftsChannel in channels: if ftsChannel.timeToStart < minTimeToStart: minTimeToStart = ftsChannel.timeToStart selSourceSE = sourceSE selChannel = ftsChannel if not selSourceSE: return S_ERROR( "swarm: no active channels found between %s and %s" % (sourceSEs, targetSE)) tree[selChannel.channelID] = { "Ancestor": False, "SourceSE": selSourceSE, "DestSE": targetSE, "Strategy": "Swarm" } return S_OK(tree) def minimiseTotalWait(self, sourceSEs, targetSEs): """ find dag that minimises start time :param list sourceSEs: list of avialable source SEs :param list targetSEs: list of target SEs :param str lfn: logical file name :param dict metadata: file metadata read from catalogue """ tree = {} primarySources = sourceSEs while targetSEs: minTimeToStart = float("inf") channels = [] for targetSE in targetSEs: for sourceSE in sourceSEs: ftsChannel = self.ftsGraph.findChannel(sourceSE, targetSE) if not ftsChannel["OK"]: self.log.warn("minimiseTotalWait: %s" % ftsChannel["Message"]) continue ftsChannel = ftsChannel["Value"] channels.append((ftsChannel, sourceSE, targetSE)) if not channels: msg = "minimiseTotalWait: FTS channels between %s and %s not defined" % ( ",".join(sourceSEs), ",".join(targetSEs)) self.log.error(msg) return S_ERROR(msg) ## filter out already used channels channels = [(channel, sourceSE, targetSE) for channel, sourceSE, targetSE in channels if channel.channelID not in tree] if not channels: msg = "minimiseTotalWait: all FTS channels between %s and %s are already used in tree" % ( ",".join(sourceSEs), ",".join(targetSEs)) self.log.error(msg) return S_ERROR(msg) self.log.debug( "minimiseTotalWait: found %s candiate channels, checking activity" % len(channels)) channels = [ (channel, sourceSE, targetSE) for channel, sourceSE, targetSE in channels if channel.fromNode.SEs[sourceSE]["read"] and channel.toNode.SEs[targetSE]["write"] and channel.status == "Active" and channel.timeToStart < float("inf") ] if not channels: self.log.error( "minimiseTotalWait: no active FTS channels found") return S_ERROR( "minimiseTotalWait: no active FTS channels found") candidates = [] for channel, sourceSE, targetSE in channels: timeToStart = channel.timeToStart if sourceSE not in primarySources: timeToStart += self.sigma ## local found if channel.fromNode == channel.toNode: self.log.debug( "minimiseTotalWait: found local channel '%s'" % channel.channelName) candidates = [(channel, sourceSE, targetSE)] break if timeToStart <= minTimeToStart: minTimeToStart = timeToStart candidates = [(channel, sourceSE, targetSE)] elif timeToStart == minTimeToStart: candidates.append((channel, sourceSE, targetSE)) if not candidates: return S_ERROR( "minimiseTotalWait: unable to find candidate FTS channels minimising total wait time" ) random.shuffle(candidates) selChannel, selSourceSE, selTargetSE = candidates[0] ancestor = False for channelID, treeItem in tree.items(): if selSourceSE in treeItem["DestSE"]: ancestor = channelID tree[selChannel.channelID] = { "Ancestor": ancestor, "SourceSE": selSourceSE, "DestSE": selTargetSE, "Strategy": "MinimiseTotalWait" } sourceSEs.append(selTargetSE) targetSEs.remove(selTargetSE) return S_OK(tree) def dynamicThroughput(self, sourceSEs, targetSEs): """ dynamic throughput - many sources, many targets - find dag that minimises overall throughput :param list sourceSEs: list of available source SE names :param list targetSE: list of target SE names :param str lfn: logical file name :param dict metadata: file metadata read from catalogue """ tree = {} primarySources = sourceSEs timeToSite = {} while targetSEs: minTimeToStart = float("inf") channels = [] for targetSE in targetSEs: for sourceSE in sourceSEs: ftsChannel = self.ftsGraph.findChannel(sourceSE, targetSE) if not ftsChannel["OK"]: self.log.warn("dynamicThroughput: %s" % ftsChannel["Message"]) continue ftsChannel = ftsChannel["Value"] channels.append((ftsChannel, sourceSE, targetSE)) ## no candidate channels found if not channels: msg = "dynamicThroughput: FTS channels between %s and %s are not defined" % ( ",".join(sourceSEs), ",".join(targetSEs)) self.log.error(msg) return S_ERROR(msg) ## filter out already used channels channels = [(channel, sourceSE, targetSE) for channel, sourceSE, targetSE in channels if channel.channelID not in tree] if not channels: msg = "dynamicThroughput: all FTS channels between %s and %s are already used in tree" % ( ",".join(sourceSEs), ",".join(targetSEs)) self.log.error(msg) return S_ERROR(msg) ## filter out non-active channels self.log.debug( "dynamicThroughput: found %s candidate channels, checking activity" % len(channels)) channels = [ (channel, sourceSE, targetSE) for channel, sourceSE, targetSE in channels if channel.fromNode.SEs[sourceSE]["read"] and channel.toNode.SEs[targetSE]["write"] and channel.status == "Active" and channel.timeToStart < float("inf") ] if not channels: self.log.info( "dynamicThroughput: active candidate channels not found") return S_ERROR( "dynamicThroughput: no active candidate FTS channels") candidates = [] selTimeToStart = None for channel, sourceSE, targetSE in channels: timeToStart = channel.timeToStart if sourceSE not in primarySources: timeToStart += self.sigma if sourceSE in timeToSite: timeToStart += timeToSite[sourceSE] ## local found if channel.fromNode == channel.toNode: self.log.debug( "dynamicThroughput: found local channel '%s'" % channel.channelName) candidates = [(channel, sourceSE, targetSE)] selTimeToStart = timeToStart break if timeToStart <= minTimeToStart: selTimeToStart = timeToStart minTimeToStart = timeToStart candidates = [(channel, sourceSE, targetSE)] elif timeToStart == minTimeToStart: candidates.append((channel, sourceSE, targetSE)) if not candidates: return S_ERROR( "dynamicThroughput: unable to find candidate FTS channels") random.shuffle(candidates) selChannel, selSourceSE, selTargetSE = candidates[0] ancestor = False for channelID, treeItem in tree.items(): if selSourceSE in treeItem["DestSE"]: ancestor = channelID tree[selChannel.channelID] = { "Ancestor": ancestor, "SourceSE": selSourceSE, "DestSE": selTargetSE, "Strategy": "DynamicThroughput" } timeToSite[selTargetSE] = selTimeToStart sourceSEs.append(selTargetSE) targetSEs.remove(selTargetSE) return S_OK(tree) def reset(self): """ reset :chosenStrategy: :param self: self reference """ self.chosenStrategy = 0 def getSupportedStrategies(self): """ Get supported strategies. :param self: self reference """ return self.supportedStrategies def replicationTree(self, sourceSEs, targetSEs, size, strategy=None): """ get replication tree :param str lfn: LFN :param list sourceSEs: list of sources SE names to use :param list targetSEs: liost of target SE names to use :param long size: file size :param str strategy: strategy name """ ## update SEs rwAccess every rwUpdatePertion timedelta (default 300 s) now = datetime.datetime.now() if now - self.lastRssUpdate > self.rwUpdatePeriod: update = self.updateGraph(rwAccess=True) if not update["OK"]: self.log.warn( "replicationTree: unable to update FTS graph: %s" % update["Message"]) else: self.lastRssUpdate = now ## get strategy strategy = strategy if strategy else self.__selectStrategy() if strategy not in self.getSupportedStrategies(): return S_ERROR("replicationTree: unsupported strategy '%s'" % strategy) self.log.info( "replicationTree: strategy=%s sourceSEs=%s targetSEs=%s size=%s" %\ ( strategy, sourceSEs, targetSEs, size ) ) ## fire action from dispatcher tree = self.strategyDispatcher[strategy](sourceSEs, targetSEs) if not tree["OK"]: self.log.error("replicationTree: %s" % tree["Message"]) return tree ## update graph edges update = self.updateGraph(replicationTree=tree["Value"], size=size) if not update["OK"]: self.log.error("replicationTree: unable to update FTS graph: %s" % update["Message"]) return update return tree def __selectStrategy(self): """ If more than one active strategy use one after the other. :param self: self reference """ chosenStrategy = self.activeStrategies[self.chosenStrategy] self.chosenStrategy += 1 if self.chosenStrategy == self.numberOfStrategies: self.chosenStrategy = 0 return chosenStrategy def __getRWAccessForSE(self, seList): """ get RSS R/W for :seList: :param list seList: SE list """ rwDict = dict.fromkeys(seList) for se in rwDict: rwDict[se] = {"read": False, "write": False} rAccess = self.resourceStatus.getStorageElementStatus( seList, statusType="ReadAccess", default='Unknown') if not rAccess["OK"]: return rAccess rAccess = [ k for k, v in rAccess["Value"].items() if "ReadAccess" in v and v["ReadAccess"] in ("Active", "Degraded") ] wAccess = self.resourceStatus.getStorageElementStatus( seList, statusType="WriteAccess", default='Unknown') if not wAccess["OK"]: return wAccess wAccess = [ k for k, v in wAccess["Value"].items() if "WriteAccess" in v and v["WriteAccess"] in ("Active", "Degraded") ] for se in rwDict: rwDict[se]["read"] = se in rAccess rwDict[se]["write"] = se in wAccess return S_OK(rwDict)
class InputDataAgent( OptimizerModule ): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ ############################################################################# def initializeOptimizer( self ): """Initialize specific parameters for JobSanityAgent. """ self.failedMinorStatus = self.am_getOption( '/FailedJobStatus', 'Input Data Not Available' ) #this will ignore failover SE files self.checkFileMetadata = self.am_getOption( 'CheckFileMetadata', True ) self.dataManager = DataManager() self.resourceStatus = ResourceStatus() self.fc = FileCatalog() self.seToSiteMapping = {} self.lastCScheck = 0 self.cacheLength = 600 return S_OK() ############################################################################# def checkJob( self, job, classAdJob ): """ This method does the optimization corresponding to this Agent, it is call for each job by the Optimizer framework """ result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobdB for %s' % ( job ) ) self.log.warn( result['Message'] ) return result if not result['Value']: self.log.verbose( 'Job %s has no input data requirement' % ( job ) ) return self.setNextOptimizer( job ) #Check if we already executed this Optimizer and the input data is resolved res = self.getOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ) ) if res['OK'] and len( res['Value'] ): pass else: self.log.verbose( 'Job %s has an input data requirement and will be processed' % ( job ) ) inputData = result['Value'] result = self.__resolveInputData( job, inputData ) if not result['OK']: self.log.warn( result['Message'] ) return result return self.setNextOptimizer( job ) ############################################################################# def __resolveInputData( self, job, inputData ): """This method checks the file catalog for replica information. """ lfns = [ fname.replace( 'LFN:', '' ) for fname in inputData ] start = time.time() # In order to place jobs on Hold if a certain SE is banned we need first to check first if # if the replicas are really available replicas = self.dataManager.getActiveReplicas( lfns ) timing = time.time() - start self.log.verbose( 'Catalog Replicas Lookup Time: %.2f seconds ' % ( timing ) ) if not replicas['OK']: self.log.warn( replicas['Message'] ) return replicas replicaDict = replicas['Value'] siteCandidates = self.__checkReplicas( job, replicaDict ) if not siteCandidates['OK']: self.log.warn( siteCandidates['Message'] ) return siteCandidates if self.checkFileMetadata: guids = True start = time.time() guidDict = self.fc.getFileMetadata( lfns ) timing = time.time() - start self.log.info( 'Catalog Metadata Lookup Time: %.2f seconds ' % ( timing ) ) if not guidDict['OK']: self.log.warn( guidDict['Message'] ) guids = False failed = guidDict['Value']['Failed'] if failed: self.log.warn( 'Failed to establish some GUIDs' ) self.log.warn( failed ) guids = False if guids: for lfn, reps in replicaDict['Successful'].items(): guidDict['Value']['Successful'][lfn].update( reps ) replicas = guidDict resolvedData = {} resolvedData['Value'] = replicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ), resolvedData ) if not result['OK']: self.log.warn( result['Message'] ) return result return S_OK( resolvedData ) ############################################################################# def __checkReplicas( self, job, replicaDict ): """Check that all input lfns have valid replicas and can all be found at least in one single site. """ badLFNs = [] if replicaDict.has_key( 'Successful' ): for lfn, reps in replicaDict['Successful'].items(): if not reps: badLFNs.append( 'LFN:%s Problem: No replicas available' % ( lfn ) ) else: return S_ERROR( 'No replica Info available' ) if replicaDict.has_key( 'Failed' ): for lfn, cause in replicaDict['Failed'].items(): badLFNs.append( 'LFN:%s Problem: %s' % ( lfn, cause ) ) if badLFNs: self.log.info( 'Found %s problematic LFN(s) for job %s' % ( len( badLFNs ), job ) ) param = '\n'.join( badLFNs ) self.log.info( param ) result = self.setJobParam( job, self.am_getModuleParam( 'optimizerName' ), param ) if not result['OK']: self.log.error( result['Message'] ) return S_ERROR( 'Input Data Not Available' ) return self.__getSiteCandidates( replicaDict['Successful'] ) ############################################################################# # FIXME: right now this is unused... def __checkActiveSEs( self, job, replicaDict ): """ Check active SE and replicas and identify possible Site candidates for the execution of the job """ # Now let's check if some replicas might not be available due to banned SE's activeReplicas = self.dataManager.checkActiveReplicas( replicaDict ) if not activeReplicas['OK']: # due to banned SE's input data might no be available msg = "On Hold: Missing replicas due to banned SE" self.log.info( msg ) self.log.warn( activeReplicas['Message'] ) return S_ERROR( msg ) activeReplicaDict = activeReplicas['Value'] siteCandidates = self.__checkReplicas( job, activeReplicaDict ) if not siteCandidates['OK']: # due to a banned SE's input data is not available at a single site msg = "On Hold: Input data not Available due to banned SE" self.log.info( msg ) self.log.warn( siteCandidates['Message'] ) return S_ERROR( msg ) resolvedData = {} resolvedData['Value'] = activeReplicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ), resolvedData ) if not result['OK']: self.log.warn( result['Message'] ) return result return S_OK( resolvedData ) ############################################################################# def __getSitesForSE( self, se ): """ Returns a list of sites having the given SE as a local one. Uses the local cache of the site-se information """ # Empty the cache if too old if ( time.time() - self.lastCScheck ) > self.cacheLength: self.log.verbose( 'Resetting the SE to site mapping cache' ) self.seToSiteMapping = {} self.lastCScheck = time.time() if se not in self.seToSiteMapping: sites = getSitesForSE( se ) if sites['OK']: self.seToSiteMapping[se] = list( sites['Value'] ) return sites else: return S_OK( self.seToSiteMapping[se] ) ############################################################################# def __getSiteCandidates( self, inputData ): """This method returns a list of possible site candidates based on the job input data requirement. For each site candidate, the number of files on disk and tape is resolved. """ fileSEs = {} for lfn, replicas in inputData.items(): siteList = [] for se in replicas.keys(): sites = self.__getSitesForSE( se ) if sites['OK']: siteList += sites['Value'] fileSEs[lfn] = uniqueElements( siteList ) siteCandidates = [] i = 0 for _fileName, sites in fileSEs.items(): if not i: siteCandidates = sites else: tempSite = [] for site in siteCandidates: if site in sites: tempSite.append( site ) siteCandidates = tempSite i += 1 if not len( siteCandidates ): return S_ERROR( 'No candidate sites available' ) #In addition, check number of files on tape and disk for each site #for optimizations during scheduling siteResult = {} for site in siteCandidates: siteResult[site] = { 'disk': [], 'tape': [] } seDict = {} for lfn, replicas in inputData.items(): for se in replicas.keys(): if se not in seDict: sites = self.__getSitesForSE( se ) if not sites['OK']: continue try: #storageElement = StorageElement( se ) result = self.resourceStatus.getStorageElementStatus( se, statusType = 'ReadAccess' ) if not result['OK']: continue seDict[se] = { 'Sites': sites['Value'], 'SEParams': result['Value'][se] } result = getStorageElementOptions( se ) if not result['OK']: continue seDict[se]['SEParams'].update(result['Value']) except Exception: self.log.exception( 'Failed to instantiate StorageElement( %s )' % se ) continue for site in seDict[se]['Sites']: if site in siteCandidates: if seDict[se]['SEParams']['ReadAccess'] and seDict[se]['SEParams']['DiskSE']: if lfn not in siteResult[site]['disk']: siteResult[site]['disk'].append( lfn ) if lfn in siteResult[site]['tape']: siteResult[site]['tape'].remove( lfn ) if seDict[se]['SEParams']['ReadAccess'] and seDict[se]['SEParams']['TapeSE']: if lfn not in siteResult[site]['tape'] and lfn not in siteResult[site]['disk']: siteResult[site]['tape'].append( lfn ) for site in siteResult: siteResult[site]['disk'] = len( siteResult[site]['disk'] ) siteResult[site]['tape'] = len( siteResult[site]['tape'] ) return S_OK( siteResult )
class StorageFactory(object): def __init__(self, useProxy=False, vo=None): self.rootConfigPath = '/Resources/StorageElements' self.proxy = False self.proxy = useProxy self.resourceStatus = ResourceStatus() self.vo = vo if self.vo is None: result = getVOfromProxyGroup() if result['OK']: self.vo = result['Value'] else: RuntimeError("Can not get the current VO context") self.remotePlugins = [] self.localPlugins = [] self.name = '' self.options = {} self.protocolDetails = [] self.storages = [] ########################################################################################### # # Below are public methods for obtaining storage objects # def getStorageName(self, initialName): return self._getConfigStorageName(initialName, 'Alias') def getStorage(self, parameterDict, hideExceptions=False): """ This instantiates a single storage for the details provided and doesn't check the CS. """ # The storage name must be supplied. if parameterDict.has_key('StorageName'): storageName = parameterDict['StorageName'] else: errStr = "StorageFactory.getStorage: StorageName must be supplied" gLogger.error(errStr) return S_ERROR(errStr) # PluginName must be supplied otherwise nothing with work. if parameterDict.has_key('PluginName'): pluginName = parameterDict['PluginName'] # Temporary fix for backward compatibility elif parameterDict.has_key('ProtocolName'): pluginName = parameterDict['ProtocolName'] else: errStr = "StorageFactory.getStorage: PluginName must be supplied" gLogger.error(errStr) return S_ERROR(errStr) return self.__generateStorageObject(storageName, pluginName, parameterDict, hideExceptions=hideExceptions) def getStorages(self, storageName, pluginList=None, hideExceptions=False): """ Get an instance of a Storage based on the DIRAC SE name based on the CS entries CS 'storageName' is the DIRAC SE name i.e. 'CERN-RAW' 'pluginList' is an optional list of protocols if a sub-set is desired i.e ['SRM2','SRM1'] """ self.remotePlugins = [] self.localPlugins = [] self.name = '' self.options = {} self.protocolDetails = [] self.storages = [] if pluginList is None: pluginList = [] elif isinstance(pluginList, basestring): pluginList = [pluginList] if not self.vo: gLogger.warn('No VO information available') # Get the name of the storage provided res = self._getConfigStorageName(storageName, 'Alias') if not res['OK']: return res storageName = res['Value'] self.name = storageName # In case the storage is made from a base SE, get this information res = self._getConfigStorageName(storageName, 'BaseSE') if not res['OK']: return res # If the storage is derived frmo another one, keep the information if res['Value'] != storageName: derivedStorageName = storageName storageName = res['Value'] else: derivedStorageName = None # Get the options defined in the CS for this storage res = self._getConfigStorageOptions( storageName, derivedStorageName=derivedStorageName) if not res['OK']: return res self.options = res['Value'] # Get the protocol specific details res = self._getConfigStorageProtocols( storageName, derivedStorageName=derivedStorageName) if not res['OK']: return res self.protocolDetails = res['Value'] requestedLocalPlugins = [] requestedRemotePlugins = [] requestedProtocolDetails = [] turlProtocols = [] # Generate the protocol specific plug-ins for protocolDict in self.protocolDetails: pluginName = protocolDict.get('PluginName') if pluginList and pluginName not in pluginList: continue protocol = protocolDict['Protocol'] result = self.__generateStorageObject( storageName, pluginName, protocolDict, hideExceptions=hideExceptions) if result['OK']: self.storages.append(result['Value']) if pluginName in self.localPlugins: turlProtocols.append(protocol) requestedLocalPlugins.append(pluginName) if pluginName in self.remotePlugins: requestedRemotePlugins.append(pluginName) requestedProtocolDetails.append(protocolDict) else: gLogger.info(result['Message']) if len(self.storages) > 0: resDict = {} resDict['StorageName'] = self.name resDict['StorageOptions'] = self.options resDict['StorageObjects'] = self.storages resDict['LocalPlugins'] = requestedLocalPlugins resDict['RemotePlugins'] = requestedRemotePlugins resDict['ProtocolOptions'] = requestedProtocolDetails resDict['TurlProtocols'] = turlProtocols return S_OK(resDict) else: errStr = "StorageFactory.getStorages: Failed to instantiate any storage protocols." gLogger.error(errStr, self.name) return S_ERROR(errStr) ########################################################################################### # # Below are internal methods for obtaining section/option/value configuration # def _getConfigStorageName(self, storageName, referenceType): """ This gets the name of the storage the configuration service. If the storage is a reference to another SE the resolution is performed. 'storageName' is the storage section to check in the CS """ configPath = '%s/%s' % (self.rootConfigPath, storageName) res = gConfig.getOptions(configPath) if not res['OK']: errStr = "StorageFactory._getConfigStorageName: Failed to get storage options" gLogger.error(errStr, res['Message']) return S_ERROR(errStr) if not res['Value']: errStr = "StorageFactory._getConfigStorageName: Supplied storage doesn't exist." gLogger.error(errStr, configPath) return S_ERROR(errStr) if referenceType in res['Value']: configPath = cfgPath(self.rootConfigPath, storageName, referenceType) referenceName = gConfig.getValue(configPath) result = self._getConfigStorageName(referenceName, 'Alias') if not result['OK']: return result resolvedName = result['Value'] else: resolvedName = storageName return S_OK(resolvedName) def _getConfigStorageOptions(self, storageName, derivedStorageName=None): """ Get the options associated to the StorageElement as defined in the CS """ optionsDict = {} # We first get the options of the baseSE, and then overwrite with the derivedSE for seName in (storageName, derivedStorageName) if derivedStorageName else ( storageName, ): storageConfigPath = cfgPath(self.rootConfigPath, seName) res = gConfig.getOptions(storageConfigPath) if not res['OK']: errStr = "StorageFactory._getStorageOptions: Failed to get storage options." gLogger.error(errStr, "%s: %s" % (seName, res['Message'])) return S_ERROR(errStr) for option in set(res['Value']) - set( ('ReadAccess', 'WriteAccess', 'CheckAccess', 'RemoveAccess')): optionConfigPath = cfgPath(storageConfigPath, option) default = [] if option in ['VO'] else '' optionsDict[option] = gConfig.getValue(optionConfigPath, default) # The status is that of the derived SE only seName = derivedStorageName if derivedStorageName else storageName res = self.resourceStatus.getStorageElementStatus(seName) if not res['OK']: errStr = "StorageFactory._getStorageOptions: Failed to get storage status" gLogger.error(errStr, "%s: %s" % (seName, res['Message'])) return S_ERROR(errStr) # For safety, we did not add the ${statusType}Access keys # this requires modifications in the StorageElement class # We add the dictionary with the statusTypes and values # { 'statusType1' : 'status1', 'statusType2' : 'status2' ... } optionsDict.update(res['Value'][seName]) return S_OK(optionsDict) def __getProtocolsSections(self, storageName): storageConfigPath = cfgPath(self.rootConfigPath, storageName) res = gConfig.getSections(storageConfigPath) if not res['OK']: errStr = "StorageFactory._getConfigStorageProtocols: Failed to get storage sections" gLogger.error(errStr, "%s: %s" % (storageName, res['Message'])) return S_ERROR(errStr) protocolSections = res['Value'] return S_OK(protocolSections) def _getConfigStorageProtocols(self, storageName, derivedStorageName=None): """ Protocol specific information is present as sections in the Storage configuration """ res = self.__getProtocolsSections(storageName) if not res['OK']: return res protocolSections = res['Value'] sortedProtocolSections = sorted(protocolSections) protocolDetails = [] for protocolSection in sortedProtocolSections: res = self._getConfigStorageProtocolDetails( storageName, protocolSection) if not res['OK']: return res protocolDetails.append(res['Value']) if derivedStorageName: # We may have parameters overwriting the baseSE protocols res = self.__getProtocolsSections(derivedStorageName) if not res['OK']: return res for protocolSection in res['Value']: res = self._getConfigStorageProtocolDetails(derivedStorageName, protocolSection, checkAccess=False) if not res['OK']: return res detail = res['Value'] pluginName = detail.get('PluginName') if pluginName: for protocolDetail in protocolDetails: if protocolDetail.get('PluginName') == pluginName: for key, val in detail.iteritems(): if val: protocolDetail[key] = val break return S_OK(protocolDetails) def _getConfigStorageProtocolDetails(self, storageName, protocolSection, checkAccess=True): """ Parse the contents of the protocol block """ # First obtain the options that are available protocolConfigPath = cfgPath(self.rootConfigPath, storageName, protocolSection) res = gConfig.getOptions(protocolConfigPath) if not res['OK']: errStr = "StorageFactory.__getProtocolDetails: Failed to get protocol options." gLogger.error(errStr, "%s: %s" % (storageName, protocolSection)) return S_ERROR(errStr) options = res['Value'] # We must have certain values internally even if not supplied in CS protocolDict = { 'Access': '', 'Host': '', 'Path': '', 'Port': '', 'Protocol': '', 'SpaceToken': '', 'WSUrl': '' } for option in options: configPath = cfgPath(protocolConfigPath, option) optionValue = gConfig.getValue(configPath, '') protocolDict[option] = optionValue # This is a temporary for backward compatibility: move ProtocolName to PluginName protocolDict.setdefault('PluginName', protocolDict.pop('ProtocolName', None)) # Evaluate the base path taking into account possible VO specific setting if self.vo: result = gConfig.getOptionsDict( cfgPath(protocolConfigPath, 'VOPath')) voPath = '' if result['OK']: voPath = result['Value'].get(self.vo, '') if voPath: protocolDict['Path'] = voPath # Now update the local and remote protocol lists. # A warning will be given if the Access option is not set. if checkAccess: if protocolDict['Access'].lower() == 'remote': self.remotePlugins.append(protocolDict['PluginName']) elif protocolDict['Access'].lower() == 'local': self.localPlugins.append(protocolDict['PluginName']) else: errStr = "StorageFactory.__getProtocolDetails: The 'Access' option for %s:%s is neither 'local' or 'remote'." % ( storageName, protocolSection) gLogger.warn(errStr) # The PluginName option must be defined if not protocolDict['PluginName']: errStr = "StorageFactory.__getProtocolDetails: 'PluginName' option is not defined." gLogger.error(errStr, "%s: %s" % (storageName, protocolSection)) return S_ERROR(errStr) return S_OK(protocolDict) ########################################################################################### # # Below is the method for obtaining the object instantiated for a provided storage configuration # def __generateStorageObject(self, storageName, pluginName, parameters, hideExceptions=False): storageType = pluginName if self.proxy: storageType = 'Proxy' objectLoader = ObjectLoader() result = objectLoader.loadObject('Resources.Storage.%sStorage' % storageType, storageType + 'Storage', hideExceptions=hideExceptions) if not result['OK']: gLogger.error('Failed to load storage object: %s' % result['Message']) return result storageClass = result['Value'] try: storage = storageClass(storageName, parameters) except Exception, x: errStr = "StorageFactory._generateStorageObject: Failed to instantiate %s: %s" % ( storageName, x) gLogger.exception(errStr) return S_ERROR(errStr) return S_OK(storage)
class StrategyHandler(object): """ .. class:: StrategyHandler StrategyHandler is a helper class for determining optimal replication tree for given source files, their replicas and target storage elements. """ def __init__(self, configSection, bandwidths=None, channels=None, failedFiles=None): """c'tor :param self: self reference :param str configSection: path on CS to ReplicationScheduler agent :param bandwithds: observed throughput on active channels :param channels: active channels :param int failedFiles: max number of distinct failed files to allow scheduling """ ## save config section self.configSection = configSection + "/" + self.__class__.__name__ ## sublogger self.log = gLogger.getSubLogger("StrategyHandler", child=True) self.log.setLevel( gConfig.getValue(self.configSection + "/LogLevel", "DEBUG")) self.supportedStrategies = [ 'Simple', 'DynamicThroughput', 'Swarm', 'MinimiseTotalWait' ] self.log.debug("Supported strategies = %s" % ", ".join(self.supportedStrategies)) self.sigma = gConfig.getValue(self.configSection + '/HopSigma', 0.0) self.log.debug("HopSigma = %s" % self.sigma) self.schedulingType = gConfig.getValue( self.configSection + '/SchedulingType', 'File') self.log.debug("SchedulingType = %s" % self.schedulingType) self.activeStrategies = gConfig.getValue( self.configSection + '/ActiveStrategies', ['MinimiseTotalWait']) self.log.debug("ActiveStrategies = %s" % ", ".join(self.activeStrategies)) self.numberOfStrategies = len(self.activeStrategies) self.log.debug("Number of active strategies = %s" % self.numberOfStrategies) self.acceptableFailureRate = gConfig.getValue( self.configSection + '/AcceptableFailureRate', 75) self.log.debug("AcceptableFailureRate = %s" % self.acceptableFailureRate) self.acceptableFailedFiles = gConfig.getValue( self.configSection + "/AcceptableFailedFiles", 5) self.log.debug("AcceptableFailedFiles = %s" % self.acceptableFailedFiles) self.bandwidths = bandwidths if bandwidths else {} self.channels = channels if channels else {} self.failedFiles = failedFiles if failedFiles else {} self.chosenStrategy = 0 # dispatcher self.strategyDispatcher = { re.compile("MinimiseTotalWait"): self.__minimiseTotalWait, re.compile("DynamicThroughput"): self.__dynamicThroughput, re.compile("Simple"): self.__simple, re.compile("Swarm"): self.__swarm } self.resourceStatus = ResourceStatus() self.log.debug("strategyDispatcher entries:") for key, value in self.strategyDispatcher.items(): self.log.debug("%s : %s" % (key.pattern, value.__name__)) self.log.debug("%s has been constructed" % self.__class__.__name__) def reset(self): """ reset :chosenStrategy: :param self: self reference """ self.chosenStrategy = 0 def setFailedFiles(self, failedFiles): """ set the failed FTS files counters :param self: self reference :param failedFiles: observed distinct failed files """ self.failedFiles = failedFiles if failedFiles else {} def setBandwiths(self, bandwidths): """ set the bandwidths :param self: self reference :param bandwithds: observed througput of active FTS channels """ self.bandwidths = bandwidths if bandwidths else {} def setChannels(self, channels): """ set the channels :param self: self reference :param channels: active channels queues """ self.channels = channels if channels else {} def getSupportedStrategies(self): """ Get supported strategies. :param self: self reference """ return self.supportedStrategies def determineReplicationTree(self, sourceSE, targetSEs, replicas, size, strategy=None, sigma=None): """ resolve and find replication tree given source and target storage elements, active replicas, and file size. :param self: self reference :param str sourceSE: source storage element name :param list targetSEs: list of target storage elements :param dict replicas: active replicas :param int size: fiel size :param str strategy: strategy to use :param float sigma: hop sigma """ if not strategy: strategy = self.__selectStrategy() self.log.debug("determineReplicationTree: will use %s strategy" % strategy) if sigma: self.log.debug("determineReplicationTree: sigma = %s" % sigma) self.sigma = sigma # For each strategy implemented an 'if' must be placed here tree = {} for reStrategy in self.strategyDispatcher: self.log.debug(reStrategy.pattern) if reStrategy.search(strategy): if "_" in strategy: try: self.sigma = float(strategy.split("_")[1]) self.log.debug( "determineReplicationTree: new sigma %s" % self.sigma) except ValueError: self.log.warn( "determineReplicationTree: can't set new sigma value from '%s'" % strategy) if reStrategy.pattern in [ "MinimiseTotalWait", "DynamicThroughput" ]: replicasToUse = replicas.keys() if sourceSE == None else [ sourceSE ] tree = self.strategyDispatcher[reStrategy].__call__( replicasToUse, targetSEs) elif reStrategy.pattern == "Simple": if not sourceSE in replicas.keys(): return S_ERROR( "File does not exist at specified source site") tree = self.__simple(sourceSE, targetSEs) elif reStrategy.pattern == "Swarm": tree = self.__swarm(targetSEs[0], replicas.keys()) # Now update the queues to reflect the chosen strategies for channelID in tree: self.channels[channelID]["Files"] += 1 self.channels[channelID]["Size"] += size return S_OK(tree) def __selectStrategy(self): """ If more than one active strategy use one after the other. :param self: self reference """ chosenStrategy = self.activeStrategies[self.chosenStrategy] self.chosenStrategy += 1 if self.chosenStrategy == self.numberOfStrategies: self.chosenStrategy = 0 return chosenStrategy def __simple(self, sourceSE, destSEs): """ This just does a simple replication from the source to all the targets. :param self: self reference :param str sourceSE: source storage element name :param list destSEs: destination storage elements """ tree = {} if not self.__getActiveSEs([sourceSE]): return tree sourceSites = self.__getChannelSitesForSE(sourceSE) for destSE in destSEs: destSites = self.__getChannelSitesForSE(destSE) for channelID, channelDict in self.channels.items(): if channelID in tree: continue if channelDict["Source"] in sourceSites and channelDict[ "Destination"] in destSites: tree[channelID] = { "Ancestor": False, "SourceSE": sourceSE, "DestSE": destSE, "Strategy": "Simple" } return tree def __swarm(self, destSE, replicas): """ This strategy is to be used to the data the the target site as quickly as possible from any source. :param self: self reference :param str destSE: destination storage element :param list replicas: replicas dictionary keys """ tree = {} res = self.__getTimeToStart() if not res["OK"]: self.log.error(res["Message"]) return tree channelInfo = res["Value"] minTimeToStart = float("inf") sourceSEs = self.__getActiveSEs(replicas) destSites = self.__getChannelSitesForSE(destSE) selectedChannelID = None selectedSourceSE = None selectedDestSE = None for destSite in destSites: for sourceSE in sourceSEs: for sourceSite in self.__getChannelSitesForSE(sourceSE): channelName = "%s-%s" % (sourceSite, destSite) if channelName not in channelInfo: errStr = "__swarm: Channel not defined" self.log.warn(errStr, channelName) continue channelTimeToStart = channelInfo[channelName][ "TimeToStart"] if channelTimeToStart <= minTimeToStart: minTimeToStart = channelTimeToStart selectedSourceSE = sourceSE selectedDestSE = destSE selectedChannelID = channelInfo[channelName][ "ChannelID"] if selectedChannelID and selectedSourceSE and selectedDestSE: tree[selectedChannelID] = { "Ancestor": False, "SourceSE": selectedSourceSE, "DestSE": selectedDestSE, "Strategy": "Swarm" } return tree def __dynamicThroughput(self, sourceSEs, destSEs): """ This creates a replication tree based on observed throughput on the channels. :param self: self reference :param list sourceSEs: source storage elements names :param list destSEs: destination storage elements names """ tree = {} res = self.__getTimeToStart() if not res["OK"]: self.log.error(res["Message"]) return tree channelInfo = res["Value"] timeToSite = {} # Maintains time to site including previous hops siteAncestor = {} # Maintains the ancestor channel for a site while len(destSEs) > 0: try: minTotalTimeToStart = float("inf") candidateChannels = [] sourceActiveSEs = self.__getActiveSEs(sourceSEs) for destSE in destSEs: destSites = self.__getChannelSitesForSE(destSE) for destSite in destSites: for sourceSE in sourceActiveSEs: sourceSites = self.__getChannelSitesForSE(sourceSE) for sourceSite in sourceSites: channelName = "%s-%s" % (sourceSite, destSite) if channelName not in channelInfo: self.log.warn( "dynamicThroughput: bailing out! channel %s not defined " % channelName) raise StrategyHandlerChannelNotDefined( channelName) channelID = channelInfo[channelName][ "ChannelID"] if channelID in tree: continue channelTimeToStart = channelInfo[channelName][ "TimeToStart"] totalTimeToStart = channelTimeToStart if sourceSE in timeToSite: totalTimeToStart += timeToSite[ sourceSE] + self.sigma if (sourceSite == destSite): selectedPathTimeToStart = totalTimeToStart candidateChannels = [(sourceSE, destSE, channelID)] raise StrategyHandlerLocalFound( candidateChannels) if totalTimeToStart < minTotalTimeToStart: minTotalTimeToStart = totalTimeToStart selectedPathTimeToStart = totalTimeToStart candidateChannels = [(sourceSE, destSE, channelID)] elif totalTimeToStart == minTotalTimeToStart and totalTimeToStart < float( "inf"): minTotalTimeToStart = totalTimeToStart selectedPathTimeToStart = totalTimeToStart candidateChannels.append( (sourceSE, destSE, channelID)) except StrategyHandlerLocalFound: pass random.shuffle(candidateChannels) selectedSourceSE, selectedDestSE, selectedChannelID = candidateChannels[ 0] timeToSite[selectedDestSE] = selectedPathTimeToStart siteAncestor[selectedDestSE] = selectedChannelID waitingChannel = False if selectedSourceSE not in siteAncestor else siteAncestor[ selectedSourceSE] tree[selectedChannelID] = { "Ancestor": waitingChannel, "SourceSE": selectedSourceSE, "DestSE": selectedDestSE, "Strategy": "DynamicThroughput" } sourceSEs.append(selectedDestSE) destSEs.remove(selectedDestSE) return tree def __minimiseTotalWait(self, sourceSEs, destSEs): """ This creates a replication tree based on observed throughput on the channels. :param self: self reference :param list sourceSEs: source storage elements names :param list destSEs: destination storage elements names """ self.log.debug("sourceSEs = %s" % sourceSEs) self.log.debug("destSEs = %s" % destSEs) tree = {} res = self.__getTimeToStart() if not res["OK"]: self.log.error(res["Message"]) return tree channelInfo = res["Value"] timeToSite = {} # Maintains time to site including previous hops siteAncestor = {} # Maintains the ancestor channel for a site primarySources = sourceSEs while destSEs: try: minTotalTimeToStart = float("inf") candidateChannels = [] sourceActiveSEs = self.__getActiveSEs(sourceSEs) for destSE in destSEs: destSites = self.__getChannelSitesForSE(destSE) for destSite in destSites: for sourceSE in sourceActiveSEs: sourceSites = self.__getChannelSitesForSE(sourceSE) for sourceSite in sourceSites: channelName = "%s-%s" % (sourceSite, destSite) if channelName not in channelInfo: continue channelID = channelInfo[channelName][ "ChannelID"] # If this channel is already used, look for another sourceSE if channelID in tree: continue channelTimeToStart = channelInfo[channelName][ "TimeToStart"] if not sourceSE in primarySources: channelTimeToStart += self.sigma ## local transfer found if sourceSite == destSite: selectedPathTimeToStart = channelTimeToStart candidateChannels = [(sourceSE, destSE, channelID)] ## bail out to save rainforests raise StrategyHandlerLocalFound( candidateChannels) if channelTimeToStart < minTotalTimeToStart: minTotalTimeToStart = channelTimeToStart selectedPathTimeToStart = channelTimeToStart candidateChannels = [(sourceSE, destSE, channelID)] elif channelTimeToStart == minTotalTimeToStart and channelTimeToStart != float( "inf"): minTotalTimeToStart = channelTimeToStart selectedPathTimeToStart = channelTimeToStart candidateChannels.append( (sourceSE, destSE, channelID)) except StrategyHandlerLocalFound: pass if not candidateChannels: return tree ## shuffle candidates and pick the 1st one random.shuffle(candidateChannels) selectedSourceSE, selectedDestSE, selectedChannelID = candidateChannels[ 0] timeToSite[selectedDestSE] = selectedPathTimeToStart siteAncestor[selectedDestSE] = selectedChannelID waitingChannel = False if selectedSourceSE not in siteAncestor else siteAncestor[ selectedSourceSE] tree[selectedChannelID] = { "Ancestor": waitingChannel, "SourceSE": selectedSourceSE, "DestSE": selectedDestSE, "Strategy": "MinimiseTotalWait" } sourceSEs.append(selectedDestSE) destSEs.remove(selectedDestSE) return tree def __getTimeToStart(self): """ Generate the dictionary of times to start based on task queue contents and observed throughput. :param self: self reference """ if self.schedulingType not in ("File", "Throughput"): errStr = "__getTimeToStart: CS SchedulingType entry must be either 'File' or 'Throughput'" self.log.error(errStr) return S_ERROR(errStr) channelInfo = {} for channelID, bandwidth in self.bandwidths.items(): channelDict = self.channels[channelID] channelName = channelDict["ChannelName"] # initial equal 0.0 timeToStart = 0.0 channelStatus = channelDict["Status"] ## channel is active? if channelStatus == "Active": channelFileSuccess = bandwidth["SuccessfulFiles"] channelFileFailed = bandwidth["FailedFiles"] attempted = channelFileSuccess + channelFileFailed successRate = 100.0 if attempted != 0: successRate = 100.0 * (channelFileSuccess / float(attempted)) ## get distinct failed files counter distinctFailedFiles = self.failedFiles.get(channelID, 0) ## success rate too low and more than acceptable distinct files are affected?, make channel unattractive if (successRate < self.acceptableFailureRate) and ( distinctFailedFiles > self.acceptableFailedFiles): timeToStart = float("inf") else: ## scheduling type == Throughput transferSpeed = bandwidth["Throughput"] waitingTransfers = channelDict["Size"] ## scheduling type == File, overwrite transferSpeed and waitingTransfer if self.schedulingType == "File": transferSpeed = bandwidth["Fileput"] waitingTransfers = channelDict["Files"] if transferSpeed > 0: timeToStart = waitingTransfers / float(transferSpeed) else: ## channel not active, make it unattractive timeToStart = float("inf") channelInfo.setdefault(channelName, { "ChannelID": channelID, "TimeToStart": timeToStart }) return S_OK(channelInfo) def __getActiveSEs(self, seList, access="Read"): """Get active storage elements. :param self: self reference :param list seList: stogare element list :param str access: storage element accesss, could be 'Read' (default) or 'Write' """ res = self.resourceStatus.getStorageElementStatus(seList, statusType=access, default='Unknown') if not res["OK"]: return [] return [ k for k, v in res["Value"].items() if access in v and v[access] in ("Active", "Bad") ] def __getChannelSitesForSE(self, storageElement): """Get sites for given storage element. :param self: self reference :param str storageElement: storage element name """ res = getSitesForSE(storageElement) if not res["OK"]: return [] sites = [] for site in res["Value"]: siteName = site.split(".") if len(siteName) > 1: if not siteName[1] in sites: sites.append(siteName[1]) return sites
class StrategyHandler( object ): """ .. class:: StrategyHandler StrategyHandler is a helper class for determining optimal replication tree for given source files, their replicas and target storage elements. """ def __init__( self, configSection, bandwidths=None, channels=None, failedFiles=None ): """c'tor :param self: self reference :param str configSection: path on CS to ReplicationScheduler agent :param bandwithds: observed throughput on active channels :param channels: active channels :param int failedFiles: max number of distinct failed files to allow scheduling """ ## save config section self.configSection = configSection + "/" + self.__class__.__name__ ## sublogger self.log = gLogger.getSubLogger( "StrategyHandler", child=True ) self.log.setLevel( gConfig.getValue( self.configSection + "/LogLevel", "DEBUG" ) ) self.supportedStrategies = [ 'Simple', 'DynamicThroughput', 'Swarm', 'MinimiseTotalWait' ] self.log.debug( "Supported strategies = %s" % ", ".join( self.supportedStrategies ) ) self.sigma = gConfig.getValue( self.configSection + '/HopSigma', 0.0 ) self.log.debug( "HopSigma = %s" % self.sigma ) self.schedulingType = gConfig.getValue( self.configSection + '/SchedulingType', 'File' ) self.log.debug( "SchedulingType = %s" % self.schedulingType ) self.activeStrategies = gConfig.getValue( self.configSection + '/ActiveStrategies', ['MinimiseTotalWait'] ) self.log.debug( "ActiveStrategies = %s" % ", ".join( self.activeStrategies ) ) self.numberOfStrategies = len( self.activeStrategies ) self.log.debug( "Number of active strategies = %s" % self.numberOfStrategies ) self.acceptableFailureRate = gConfig.getValue( self.configSection + '/AcceptableFailureRate', 75 ) self.log.debug( "AcceptableFailureRate = %s" % self.acceptableFailureRate ) self.acceptableFailedFiles = gConfig.getValue( self.configSection + "/AcceptableFailedFiles", 5 ) self.log.debug( "AcceptableFailedFiles = %s" % self.acceptableFailedFiles ) self.bandwidths = bandwidths if bandwidths else {} self.channels = channels if channels else {} self.failedFiles = failedFiles if failedFiles else {} self.chosenStrategy = 0 # dispatcher self.strategyDispatcher = { re.compile("MinimiseTotalWait") : self.__minimiseTotalWait, re.compile("DynamicThroughput") : self.__dynamicThroughput, re.compile("Simple") : self.__simple, re.compile("Swarm") : self.__swarm } self.resourceStatus = ResourceStatus() self.log.debug( "strategyDispatcher entries:" ) for key, value in self.strategyDispatcher.items(): self.log.debug( "%s : %s" % ( key.pattern, value.__name__ ) ) self.log.debug("%s has been constructed" % self.__class__.__name__ ) def reset( self ): """ reset :chosenStrategy: :param self: self reference """ self.chosenStrategy = 0 def setFailedFiles( self, failedFiles ): """ set the failed FTS files counters :param self: self reference :param failedFiles: observed distinct failed files """ self.failedFiles = failedFiles if failedFiles else {} def setBandwiths( self, bandwidths ): """ set the bandwidths :param self: self reference :param bandwithds: observed througput of active FTS channels """ self.bandwidths = bandwidths if bandwidths else {} def setChannels( self, channels ): """ set the channels :param self: self reference :param channels: active channels queues """ self.channels = channels if channels else {} def getSupportedStrategies( self ): """ Get supported strategies. :param self: self reference """ return self.supportedStrategies def determineReplicationTree( self, sourceSE, targetSEs, replicas, size, strategy = None, sigma = None ): """ resolve and find replication tree given source and target storage elements, active replicas, and file size. :param self: self reference :param str sourceSE: source storage element name :param list targetSEs: list of target storage elements :param dict replicas: active replicas :param int size: fiel size :param str strategy: strategy to use :param float sigma: hop sigma """ if not strategy: strategy = self.__selectStrategy() self.log.debug( "determineReplicationTree: will use %s strategy" % strategy ) if sigma: self.log.debug( "determineReplicationTree: sigma = %s" % sigma ) self.sigma = sigma # For each strategy implemented an 'if' must be placed here tree = {} for reStrategy in self.strategyDispatcher: self.log.debug( reStrategy.pattern ) if reStrategy.search( strategy ): if "_" in strategy: try: self.sigma = float(strategy.split("_")[1]) self.log.debug("determineReplicationTree: new sigma %s" % self.sigma ) except ValueError: self.log.warn("determineReplicationTree: can't set new sigma value from '%s'" % strategy ) if reStrategy.pattern in [ "MinimiseTotalWait", "DynamicThroughput" ]: replicasToUse = replicas.keys() if sourceSE == None else [ sourceSE ] tree = self.strategyDispatcher[ reStrategy ].__call__( replicasToUse, targetSEs ) elif reStrategy.pattern == "Simple": if not sourceSE in replicas.keys(): return S_ERROR( "File does not exist at specified source site" ) tree = self.__simple( sourceSE, targetSEs ) elif reStrategy.pattern == "Swarm": tree = self.__swarm( targetSEs[0], replicas.keys() ) # Now update the queues to reflect the chosen strategies for channelID in tree: self.channels[channelID]["Files"] += 1 self.channels[channelID]["Size"] += size return S_OK( tree ) def __selectStrategy( self ): """ If more than one active strategy use one after the other. :param self: self reference """ chosenStrategy = self.activeStrategies[self.chosenStrategy] self.chosenStrategy += 1 if self.chosenStrategy == self.numberOfStrategies: self.chosenStrategy = 0 return chosenStrategy def __simple( self, sourceSE, destSEs ): """ This just does a simple replication from the source to all the targets. :param self: self reference :param str sourceSE: source storage element name :param list destSEs: destination storage elements """ tree = {} if not self.__getActiveSEs( [ sourceSE ] ): return tree sourceSites = self.__getChannelSitesForSE( sourceSE ) for destSE in destSEs: destSites = self.__getChannelSitesForSE( destSE ) for channelID, channelDict in self.channels.items(): if channelID in tree: continue if channelDict["Source"] in sourceSites and channelDict["Destination"] in destSites: tree[channelID] = { "Ancestor" : False, "SourceSE" : sourceSE, "DestSE" : destSE, "Strategy" : "Simple" } return tree def __swarm( self, destSE, replicas ): """ This strategy is to be used to the data the the target site as quickly as possible from any source. :param self: self reference :param str destSE: destination storage element :param list replicas: replicas dictionary keys """ tree = {} res = self.__getTimeToStart() if not res["OK"]: self.log.error( res["Message"] ) return tree channelInfo = res["Value"] minTimeToStart = float( "inf" ) sourceSEs = self.__getActiveSEs( replicas ) destSites = self.__getChannelSitesForSE( destSE ) selectedChannelID = None selectedSourceSE = None selectedDestSE = None for destSite in destSites: for sourceSE in sourceSEs: for sourceSite in self.__getChannelSitesForSE( sourceSE ): channelName = "%s-%s" % ( sourceSite, destSite ) if channelName not in channelInfo: errStr = "__swarm: Channel not defined" self.log.warn( errStr, channelName ) continue channelTimeToStart = channelInfo[channelName]["TimeToStart"] if channelTimeToStart <= minTimeToStart: minTimeToStart = channelTimeToStart selectedSourceSE = sourceSE selectedDestSE = destSE selectedChannelID = channelInfo[channelName]["ChannelID"] if selectedChannelID and selectedSourceSE and selectedDestSE: tree[selectedChannelID] = { "Ancestor" : False, "SourceSE" : selectedSourceSE, "DestSE" : selectedDestSE, "Strategy" : "Swarm" } return tree def __dynamicThroughput( self, sourceSEs, destSEs ): """ This creates a replication tree based on observed throughput on the channels. :param self: self reference :param list sourceSEs: source storage elements names :param list destSEs: destination storage elements names """ tree = {} res = self.__getTimeToStart() if not res["OK"]: self.log.error( res["Message"] ) return tree channelInfo = res["Value"] timeToSite = {} # Maintains time to site including previous hops siteAncestor = {} # Maintains the ancestor channel for a site while len( destSEs ) > 0: try: minTotalTimeToStart = float( "inf" ) candidateChannels = [] sourceActiveSEs = self.__getActiveSEs( sourceSEs ) for destSE in destSEs: destSites = self.__getChannelSitesForSE( destSE ) for destSite in destSites: for sourceSE in sourceActiveSEs: sourceSites = self.__getChannelSitesForSE( sourceSE ) for sourceSite in sourceSites: channelName = "%s-%s" % ( sourceSite, destSite ) if channelName not in channelInfo: self.log.warn( "dynamicThroughput: bailing out! channel %s not defined " % channelName ) raise StrategyHandlerChannelNotDefined( channelName ) channelID = channelInfo[channelName]["ChannelID"] if channelID in tree: continue channelTimeToStart = channelInfo[channelName]["TimeToStart"] totalTimeToStart = channelTimeToStart if sourceSE in timeToSite: totalTimeToStart += timeToSite[sourceSE] + self.sigma if ( sourceSite == destSite ) : selectedPathTimeToStart = totalTimeToStart candidateChannels = [ ( sourceSE, destSE, channelID ) ] raise StrategyHandlerLocalFound( candidateChannels ) if totalTimeToStart < minTotalTimeToStart: minTotalTimeToStart = totalTimeToStart selectedPathTimeToStart = totalTimeToStart candidateChannels = [ ( sourceSE, destSE, channelID ) ] elif totalTimeToStart == minTotalTimeToStart and totalTimeToStart < float("inf"): minTotalTimeToStart = totalTimeToStart selectedPathTimeToStart = totalTimeToStart candidateChannels.append( ( sourceSE, destSE, channelID ) ) except StrategyHandlerLocalFound: pass random.shuffle( candidateChannels ) selectedSourceSE, selectedDestSE, selectedChannelID = candidateChannels[0] timeToSite[selectedDestSE] = selectedPathTimeToStart siteAncestor[selectedDestSE] = selectedChannelID waitingChannel = False if selectedSourceSE not in siteAncestor else siteAncestor[selectedSourceSE] tree[selectedChannelID] = { "Ancestor" : waitingChannel, "SourceSE" : selectedSourceSE, "DestSE" : selectedDestSE, "Strategy" : "DynamicThroughput" } sourceSEs.append( selectedDestSE ) destSEs.remove( selectedDestSE ) return tree def __minimiseTotalWait( self, sourceSEs, destSEs ): """ This creates a replication tree based on observed throughput on the channels. :param self: self reference :param list sourceSEs: source storage elements names :param list destSEs: destination storage elements names """ self.log.debug( "sourceSEs = %s" % sourceSEs ) self.log.debug( "destSEs = %s" % destSEs ) tree = {} res = self.__getTimeToStart() if not res["OK"]: self.log.error( res["Message"] ) return tree channelInfo = res["Value"] timeToSite = {} # Maintains time to site including previous hops siteAncestor = {} # Maintains the ancestor channel for a site primarySources = sourceSEs while destSEs: try: minTotalTimeToStart = float( "inf" ) candidateChannels = [] sourceActiveSEs = self.__getActiveSEs( sourceSEs ) for destSE in destSEs: destSites = self.__getChannelSitesForSE( destSE ) for destSite in destSites: for sourceSE in sourceActiveSEs: sourceSites = self.__getChannelSitesForSE( sourceSE ) for sourceSite in sourceSites: channelName = "%s-%s" % ( sourceSite, destSite ) if channelName not in channelInfo: continue channelID = channelInfo[channelName]["ChannelID"] # If this channel is already used, look for another sourceSE if channelID in tree: continue channelTimeToStart = channelInfo[channelName]["TimeToStart"] if not sourceSE in primarySources: channelTimeToStart += self.sigma ## local transfer found if sourceSite == destSite: selectedPathTimeToStart = channelTimeToStart candidateChannels = [ ( sourceSE, destSE, channelID ) ] ## bail out to save rainforests raise StrategyHandlerLocalFound( candidateChannels ) if channelTimeToStart < minTotalTimeToStart: minTotalTimeToStart = channelTimeToStart selectedPathTimeToStart = channelTimeToStart candidateChannels = [ ( sourceSE, destSE, channelID ) ] elif channelTimeToStart == minTotalTimeToStart and channelTimeToStart != float("inf"): minTotalTimeToStart = channelTimeToStart selectedPathTimeToStart = channelTimeToStart candidateChannels.append( ( sourceSE, destSE, channelID ) ) except StrategyHandlerLocalFound: pass if not candidateChannels: return tree ## shuffle candidates and pick the 1st one random.shuffle( candidateChannels ) selectedSourceSE, selectedDestSE, selectedChannelID = candidateChannels[0] timeToSite[selectedDestSE] = selectedPathTimeToStart siteAncestor[selectedDestSE] = selectedChannelID waitingChannel = False if selectedSourceSE not in siteAncestor else siteAncestor[selectedSourceSE] tree[selectedChannelID] = { "Ancestor" : waitingChannel, "SourceSE" : selectedSourceSE, "DestSE" : selectedDestSE, "Strategy" : "MinimiseTotalWait" } sourceSEs.append( selectedDestSE ) destSEs.remove( selectedDestSE ) return tree def __getTimeToStart( self ): """ Generate the dictionary of times to start based on task queue contents and observed throughput. :param self: self reference """ if self.schedulingType not in ( "File", "Throughput" ): errStr = "__getTimeToStart: CS SchedulingType entry must be either 'File' or 'Throughput'" self.log.error( errStr ) return S_ERROR( errStr ) channelInfo = {} for channelID, bandwidth in self.bandwidths.items(): channelDict = self.channels[channelID] channelName = channelDict["ChannelName"] # initial equal 0.0 timeToStart = 0.0 channelStatus = channelDict["Status"] ## channel is active? if channelStatus == "Active": channelFileSuccess = bandwidth["SuccessfulFiles"] channelFileFailed = bandwidth["FailedFiles"] attempted = channelFileSuccess + channelFileFailed successRate = 100.0 if attempted != 0: successRate = 100.0 * ( channelFileSuccess / float( attempted ) ) ## get distinct failed files counter distinctFailedFiles = self.failedFiles.get( channelID, 0 ) ## success rate too low and more than acceptable distinct files are affected?, make channel unattractive if ( successRate < self.acceptableFailureRate ) and ( distinctFailedFiles > self.acceptableFailedFiles ): timeToStart = float( "inf" ) else: ## scheduling type == Throughput transferSpeed = bandwidth["Throughput"] waitingTransfers = channelDict["Size"] ## scheduling type == File, overwrite transferSpeed and waitingTransfer if self.schedulingType == "File": transferSpeed = bandwidth["Fileput"] waitingTransfers = channelDict["Files"] if transferSpeed > 0: timeToStart = waitingTransfers / float( transferSpeed ) else: ## channel not active, make it unattractive timeToStart = float( "inf" ) channelInfo.setdefault( channelName, { "ChannelID" : channelID, "TimeToStart" : timeToStart } ) return S_OK( channelInfo ) def __getActiveSEs( self, seList, access = "Read" ): """Get active storage elements. :param self: self reference :param list seList: stogare element list :param str access: storage element accesss, could be 'Read' (default) or 'Write' """ res = self.resourceStatus.getStorageElementStatus( seList, statusType = access, default = 'Unknown' ) if not res["OK"]: return [] return [ k for k, v in res["Value"].items() if access in v and v[access] in ( "Active", "Bad" ) ] def __getChannelSitesForSE( self, storageElement ): """Get sites for given storage element. :param self: self reference :param str storageElement: storage element name """ res = getSitesForSE( storageElement ) if not res["OK"]: return [] sites = [] for site in res["Value"]: siteName = site.split( "." ) if len( siteName ) > 1: if not siteName[1] in sites: sites.append( siteName[1] ) return sites
class StorageFactory: def __init__( self, useProxy = False, vo = None ): self.rootConfigPath = '/Resources/StorageElements' self.valid = True self.proxy = False self.proxy = useProxy self.resourceStatus = ResourceStatus() self.vo = vo ########################################################################################### # # Below are public methods for obtaining storage objects # def getStorageName( self, initialName ): return self._getConfigStorageName( initialName ) def getStorage( self, parameterDict ): """ This instantiates a single storage for the details provided and doesn't check the CS. """ # The storage name must be supplied. if parameterDict.has_key( 'StorageName' ): storageName = parameterDict['StorageName'] else: errStr = "StorageFactory.getStorage: StorageName must be supplied" gLogger.error( errStr ) return S_ERROR( errStr ) # ProtocolName must be supplied otherwise nothing with work. if parameterDict.has_key( 'ProtocolName' ): protocolName = parameterDict['ProtocolName'] else: errStr = "StorageFactory.getStorage: ProtocolName must be supplied" gLogger.error( errStr ) return S_ERROR( errStr ) # The other options need not always be specified if parameterDict.has_key( 'Protocol' ): protocol = parameterDict['Protocol'] else: protocol = '' if parameterDict.has_key( 'Port' ): port = parameterDict['Port'] else: port = '' if parameterDict.has_key( 'Host' ): host = parameterDict['Host'] else: host = '' if parameterDict.has_key( 'Path' ): path = parameterDict['Path'] else: path = '' if parameterDict.has_key( 'SpaceToken' ): spaceToken = parameterDict['SpaceToken'] else: spaceToken = '' if parameterDict.has_key( 'WSUrl' ): wsPath = parameterDict['WSUrl'] else: wsPath = '' return self.__generateStorageObject( storageName, protocolName, protocol, path, host, port, spaceToken, wsPath, parameterDict ) def getStorages( self, storageName, protocolList = [] ): """ Get an instance of a Storage based on the DIRAC SE name based on the CS entries CS 'storageName' is the DIRAC SE name i.e. 'CERN-RAW' 'protocolList' is an optional list of protocols if a sub-set is desired i.e ['SRM2','SRM1'] """ self.remoteProtocols = [] self.localProtocols = [] self.name = '' self.options = {} self.protocolDetails = [] self.storages = [] # Get the name of the storage provided res = self._getConfigStorageName( storageName ) if not res['OK']: self.valid = False return res storageName = res['Value'] self.name = storageName # Get the options defined in the CS for this storage res = self._getConfigStorageOptions( storageName ) if not res['OK']: self.valid = False return res self.options = res['Value'] # Get the protocol specific details res = self._getConfigStorageProtocols( storageName ) if not res['OK']: self.valid = False return res self.protocolDetails = res['Value'] requestedLocalProtocols = [] requestedRemoteProtocols = [] requestedProtocolDetails = [] turlProtocols = [] # Generate the protocol specific plug-ins self.storages = [] for protocolDict in self.protocolDetails: protocolName = protocolDict['ProtocolName'] protocolRequested = True if protocolList: if protocolName not in protocolList: protocolRequested = False if protocolRequested: protocol = protocolDict['Protocol'] host = protocolDict['Host'] path = protocolDict['Path'] port = protocolDict['Port'] spaceToken = protocolDict['SpaceToken'] wsUrl = protocolDict['WSUrl'] res = self.__generateStorageObject( storageName, protocolName, protocol, path = path, host = host, port = port, spaceToken = spaceToken, wsUrl = wsUrl, parameters = protocolDict ) if res['OK']: self.storages.append( res['Value'] ) if protocolName in self.localProtocols: turlProtocols.append( protocol ) requestedLocalProtocols.append( protocolName ) if protocolName in self.remoteProtocols: requestedRemoteProtocols.append( protocolName ) requestedProtocolDetails.append( protocolDict ) else: gLogger.info( res['Message'] ) if len( self.storages ) > 0: resDict = {} resDict['StorageName'] = self.name resDict['StorageOptions'] = self.options resDict['StorageObjects'] = self.storages resDict['LocalProtocols'] = requestedLocalProtocols resDict['RemoteProtocols'] = requestedRemoteProtocols resDict['ProtocolOptions'] = requestedProtocolDetails resDict['TurlProtocols'] = turlProtocols return S_OK( resDict ) else: errStr = "StorageFactory.getStorages: Failed to instantiate any storage protocols." gLogger.error( errStr, self.name ) return S_ERROR( errStr ) ########################################################################################### # # Below are internal methods for obtaining section/option/value configuration # def _getConfigStorageName( self, storageName ): """ This gets the name of the storage the configuration service. If the storage is an alias for another the resolution is performed. 'storageName' is the storage section to check in the CS """ configPath = '%s/%s' % ( self.rootConfigPath, storageName ) res = gConfig.getOptions( configPath ) if not res['OK']: errStr = "StorageFactory._getConfigStorageName: Failed to get storage options" gLogger.error( errStr, res['Message'] ) return S_ERROR( errStr ) if not res['Value']: errStr = "StorageFactory._getConfigStorageName: Supplied storage doesn't exist." gLogger.error( errStr, configPath ) return S_ERROR( errStr ) if 'Alias' in res['Value']: configPath = '%s/%s/Alias' % ( self.rootConfigPath, storageName ) aliasName = gConfig.getValue( configPath ) result = self._getConfigStorageName( aliasName ) if not result['OK']: errStr = "StorageFactory._getConfigStorageName: Supplied storage doesn't exist." gLogger.error( errStr, configPath ) return S_ERROR( errStr ) resolvedName = result['Value'] else: resolvedName = storageName return S_OK( resolvedName ) def _getConfigStorageOptions( self, storageName ): """ Get the options associated to the StorageElement as defined in the CS """ storageConfigPath = '%s/%s' % ( self.rootConfigPath, storageName ) res = gConfig.getOptions( storageConfigPath ) if not res['OK']: errStr = "StorageFactory._getStorageOptions: Failed to get storage options." gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) options = res['Value'] optionsDict = {} for option in options: if option in [ 'ReadAccess', 'WriteAccess', 'CheckAccess', 'RemoveAccess']: continue optionConfigPath = '%s/%s' % ( storageConfigPath, option ) optionsDict[option] = gConfig.getValue( optionConfigPath, '' ) res = self.resourceStatus.getStorageElementStatus( storageName ) if not res[ 'OK' ]: errStr = "StorageFactory._getStorageOptions: Failed to get storage status" gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) # For safety, we did not add the ${statusType}Access keys # this requires modifications in the StorageElement class # We add the dictionary with the statusTypes and values # { 'statusType1' : 'status1', 'statusType2' : 'status2' ... } optionsDict.update( res[ 'Value' ][ storageName ] ) return S_OK( optionsDict ) def _getConfigStorageProtocols( self, storageName ): """ Protocol specific information is present as sections in the Storage configuration """ storageConfigPath = '%s/%s' % ( self.rootConfigPath, storageName ) res = gConfig.getSections( storageConfigPath ) if not res['OK']: errStr = "StorageFactory._getConfigStorageProtocols: Failed to get storage sections" gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) protocolSections = res['Value'] sortedProtocols = sortList( protocolSections ) protocolDetails = [] for protocol in sortedProtocols: res = self._getConfigStorageProtocolDetails( storageName, protocol ) if not res['OK']: return res protocolDetails.append( res['Value'] ) self.protocols = self.localProtocols + self.remoteProtocols return S_OK( protocolDetails ) def _getConfigStorageProtocolDetails( self, storageName, protocol ): """ Parse the contents of the protocol block """ # First obtain the options that are available protocolConfigPath = '%s/%s/%s' % ( self.rootConfigPath, storageName, protocol ) res = gConfig.getOptions( protocolConfigPath ) if not res['OK']: errStr = "StorageFactory.__getProtocolDetails: Failed to get protocol options." gLogger.error( errStr, "%s: %s" % ( storageName, protocol ) ) return S_ERROR( errStr ) options = res['Value'] # We must have certain values internally even if not supplied in CS protocolDict = {'Access':'', 'Host':'', 'Path':'', 'Port':'', 'Protocol':'', 'ProtocolName':'', 'SpaceToken':'', 'WSUrl':''} for option in options: configPath = '%s/%s' % ( protocolConfigPath, option ) optionValue = gConfig.getValue( configPath, '' ) protocolDict[option] = optionValue # Evaluate the base path taking into account possible VO specific setting if self.vo: result = gConfig.getOptionsDict( cfgPath( protocolConfigPath, 'VOPath' ) ) voPath = '' if result['OK']: voPath = result['Value'].get( self.vo, '' ) if voPath: protocolDict['Path'] = voPath # Now update the local and remote protocol lists. # A warning will be given if the Access option is not set. if protocolDict['Access'] == 'remote': self.remoteProtocols.append( protocolDict['ProtocolName'] ) elif protocolDict['Access'] == 'local': self.localProtocols.append( protocolDict['ProtocolName'] ) else: errStr = "StorageFactory.__getProtocolDetails: The 'Access' option for %s:%s is neither 'local' or 'remote'." % ( storageName, protocol ) gLogger.warn( errStr ) # The ProtocolName option must be defined if not protocolDict['ProtocolName']: errStr = "StorageFactory.__getProtocolDetails: 'ProtocolName' option is not defined." gLogger.error( errStr, "%s: %s" % ( storageName, protocol ) ) return S_ERROR( errStr ) return S_OK( protocolDict ) ########################################################################################### # # Below is the method for obtaining the object instantiated for a provided storage configuration # def __generateStorageObject( self, storageName, protocolName, protocol, path = None, host = None, port = None, spaceToken = None, wsUrl = None, parameters = {} ): storageType = protocolName if self.proxy: storageType = 'Proxy' moduleRootPaths = getInstalledExtensions() moduleLoaded = False path = path.rstrip( '/' ) if not path: path = '/' for moduleRootPath in moduleRootPaths: if moduleLoaded: break gLogger.debug( "Trying to load from root path %s" % moduleRootPath ) moduleFile = os.path.join( rootPath, moduleRootPath, "Resources", "Storage", "%sStorage.py" % storageType ) gLogger.debug( "Looking for file %s" % moduleFile ) if not os.path.isfile( moduleFile ): continue try: # This inforces the convention that the plug in must be named after the protocol moduleName = "%sStorage" % ( storageType ) storageModule = __import__( '%s.Resources.Storage.%s' % ( moduleRootPath, moduleName ), globals(), locals(), [moduleName] ) except Exception, x: errStr = "StorageFactory._generateStorageObject: Failed to import %s: %s" % ( storageName, x ) gLogger.exception( errStr ) return S_ERROR( errStr ) try: evalString = "storageModule.%s(storageName,protocol,path,host,port,spaceToken,wsUrl)" % moduleName storage = eval( evalString ) if not storage.isOK(): errStr = "StorageFactory._generateStorageObject: Failed to instantiate storage plug in." gLogger.error( errStr, "%s" % ( moduleName ) ) return S_ERROR( errStr ) except Exception, x: errStr = "StorageFactory._generateStorageObject: Failed to instantiate %s(): %s" % ( moduleName, x ) gLogger.exception( errStr ) return S_ERROR( errStr ) # Set extra parameters if any if parameters: result = storage.setParameters( parameters ) if not result['OK']: return result # If use proxy, keep the original protocol name if self.proxy: storage.protocolName = protocolName return S_OK( storage )
class StorageFactory: def __init__( self, useProxy=False ): self.rootConfigPath = '/Resources/StorageElements' self.valid = True self.proxy = False self.proxy = useProxy self.resourceStatus = ResourceStatus() ########################################################################################### # # Below are public methods for obtaining storage objects # def getStorageName( self, initialName ): return self._getConfigStorageName( initialName ) def getStorage( self, parameterDict ): """ This instantiates a single storage for the details provided and doesn't check the CS. """ # The storage name must be supplied. if parameterDict.has_key( 'StorageName' ): storageName = parameterDict['StorageName'] else: errStr = "StorageFactory.getStorage: StorageName must be supplied" gLogger.error( errStr ) return S_ERROR( errStr ) # ProtocolName must be supplied otherwise nothing with work. if parameterDict.has_key( 'ProtocolName' ): protocolName = parameterDict['ProtocolName'] else: errStr = "StorageFactory.getStorage: ProtocolName must be supplied" gLogger.error( errStr ) return S_ERROR( errStr ) # The other options need not always be specified if parameterDict.has_key( 'Protocol' ): protocol = parameterDict['Protocol'] else: protocol = '' if parameterDict.has_key( 'Port' ): port = parameterDict['Port'] else: port = '' if parameterDict.has_key( 'Host' ): host = parameterDict['Host'] else: host = '' if parameterDict.has_key( 'Path' ): path = parameterDict['Path'] else: path = '' if parameterDict.has_key( 'SpaceToken' ): spaceToken = parameterDict['SpaceToken'] else: spaceToken = '' if parameterDict.has_key( 'WSUrl' ): wsPath = parameterDict['WSUrl'] else: wsPath = '' return self.__generateStorageObject( storageName, protocolName, protocol, path, host, port, spaceToken, wsPath, parameterDict ) def getStorages( self, storageName, protocolList = [] ): """ Get an instance of a Storage based on the DIRAC SE name based on the CS entries CS 'storageName' is the DIRAC SE name i.e. 'CERN-RAW' 'protocolList' is an optional list of protocols if a sub-set is desired i.e ['SRM2','SRM1'] """ self.remoteProtocols = [] self.localProtocols = [] self.name = '' self.options = {} self.protocolDetails = [] self.storages = [] # Get the name of the storage provided res = self._getConfigStorageName( storageName ) if not res['OK']: self.valid = False return res storageName = res['Value'] self.name = storageName # Get the options defined in the CS for this storage res = self._getConfigStorageOptions( storageName ) if not res['OK']: self.valid = False return res self.options = res['Value'] # Get the protocol specific details res = self._getConfigStorageProtocols( storageName ) if not res['OK']: self.valid = False return res self.protocolDetails = res['Value'] requestedLocalProtocols = [] requestedRemoteProtocols = [] requestedProtocolDetails = [] turlProtocols = [] # Generate the protocol specific plug-ins self.storages = [] for protocolDict in self.protocolDetails: protocolName = protocolDict['ProtocolName'] protocolRequested = True if protocolList: if protocolName not in protocolList: protocolRequested = False if protocolRequested: protocol = protocolDict['Protocol'] host = protocolDict['Host'] path = protocolDict['Path'] port = protocolDict['Port'] spaceToken = protocolDict['SpaceToken'] wsUrl = protocolDict['WSUrl'] res = self.__generateStorageObject( storageName, protocolName, protocol, path = path, host = host, port = port, spaceToken = spaceToken, wsUrl = wsUrl, parameters = protocolDict ) if res['OK']: self.storages.append( res['Value'] ) if protocolName in self.localProtocols: turlProtocols.append( protocol ) requestedLocalProtocols.append( protocolName ) if protocolName in self.remoteProtocols: requestedRemoteProtocols.append( protocolName ) requestedProtocolDetails.append( protocolDict ) else: gLogger.info( res['Message'] ) if len( self.storages ) > 0: resDict = {} resDict['StorageName'] = self.name resDict['StorageOptions'] = self.options resDict['StorageObjects'] = self.storages resDict['LocalProtocols'] = requestedLocalProtocols resDict['RemoteProtocols'] = requestedRemoteProtocols resDict['ProtocolOptions'] = requestedProtocolDetails resDict['TurlProtocols'] = turlProtocols return S_OK( resDict ) else: errStr = "StorageFactory.getStorages: Failed to instantiate any storage protocols." gLogger.error( errStr, self.name ) return S_ERROR( errStr ) ########################################################################################### # # Below are internal methods for obtaining section/option/value configuration # def _getConfigStorageName( self, storageName ): """ This gets the name of the storage the configuration service. If the storage is an alias for another the resolution is performed. 'storageName' is the storage section to check in the CS """ configPath = '%s/%s' % ( self.rootConfigPath, storageName ) res = gConfig.getOptions( configPath ) if not res['OK']: errStr = "StorageFactory._getConfigStorageName: Failed to get storage options" gLogger.error( errStr, res['Message'] ) return S_ERROR( errStr ) if not res['Value']: errStr = "StorageFactory._getConfigStorageName: Supplied storage doesn't exist." gLogger.error( errStr, configPath ) return S_ERROR( errStr ) if 'Alias' in res['Value']: configPath = '%s/%s/Alias' % ( self.rootConfigPath, storageName ) resolvedName = gConfig.getValue( configPath ) else: resolvedName = storageName return S_OK( resolvedName ) def _getConfigStorageOptions( self, storageName ): """ Get the options associated to the StorageElement as defined in the CS """ storageConfigPath = '%s/%s' % ( self.rootConfigPath, storageName ) res = gConfig.getOptions( storageConfigPath ) if not res['OK']: errStr = "StorageFactory._getStorageOptions: Failed to get storage options." gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) options = res['Value'] optionsDict = {} for option in options: if option in [ 'ReadAccess', 'WriteAccess', 'CheckAccess', 'RemoveAccess']: continue optionConfigPath = '%s/%s' % ( storageConfigPath, option ) optionsDict[option] = gConfig.getValue( optionConfigPath, '' ) res = self.resourceStatus.getStorageElementStatus( storageName ) if not res[ 'OK' ]: errStr = "StorageFactory._getStorageOptions: Failed to get storage status" gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) # For safety, we did not add the ${statusType}Access keys # this requires modifications in the StorageElement class # We add the dictionary with the statusTypes and values # { 'statusType1' : 'status1', 'statusType2' : 'status2' ... } optionsDict.update( res[ 'Value' ][ storageName ] ) return S_OK( optionsDict ) def _getConfigStorageProtocols( self, storageName ): """ Protocol specific information is present as sections in the Storage configuration """ storageConfigPath = '%s/%s' % ( self.rootConfigPath, storageName ) res = gConfig.getSections( storageConfigPath ) if not res['OK']: errStr = "StorageFactory._getConfigStorageProtocols: Failed to get storage sections" gLogger.error( errStr, "%s: %s" % ( storageName, res['Message'] ) ) return S_ERROR( errStr ) protocolSections = res['Value'] sortedProtocols = sortList( protocolSections ) protocolDetails = [] for protocol in sortedProtocols: res = self._getConfigStorageProtocolDetails( storageName, protocol ) if not res['OK']: return res protocolDetails.append( res['Value'] ) self.protocols = self.localProtocols + self.remoteProtocols return S_OK( protocolDetails ) def _getConfigStorageProtocolDetails( self, storageName, protocol ): """ Parse the contents of the protocol block """ # First obtain the options that are available protocolConfigPath = '%s/%s/%s' % ( self.rootConfigPath, storageName, protocol ) res = gConfig.getOptions( protocolConfigPath ) if not res['OK']: errStr = "StorageFactory.__getProtocolDetails: Failed to get protocol options." gLogger.error( errStr, "%s: %s" % ( storageName, protocol ) ) return S_ERROR( errStr ) options = res['Value'] # We must have certain values internally even if not supplied in CS protocolDict = {'Access':'', 'Host':'', 'Path':'', 'Port':'', 'Protocol':'', 'ProtocolName':'', 'SpaceToken':'', 'WSUrl':''} for option in options: configPath = '%s/%s' % ( protocolConfigPath, option ) optionValue = gConfig.getValue( configPath, '' ) protocolDict[option] = optionValue # Now update the local and remote protocol lists. # A warning will be given if the Access option is not set. if protocolDict['Access'] == 'remote': self.remoteProtocols.append( protocolDict['ProtocolName'] ) elif protocolDict['Access'] == 'local': self.localProtocols.append( protocolDict['ProtocolName'] ) else: errStr = "StorageFactory.__getProtocolDetails: The 'Access' option for %s:%s is neither 'local' or 'remote'." % ( storageName, protocol ) gLogger.warn( errStr ) # The ProtocolName option must be defined if not protocolDict['ProtocolName']: errStr = "StorageFactory.__getProtocolDetails: 'ProtocolName' option is not defined." gLogger.error( errStr, "%s: %s" % ( storageName, protocol ) ) return S_ERROR( errStr ) return S_OK( protocolDict ) ########################################################################################### # # Below is the method for obtaining the object instantiated for a provided storage configuration # def __generateStorageObject( self, storageName, protocolName, protocol, path = None, host = None, port = None, spaceToken = None, wsUrl = None, parameters={} ): storageType = protocolName if self.proxy: storageType = 'Proxy' moduleRootPaths = getInstalledExtensions() moduleLoaded = False path = path.rstrip( '/' ) if not path: path = '/' for moduleRootPath in moduleRootPaths: if moduleLoaded: break gLogger.verbose( "Trying to load from root path %s" % moduleRootPath ) moduleFile = os.path.join( rootPath, moduleRootPath, "Resources", "Storage", "%sStorage.py" % storageType ) gLogger.verbose( "Looking for file %s" % moduleFile ) if not os.path.isfile( moduleFile ): continue try: # This inforces the convention that the plug in must be named after the protocol moduleName = "%sStorage" % ( storageType ) storageModule = __import__( '%s.Resources.Storage.%s' % ( moduleRootPath, moduleName ), globals(), locals(), [moduleName] ) except Exception, x: errStr = "StorageFactory._generateStorageObject: Failed to import %s: %s" % ( storageName, x ) gLogger.exception( errStr ) return S_ERROR( errStr ) try: evalString = "storageModule.%s(storageName,protocol,path,host,port,spaceToken,wsUrl)" % moduleName storage = eval( evalString ) if not storage.isOK(): errStr = "StorageFactory._generateStorageObject: Failed to instantiate storage plug in." gLogger.error( errStr, "%s" % ( moduleName ) ) return S_ERROR( errStr ) except Exception, x: errStr = "StorageFactory._generateStorageObject: Failed to instantiate %s(): %s" % ( moduleName, x ) gLogger.exception( errStr ) return S_ERROR( errStr ) # Set extra parameters if any if parameters: result = storage.setParameters( parameters ) if not result['OK']: return result # If use proxy, keep the original protocol name if self.proxy: storage.protocolName = protocolName return S_OK( storage )
class InputDataAgent(OptimizerModule): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ ############################################################################# def initializeOptimizer(self): """Initialize specific parameters for JobSanityAgent. """ self.failedMinorStatus = self.am_getOption('/FailedJobStatus', 'Input Data Not Available') #this will ignore failover SE files self.checkFileMetadata = self.am_getOption('CheckFileMetadata', True) self.dataManager = DataManager() self.resourceStatus = ResourceStatus() self.fc = FileCatalog() self.seToSiteMapping = {} self.lastCScheck = 0 self.cacheLength = 600 return S_OK() ############################################################################# def checkJob(self, job, classAdJob): """ This method does the optimization corresponding to this Agent, it is call for each job by the Optimizer framework """ result = self.jobDB.getInputData(job) if not result['OK']: self.log.warn('Failed to get input data from JobdB for %s' % (job)) self.log.warn(result['Message']) return result if not result['Value']: self.log.verbose('Job %s has no input data requirement' % (job)) return self.setNextOptimizer(job) #Check if we already executed this Optimizer and the input data is resolved res = self.getOptimizerJobInfo(job, self.am_getModuleParam('optimizerName')) if res['OK'] and len(res['Value']): pass else: self.log.verbose( 'Job %s has an input data requirement and will be processed' % (job)) inputData = result['Value'] result = self.__resolveInputData(job, inputData) if not result['OK']: self.log.warn(result['Message']) return result return self.setNextOptimizer(job) ############################################################################# def __resolveInputData(self, job, inputData): """This method checks the file catalog for replica information. """ lfns = [fname.replace('LFN:', '') for fname in inputData] start = time.time() # In order to place jobs on Hold if a certain SE is banned we need first to check first if # if the replicas are really available replicas = self.dataManager.getActiveReplicas(lfns) timing = time.time() - start self.log.verbose('Catalog Replicas Lookup Time: %.2f seconds ' % (timing)) if not replicas['OK']: self.log.warn(replicas['Message']) return replicas replicaDict = replicas['Value'] siteCandidates = self.__checkReplicas(job, replicaDict) if not siteCandidates['OK']: self.log.warn(siteCandidates['Message']) return siteCandidates if self.checkFileMetadata: guids = True start = time.time() guidDict = self.fc.getFileMetadata(lfns) timing = time.time() - start self.log.info('Catalog Metadata Lookup Time: %.2f seconds ' % (timing)) if not guidDict['OK']: self.log.warn(guidDict['Message']) guids = False failed = guidDict['Value']['Failed'] if failed: self.log.warn('Failed to establish some GUIDs') self.log.warn(failed) guids = False if guids: for lfn, reps in replicaDict['Successful'].items(): guidDict['Value']['Successful'][lfn].update(reps) replicas = guidDict resolvedData = {} resolvedData['Value'] = replicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam('optimizerName'), resolvedData) if not result['OK']: self.log.warn(result['Message']) return result return S_OK(resolvedData) ############################################################################# def __checkReplicas(self, job, replicaDict): """Check that all input lfns have valid replicas and can all be found at least in one single site. """ badLFNs = [] if replicaDict.has_key('Successful'): for lfn, reps in replicaDict['Successful'].items(): if not reps: badLFNs.append('LFN:%s Problem: No replicas available' % (lfn)) else: return S_ERROR('No replica Info available') if replicaDict.has_key('Failed'): for lfn, cause in replicaDict['Failed'].items(): badLFNs.append('LFN:%s Problem: %s' % (lfn, cause)) if badLFNs: self.log.info('Found %s problematic LFN(s) for job %s' % (len(badLFNs), job)) param = '\n'.join(badLFNs) self.log.info(param) result = self.setJobParam(job, self.am_getModuleParam('optimizerName'), param) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Input Data Not Available') return self.__getSiteCandidates(replicaDict['Successful']) ############################################################################# # FIXME: right now this is unused... def __checkActiveSEs(self, job, replicaDict): """ Check active SE and replicas and identify possible Site candidates for the execution of the job """ # Now let's check if some replicas might not be available due to banned SE's activeReplicas = self.dataManager.checkActiveReplicas(replicaDict) if not activeReplicas['OK']: # due to banned SE's input data might no be available msg = "On Hold: Missing replicas due to banned SE" self.log.info(msg) self.log.warn(activeReplicas['Message']) return S_ERROR(msg) activeReplicaDict = activeReplicas['Value'] siteCandidates = self.__checkReplicas(job, activeReplicaDict) if not siteCandidates['OK']: # due to a banned SE's input data is not available at a single site msg = "On Hold: Input data not Available due to banned SE" self.log.info(msg) self.log.warn(siteCandidates['Message']) return S_ERROR(msg) resolvedData = {} resolvedData['Value'] = activeReplicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam('optimizerName'), resolvedData) if not result['OK']: self.log.warn(result['Message']) return result return S_OK(resolvedData) ############################################################################# def __getSitesForSE(self, se): """ Returns a list of sites having the given SE as a local one. Uses the local cache of the site-se information """ # Empty the cache if too old if (time.time() - self.lastCScheck) > self.cacheLength: self.log.verbose('Resetting the SE to site mapping cache') self.seToSiteMapping = {} self.lastCScheck = time.time() if se not in self.seToSiteMapping: sites = getSitesForSE(se) if sites['OK']: self.seToSiteMapping[se] = list(sites['Value']) return sites else: return S_OK(self.seToSiteMapping[se]) ############################################################################# def __getSiteCandidates(self, inputData): """This method returns a list of possible site candidates based on the job input data requirement. For each site candidate, the number of files on disk and tape is resolved. """ fileSEs = {} for lfn, replicas in inputData.items(): siteList = [] for se in replicas.keys(): sites = self.__getSitesForSE(se) if sites['OK']: siteList += sites['Value'] fileSEs[lfn] = uniqueElements(siteList) siteCandidates = [] i = 0 for _fileName, sites in fileSEs.items(): if not i: siteCandidates = sites else: tempSite = [] for site in siteCandidates: if site in sites: tempSite.append(site) siteCandidates = tempSite i += 1 if not len(siteCandidates): return S_ERROR('No candidate sites available') #In addition, check number of files on tape and disk for each site #for optimizations during scheduling siteResult = {} for site in siteCandidates: siteResult[site] = {'disk': [], 'tape': []} seDict = {} for lfn, replicas in inputData.items(): for se in replicas.keys(): if se not in seDict: sites = self.__getSitesForSE(se) if not sites['OK']: continue try: #storageElement = StorageElement( se ) result = self.resourceStatus.getStorageElementStatus( se, statusType='ReadAccess') if not result['OK']: continue seDict[se] = { 'Sites': sites['Value'], 'SEParams': result['Value'][se] } result = getStorageElementOptions(se) if not result['OK']: continue seDict[se]['SEParams'].update(result['Value']) except Exception: self.log.exception( 'Failed to instantiate StorageElement( %s )' % se) continue for site in seDict[se]['Sites']: if site in siteCandidates: if seDict[se]['SEParams']['ReadAccess'] and seDict[se][ 'SEParams']['DiskSE']: if lfn not in siteResult[site]['disk']: siteResult[site]['disk'].append(lfn) if lfn in siteResult[site]['tape']: siteResult[site]['tape'].remove(lfn) if seDict[se]['SEParams']['ReadAccess'] and seDict[se][ 'SEParams']['TapeSE']: if lfn not in siteResult[site][ 'tape'] and lfn not in siteResult[site][ 'disk']: siteResult[site]['tape'].append(lfn) for site in siteResult: siteResult[site]['disk'] = len(siteResult[site]['disk']) siteResult[site]['tape'] = len(siteResult[site]['tape']) return S_OK(siteResult)