def __init__(self): """ Standard constructor """ self.log = gLogger.getSubLogger('TimeLeft') # This is the ratio SpecInt published by the site over 250 (the reference used for Matching) self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 0.0) if not self.scaleFactor: self.log.warn( '/LocalSite/CPUScalingFactor not defined for site %s' % DIRAC.siteName()) self.normFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', 0.0) if not self.normFactor: self.log.warn( '/LocalSite/CPUNormalizationFactor not defined for site %s' % DIRAC.siteName()) # CPU and wall clock margins, which don't seem to be set anywhere self.cpuMargin = gConfig.getValue('/LocalSite/CPUMargin', 2) # percent self.wallClockMargin = gConfig.getValue('/LocalSite/wallClockMargin', 8) # percent result = self.__getBatchSystemPlugin() if result['OK']: self.batchPlugin = result['Value'] else: self.batchPlugin = None self.batchError = result['Message']
def getTimeLeft( self, cpuConsumed ): """Returns the CPU Time Left for supported batch systems. The CPUConsumed is the current raw total CPU. """ #Quit if no scale factor available if not self.scaleFactor: return S_ERROR( '/LocalSite/CPUScalingFactor not defined for site %s' % DIRAC.siteName() ) if not self.batchPlugin: return S_ERROR( self.batchError ) resourceDict = self.batchPlugin.getResourceUsage() if not resourceDict['OK']: self.log.warn( 'Could not determine timeleft for batch system at site %s' % DIRAC.siteName() ) return resourceDict resources = resourceDict['Value'] self.log.verbose( resources ) if not resources['CPULimit'] or not resources['WallClockLimit']: return S_ERROR( 'No CPU / WallClock limits obtained' ) cpuFactor = 100 * float( resources['CPU'] ) / float( resources['CPULimit'] ) cpuRemaining = 100 - cpuFactor cpuLimit = float( resources['CPULimit'] ) wcFactor = 100 * float( resources['WallClock'] ) / float( resources['WallClockLimit'] ) wcRemaining = 100 - wcFactor wcLimit = float( resources['WallClockLimit'] ) self.log.verbose( 'Used CPU is %.02f, Used WallClock is %.02f.' % ( cpuFactor, wcFactor ) ) self.log.verbose( 'Remaining WallClock %.02f, Remaining CPU %.02f, margin %s' % ( wcRemaining, cpuRemaining, self.cpuMargin ) ) timeLeft = None if wcRemaining > cpuRemaining and ( wcRemaining - cpuRemaining ) > self.cpuMargin: # In some cases cpuFactor might be 0 # timeLeft = float(cpuConsumed*self.scaleFactor*cpuRemaining/cpuFactor) # We need time left in the same units used by the Matching timeLeft = float( cpuRemaining * cpuLimit / 100 * self.scaleFactor ) self.log.verbose( 'Remaining WallClock %.02f > Remaining CPU %.02f and difference > margin %s' % ( wcRemaining, cpuRemaining, self.cpuMargin ) ) else: if cpuRemaining > self.cpuMargin and wcRemaining > self.cpuMargin: self.log.verbose( 'Remaining WallClock %.02f and Remaining CPU %.02f both > margin %s' % ( wcRemaining, cpuRemaining, self.cpuMargin ) ) # In some cases cpuFactor might be 0 # timeLeft = float(cpuConsumed*self.scaleFactor*(wcRemaining-self.cpuMargin)/cpuFactor) timeLeft = float( cpuRemaining * cpuLimit / 100 * self.scaleFactor ) else: self.log.verbose( 'Remaining CPU %.02f < margin %s and WallClock %.02f < margin %s so no time left' % ( cpuRemaining, self.cpuMargin, wcRemaining, self.cpuMargin ) ) if timeLeft: self.log.verbose( 'Remaining CPU in normalized units is: %.02f' % timeLeft ) return S_OK( timeLeft ) else: return S_ERROR( 'No time left for slot' )
def __init__(self): BaseAccountingType.__init__(self) self.definitionKeyFields = [('User', 'VARCHAR(32)'), ('UserGroup', 'VARCHAR(32)'), ('JobGroup', "VARCHAR(64)"), ('JobType', 'VARCHAR(32)'), ('JobClass', 'VARCHAR(32)'), ('ProcessingType', 'VARCHAR(256)'), ('Site', 'VARCHAR(32)'), ('FinalMajorStatus', 'VARCHAR(32)'), ('FinalMinorStatus', 'VARCHAR(256)')] self.definitionAccountingFields = [ ('CPUTime', "INT UNSIGNED"), ('NormCPUTime', "INT UNSIGNED"), ('ExecTime', "INT UNSIGNED"), ('InputDataSize', 'BIGINT UNSIGNED'), ('OutputDataSize', 'BIGINT UNSIGNED'), ('InputDataFiles', 'INT UNSIGNED'), ('OutputDataFiles', 'INT UNSIGNED'), ('DiskSpace', 'BIGINT UNSIGNED'), ('InputSandBoxSize', 'BIGINT UNSIGNED'), ('OutputSandBoxSize', 'BIGINT UNSIGNED'), ('ProcessedEvents', 'INT UNSIGNED') ] self.bucketsLength = [ (86400 * 8, 3600), # <1w+1d = 1h (86400 * 35, 3600 * 4), # <35d = 4h (86400 * 30 * 6, 86400), # <6m = 1d (86400 * 365, 86400 * 2), # <1y = 2d (86400 * 600, 604800), # >1y = 1w ] self.checkType() # Fill the site self.setValueByKey("Site", DIRAC.siteName())
def __findServiceURL( self ): if not self.__initStatus[ 'OK' ]: return self.__initStatus gatewayURL = False if self.KW_IGNORE_GATEWAYS not in self.kwargs or not self.kwargs[ self.KW_IGNORE_GATEWAYS ]: dRetVal = gConfig.getOption( "/DIRAC/Gateways/%s" % DIRAC.siteName() ) if dRetVal[ 'OK' ]: rawGatewayURL = List.randomize( List.fromChar( dRetVal[ 'Value'], "," ) )[0] gatewayURL = "/".join( rawGatewayURL.split( "/" )[:3] ) for protocol in gProtocolDict.keys(): if self._destinationSrv.find( "%s://" % protocol ) == 0: gLogger.debug( "Already given a valid url", self._destinationSrv ) if not gatewayURL: return S_OK( self._destinationSrv ) gLogger.debug( "Reconstructing given URL to pass through gateway" ) path = "/".join( self._destinationSrv.split( "/" )[3:] ) finalURL = "%s/%s" % ( gatewayURL, path ) gLogger.debug( "Gateway URL conversion:\n %s -> %s" % ( self._destinationSrv, finalURL ) ) return S_OK( finalURL ) if gatewayURL: gLogger.debug( "Using gateway", gatewayURL ) return S_OK( "%s/%s" % ( gatewayURL, self._destinationSrv ) ) try: urls = getServiceURL( self._destinationSrv, setup = self.setup ) except Exception, e: return S_ERROR( "Cannot get URL for %s in setup %s: %s" % ( self._destinationSrv, self.setup, str( e ) ) )
def __init__(self): super(OverlayInput, self).__init__() self.enable = True self.STEP_NUMBER = '' self.log = gLogger.getSubLogger( "OverlayInput" ) self.applicationName = 'OverlayInput' self.curdir = os.getcwd() self.applicationLog = '' self.printoutflag = '' self.prodid = 0 self.detector = '' ##needed for backward compatibility self.detectormodel = "" self.energytouse = '' self.energy = 0 self.nbofeventsperfile = 100 self.lfns = [] self.nbfilestoget = 0 self.BkgEvtType = 'gghad' self.metaEventType = self.BkgEvtType self.BXOverlay = 0 self.ggtohadint = 3.2 self.nbsigeventsperfile = 0 self.nbinputsigfile = 1 self.NbSigEvtsPerJob = 0 self.datMan = DataManager() self.fcc = FileCatalogClient() self.site = DIRAC.siteName() self.useEnergyForFileLookup = True self.machine = 'clic_cdr' self.pathToOverlayFiles = '' self.processorName = ''
def _getSEList( self, SEType = 'ProductionOutputs', DataType = 'SimtelProd' ): """ get from CS the list of available SE for data upload """ opsHelper = Operations() optionName = os.path.join( SEType, DataType ) SEList = opsHelper.getValue( optionName , [] ) SEList = List.randomize( SEList ) DIRAC.gLogger.notice( 'List of %s SE: %s ' % ( SEType, SEList ) ) # # Check if the local SE is in the list. If yes try it first by reversing list order localSEList = [] res = getSEsForSite( DIRAC.siteName() ) if res['OK']: localSEList = res['Value'] retainedlocalSEList = [] for localSE in localSEList: if localSE in SEList: DIRAC.gLogger.notice( 'The local Storage Element is an available SE: ', localSE ) retainedlocalSEList.append( localSE ) SEList.remove( localSE ) SEList = retainedlocalSEList + SEList if len( SEList ) == 0: return DIRAC.S_ERROR( 'Error in building SEList' ) return DIRAC.S_OK( SEList )
def jobexec(jobxml, wfParameters): jobfile = os.path.abspath(jobxml) if not os.path.exists(jobfile): gLogger.warn('Path to specified workflow %s does not exist' % (jobfile)) sys.exit(1) workflow = fromXMLFile(jobfile) gLogger.debug(workflow) code = workflow.createCode() gLogger.debug(code) jobID = 0 if 'JOBID' in os.environ: jobID = os.environ['JOBID'] gLogger.info('DIRAC JobID %s is running at site %s' % (jobID, DIRAC.siteName())) workflow.addTool('JobReport', JobReport(jobID)) workflow.addTool('AccountingReport', DataStoreClient()) workflow.addTool('Request', Request()) # Propagate the command line parameters to the workflow if any for pName, pValue in wfParameters.items(): workflow.setValue(pName, pValue) # Propagate the command line parameters to the workflow module instances of each step for stepdefinition in workflow.step_definitions.itervalues(): for moduleInstance in stepdefinition.module_instances: for pName, pValue in wfParameters.iteritems(): if moduleInstance.parameters.find(pName): moduleInstance.parameters.setValue(pName, pValue) return workflow.execute()
def jobexec(jobxml, wfParameters): jobfile = os.path.abspath(jobxml) if not os.path.exists(jobfile): gLogger.warn('Path to specified workflow %s does not exist' % (jobfile)) sys.exit(1) workflow = fromXMLFile(jobfile) gLogger.debug(workflow) code = workflow.createCode() gLogger.debug(code) jobID = 0 if 'JOBID' in os.environ: jobID = os.environ['JOBID'] gLogger.info('DIRAC JobID %s is running at site %s' % (jobID, DIRAC.siteName())) workflow.addTool('JobReport', JobReport(jobID)) workflow.addTool('AccountingReport', DataStoreClient()) workflow.addTool('Request', Request()) # Propagate the command line parameters to the workflow if any for pName, pValue in wfParameters.items(): workflow.setValue(pName, pValue) # Propagate the command line parameters to the workflow module instances of each step for stepdefinition in workflow.step_definitions.values(): for moduleInstance in stepdefinition.module_instances: for pName, pValue in wfParameters.items(): if moduleInstance.parameters.find(pName): moduleInstance.parameters.setValue(pName, pValue) return workflow.execute()
def jobexec(jobxml, wfParameters={}): jobfile = os.path.abspath(jobxml) if not os.path.exists(jobfile): gLogger.warn('Path to specified workflow %s does not exist' % (jobfile)) sys.exit(1) workflow = fromXMLFile(jobfile) gLogger.debug(workflow) code = workflow.createCode() gLogger.debug(code) jobID = 0 if os.environ.has_key('JOBID'): jobID = os.environ['JOBID'] gLogger.info('DIRAC JobID %s is running at site %s' % (jobID, DIRAC.siteName())) workflow.addTool('JobReport', JobReport(jobID)) workflow.addTool('AccountingReport', DataStoreClient()) workflow.addTool('Request', RequestContainer()) # Propagate the command line parameters to the workflow if any for name, value in wfParameters.items(): workflow.setValue(name, value) result = workflow.execute() return result
def __init__(self): BaseAccountingType.__init__(self) self.definitionKeyFields = [('OperationType', "VARCHAR(32)"), ('User', "VARCHAR(32)"), ('ExecutionSite', 'VARCHAR(32)'), ('Source', 'VARCHAR(32)'), ('Destination', 'VARCHAR(32)'), ('Protocol', 'VARCHAR(32)'), ('FinalStatus', 'VARCHAR(32)')] self.definitionAccountingFields = [('TransferSize', 'BIGINT UNSIGNED'), ('TransferTime', 'FLOAT'), ('RegistrationTime', 'FLOAT'), ('TransferOK', 'INT UNSIGNED'), ('TransferTotal', 'INT UNSIGNED'), ('RegistrationOK', 'INT UNSIGNED'), ('RegistrationTotal', 'INT UNSIGNED')] self.bucketsLength = [ (172800, 900), #<2d = 15m (604800, 3600), #<1w = 1h (15552000, 86400), #>1w <6m = 1d (31104000, 604800), #>6m = 1w ] self.checkType() self.setValueByKey('ExecutionSite', DIRAC.siteName())
def __getBatchSystemPlugin( self ): """Using the name of the batch system plugin, will return an instance of the plugin class. """ batchSystems = {'LSF':'LSB_JOBID', 'PBS':'PBS_JOBID', 'BQS':'QSUB_REQNAME'} #more to be added later name = None for batchSystem, envVar in batchSystems.items(): if os.environ.has_key( envVar ): name = batchSystem break if name == None: self.log.warn( 'Batch system type for site %s is not currently supported' % DIRAC.siteName() ) return S_ERROR( 'Current batch system is not supported' ) self.log.debug( 'Creating plugin for %s batch system' % ( name ) ) try: batchSystemName = "%sTimeLeft" % ( name ) batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName, globals(), locals(), [batchSystemName] ) except Exception, x: msg = 'Could not import DIRAC.Core.Utilities.TimeLeft.%s' % ( batchSystemName ) self.log.warn( x ) self.log.warn( msg ) return S_ERROR( msg )
def getLocationOrderedCatalogs( siteName = '' ): # First get a list of the active catalogs and their location res = getActiveCatalogs() if not res['OK']: gLogger.error( "Failed to get list of active catalogs", res['Message'] ) return res catalogDict = res['Value'] # Get the tier1 associated to the current location if not siteName: import DIRAC siteName = DIRAC.siteName() countryCode = siteName.split( '.' )[-1] res = getCountryMappingTier1( countryCode ) if not res['OK']: gLogger.error( "Failed to resolve closest Tier1", res['Message'] ) return res tier1 = res['Value'] # Create a sorted list of the active readonly catalogs catalogList = [] if catalogDict.has_key( tier1 ): catalogList.append( catalogDict[tier1] ) catalogDict.pop( tier1 ) for catalogURL in randomize( catalogDict.values() ): catalogList.append( catalogURL ) return S_OK( catalogList )
def getLocationOrderedCatalogs(siteName=''): # First get a list of the active catalogs and their location res = getActiveCatalogs() if not res['OK']: gLogger.error("Failed to get list of active catalogs", res['Message']) return res catalogDict = res['Value'] # Get the tier1 associated to the current location if not siteName: import DIRAC siteName = DIRAC.siteName() countryCode = siteName.split('.')[-1] res = getCountryMappingTier1(countryCode) if not res['OK']: gLogger.error("Failed to resolve closest Tier1", res['Message']) return res tier1 = res['Value'] # Create a sorted list of the active readonly catalogs catalogList = [] if catalogDict.has_key(tier1): catalogList.append(catalogDict[tier1]) catalogDict.pop(tier1) for catalogURL in randomize(catalogDict.values()): catalogList.append(catalogURL) return S_OK(catalogList)
def __init__( self ): BaseAccountingType.__init__( self ) self.definitionKeyFields = [ ( 'OperationType' , "VARCHAR(32)" ), ( 'User', "VARCHAR(32)" ), ( 'ExecutionSite', 'VARCHAR(32)' ), ( 'Source', 'VARCHAR(32)' ), ( 'Destination', 'VARCHAR(32)' ), ( 'Protocol', 'VARCHAR(32)' ), ( 'FinalStatus', 'VARCHAR(32)' ) ] self.definitionAccountingFields = [ ( 'TransferSize', 'BIGINT UNSIGNED' ), ( 'TransferTime', 'FLOAT' ), ( 'RegistrationTime', 'FLOAT' ), ( 'TransferOK', 'INT UNSIGNED' ), ( 'TransferTotal', 'INT UNSIGNED' ), ( 'RegistrationOK', 'INT UNSIGNED' ), ( 'RegistrationTotal', 'INT UNSIGNED' ) ] self.bucketsLength = [ ( 86400 * 3, 900 ), #<3d = 15m ( 86400 * 8, 3600 ), #<1w+1d = 1h ( 15552000, 86400 ), #>1w+1d <6m = 1d ( 31104000, 604800 ), #>6m = 1w ] self.checkType() self.setValueByKey( 'ExecutionSite', DIRAC.siteName() )
def initialize(self, systemName, cfgPath): if self.__initialized: return self.__initialized = True from DIRAC.ConfigurationSystem.Client.Config import gConfig from os import getpid # self.__printDebug( "The configuration path is %s" % cfgPath ) # Get the options for the different output backends retDict = gConfig.getOptionsDict("%s/BackendsOptions" % cfgPath) # self.__printDebug( retDict ) if not retDict['OK']: cfgBackOptsDict = { 'FileName': 'Dirac-log_%s.log' % getpid(), 'Interactive': True, 'SleepTime': 150 } else: cfgBackOptsDict = retDict['Value'] self.__backendOptions.update(cfgBackOptsDict) if 'FileName' not in self.__backendOptions: self.__backendOptions['FileName'] = 'Dirac-log_%s.log' % getpid() sleepTime = 150 try: sleepTime = int(self.__backendOptions['SleepTime']) except: pass self.__backendOptions['SleepTime'] = sleepTime self.__backendOptions['Interactive'] = gConfig.getValue( "%s/BackendsOptions/Interactive" % cfgPath, True) self.__backendOptions['Site'] = DIRAC.siteName() self.__backendOptions['Color'] = gConfig.getValue( "%s/LogColor" % cfgPath, False) # Configure outputs desiredBackends = gConfig.getValue("%s/LogBackends" % cfgPath, 'stdout') self.registerBackends(List.fromChar(desiredBackends)) # Configure verbosity defaultLevel = Logger.defaultLogLevel if "Scripts" in cfgPath: defaultLevel = gConfig.getValue('/Systems/Scripts/LogLevel', Logger.defaultLogLevel) self.setLevel(gConfig.getValue("%s/LogLevel" % cfgPath, defaultLevel)) # Configure framing self._showCallingFrame = gConfig.getValue("%s/LogShowLine" % cfgPath, self._showCallingFrame) # Get system name self._systemName = str(systemName) if not self.__backendOptions['Interactive']: ExitCallback.registerExitCallback(self.flushAllMessages)
def __getBatchSystemPlugin( self ): """Using the name of the batch system plugin, will return an instance of the plugin class. """ batchSystems = {'LSF':'LSB_JOBID', 'PBS':'PBS_JOBID', 'BQS':'QSUB_REQNAME', 'SGE':'SGE_TASK_ID'} #more to be added later name = None for batchSystem, envVar in batchSystems.items(): if os.environ.has_key( envVar ): name = batchSystem break if name == None: self.log.warn( 'Batch system type for site %s is not currently supported' % DIRAC.siteName() ) return S_ERROR( 'Current batch system is not supported' ) self.log.debug( 'Creating plugin for %s batch system' % ( name ) ) try: batchSystemName = "%sTimeLeft" % ( name ) batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName, globals(), locals(), [batchSystemName] ) except Exception, x: msg = 'Could not import DIRAC.Core.Utilities.TimeLeft.%s' % ( batchSystemName ) self.log.warn( x ) self.log.warn( msg ) return S_ERROR( msg )
def __init__(self): super(OverlayInput, self).__init__() self.enable = True self.STEP_NUMBER = '' self.log = gLogger.getSubLogger( "OverlayInput" ) self.applicationName = 'OverlayInput' self.curdir = os.getcwd() self.applicationLog = '' self.printoutflag = '' self.prodid = 0 self.detector = '' ##needed for backward compatibility self.detectormodel = "" self.energytouse = '' self.energy = 0 self.nbofeventsperfile = 100 self.lfns = [] self.nbfilestoget = 0 self.BkgEvtType = 'gghad' self.BXOverlay = 0 self.ggtohadint = 3.2 self.nbsigeventsperfile = 0 self.nbinputsigfile = 1 self.NbSigEvtsPerJob = 0 self.rm = ReplicaManager() self.fc = FileCatalogClient() self.site = DIRAC.siteName() self.machine = 'clic_cdr'
def setReplicaProblematic(self,lfn,se,pfn='',reason='Access failure'): """ Set replica status to Problematic in the File Catalog @param lfn: lfn of the problematic file @param se: storage element @param pfn: physical file name @param reason: as name suggests... @return: S_OK() """ rm = ReplicaManager() source = "Job %d at %s" % (self.jobID,DIRAC.siteName()) result = rm.setReplicaProblematic((lfn,pfn,se,reason),source) if not result['OK'] or result['Value']['Failed']: # We have failed the report, let's attempt the Integrity DB faiover integrityDB = RPCClient('DataManagement/DataIntegrity',timeout=120) fileMetadata = {'Prognosis':reason,'LFN':lfn,'PFN':pfn,'StorageElement':se} result = integrityDB.insertProblematic(source,fileMetadata) if not result['OK']: # Add it to the request if self.workflow_commons.has_key('Request'): request = self.workflow_commons['Request'] subrequest = DISETSubRequest(result['rpcStub']).getDictionary() request.addSubRequest(subrequest,'integrity') return S_OK()
def __init__(self): BaseAccountingType.__init__(self) self.definitionKeyFields = [('User', 'VARCHAR(32)'), ('UserGroup', 'VARCHAR(32)'), ('JobGroup', "VARCHAR(64)"), ('JobType', 'VARCHAR(32)'), ('JobClass', 'VARCHAR(32)'), ('ProcessingType', 'VARCHAR(256)'), ('Site', 'VARCHAR(32)'), ('FinalMajorStatus', 'VARCHAR(32)'), ('FinalMinorStatus', 'VARCHAR(256)') ] self.definitionAccountingFields = [('CPUTime', "INT UNSIGNED"), ('NormCPUTime', "INT UNSIGNED"), ('ExecTime', "INT UNSIGNED"), ('InputDataSize', 'BIGINT UNSIGNED'), ('OutputDataSize', 'BIGINT UNSIGNED'), ('InputDataFiles', 'INT UNSIGNED'), ('OutputDataFiles', 'INT UNSIGNED'), ('DiskSpace', 'BIGINT UNSIGNED'), ('InputSandBoxSize', 'BIGINT UNSIGNED'), ('OutputSandBoxSize', 'BIGINT UNSIGNED'), ('ProcessedEvents', 'INT UNSIGNED') ] self.bucketsLength = [(86400 * 8, 3600), # <1w+1d = 1h (86400 * 35, 3600 * 4), # <35d = 4h (86400 * 30 * 6, 86400), # <6m = 1d (86400 * 365, 86400 * 2), # <1y = 2d (86400 * 600, 604800), # >1y = 1w ] self.checkType() # Fill the site self.setValueByKey("Site", DIRAC.siteName())
def __init__(self): super(DataOperation, self).__init__() self.definitionKeyFields = [ ("OperationType", "VARCHAR(32)"), ("User", "VARCHAR(64)"), ("ExecutionSite", "VARCHAR(256)"), ("Source", "VARCHAR(32)"), ("Destination", "VARCHAR(32)"), ("Protocol", "VARCHAR(32)"), ("FinalStatus", "VARCHAR(32)"), ] self.definitionAccountingFields = [ ("TransferSize", "BIGINT UNSIGNED"), ("TransferTime", "FLOAT"), ("RegistrationTime", "FLOAT"), ("TransferOK", "INT UNSIGNED"), ("TransferTotal", "INT UNSIGNED"), ("RegistrationOK", "INT UNSIGNED"), ("RegistrationTotal", "INT UNSIGNED"), ] self.bucketsLength = [ (86400 * 3, 900), # <3d = 15m (86400 * 8, 3600), # <1w+1d = 1h (15552000, 86400), # >1w+1d <6m = 1d (31104000, 604800), # >6m = 1w ] self.checkType() self.setValueByKey("ExecutionSite", DIRAC.siteName())
def __resolveInputData(self): """This method controls the execution of the DIRAC input data modules according to the VO policy defined in the configuration service. """ site = self.arguments['Configuration'].get('SiteName', DIRAC.siteName()) self.arguments.setdefault('Job', {}) policy = self.arguments['Job'].get('InputDataPolicy', []) if policy: # In principle this can be a list of modules with the first taking precedence if isinstance(policy, six.string_types): policy = [policy] self.log.info('Job has a specific policy setting: %s' % (', '.join(policy))) else: self.log.debug( 'Attempting to resolve input data policy for site %s' % site) inputDataPolicy = Operations().getOptionsDict('InputDataPolicy') if not inputDataPolicy['OK']: return S_ERROR( 'Could not resolve InputDataPolicy from Operations InputDataPolicy' ) options = inputDataPolicy['Value'] policy = options.get(site, options.get('Default', [])) if policy: policy = [x.strip() for x in policy.split(',')] if site in options: prStr = 'Found specific' else: prStr = 'Applying default' self.log.info('%s input data policy for site %s:\n%s' % (prStr, site, '\n'.join(policy))) dataToResolve = [] # if none, all supplied input data is resolved successful = {} for modulePath in policy: result = self.__runModule(modulePath, dataToResolve) if not result['OK']: self.log.warn('Problem during %s execution' % modulePath) return result result = result['Value'] successful.update(result.get('Successful', {})) dataToResolve = result.get('Failed', []) if dataToResolve: self.log.info('%s failed for the following files:\n%s' % (modulePath, '\n'.join(dataToResolve))) else: self.log.info('All replicas resolved after %s execution' % (modulePath)) break if successful: self.log.verbose('Successfully resolved:', str(successful)) return S_OK({'Successful': successful, 'Failed': dataToResolve})
def __resolveInputData(self): """This method controls the execution of the DIRAC input data modules according to the VO policy defined in the configuration service. """ site = self.arguments["Configuration"].get("SiteName", DIRAC.siteName()) self.arguments.setdefault("Job", {}) policy = self.arguments["Job"].get("InputDataPolicy", []) if policy: # In principle this can be a list of modules with the first taking precedence if isinstance(policy, six.string_types): policy = [policy] self.log.info("Job has a specific policy setting: %s" % (", ".join(policy))) else: self.log.debug( "Attempting to resolve input data policy for site %s" % site) inputDataPolicy = Operations().getOptionsDict("InputDataPolicy") if not inputDataPolicy["OK"]: return S_ERROR( "Could not resolve InputDataPolicy from Operations InputDataPolicy" ) options = inputDataPolicy["Value"] policy = options.get(site, options.get("Default", [])) if policy: policy = [x.strip() for x in policy.split(",")] if site in options: prStr = "Found specific" else: prStr = "Applying default" self.log.info("%s input data policy for site %s:\n%s" % (prStr, site, "\n".join(policy))) dataToResolve = [] # if none, all supplied input data is resolved successful = {} for modulePath in policy: result = self.__runModule(modulePath, dataToResolve) if not result["OK"]: self.log.warn("Problem during %s execution" % modulePath) return result result = result["Value"] successful.update(result.get("Successful", {})) dataToResolve = result.get("Failed", []) if dataToResolve: self.log.info("%s failed for the following files:\n%s" % (modulePath, "\n".join(dataToResolve))) else: self.log.info("All replicas resolved after %s execution" % (modulePath)) break if successful: self.log.verbose("Successfully resolved:", str(successful)) return S_OK({"Successful": successful, "Failed": dataToResolve})
def determineSeFromSite(): siteName = DIRAC.siteName() SEname = SeSiteMap.get(siteName, "") if not SEname: result = getSEsForSite(siteName) if result["OK"] and result["Value"]: SEname = result["Value"][0] return SEname
def determineSeFromSite(): siteName = DIRAC.siteName() SEname = SeSiteMap.get(siteName, '') if not SEname: result = getSEsForSite(siteName) if result['OK'] and result['Value']: SEname = result['Value'][0] return SEname
def am_initialize(self, *initArgs): """ Common initialization for all the agents. This is executed every time an agent (re)starts. This is called by the AgentReactor, should not be overridden. """ agentName = self.am_getModuleParam('fullName') result = self.initialize(*initArgs) if not isReturnStructure(result): return S_ERROR("initialize must return S_OK/S_ERROR") if not result['OK']: return S_ERROR("Error while initializing %s: %s" % (agentName, result['Message'])) mkDir(self.am_getControlDirectory()) workDirectory = self.am_getWorkDirectory() mkDir(workDirectory) # Set the work directory in an environment variable available to subprocesses if needed os.environ['AGENT_WORKDIRECTORY'] = workDirectory self.__moduleProperties['shifterProxy'] = self.am_getOption( 'shifterProxy') if self.am_monitoringEnabled() and not self.activityMonitoring: self.monitor.enable() if len(self.__moduleProperties['executors']) < 1: return S_ERROR("At least one executor method has to be defined") if not self.am_Enabled(): return S_ERROR("Agent is disabled via the configuration") self.log.notice("=" * 40) self.log.notice("Loaded agent module %s" % self.__moduleProperties['fullName']) self.log.notice(" Site: %s" % DIRAC.siteName()) self.log.notice(" Setup: %s" % gConfig.getValue("/DIRAC/Setup")) self.log.notice(" Base Module version: %s " % __RCSID__) self.log.notice(" Agent version: %s" % self.__codeProperties['version']) self.log.notice(" DIRAC version: %s" % DIRAC.version) self.log.notice(" DIRAC platform: %s" % DIRAC.getPlatform()) pollingTime = int(self.am_getOption('PollingTime')) if pollingTime > 3600: self.log.notice(" Polling time: %s hours" % (pollingTime / 3600.)) else: self.log.notice(" Polling time: %s seconds" % self.am_getOption('PollingTime')) self.log.notice(" Control dir: %s" % self.am_getControlDirectory()) self.log.notice(" Work dir: %s" % self.am_getWorkDirectory()) if self.am_getOption('MaxCycles') > 0: self.log.notice(" Cycles: %s" % self.am_getMaxCycles()) else: self.log.notice(" Cycles: unlimited") if self.am_getWatchdogTime() > 0: self.log.notice(" Watchdog interval: %s" % self.am_getWatchdogTime()) else: self.log.notice(" Watchdog interval: disabled ") self.log.notice("=" * 40) self.__initialized = True return S_OK()
def isLocalSE( self ): """ Test if the Storage Element is local in the current context """ import DIRAC gLogger.verbose( "StorageElement.isLocalSE: Determining whether %s is a local SE." % self.name ) localSEs = getSEsForSite( DIRAC.siteName() )['Value'] if self.name in localSEs: return S_OK( True ) else: return S_OK( False )
def isLocalSE( self ): """ Test if the Storage Element is local in the current context """ import DIRAC self.log.verbose( "isLocalSE: Determining whether %s is a local SE." % self.name ) localSEs = getSEsForSite( DIRAC.siteName() )['Value'] if self.name in localSEs: return S_OK( True ) else: return S_OK( False )
def getSiteSE(SEname): sitename = DIRAC.siteName() DIRAC.gLogger.error('Sitename: %s' % (sitename)) print "sitename", sitename res = getSEsForSite(sitename) if not res['OK']: DIRAC.gLogger.error(res['Message']) return SEname if res['Value']: SEname = res['Value'][0] return SEname
def __isLocalSE( self ): """ Test if the Storage Element is local in the current context """ self.log.getSubLogger( 'LocalSE' ).verbose( "Determining whether %s is a local SE." % self.name ) import DIRAC localSEs = getSEsForSite( DIRAC.siteName() )['Value'] if self.name in localSEs: return S_OK( True ) else: return S_OK( False )
def __init__( self ): """ Standard constructor """ self.log = gLogger.getSubLogger( 'TimeLeft' ) # This is the ratio SpecInt published by the site over 250 (the reference used for Matching) self.scaleFactor = gConfig.getValue( '/LocalSite/CPUScalingFactor', 0.0 ) if not self.scaleFactor: self.log.warn( '/LocalSite/CPUScalingFactor not defined for site %s' % DIRAC.siteName() ) self.normFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 ) if not self.normFactor: self.log.warn( '/LocalSite/CPUNormalizationFactor not defined for site %s' % DIRAC.siteName() ) self.cpuMargin = gConfig.getValue( '/LocalSite/CPUMargin', 10 ) # percent result = self.__getBatchSystemPlugin() if result['OK']: self.batchPlugin = result['Value'] else: self.batchPlugin = None self.batchError = result['Message']
def __isLocalSE(self): """ Test if the Storage Element is local in the current context """ self.log.getSubLogger("LocalSE").verbose("Determining whether %s is a local SE." % self.name) import DIRAC localSEs = getSEsForSite(DIRAC.siteName())["Value"] if self.name in localSEs: return S_OK(True) else: return S_OK(False)
def __getBatchSystemPlugin(self): """Using the name of the batch system plugin, will return an instance of the plugin class.""" batchSystems = { "LSF": "LSB_JOBID", "PBS": "PBS_JOBID", "BQS": "QSUB_REQNAME", "SGE": "SGE_TASK_ID", "SLURM": "SLURM_JOB_ID", "HTCondor": "_CONDOR_JOB_AD", } # more to be added later name = None for batchSystem, envVar in batchSystems.items(): if envVar in os.environ: name = batchSystem break if name is None and "MACHINEFEATURES" in os.environ and "JOBFEATURES" in os.environ: # Only use MJF if legacy batch system information not available for now name = "MJF" if name is None: self.log.warn( "Batch system type for site %s is not currently supported" % DIRAC.siteName()) return S_ERROR("Current batch system is not supported") self.log.debug("Creating plugin for %s batch system" % (name)) try: batchSystemName = "%sResourceUsage" % (name) batchPlugin = __import__( "DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s" % batchSystemName, # pylint: disable=unused-variable globals(), locals(), [batchSystemName], ) except ImportError as x: msg = "Could not import DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s" % ( batchSystemName) self.log.warn(x) self.log.warn(msg) return S_ERROR(msg) try: batchStr = "batchPlugin.%s()" % (batchSystemName) batchInstance = eval(batchStr) except Exception as x: # pylint: disable=broad-except msg = "Could not instantiate %s()" % (batchSystemName) self.log.warn(x) self.log.warn(msg) return S_ERROR(msg) return S_OK(batchInstance)
def __getBatchSystemPlugin(self): """ Using the name of the batch system plugin, will return an instance of the plugin class. """ batchSystems = { 'LSF': 'LSB_JOBID', 'PBS': 'PBS_JOBID', 'BQS': 'QSUB_REQNAME', 'SGE': 'SGE_TASK_ID', 'SLURM': 'SLURM_JOB_ID', 'HTCondor': '_CONDOR_JOB_AD' } # more to be added later name = None for batchSystem, envVar in batchSystems.items(): if envVar in os.environ: name = batchSystem break if name is None and 'MACHINEFEATURES' in os.environ and 'JOBFEATURES' in os.environ: # Only use MJF if legacy batch system information not available for now name = 'MJF' if name is None: self.log.warn( 'Batch system type for site %s is not currently supported' % DIRAC.siteName()) return S_ERROR('Current batch system is not supported') self.log.debug('Creating plugin for %s batch system' % (name)) try: batchSystemName = "%sResourceUsage" % (name) batchPlugin = __import__( 'DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s' % # pylint: disable=unused-variable batchSystemName, globals(), locals(), [batchSystemName]) except ImportError as x: msg = 'Could not import DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s' % ( batchSystemName) self.log.warn(x) self.log.warn(msg) return S_ERROR(msg) try: batchStr = 'batchPlugin.%s()' % (batchSystemName) batchInstance = eval(batchStr) except Exception as x: # pylint: disable=broad-except msg = 'Could not instantiate %s()' % (batchSystemName) self.log.warn(x) self.log.warn(msg) return S_ERROR(msg) return S_OK(batchInstance)
def __getConfigFlags(): """ Get the flags for dirac-configure inside the container. Returns a string containing the command line flags. """ cfgOpts = [] setup = gConfig.getValue("/DIRAC/Setup", "unknown") if setup: cfgOpts.append("-S '%s'" % setup) csServers = gConfig.getValue("/DIRAC/Configuration/Servers", []) cfgOpts.append("-C '%s'" % ','.join(csServers)) cfgOpts.append("-n '%s'" % DIRAC.siteName()) return ' '.join(cfgOpts)
def __resolveInputData(self): """This method controls the execution of the DIRAC input data modules according to the VO policy defined in the configuration service. """ site = self.arguments['Configuration'].get('SiteName', DIRAC.siteName()) self.arguments.setdefault('Job', {}) policy = self.arguments['Job'].get('InputDataPolicy', []) if policy: # In principle this can be a list of modules with the first taking precedence if isinstance(policy, basestring): policy = [policy] self.log.info('Job has a specific policy setting: %s' % (', '.join(policy))) else: self.log.debug('Attempting to resolve input data policy for site %s' % site) inputDataPolicy = Operations().getOptionsDict('InputDataPolicy') if not inputDataPolicy['OK']: return S_ERROR('Could not resolve InputDataPolicy from Operations InputDataPolicy') options = inputDataPolicy['Value'] policy = options.get(site, options.get('Default', [])) if policy: policy = [x.strip() for x in policy.split(',')] if site in options: prStr = 'Found specific' else: prStr = 'Applying default' self.log.info('%s input data policy for site %s:\n%s' % (prStr, site, '\n'.join(policy))) dataToResolve = [] # if none, all supplied input data is resolved successful = {} for modulePath in policy: result = self.__runModule(modulePath, dataToResolve) if not result['OK']: self.log.warn('Problem during %s execution' % modulePath) return result result = result['Value'] successful.update(result.get('Successful', {})) dataToResolve = result.get('Failed', []) if dataToResolve: self.log.info('%s failed for the following files:\n%s' % (modulePath, '\n'.join(dataToResolve))) else: self.log.info('All replicas resolved after %s execution' % (modulePath)) break if successful: self.log.verbose('Successfully resolved:', str(successful)) return S_OK({'Successful': successful, 'Failed': dataToResolve})
def sendMonitoring(self): baseDict["ExecutionSite"] = DIRAC.siteName() baseDict["Channel"] = baseDict["Source"] + "->" + baseDict[ "Destination"] self.dataOperationReporter.addRecord(baseDict) if commitFlag: result = self.dataOperationReporter.commit() sLog.debug("Committing data operation to monitoring") if not result["OK"]: sLog.error("Could not commit data operation to monitoring", result["Message"]) else: sLog.debug("Done committing to monitoring")
def initialize( self, systemName, cfgPath ): if self.__initialized: return self.__initialized = True from DIRAC.ConfigurationSystem.Client.Config import gConfig from os import getpid # self.__printDebug( "The configuration path is %s" % cfgPath ) # Get the options for the different output backends retDict = gConfig.getOptionsDict( "%s/BackendsOptions" % cfgPath ) # self.__printDebug( retDict ) if not retDict[ 'OK' ]: cfgBackOptsDict = { 'FileName': 'Dirac-log_%s.log' % getpid(), 'Interactive': True, 'SleepTime': 150 } else: cfgBackOptsDict = retDict[ 'Value' ] self.__backendOptions.update( cfgBackOptsDict ) if 'FileName' not in self.__backendOptions: self.__backendOptions[ 'FileName' ] = 'Dirac-log_%s.log' % getpid() sleepTime = 150 try: sleepTime = int ( self.__backendOptions[ 'SleepTime' ] ) except: pass self.__backendOptions[ 'SleepTime' ] = sleepTime self.__backendOptions[ 'Interactive' ] = gConfig.getValue( "%s/BackendsOptions/Interactive" % cfgPath, True ) self.__backendOptions[ 'Site' ] = DIRAC.siteName() self.__backendOptions[ 'Color' ] = gConfig.getValue( "%s/LogColor" % cfgPath, False ) # Configure outputs desiredBackends = gConfig.getValue( "%s/LogBackends" % cfgPath, 'stdout' ) self.registerBackends( List.fromChar( desiredBackends ) ) # Configure verbosity defaultLevel = Logger.defaultLogLevel if "Scripts" in cfgPath: defaultLevel = gConfig.getValue( '/Systems/Scripts/LogLevel', Logger.defaultLogLevel ) self.setLevel( gConfig.getValue( "%s/LogLevel" % cfgPath, defaultLevel ) ) # Configure framing self._showCallingFrame = gConfig.getValue( "%s/LogShowLine" % cfgPath, self._showCallingFrame ) # Get system name self._systemName = str( systemName ) if not self.__backendOptions['Interactive']: ExitCallback.registerExitCallback( self.flushAllMessages )
def __resolveInputData(self): """This method controls the execution of the DIRAC input data modules according to the VO policy defined in the configuration service. """ site = self.arguments["Configuration"].get("SiteName", DIRAC.siteName()) self.arguments.setdefault("Job", {}) policy = self.arguments["Job"].get("InputDataPolicy", []) if policy: # In principle this can be a list of modules with the first taking precedence if type(policy) in types.StringTypes: policy = [policy] self.log.info("Job has a specific policy setting: %s" % (", ".join(policy))) else: self.log.debug("Attempting to resolve input data policy for site %s" % site) inputDataPolicy = Operations().getOptionsDict("InputDataPolicy") if not inputDataPolicy["OK"]: return S_ERROR("Could not resolve InputDataPolicy from Operations InputDataPolicy") options = inputDataPolicy["Value"] policy = options.get(site, options.get("Default", [])) if policy: policy = [x.strip() for x in policy.split(",")] if site in options: prStr = "Found specific" else: prStr = "Applying default" self.log.info("%s input data policy for site %s:\n%s" % (prStr, site, "\n".join(policy))) dataToResolve = [] # if none, all supplied input data is resolved successful = {} for modulePath in policy: result = self.__runModule(modulePath, dataToResolve) if not result["OK"]: self.log.warn("Problem during %s execution" % modulePath) return result result = result["Value"] successful.update(result.get("Successful", {})) dataToResolve = result.get("Failed", []) if dataToResolve: self.log.info("%s failed for the following files:\n%s" % (modulePath, "\n".join(dataToResolve))) else: self.log.info("All replicas resolved after %s execution" % (modulePath)) break if successful: self.log.verbose("Successfully resolved:", str(successful)) return S_OK({"Successful": successful, "Failed": dataToResolve})
def upload_to_seList(FileLFN, FileName): DIRAC.gLogger.notice('Put and register in LFC and DFC:', FileLFN) from DIRAC.Interfaces.API.Dirac import Dirac from DIRAC.Core.Utilities.SiteSEMapping import getSEsForSite result = getSEsForSite(DIRAC.siteName()) if result['OK']: localSEs = result['Value'] dirac = Dirac() upload_result = 'NOTOK' failing_se = [] for se in localSEs: if se in seList: DIRAC.gLogger.notice('Local SE is in the list:', se) ret = dirac.addFile(FileLFN, FileName, se) res = CheckCatalogCoherence(FileLFN) if res != DIRAC.S_OK: DIRAC.gLogger.error( 'Job failed: Catalog Coherence problem found') DIRAC.gLogger.notice('Failing SE:', se) failing_se.append(se) continue upload_result = 'OK' if upload_result != 'OK': for se in seList: DIRAC.gLogger.notice('Try upload to:', se) ret = dirac.addFile(FileLFN, FileName, se) res = CheckCatalogCoherence(FileLFN) if res != DIRAC.S_OK: DIRAC.gLogger.error( 'Job failed: Catalog Coherence problem found') failing_se.append(se) DIRAC.gLogger.notice('Failing SE:', se) continue upload_result = 'OK' break DIRAC.gLogger.notice('Failing SE list:', failing_se) #for se in failing_se: # seList.remove(se) # DIRAC.gLogger.notice('Failing SE list:',failing_se) if upload_result != 'OK': return DIRAC.S_ERROR return DIRAC.S_OK
def am_initialize(self, *initArgs): """ Common initialization for all the agents. This is executed every time an agent (re)starts. This is called by the AgentReactor, should not be overridden. """ agentName = self.am_getModuleParam('fullName') result = self.initialize(*initArgs) if not isReturnStructure(result): return S_ERROR("initialize must return S_OK/S_ERROR") if not result['OK']: return S_ERROR("Error while initializing %s: %s" % (agentName, result['Message'])) mkDir(self.am_getControlDirectory()) workDirectory = self.am_getWorkDirectory() mkDir(workDirectory) # Set the work directory in an environment variable available to subprocesses if needed os.environ['AGENT_WORKDIRECTORY'] = workDirectory self.__moduleProperties['shifterProxy'] = self.am_getOption('shifterProxy') if self.am_monitoringEnabled(): self.monitor.enable() if len(self.__moduleProperties['executors']) < 1: return S_ERROR("At least one executor method has to be defined") if not self.am_Enabled(): return S_ERROR("Agent is disabled via the configuration") self.log.notice("=" * 40) self.log.notice("Loaded agent module %s" % self.__moduleProperties['fullName']) self.log.notice(" Site: %s" % DIRAC.siteName()) self.log.notice(" Setup: %s" % gConfig.getValue("/DIRAC/Setup")) self.log.notice(" Base Module version: %s " % __RCSID__) self.log.notice(" Agent version: %s" % self.__codeProperties['version']) self.log.notice(" DIRAC version: %s" % DIRAC.version) self.log.notice(" DIRAC platform: %s" % DIRAC.getPlatform()) pollingTime = int(self.am_getOption('PollingTime')) if pollingTime > 3600: self.log.notice(" Polling time: %s hours" % (pollingTime / 3600.)) else: self.log.notice(" Polling time: %s seconds" % self.am_getOption('PollingTime')) self.log.notice(" Control dir: %s" % self.am_getControlDirectory()) self.log.notice(" Work dir: %s" % self.am_getWorkDirectory()) if self.am_getOption('MaxCycles') > 0: self.log.notice(" Cycles: %s" % self.am_getMaxCycles()) else: self.log.notice(" Cycles: unlimited") if self.am_getWatchdogTime() > 0: self.log.notice(" Watchdog interval: %s" % self.am_getWatchdogTime()) else: self.log.notice(" Watchdog interval: disabled ") self.log.notice("=" * 40) self.__initialized = True return S_OK()
def rescheduleFailedJob(jobID, message): try: import DIRAC global jobReport gLogger.warn('Failure during %s' % (message)) #Setting a job parameter does not help since the job will be rescheduled, #instead set the status with the cause and then another status showing the #reschedule operation. if not jobReport: gLogger.info('Creating a new JobReport Object') jobReport = JobReport(int(jobID), 'JobWrapperTemplate') jobReport.setApplicationStatus('Failed %s ' % message, sendFlag=False) jobReport.setJobStatus('Rescheduled', message, sendFlag=False) # We must send Job States and Parameters before it gets reschedule jobReport.sendStoredStatusInfo() jobReport.sendStoredJobParameters() gLogger.info( 'Job will be rescheduled after exception during execution of the JobWrapper' ) jobManager = RPCClient('WorkloadManagement/JobManager') result = jobManager.rescheduleJob(int(jobID)) if not result['OK']: gLogger.warn(result) # Send mail to debug errors mailAddress = DIRAC.alarmMail site = DIRAC.siteName() subject = 'Job rescheduled at %s' % site ret = systemCall(0, 'hostname') wn = ret['Value'][1] msg = 'Job %s rescheduled at %s, wn=%s\n' % (jobID, site, wn) msg += message NotificationClient().sendMail(mailAddress, subject, msg, fromAddress="*****@*****.**", localAttempt=False) return except Exception, x: gLogger.exception('JobWrapperTemplate failed to reschedule Job') return
def initialize(self, systemName, cfgPath): if self.__initialized: return self.__initialized = True from DIRAC.ConfigurationSystem.Client.Config import gConfig from os import getpid # self.__printDebug( "The configuration path is %s" % cfgPath ) # Get the options for the different output backends retDict = gConfig.getOptionsDict("%s/BackendsOptions" % cfgPath) # self.__printDebug( retDict ) if not retDict["OK"]: cfgBackOptsDict = {"FileName": "Dirac-log_%s.log" % getpid(), "Interactive": True, "SleepTime": 150} else: cfgBackOptsDict = retDict["Value"] self.__backendOptions.update(cfgBackOptsDict) if not self.__backendOptions.has_key("Filename"): self.__backendOptions["FileName"] = "Dirac-log_%s.log" % getpid() sleepTime = 150 try: sleepTime = int(self.__backendOptions["SleepTime"]) except: pass self.__backendOptions["SleepTime"] = sleepTime self.__backendOptions["Interactive"] = gConfig.getValue("%s/BackendsOptions/Interactive" % cfgPath, True) self.__backendOptions["Site"] = DIRAC.siteName() # Configure outputs desiredBackends = gConfig.getValue("%s/LogBackends" % cfgPath, "stdout") self.registerBackends(List.fromChar(desiredBackends)) # Configure verbosity defaultLevel = Logger.defaultLogLevel if "Scripts" in cfgPath: defaultLevel = gConfig.getValue("/Systems/Scripts/LogLevel", Logger.defaultLogLevel) self.setLevel(gConfig.getValue("%s/LogLevel" % cfgPath, defaultLevel)) # Configure framing self._showCallingFrame = gConfig.getValue("%s/LogShowLine" % cfgPath, self._showCallingFrame) # Get system name self._systemName = str(systemName) if not self.__backendOptions["Interactive"]: ExitCallback.registerExitCallback(self.flushAllMessages)
def createHandler(self, parameters=None): """ Each backend can initialize its attributes and create its handler with them. :params parameters: dictionary of parameters. ex: {'FileName': file.log} """ if parameters is not None: self.__interactive = parameters.get('Interactive', self.__interactive) self.__sleepTime = parameters.get('SleepTime', self.__sleepTime) self.__site = DIRAC.siteName() self._handler = ServerHandler(self.__sleepTime, self.__interactive, self.__site) self._handler.setLevel(LogLevels.ERROR)
def __init__(self): """Standard constructor""" self.log = gLogger.getSubLogger("TimeLeft") self.cpuPower = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0) if not self.cpuPower: self.log.warn( "/LocalSite/CPUNormalizationFactor not defined for site %s" % DIRAC.siteName()) result = self.__getBatchSystemPlugin() if result["OK"]: self.batchPlugin = result["Value"] else: self.batchPlugin = None self.batchError = result["Message"]
def am_initialize(self, *initArgs): agentName = self.am_getModuleParam('fullName') result = self.initialize(*initArgs) if result == None: return S_ERROR( "Error while initializing %s module: initialize must return S_OK/S_ERROR" % agentName) if not result['OK']: return S_ERROR("Error while initializing %s: %s" % (agentName, result['Message'])) _checkDir(self.am_getControlDirectory()) _checkDir(self.am_getWorkDirectory()) self.__moduleProperties['shifterProxy'] = self.am_getOption( 'shifterProxy') if self.am_monitoringEnabled(): self.monitor.enable() if len(self.__moduleProperties['executors']) < 1: return S_ERROR("At least one executor method has to be defined") if not self.am_Enabled(): return S_ERROR("Agent is disabled via the configuration") self.log.notice("=" * 40) self.log.notice("Loaded agent module %s" % self.__moduleProperties['fullName']) self.log.notice(" Site: %s" % DIRAC.siteName()) self.log.notice(" Setup: %s" % gConfig.getValue("/DIRAC/Setup")) self.log.notice(" Base Module version: %s " % __RCSID__) self.log.notice(" Agent version: %s" % self.__codeProperties['version']) self.log.notice(" DIRAC version: %s" % DIRAC.version) self.log.notice(" DIRAC platform: %s" % DIRAC.platform) pollingTime = int(self.am_getOption('PollingTime')) if pollingTime > 3600: self.log.notice(" Polling time: %s hours" % (pollingTime / 3600.)) else: self.log.notice(" Polling time: %s seconds" % self.am_getOption('PollingTime')) self.log.notice(" Control dir: %s" % self.am_getControlDirectory()) self.log.notice(" Work dir: %s" % self.am_getWorkDirectory()) if self.am_getOption('MaxCycles') > 0: self.log.notice(" Cycles: %s" % self.am_getMaxCycles()) else: self.log.notice(" Cycles: unlimited") self.log.notice("=" * 40) self.__initialized = True return S_OK()
def rescheduleFailedJob(jobID,message): try: import DIRAC global jobReport gLogger.warn('Failure during %s' %(message)) #Setting a job parameter does not help since the job will be rescheduled, #instead set the status with the cause and then another status showing the #reschedule operation. if not jobReport: gLogger.info('Creating a new JobReport Object') jobReport = JobReport(int(jobID),'JobWrapperTemplate') jobReport.setApplicationStatus( 'Failed %s ' % message, sendFlag = False ) jobReport.setJobStatus( 'Rescheduled', message, sendFlag = False ) # We must send Job States and Parameters before it gets reschedule jobReport.sendStoredStatusInfo() jobReport.sendStoredJobParameters() gLogger.info('Job will be rescheduled after exception during execution of the JobWrapper') jobManager = RPCClient('WorkloadManagement/JobManager') result = jobManager.rescheduleJob(int(jobID)) if not result['OK']: gLogger.warn(result) # Send mail to debug errors mailAddress = DIRAC.alarmMail site = DIRAC.siteName() subject = 'Job rescheduled at %s' % site ret = systemCall(0,'hostname') wn = ret['Value'][1] msg = 'Job %s rescheduled at %s, wn=%s\n' % ( jobID, site, wn ) msg += message NotificationClient().sendMail(mailAddress,subject,msg,fromAddress="*****@*****.**",localAttempt=False) return except Exception,x: gLogger.exception('JobWrapperTemplate failed to reschedule Job') return
def __getBatchSystemPlugin(self): """ Using the name of the batch system plugin, will return an instance of the plugin class. """ batchSystems = { 'LSF': 'LSB_JOBID', 'PBS': 'PBS_JOBID', 'BQS': 'QSUB_REQNAME', 'SGE': 'SGE_TASK_ID'} # more to be added later name = None for batchSystem, envVar in batchSystems.items(): if envVar in os.environ: name = batchSystem break if name is None and 'MACHINEFEATURES' in os.environ and 'JOBFEATURES' in os.environ: # Only use MJF if legacy batch system information not available for now name = 'MJF' if name is None: self.log.warn('Batch system type for site %s is not currently supported' % DIRAC.siteName()) return S_ERROR('Current batch system is not supported') self.log.debug('Creating plugin for %s batch system' % (name)) try: batchSystemName = "%sTimeLeft" % (name) batchPlugin = __import__('DIRAC.Core.Utilities.TimeLeft.%s' % # pylint: disable=unused-variable batchSystemName, globals(), locals(), [batchSystemName]) except ImportError as x: msg = 'Could not import DIRAC.Core.Utilities.TimeLeft.%s' % (batchSystemName) self.log.warn(x) self.log.warn(msg) return S_ERROR(msg) try: batchStr = 'batchPlugin.%s()' % (batchSystemName) batchInstance = eval(batchStr) except Exception as x: # pylint: disable=broad-except msg = 'Could not instantiate %s()' % (batchSystemName) self.log.warn(x) self.log.warn(msg) return S_ERROR(msg) return S_OK(batchInstance)
def __getConfigFlags(infoDict=None): """Get the flags for dirac-configure inside the container. Returns a string containing the command line flags. """ if not infoDict: infoDict = {} cfgOpts = [] setup = infoDict.get("DefaultSetup") if not setup: setup = gConfig.getValue("/DIRAC/Setup", "unknown") cfgOpts.append("-S '%s'" % setup) csServers = infoDict.get("ConfigurationServers") if not csServers: csServers = gConfig.getValue("/DIRAC/Configuration/Servers", []) cfgOpts.append("-C '%s'" % ",".join([str(ce) for ce in csServers])) cfgOpts.append("-n '%s'" % DIRAC.siteName()) return " ".join(cfgOpts)
def initialize( self ): self.logger = gLogger.getSubLogger( "Monitoring" ) self.logger.debug( "Initializing Monitoring Client" ) self.sourceDict[ 'setup' ] = gConfig.getValue( "/DIRAC/Setup" ) self.sourceDict[ 'site' ] = DIRAC.siteName() if self.sourceDict[ 'componentType' ] == self.COMPONENT_SERVICE: self.cfgSection = PathFinder.getSystemSection( self.sourceDict[ 'componentName' ] ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_AGENT: self.cfgSection = PathFinder.getAgentSection( self.sourceDict[ 'componentName' ] ) self.setComponentLocation( Network.getFQDN() ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_WEB: self.cfgSection = "/WebApp" self.setComponentLocation( 'http://%s' % Network.getFQDN() ) self.setComponentName( 'WebApp' ) elif self.sourceDict[ 'componentType' ] == self.COMPONENT_SCRIPT: self.cfgSection = "/Script" else: raise Exception( "Component type has not been defined" ) gMonitoringFlusher.registerMonitoringClient( self ) # ExitCallback.registerExitCallback( self.forceFlush ) self.__initialized = True
def am_initialize( self, *initArgs ): agentName = self.am_getModuleParam( 'fullName' ) result = self.initialize( *initArgs ) if result == None: return S_ERROR( "Error while initializing %s module: initialize must return S_OK/S_ERROR" % agentName ) if not result[ 'OK' ]: return S_ERROR( "Error while initializing %s: %s" % ( agentName, result[ 'Message' ] ) ) _checkDir( self.am_getControlDirectory() ) _checkDir( self.am_getWorkDirectory() ) self.__moduleProperties[ 'shifterProxy' ] = self.am_getOption( 'shifterProxy' ) if self.am_monitoringEnabled(): self.monitor.enable() if len( self.__moduleProperties[ 'executors' ] ) < 1: return S_ERROR( "At least one executor method has to be defined" ) if not self.am_Enabled(): return S_ERROR( "Agent is disabled via the configuration" ) self.log.notice( "="*40 ) self.log.notice( "Loaded agent module %s" % self.__moduleProperties[ 'fullName' ] ) self.log.notice( " Site: %s" % DIRAC.siteName() ) self.log.notice( " Setup: %s" % gConfig.getValue( "/DIRAC/Setup" ) ) self.log.notice( " Base Module version: %s " % __RCSID__ ) self.log.notice( " Agent version: %s" % self.__codeProperties[ 'version' ] ) self.log.notice( " DIRAC version: %s" % DIRAC.version ) self.log.notice( " DIRAC platform: %s" % DIRAC.platform ) pollingTime = int( self.am_getOption( 'PollingTime' ) ) if pollingTime > 3600: self.log.notice( " Polling time: %s hours" % ( pollingTime / 3600. ) ) else: self.log.notice( " Polling time: %s seconds" % self.am_getOption( 'PollingTime' ) ) self.log.notice( " Control dir: %s" % self.am_getControlDirectory() ) self.log.notice( " Work dir: %s" % self.am_getWorkDirectory() ) if self.am_getOption( 'MaxCycles' ) > 0: self.log.notice( " Cycles: %s" % self.am_getMaxCycles() ) else: self.log.notice( " Cycles: unlimited" ) self.log.notice( "="*40 ) self.__initialized = True return S_OK()
def jobexec( jobxml, wfParameters ): jobfile = os.path.abspath( jobxml ) if not os.path.exists( jobfile ): gLogger.warn( 'Path to specified workflow %s does not exist' % ( jobfile ) ) sys.exit( 1 ) workflow = fromXMLFile( jobfile ) gLogger.debug( workflow ) code = workflow.createCode() gLogger.debug( code ) jobID = 0 if os.environ.has_key( 'JOBID' ): jobID = os.environ['JOBID'] gLogger.info( 'DIRAC JobID %s is running at site %s' % ( jobID, DIRAC.siteName() ) ) workflow.addTool( 'JobReport', JobReport( jobID ) ) workflow.addTool( 'AccountingReport', DataStoreClient() ) workflow.addTool( 'Request', Request() ) # Propagate the command line parameters to the workflow if any for pName, pValue in wfParameters.items(): workflow.setValue( pName, pValue ) return workflow.execute()
def __init__( self, infosys = None, master_host = None, mirrors = [] ): """ Default constructor """ if not infosys: configPath = '/Resources/FileCatalogs/LcgFileCatalogCombined/LcgGfalInfosys' infosys = gConfig.getValue( configPath ) self.valid = False if not master_host: configPath = '/Resources/FileCatalogs/LcgFileCatalogCombined/MasterHost' master_host = gConfig.getValue( configPath ) if master_host: # Create the master LFC client first self.lfc = LcgFileCatalogClient( infosys, master_host ) if self.lfc.isOK(): self.valid = True if not mirrors: siteName = DIRAC.siteName() res = getLocationOrderedCatalogs( siteName = siteName ) if not res['OK']: mirrors = [] else: mirrors = res['Value'] # Create the mirror LFC instances self.mirrors = [] for mirror in mirrors: lfc = LcgFileCatalogClient( infosys, mirror ) self.mirrors.append( lfc ) self.nmirrors = len( self.mirrors ) # Keep the environment for the master instance self.master_host = self.lfc.host os.environ['LFC_HOST'] = self.master_host os.environ['LCG_GFAL_INFOSYS'] = infosys self.name = 'LFC' self.timeout = 3000
Script.localCfg.addDefaultEntry( '/LocalSite/Site', siteName ) DIRAC.__siteName = False if ceName: DIRAC.gLogger.notice( 'Setting /LocalSite/GridCE = %s' % ceName ) Script.localCfg.addDefaultEntry( '/LocalSite/GridCE', ceName ) if not localSE and siteName in sites: localSE = getSEsForSite( siteName ) if localSE['OK'] and localSE['Value']: localSE = ','.join( localSE['Value'] ) DIRAC.gLogger.notice( 'Setting /LocalSite/LocalSE =', localSE ) Script.localCfg.addDefaultEntry( '/LocalSite/LocalSE', localSE ) break if gatewayServer: DIRAC.gLogger.verbose( '/DIRAC/Gateways/%s =' % DIRAC.siteName(), gatewayServer ) Script.localCfg.addDefaultEntry( '/DIRAC/Gateways/%s' % DIRAC.siteName(), gatewayServer ) # Create the local cfg if it is not yet there if not outputFile: outputFile = DIRAC.gConfig.diracConfigFilePath outputFile = os.path.abspath( outputFile ) if not os.path.exists( outputFile ): configDir = os.path.dirname( outputFile ) mkDir(configDir) update = True DIRAC.gConfig.dumpLocalCFGToFile( outputFile ) # We need user proxy or server certificate to continue if not useServerCert: Script.enableCS()
def getTimeLeft( self, cpuConsumed = 0.0 ): """Returns the CPU Time Left for supported batch systems. The CPUConsumed is the current raw total CPU. """ # Quit if no scale factor available if not self.scaleFactor: return S_ERROR( '/LocalSite/CPUScalingFactor not defined for site %s' % DIRAC.siteName() ) if not self.batchPlugin: return S_ERROR( self.batchError ) resourceDict = self.batchPlugin.getResourceUsage() if not resourceDict['OK']: self.log.warn( 'Could not determine timeleft for batch system at site %s' % DIRAC.siteName() ) return resourceDict resources = resourceDict['Value'] if not resources['CPULimit'] or not resources['WallClockLimit']: # This should never happen return S_ERROR( 'No CPU or WallClock limit obtained' ) timeLeft = 0. cpu = float( resources['CPU'] ) cpuLimit = float( resources['CPULimit'] ) cpuUsedFraction = cpu / cpuLimit cpuRemainingFraction = 1. - cpuUsedFraction wc = float( resources['WallClock'] ) wcLimit = float( resources['WallClockLimit'] ) wcUsedFraction = wc / wcLimit wcRemainingFraction = 1. - wcUsedFraction marginFraction = self.cpuMargin / 100. fractionTuple = ( 100. * cpuRemainingFraction, 100. * wcRemainingFraction, self.cpuMargin ) self.log.verbose( 'Used CPU is %.1f s out of %.1f, Used WallClock is %.1f s out of %.1f.' % ( cpu, cpuLimit, wc, wcLimit ) ) self.log.verbose( 'Remaining CPU %.02f%%, Remaining WallClock %.02f%%, margin %s%%' % fractionTuple ) validTimeLeft = False if wcRemainingFraction > cpuRemainingFraction and ( wcRemainingFraction - cpuRemainingFraction ) > marginFraction: # FIXME: I have no idea why this test is done (PhC) self.log.verbose( 'Remaining CPU %.02f%% < Remaining WallClock %.02f%% and difference > margin %s%%' % fractionTuple ) validTimeLeft = True else: if cpuRemainingFraction > marginFraction and wcRemainingFraction > marginFraction: self.log.verbose( 'Remaining CPU %.02f%% and Remaining WallClock %.02f%% both > margin %s%%' % fractionTuple ) validTimeLeft = True else: self.log.verbose( 'Remaining CPU %.02f%% or WallClock %.02f%% < margin %s%% so no time left' % fractionTuple ) if validTimeLeft: if cpu and cpuConsumed > 3600. and self.normFactor: # If there has been more than 1 hour of consumed CPU and # there is a Normalization set for the current CPU # use that value to renormalize the values returned by the batch system # NOTE: cpuConsumed is non-zero for call by the JobAgent and 0 for call by the watchdog # cpuLimit and cpu may be in the units of the batch system, not real seconds... (in this case the other case won't work) # therefore renormalise it using cpuConsumed (which is in real seconds) timeLeft = ( cpuLimit - cpu ) * self.normFactor * cpuConsumed / cpu elif self.normFactor: # FIXME: this is always used by the watchdog... Also used by the JobAgent # if consumed less than 1 hour of CPU # It was using self.scaleFactor but this is inconsistent: use the same as above # In case the returned cpu and cpuLimit are not in real seconds, this is however rubbish timeLeft = ( cpuLimit - cpu ) * self.normFactor else: # Last resort recovery... timeLeft = ( cpuLimit - cpu ) * self.scaleFactor self.log.verbose( 'Remaining CPU in normalized units is: %.02f' % timeLeft ) return S_OK( timeLeft ) else: return S_ERROR( 'No time left for slot' )
def __resolveInputData(self): """This method controls the execution of the DIRAC input data modules according to the ILC VO policy defined in the configuration service. """ if self.arguments['Configuration'].has_key('SiteName'): site = self.arguments['Configuration']['SiteName'] else: site = DIRAC.siteName() policy = [] if not self.arguments.has_key('Job'): self.arguments['Job'] = {} if self.arguments['Job'].has_key('InputDataPolicy'): policy = self.arguments['Job']['InputDataPolicy'] #In principle this can be a list of modules with the first taking precedence if type(policy) in types.StringTypes: policy = [policy] self.log.info('Job has a specific policy setting: %s' % (string.join(policy, ', '))) else: self.log.verbose( 'Attempting to resolve input data policy for site %s' % site) inputDataPolicy = self.ops.getOptionsDict('/InputDataPolicy') if not inputDataPolicy: return S_ERROR( 'Could not resolve InputDataPolicy from /InputDataPolicy') options = inputDataPolicy['Value'] if options.has_key(site): policy = options[site] policy = [x.strip() for x in string.split(policy, ',')] self.log.info( 'Found specific input data policy for site %s:\n%s' % (site, string.join(policy, ',\n'))) elif options.has_key('Default'): policy = options['Default'] policy = [x.strip() for x in string.split(policy, ',')] self.log.info( 'Applying default input data policy for site %s:\n%s' % (site, string.join(policy, ',\n'))) dataToResolve = None #if none, all supplied input data is resolved allDataResolved = False successful = {} failedReplicas = [] for modulePath in policy: if not allDataResolved: result = self.__runModule(modulePath, dataToResolve) if not result['OK']: self.log.warn('Problem during %s execution' % modulePath) return result if result.has_key('Failed'): failedReplicas = result['Failed'] if failedReplicas: self.log.info( '%s failed for the following files:\n%s' % (modulePath, string.join(failedReplicas, '\n'))) dataToResolve = failedReplicas else: self.log.info('All replicas resolved after %s execution' % (modulePath)) allDataResolved = True successful.update(result['Successful']) self.log.verbose(successful) result = S_OK() result['Successful'] = successful result['Failed'] = failedReplicas return result
def __resolveInputData(self): """This method controls the execution of the DIRAC input data modules according to the ILC VO policy defined in the configuration service. """ if self.arguments["Configuration"].has_key("SiteName"): site = self.arguments["Configuration"]["SiteName"] else: site = DIRAC.siteName() policy = [] if not self.arguments.has_key("Job"): self.arguments["Job"] = {} if self.arguments["Job"].has_key("InputDataPolicy"): policy = self.arguments["Job"]["InputDataPolicy"] # In principle this can be a list of modules with the first taking precedence if type(policy) in types.StringTypes: policy = [policy] self.log.info("Job has a specific policy setting: %s" % (string.join(policy, ", "))) else: self.log.verbose("Attempting to resolve input data policy for site %s" % site) inputDataPolicy = self.ops.getOptionsDict("/InputDataPolicy") if not inputDataPolicy: return S_ERROR("Could not resolve InputDataPolicy from /InputDataPolicy") options = inputDataPolicy["Value"] if options.has_key(site): policy = options[site] policy = [x.strip() for x in string.split(policy, ",")] self.log.info("Found specific input data policy for site %s:\n%s" % (site, string.join(policy, ",\n"))) elif options.has_key("Default"): policy = options["Default"] policy = [x.strip() for x in string.split(policy, ",")] self.log.info( "Applying default input data policy for site %s:\n%s" % (site, string.join(policy, ",\n")) ) dataToResolve = None # if none, all supplied input data is resolved allDataResolved = False successful = {} failedReplicas = [] for modulePath in policy: if not allDataResolved: result = self.__runModule(modulePath, dataToResolve) if not result["OK"]: self.log.warn("Problem during %s execution" % modulePath) return result if result.has_key("Failed"): failedReplicas = result["Failed"] if failedReplicas: self.log.info( "%s failed for the following files:\n%s" % (modulePath, string.join(failedReplicas, "\n")) ) dataToResolve = failedReplicas else: self.log.info("All replicas resolved after %s execution" % (modulePath)) allDataResolved = True successful.update(result["Successful"]) self.log.verbose(successful) result = S_OK() result["Successful"] = successful result["Failed"] = failedReplicas return result
def __findServiceURL(self): """ Discovers the URL of a service, taking into account gateways, multiple URLs, banned URLs If the site on which we run is configured to use gateways (/DIRAC/Gateways/<siteName>), these URLs will be used. To ignore the gateway, it is possible to set KW_IGNORE_GATEWAYS to False in kwargs. If self._destinationSrv (given as constructor attribute) is a properly formed URL, we just return this one. If we have to use a gateway, we just replace the server name in the url. The list of URLs defined in the CS (<System>/URLs/<Component>) is randomized This method also sets some attributes: * self.__nbOfUrls = number of URLs * self.__nbOfRetry = 2 if we have more than 2 urls, otherwise 3 * self.__bannedUrls is reinitialized if all the URLs are banned :return: the selected URL """ if not self.__initStatus['OK']: return self.__initStatus # Load the Gateways URLs for the current site Name gatewayURL = False if self.KW_IGNORE_GATEWAYS not in self.kwargs or not self.kwargs[self.KW_IGNORE_GATEWAYS]: dRetVal = gConfig.getOption("/DIRAC/Gateways/%s" % DIRAC.siteName()) if dRetVal['OK']: rawGatewayURL = List.randomize(List.fromChar(dRetVal['Value'], ","))[0] gatewayURL = "/".join(rawGatewayURL.split("/")[:3]) # If what was given as constructor attribute is a properly formed URL, # we just return this one. # If we have to use a gateway, we just replace the server name in it for protocol in gProtocolDict: if self._destinationSrv.find("%s://" % protocol) == 0: gLogger.debug("Already given a valid url", self._destinationSrv) if not gatewayURL: return S_OK(self._destinationSrv) gLogger.debug("Reconstructing given URL to pass through gateway") path = "/".join(self._destinationSrv.split("/")[3:]) finalURL = "%s/%s" % (gatewayURL, path) gLogger.debug("Gateway URL conversion:\n %s -> %s" % (self._destinationSrv, finalURL)) return S_OK(finalURL) if gatewayURL: gLogger.debug("Using gateway", gatewayURL) return S_OK("%s/%s" % (gatewayURL, self._destinationSrv)) # We extract the list of URLs from the CS (System/URLs/Component) try: urls = getServiceURL(self._destinationSrv, setup=self.setup) except Exception as e: return S_ERROR("Cannot get URL for %s in setup %s: %s" % (self._destinationSrv, self.setup, repr(e))) if not urls: return S_ERROR("URL for service %s not found" % self._destinationSrv) failoverUrls = [] # Try if there are some failover URLs to use as last resort try: failoverUrlsStr = getServiceFailoverURL(self._destinationSrv, setup=self.setup) if failoverUrlsStr: failoverUrls = failoverUrlsStr.split(',') except Exception as e: pass # We randomize the list, and add at the end the failover URLs (System/FailoverURLs/Component) urlsList = List.randomize(List.fromChar(urls, ",")) + failoverUrls self.__nbOfUrls = len(urlsList) self.__nbOfRetry = 2 if self.__nbOfUrls > 2 else 3 # we retry 2 times all services, if we run more than 2 services if self.__nbOfUrls == len(self.__bannedUrls): self.__bannedUrls = [] # retry all urls gLogger.debug("Retrying again all URLs") if len(self.__bannedUrls) > 0 and len(urlsList) > 1: # we have host which is not accessible. We remove that host from the list. # We only remove if we have more than one instance for i in self.__bannedUrls: gLogger.debug("Removing banned URL", "%s" % i) urlsList.remove(i) # Take the first URL from the list #randUrls = List.randomize( urlsList ) + failoverUrls sURL = urlsList[0] # If we have banned URLs, and several URLs at disposals, we make sure that the selected sURL # is not on a host which is banned. If it is, we take the next one in the list using __selectUrl # If we have banned URLs, and several URLs at disposals, we make sure that the selected sURL # is not on a host which is banned. If it is, we take the next one in the list using __selectUrl if len(self.__bannedUrls) > 0 and self.__nbOfUrls > 2: # when we have multiple services then we can # have a situation when two services are running on the same machine with different ports... retVal = Network.splitURL(sURL) nexturl = None if retVal['OK']: nexturl = retVal['Value'] found = False for i in self.__bannedUrls: retVal = Network.splitURL(i) if retVal['OK']: bannedurl = retVal['Value'] else: break # We found a banned URL on the same host as the one we are running on if nexturl[1] == bannedurl[1]: found = True break if found: nexturl = self.__selectUrl(nexturl, urlsList[1:]) if nexturl: # an url found which is in different host sURL = nexturl gLogger.debug("Discovering URL for service", "%s -> %s" % (self._destinationSrv, sURL)) return S_OK(sURL)