class StorageElementCache(object): def __init__(self): self.seCache = DictCache() def __call__(self, name, plugins=None, vo=None, hideExceptions=False): self.seCache.purgeExpired(expiredInSeconds=60) tId = threading.current_thread().ident if not vo: result = getVOfromProxyGroup() if not result['OK']: return vo = result['Value'] argTuple = (tId, name, plugins, vo) seObj = self.seCache.get(argTuple) if not seObj: seObj = StorageElementItem(name, plugins, vo, hideExceptions=hideExceptions) # Add the StorageElement to the cache for 1/2 hour self.seCache.add(argTuple, 1800, seObj) return seObj
class StorageElementCache(object): def __init__(self): self.seCache = DictCache() def __call__(self, name, plugins=None, vo=None, hideExceptions=False): self.seCache.purgeExpired(expiredInSeconds=60) tId = threading.current_thread().ident if not vo: result = getVOfromProxyGroup() if not result['OK']: return vo = result['Value'] # Because the gfal2 context caches the proxy location, # we also use the proxy location as a key. # In practice, there should almost always be one, except for the REA # If we see its memory consumtpion exploding, this might be a place to look proxyLoc = getProxyLocation() argTuple = (tId, name, plugins, vo, proxyLoc) seObj = self.seCache.get(argTuple) if not seObj: seObj = StorageElementItem(name, plugins, vo, hideExceptions=hideExceptions) # Add the StorageElement to the cache for 1/2 hour self.seCache.add(argTuple, 1800, seObj) return seObj
class PlotCache(object): def __init__(self, plotsLocation=False): self.plotsLocation = plotsLocation self.alive = True self.__graphCache = DictCache(deleteFunction=_deleteGraph) self.__graphLifeTime = 600 self.purgeThread = threading.Thread(target=self.purgeExpired) self.purgeThread.start() def setPlotsLocation(self, plotsDir): self.plotsLocation = plotsDir for plot in os.listdir(self.plotsLocation): if plot.find(".png") > 0: plotLocation = "%s/%s" % (self.plotsLocation, plot) gLogger.verbose("Purging %s" % plotLocation) os.unlink(plotLocation) def purgeExpired(self): while self.alive: time.sleep(self.__graphLifeTime) self.__graphCache.purgeExpired() def getPlot(self, plotHash, plotData, plotMetadata, subplotMetadata): """ Get plot from the cache if exists, else generate it """ plotDict = self.__graphCache.get(plotHash) if plotDict is None: basePlotFileName = "%s/%s.png" % (self.plotsLocation, plotHash) if subplotMetadata: retVal = graph(plotData, basePlotFileName, plotMetadata, metadata=subplotMetadata) else: retVal = graph(plotData, basePlotFileName, plotMetadata) if not retVal['OK']: return retVal plotDict = retVal['Value'] if plotDict['plot']: plotDict['plot'] = os.path.basename(basePlotFileName) self.__graphCache.add(plotHash, self.__graphLifeTime, plotDict) return S_OK(plotDict) def getPlotData(self, plotFileName): filename = "%s/%s" % (self.plotsLocation, plotFileName) try: fd = file(filename, "rb") data = fd.read() fd.close() except Exception as v: return S_ERROR("Can't open file %s: %s" % (plotFileName, str(v))) return S_OK(data)
class PlotCache: def __init__( self, plotsLocation = False ): self.plotsLocation = plotsLocation self.alive = True self.__graphCache = DictCache( deleteFunction = _deleteGraph ) self.__graphLifeTime = 600 self.purgeThread = threading.Thread( target = self.purgeExpired ) self.purgeThread.setDaemon( 1 ) self.purgeThread.start() def setPlotsLocation( self, plotsDir ): self.plotsLocation = plotsDir for plot in os.listdir( self.plotsLocation ): if plot.find( ".png" ) > 0: plotLocation = "%s/%s" % ( self.plotsLocation, plot ) gLogger.verbose( "Purging %s" % plotLocation ) os.unlink( plotLocation ) def purgeExpired( self ): while self.alive: time.sleep( self.__graphLifeTime ) self.__graphCache.purgeExpired() def getPlot( self, plotHash, plotData, plotMetadata, subplotMetadata ): """ Get plot from the cache if exists, else generate it """ plotDict = self.__graphCache.get( plotHash ) if plotDict == False: basePlotFileName = "%s/%s.png" % ( self.plotsLocation, plotHash ) if subplotMetadata: retVal = graph( plotData, basePlotFileName, plotMetadata, metadata = subplotMetadata ) else: retVal = graph( plotData, basePlotFileName, plotMetadata ) if not retVal[ 'OK' ]: return retVal plotDict = retVal[ 'Value' ] if plotDict[ 'plot' ]: plotDict[ 'plot' ] = os.path.basename( basePlotFileName ) self.__graphCache.add( plotHash, self.__graphLifeTime, plotDict ) return S_OK( plotDict ) def getPlotData( self, plotFileName ): filename = "%s/%s" % ( self.plotsLocation, plotFileName ) try: fd = file( filename, "rb" ) data = fd.read() fd.close() except Exception, v: return S_ERROR( "Can't open file %s: %s" % ( plotFileName, str( v ) ) ) return S_OK( data )
class StorageElementCache(object): def __init__(self): self.seCache = DictCache() def __call__(self, name, protocols=None, vo=None, hideExceptions=False): self.seCache.purgeExpired(expiredInSeconds=60) argTuple = (name, protocols, vo) seObj = self.seCache.get(argTuple) if not seObj: seObj = StorageElementItem(name, protocols, vo, hideExceptions=hideExceptions) # Add the StorageElement to the cache for 1/2 hour self.seCache.add(argTuple, 1800, seObj) return seObj
class StorageElementCache(object): def __init__(self): self.seCache = DictCache() def __call__(self, name, protocols=None, vo=None): self.seCache.purgeExpired(expiredInSeconds=60) argTuple = (name, protocols, vo) seObj = self.seCache.get(argTuple) if not seObj: seObj = StorageElementItem(name, protocols, vo) # Add the StorageElement to the cache for 1/2 hour self.seCache.add(argTuple, 1800, seObj) return seObj
class StorageElementCache(object): def __init__(self): self.seCache = DictCache() def __call__(self, name, protocols=None, vo=None, hideExceptions=False): self.seCache.purgeExpired(expiredInSeconds=60) tId = threading.current_thread().ident argTuple = (tId, name, protocols, vo) seObj = self.seCache.get(argTuple) if not seObj: seObj = StorageElementItem(name, protocols, vo, hideExceptions=hideExceptions) # Add the StorageElement to the cache for 1/2 hour self.seCache.add(argTuple, 1800, seObj) return seObj
class StorageElementCache( object ): def __init__( self ): self.seCache = DictCache() def __call__( self, name, plugins = None, vo = None, hideExceptions = False ): self.seCache.purgeExpired( expiredInSeconds = 60 ) tId = threading.current_thread().ident if not vo: result = getVOfromProxyGroup() if not result['OK']: return vo = result['Value'] argTuple = ( tId, name, plugins, vo ) seObj = self.seCache.get( argTuple ) if not seObj: seObj = StorageElementItem( name, plugins, vo, hideExceptions = hideExceptions ) # Add the StorageElement to the cache for 1/2 hour self.seCache.add( argTuple, 1800, seObj ) return seObj
class DataCache: def __init__(self): self.graphsLocation = os.path.join( gConfig.getValue('/LocalSite/InstancePath', rootPath), 'data', 'accountingPlots') self.cachedGraphs = {} self.alive = True self.purgeThread = threading.Thread(target=self.purgeExpired) self.purgeThread.setDaemon(1) self.purgeThread.start() self.__dataCache = DictCache() self.__graphCache = DictCache(deleteFunction=self._deleteGraph) self.__dataLifeTime = 600 self.__graphLifeTime = 3600 def setGraphsLocation(self, graphsDir): self.graphsLocation = graphsDir for graphName in os.listdir(self.graphsLocation): if graphName.find(".png") > 0: graphLocation = "%s/%s" % (self.graphsLocation, graphName) gLogger.verbose("Purging %s" % graphLocation) os.unlink(graphLocation) def purgeExpired(self): while self.alive: time.sleep(600) self.__graphCache.purgeExpired() self.__dataCache.purgeExpired() def getReportData(self, reportRequest, reportHash, dataFunc): """ Get report data from cache if exists, else generate it """ reportData = self.__dataCache.get(reportHash) if reportData == False: retVal = dataFunc(reportRequest) if not retVal['OK']: return retVal reportData = retVal['Value'] self.__dataCache.add(reportHash, self.__dataLifeTime, reportData) return S_OK(reportData) def getReportPlot(self, reportRequest, reportHash, reportData, plotFunc): """ Get report data from cache if exists, else generate it """ plotDict = self.__graphCache.get(reportHash) if plotDict == False: basePlotFileName = "%s/%s" % (self.graphsLocation, reportHash) retVal = plotFunc(reportRequest, reportData, basePlotFileName) if not retVal['OK']: return retVal plotDict = retVal['Value'] if plotDict['plot']: plotDict['plot'] = "%s.png" % reportHash if plotDict['thumbnail']: plotDict['thumbnail'] = "%s.thb.png" % reportHash self.__graphCache.add(reportHash, self.__graphLifeTime, plotDict) return S_OK(plotDict) def getPlotData(self, plotFileName): filename = "%s/%s" % (self.graphsLocation, plotFileName) try: fd = file(filename, "rb") data = fd.read() fd.close() except Exception, e: return S_ERROR("Can't open file %s: %s" % (plotFileName, str(e))) return S_OK(data)
class PilotDirector(object): """ Base Pilot Director class. Derived classes must implement: * __init__( self, submitPool ): that must call the parent class __init__ method and then do its own initialization * configure( self, csSection, submitPool ): that must call the parent class configure method and the do its own configuration * _submitPilot( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) * _listMatch( self, proxy, jdl, taskQueueID, rb ) * _getChildrenReferences( self, proxy, parentReference, taskQueueID ) Derived classes might implement: * configureFromSection( self, mySection ): to reload from a CS section the additional datamembers they might have defined. If additional datamembers are defined, they must: - be declared in the __init__ - be reconfigured in the configureFromSection method by executing self.reloadConfiguration( csSection, submitPool ) in their configure method """ gridMiddleware = '' def __init__(self, submitPool): """ Define the logger and some defaults """ if submitPool == self.gridMiddleware: self.log = gLogger.getSubLogger('%sPilotDirector' % self.gridMiddleware) else: self.log = gLogger.getSubLogger('%sPilotDirector/%s' % (self.gridMiddleware, submitPool)) self.pilot = DIRAC_PILOT self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool self.extraPilotOptions = [] self.installVersion = DIRAC_VERSION self.installProject = DIRAC_PROJECT self.installation = DIRAC_INSTALLATION self.pilotExtensionsList = [] self.virtualOrganization = VIRTUAL_ORGANIZATION self.install = DIRAC_INSTALL self.extraModules = DIRAC_MODULES self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.targetGrids = [self.gridMiddleware] self.enableListMatch = ENABLE_LISTMATCH self.listMatchDelay = LISTMATCH_DELAY self.listMatchCache = DictCache() self.privatePilotFraction = PRIVATE_PILOT_FRACTION self.errorClearTime = ERROR_CLEAR_TIME self.errorTicketTime = ERROR_TICKET_TIME self.errorMailAddress = DIRAC.errorMail self.alarmMailAddress = DIRAC.alarmMail self.mailFromAddress = FROM_MAIL if not 'log' in self.__dict__: self.log = gLogger.getSubLogger('PilotDirector') self.log.info('Initialized') def configure(self, csSection, submitPool): """ Here goes common configuration for all PilotDirectors """ self.configureFromSection(csSection) self.reloadConfiguration(csSection, submitPool) # Get the defaults for the Setup where the Director is running opsHelper = Operations() self.installVersion = opsHelper.getValue(cfgPath('Pilot', 'Version'), [self.installVersion])[0] self.installProject = opsHelper.getValue(cfgPath('Pilot', 'Project'), self.installProject) self.installation = opsHelper.getValue( cfgPath('Pilot', 'Installation'), self.installation) self.pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", self.pilotExtensionsList) self.log.info('===============================================') self.log.info('Configuration:') self.log.info('') self.log.info(' Target Grids: ', ', '.join(self.targetGrids)) self.log.info(' Install script: ', self.install) self.log.info(' Pilot script: ', self.pilot) self.log.info(' Pilot modules', self.extraModules) self.log.info(' Install Ver: ', self.installVersion) if self.installProject: self.log.info(' Project: ', self.installProject) if self.installation: self.log.info(' Installation: ', self.installation) if self.extraPilotOptions: self.log.info(' Extra Options: ', ' '.join(self.extraPilotOptions)) self.log.info(' ListMatch: ', self.enableListMatch) self.log.info(' Private %: ', self.privatePilotFraction * 100) if self.enableListMatch: self.log.info(' ListMatch Delay:', self.listMatchDelay) self.listMatchCache.purgeExpired() def reloadConfiguration(self, csSection, submitPool): """ Common Configuration can be overwriten for each GridMiddleware """ mySection = csSection + '/' + self.gridMiddleware self.configureFromSection(mySection) # And Again for each SubmitPool mySection = csSection + '/' + submitPool self.configureFromSection(mySection) def configureFromSection(self, mySection): """ reload from CS """ self.pilot = gConfig.getValue(mySection + '/PilotScript', self.pilot) self.installVersion = gConfig.getValue(mySection + '/Version', self.installVersion) self.extraPilotOptions = gConfig.getValue( mySection + '/ExtraPilotOptions', self.extraPilotOptions) self.install = gConfig.getValue(mySection + '/InstallScript', self.install) self.extraModules = gConfig.getValue(mySection + '/ExtraPilotModules', []) + self.extraModules self.installProject = gConfig.getValue(mySection + '/Project', self.installProject) self.installation = gConfig.getValue(mySection + '/Installation', self.installation) self.maxJobsInFillMode = gConfig.getValue( mySection + '/MaxJobsInFillMode', self.maxJobsInFillMode) self.targetGrids = gConfig.getValue(mySection + '/TargetGrids', self.targetGrids) self.enableListMatch = gConfig.getValue(mySection + '/EnableListMatch', self.enableListMatch) self.listMatchDelay = gConfig.getValue(mySection + '/ListMatchDelay', self.listMatchDelay) self.errorClearTime = gConfig.getValue(mySection + '/ErrorClearTime', self.errorClearTime) self.errorTicketTime = gConfig.getValue(mySection + '/ErrorTicketTime', self.errorTicketTime) self.errorMailAddress = gConfig.getValue( mySection + '/ErrorMailAddress', self.errorMailAddress) self.alarmMailAddress = gConfig.getValue( mySection + '/AlarmMailAddress', self.alarmMailAddress) self.mailFromAddress = gConfig.getValue(mySection + '/MailFromAddress', self.mailFromAddress) self.privatePilotFraction = gConfig.getValue( mySection + '/PrivatePilotFraction', self.privatePilotFraction) virtualOrganization = gConfig.getValue( mySection + '/VirtualOrganization', '') if not virtualOrganization: virtualOrganization = getVOForGroup('NonExistingGroup') if not virtualOrganization: virtualOrganization = self.virtualOrganization self.virtualOrganization = virtualOrganization def _resolveCECandidates(self, taskQueueDict): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(taskQueueDict['GridCEs'])) return taskQueueDict['GridCEs'] # Get the mask ret = jobDB.getSiteMask() if not ret['OK']: self.log.error('Can not retrieve site Mask from DB:', ret['Message']) return [] siteMask = ret['Value'] if not siteMask: self.log.error('Site mask is empty') return [] self.log.verbose('Site Mask: %s' % ', '.join(siteMask)) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in siteMask: siteMask.remove(site) self.log.verbose('Removing banned site %s from site Mask' % site) # remove from the mask if a Site is given siteMask = [ site for site in siteMask if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info('No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID']) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(siteMask)) # Get CE's associates to the given site Names ceMask = [] for grid in self.targetGrids: section = '/Resources/Sites/%s' % grid ret = gConfig.getSections(section) if not ret['OK']: # this is hack, maintained until LCG is added as TargetGrid for the gLite SubmitPool section = '/Resources/Sites/LCG' ret = gConfig.getSections(section) if not ret['OK']: self.log.error('Could not obtain CEs from CS', ret['Message']) continue gridSites = ret['Value'] for siteName in gridSites: if siteName in siteMask: ret = gConfig.getValue('%s/%s/CE' % (section, siteName), []) for ce in ret: submissionMode = gConfig.getValue( '%s/%s/CEs/%s/SubmissionMode' % (section, siteName, ce), 'gLite') if submissionMode == self.gridMiddleware and ce not in ceMask: ceMask.append(ce) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(siteMask)) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join(ceMask)) return ceMask def _getPilotOptions(self, taskQueueDict, pilotsToSubmit): # Need to limit the maximum number of pilots to submit at once # For generic pilots this is limited by the number of use of the tokens and the # maximum number of jobs in Filling mode, but for private Jobs we need an extra limitation: pilotsToSubmit = max( min(pilotsToSubmit, int(50 / self.maxJobsInFillMode)), 1) pilotOptions = [] privateIfGenericTQ = self.privatePilotFraction > random.random() privateTQ = ('PilotTypes' in taskQueueDict and 'private' in [t.lower() for t in taskQueueDict['PilotTypes']]) forceGeneric = 'ForceGeneric' in taskQueueDict submitPrivatePilot = (privateIfGenericTQ or privateTQ) and not forceGeneric if submitPrivatePilot: self.log.verbose('Submitting private pilots for TaskQueue %s' % taskQueueDict['TaskQueueID']) ownerDN = taskQueueDict['OwnerDN'] ownerGroup = taskQueueDict['OwnerGroup'] # User Group requirement pilotOptions.append('-G %s' % taskQueueDict['OwnerGroup']) # check if group allows jobsharing ownerGroupProperties = getPropertiesForGroup(ownerGroup) if not 'JobSharing' in ownerGroupProperties: # Add Owner requirement to pilot pilotOptions.append("-O '%s'" % ownerDN) if privateTQ: pilotOptions.append( '-o /Resources/Computing/CEDefaults/PilotType=private') maxJobsInFillMode = self.maxJobsInFillMode else: #For generic jobs we'll submit mixture of generic and private pilots self.log.verbose('Submitting generic pilots for TaskQueue %s' % taskQueueDict['TaskQueueID']) #ADRI: Find the generic group result = findGenericPilotCredentials( group=taskQueueDict['OwnerGroup']) if not result['OK']: self.log.error(ERROR_GENERIC_CREDENTIALS, result['Message']) return S_ERROR(ERROR_GENERIC_CREDENTIALS) ownerDN, ownerGroup = result['Value'] result = gProxyManager.requestToken( ownerDN, ownerGroup, max(pilotsToSubmit, self.maxJobsInFillMode)) if not result['OK']: self.log.error(ERROR_TOKEN, result['Message']) return S_ERROR(ERROR_TOKEN) (token, numberOfUses) = result['Value'] pilotsToSubmit = min(numberOfUses, pilotsToSubmit) pilotOptions.append('-o /Security/ProxyToken=%s' % token) pilotsToSubmit = max( 1, (pilotsToSubmit - 1) / self.maxJobsInFillMode + 1) maxJobsInFillMode = int(numberOfUses / pilotsToSubmit) # Use Filling mode pilotOptions.append('-M %s' % maxJobsInFillMode) # Debug pilotOptions.append('-d') # Setup. pilotOptions.append('-S %s' % taskQueueDict['Setup']) # CS Servers csServers = gConfig.getServersList() if len(csServers) > 3: # Remove the master master = gConfigurationData.getMasterServer() if master in csServers: csServers.remove(master) pilotOptions.append('-C %s' % ",".join(csServers)) # DIRAC Extensions to be used in pilots # ubeda: I'm not entirely sure if we can use here the same opsHelper as in line # line +352 pilotExtensionsList = Operations().getValue("Pilot/Extensions", []) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = getCSExtensions() if extensionsList: pilotOptions.append('-e %s' % ",".join(extensionsList)) #Get DIRAC version and project, There might be global Setup defaults and per VO/Setup defaults (from configure) opsHelper = Operations(group=taskQueueDict['OwnerGroup'], setup=taskQueueDict['Setup']) # Requested version of DIRAC (it can be a list, so we take the fist one) version = opsHelper.getValue(cfgPath('Pilot', 'Version'), [self.installVersion])[0] pilotOptions.append('-r %s' % version) # Requested Project to install installProject = opsHelper.getValue(cfgPath('Pilot', 'Project'), self.installProject) if installProject: pilotOptions.append('-l %s' % installProject) installation = opsHelper.getValue(cfgPath('Pilot', 'Installation'), self.installation) if installation: pilotOptions.append("-V %s" % installation) # Requested CPU time pilotOptions.append('-T %s' % taskQueueDict['CPUTime']) if self.submitPoolOption not in self.extraPilotOptions: pilotOptions.append(self.submitPoolOption) if self.extraPilotOptions: pilotOptions.extend(self.extraPilotOptions) return S_OK((pilotOptions, pilotsToSubmit, ownerDN, ownerGroup, submitPrivatePilot, privateTQ)) def _submitPilots(self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob): """ This method must be implemented on the Backend specific derived class. This is problem with the Director, not with the Job so we must return S_OK Return S_ERROR if not defined. """ self.log.error('_submitPilots method not implemented') return S_OK() def _submitPilot(self, proxy, pilotsToSubmit, jdl, taskQueueID, rb): """ Submit pilot and get back the reference """ self.log.error('_submitPilot method not implemented') return S_OK() def _listMatch(self, proxy, jdl, taskQueueID, rb): """ This method must be implemented on the Backend specific derived class. """ self.log.error('_listMatch method not implemented') return S_OK() def _getChildrenReferences(self, proxy, parentReference, taskQueueID): """ This method must be implemented on the Backend specific derived class. """ self.log.error('_getChildrenReferences method not implemented') return S_OK() def submitPilots(self, taskQueueDict, pilotsToSubmit, workDir=None): """ Submit pilot for the given TaskQueue, this method just insert the request in the corresponding ThreadPool, the submission is done from the Thread Pool job """ try: taskQueueID = taskQueueDict['TaskQueueID'] self.log.verbose('Submitting Pilot') ceMask = self._resolveCECandidates(taskQueueDict) if not ceMask: return S_ERROR('No CE available for TaskQueue %d' % int(taskQueueID)) result = self._getPilotOptions(taskQueueDict, pilotsToSubmit) if not result['OK']: return result (pilotOptions, pilotsPerJob, ownerDN, ownerGroup, submitPrivatePilot, privateTQ) = result['Value'] # get a valid proxy, submit with a long proxy to avoid renewal ret = self._getPilotProxyFromDIRACGroup(ownerDN, ownerGroup, requiredTimeLeft=86400 * 5) if not ret['OK']: self.log.error(ret['Message']) self.log.error('No proxy Available', 'User "%s", Group "%s"' % (ownerDN, ownerGroup)) return S_ERROR(ERROR_PROXY) proxy = ret['Value'] # Now call a Grid Specific method to handle the final submission of the pilots return self._submitPilots(workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob) except Exception: self.log.exception('Error in Pilot Submission') return S_OK(0) def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup, requiredTimeLeft): """ To be overwritten if a given Pilot does not require a full proxy """ self.log.info("Downloading %s@%s proxy" % (ownerDN, ownerGroup)) return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft) def exceptionCallBack(self, threadedJob, exceptionInfo): self.log.exception('Error in Pilot Submission')
class DIRACPilotDirector(PilotDirector): """ DIRAC PilotDirector class """ def __init__( self, submitPool ): """ Define some defaults and call parent __init__ """ self.gridMiddleware = 'DIRAC' PilotDirector.__init__( self, submitPool ) self.computingElementList = COMPUTING_ELEMENTS self.computingElementDict = {} self.addComputingElement( self.computingElementList ) self.siteName = gConfig.getValue('/LocalSite/Site','') if not self.siteName: self.log.error( 'Can not run a Director if Site Name is not defined' ) sys.exit() self.__failingCECache = DictCache() self.__ticketsCECache = DictCache() def configure(self, csSection, submitPool ): """ Here goes common configuration for DIRAC PilotDirector """ PilotDirector.configure( self, csSection, submitPool ) self.reloadConfiguration( csSection, submitPool ) self.__failingCECache.purgeExpired() self.__ticketsCECache.purgeExpired() for ce in self.__failingCECache.getKeys(): if ce in self.computingElementDict.keys(): try: del self.computingElementDict[ce] except: pass if self.computingElementDict: self.log.info( ' ComputingElements:', ', '.join(self.computingElementDict.keys()) ) else: return # FIXME: this is to start testing _ceName, computingElementDict = self.computingElementDict.items()[0] self.computingElement = computingElementDict['CE'] self.log.debug( self.computingElement.getCEStatus() ) self.log.info( ' SiteName:', self.siteName ) def configureFromSection( self, mySection ): """ reload from CS """ PilotDirector.configureFromSection( self, mySection ) self.computingElementList = gConfig.getValue( mySection+'/ComputingElements' , self.computingElementList ) self.addComputingElement( self.computingElementList ) self.siteName = gConfig.getValue( mySection+'/SiteName' , self.siteName ) def addComputingElement(self, ceList): """ Check if a CE object for the current CE is available, instantiate one if necessary """ for CE in ceList: if CE not in self.computingElementDict: ceFactory = ComputingElementFactory( ) ceInstance = ceFactory.getCE( ceName = CE ) if not ceInstance['OK']: self.log.error('Can not create CE object:', ceInstance['Message']) return self.computingElementDict[CE] = ceInstance['Value'].ceConfigDict # add the 'CE' instance at the end to avoid being overwritten self.computingElementDict[CE]['CE'] = ceInstance['Value'] def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ): """ This method does the actual pilot submission to the DIRAC CE The logic is as follows: - If there are no available CE it return error - If there is no queue available in the CE's, it returns error - It creates a temp directory - It prepare a PilotScript """ taskQueueID = taskQueueDict['TaskQueueID'] # ownerDN = taskQueueDict['OwnerDN'] submittedPilots = 0 # if self.computingElement not in self.computingElementDict: # # Since we can exclude CEs from the list, it may become empty # return S_ERROR( ERROR_CE ) pilotRequirements = [] pilotRequirements.append( ( 'CPUTime', taskQueueDict['CPUTime'] ) ) # do we need to care about anything else? pilotRequirementsString = str( pilotRequirements ) # Check that there are available queues for the Jobs: if self.enableListMatch: availableQueues = [] # now = Time.dateTime() cachedAvailableQueues = self.listMatchCache.get( pilotRequirementsString ) if cachedAvailableQueues is None: availableQueues = self._listQueues( pilotRequirements ) if availableQueues != False: self.listMatchCache.add( pilotRequirementsString, self.listMatchDelay, availableQueues ) self.log.verbose( 'Available Queues for TaskQueue ', "%s: %s" % ( taskQueueID, str(availableQueues) ) ) else: availableQueues = cachedAvailableQueues if not availableQueues: return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID ) baseDir = os.getcwd() workingDirectory = tempfile.mkdtemp( prefix= 'TQ_%s_' % taskQueueID, dir = workDir ) self.log.verbose( 'Using working Directory:', workingDirectory ) os.chdir( workingDirectory ) # set the Site Name pilotOptions.append( "-n '%s'" % self.siteName) # submit pilots for every CE available for CE in self.computingElementDict.keys(): ceName = CE computingElement = self.computingElementDict[CE]['CE'] # add possible requirements from Site and CE for req, val in getResourceDict( ceName ).items(): pilotOptions.append( "-o '/AgentJobRequirements/%s=%s'" % ( req, val ) ) ceConfigDict = self.computingElementDict[CE] if 'ClientPlatform' in ceConfigDict: pilotOptions.append( "-p '%s'" % ceConfigDict['ClientPlatform']) if 'SharedArea' in ceConfigDict: pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % ceConfigDict['SharedArea'] ) # if 'CPUScalingFactor' in ceConfigDict: # pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % ceConfigDict['CPUScalingFactor'] ) # # if 'CPUNormalizationFactor' in ceConfigDict: # pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % ceConfigDict['CPUNormalizationFactor'] ) self.log.info( "pilotOptions: ", ' '.join(pilotOptions)) httpProxy = '' if 'HttpProxy' in ceConfigDict: httpProxy = ceConfigDict['HttpProxy'] if 'JobExecDir' in ceConfigDict: pilotExecDir = ceConfigDict['JobExecDir'] try: pilotScript = self._writePilotScript( workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir ) except: self.log.exception( ERROR_SCRIPT ) try: os.chdir( baseDir ) shutil.rmtree( workingDirectory ) except: pass return S_ERROR( ERROR_SCRIPT ) self.log.info("Pilots to submit: ", pilotsToSubmit) while submittedPilots < pilotsToSubmit: # Find out how many pilots can be submitted ret = computingElement.available( ) if not ret['OK']: self.log.error('Can not determine if pilot should be submitted: ', ret['Message']) break maxPilotsToSubmit = ret['Value'] self.log.info("Submit Pilots: ", maxPilotsToSubmit) if not maxPilotsToSubmit: break # submit the pilots and then check again for _i in range( min( maxPilotsToSubmit, pilotsToSubmit - submittedPilots ) ): submission = computingElement.submitJob(pilotScript, '', '') if not submission['OK']: self.log.error('Pilot submission failed: ', submission['Message']) # cleanup try: os.chdir( baseDir ) shutil.rmtree( workingDirectory ) except: pass return S_ERROR('Pilot submission failed after ' + str(submittedPilots) + ' pilots submitted successful') submittedPilots += 1 # let the batch system some time to digest the submitted job time.sleep(1) #next CE try: os.chdir( baseDir ) shutil.rmtree( workingDirectory ) except: pass return S_OK(submittedPilots) def _listQueues( self, pilotRequirements ): """ For each defined CE return the list of Queues with available, running and waiting slots, matching the requirements of the pilots. Currently only CPU time is considered """ result = self.computingElement.available( pilotRequirements ) if not result['OK']: self.log.error( 'Can not determine available queues', result['Message'] ) return False return result['Value'] def _writePilotScript( self, workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir ): """ Prepare the script to execute the pilot For the moment it will do like Grid Pilots, a full DIRAC installation It assumes that the pilot script will have access to the submit working directory """ try: compressedAndEncodedProxy = base64.encodestring( bz2.compress( proxy.dumpAllToString()['Value'] ) ).replace('\n','') compressedAndEncodedPilot = base64.encodestring( bz2.compress( open( self.pilot, "rb" ).read(), 9 ) ).replace('\n','') compressedAndEncodedInstall = base64.encodestring( bz2.compress( open( self.install, "rb" ).read(), 9 ) ).replace('\n','') except: self.log.exception('Exception during file compression of proxy, dirac-pilot or dirac-install') return S_ERROR('Exception during file compression of proxy, dirac-pilot or dirac-install') localPilot = """#!/bin/bash /usr/bin/env python << EOF # import os, stat, tempfile, sys, shutil, base64, bz2 try: pilotExecDir = '%(pilotExecDir)s' if not pilotExecDir: pilotExecDir = None pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir ) os.chdir( pilotWorkingDirectory ) open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedProxy)s" ) ) ) open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedPilot)s" ) ) ) open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedInstall)s" ) ) ) os.chmod("proxy", stat.S_IRUSR | stat.S_IWUSR) os.chmod("%(pilotScript)s", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) os.chmod("%(installScript)s", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) if "LD_LIBRARY_PATH" not in os.environ: os.environ["LD_LIBRARY_PATH"]="" os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy') if "%(httpProxy)s": os.environ["HTTP_PROXY"]="%(httpProxy)s" os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates') # TODO: structure the output print '===========================================================' print 'Environment of execution host' for key in os.environ.keys(): print key + '=' + os.environ[key] print '===========================================================' except Exception, x: print >> sys.stderr, x sys.exit(-1) cmd = "python %(pilotScript)s %(pilotOptions)s" print 'Executing: ', cmd sys.stdout.flush() os.system( cmd ) shutil.rmtree( pilotWorkingDirectory ) EOF """ % { 'compressedAndEncodedProxy': compressedAndEncodedProxy, 'compressedAndEncodedPilot': compressedAndEncodedPilot, 'compressedAndEncodedInstall': compressedAndEncodedInstall, 'httpProxy': httpProxy, 'pilotScript': os.path.basename(self.pilot), 'installScript': os.path.basename(self.install), 'pilotOptions': ' '.join( pilotOptions ), 'pilotExecDir': pilotExecDir } fd, name = tempfile.mkstemp( suffix = '_pilotwrapper.py', prefix = 'DIRAC_', dir=workingDirectory) pilotWrapper = os.fdopen(fd, 'w') pilotWrapper.write( localPilot ) pilotWrapper.close() return name def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ): """ Download a limited pilot proxy with VOMS extensions depending on the group """ #Assign VOMS attribute vomsAttr = CS.getVOMSAttributeForGroup( ownerGroup ) if not vomsAttr: self.log.info( "Downloading a proxy without VOMS extensions for %s@%s" % ( ownerDN, ownerGroup ) ) return gProxyManager.downloadProxy( ownerDN, ownerGroup, limited = True, requiredTimeLeft = requiredTimeLeft ) else: self.log.info( "Downloading a proxy with '%s' VOMS extension for %s@%s" % ( vomsAttr, ownerDN, ownerGroup ) ) return gProxyManager.downloadVOMSProxy( ownerDN, ownerGroup, limited = True, requiredTimeLeft = requiredTimeLeft, requiredVOMSAttribute = vomsAttr )
class DataCache: def __init__( self ): self.graphsLocation = os.path.join( gConfig.getValue( '/LocalSite/InstancePath', rootPath ), 'data', 'accountingPlots' ) self.cachedGraphs = {} self.alive = True self.purgeThread = threading.Thread( target = self.purgeExpired ) self.purgeThread.setDaemon( 1 ) self.purgeThread.start() self.__dataCache = DictCache() self.__graphCache = DictCache( deleteFunction = self._deleteGraph ) self.__dataLifeTime = 600 self.__graphLifeTime = 3600 def setGraphsLocation( self, graphsDir ): self.graphsLocation = graphsDir for graphName in os.listdir( self.graphsLocation ): if graphName.find( ".png" ) > 0: graphLocation = "%s/%s" % ( self.graphsLocation, graphName ) gLogger.verbose( "Purging %s" % graphLocation ) os.unlink( graphLocation ) def purgeExpired( self ): while self.alive: time.sleep( 600 ) self.__graphCache.purgeExpired() self.__dataCache.purgeExpired() def getReportData( self, reportRequest, reportHash, dataFunc ): """ Get report data from cache if exists, else generate it """ reportData = self.__dataCache.get( reportHash ) if reportData == False: retVal = dataFunc( reportRequest ) if not retVal[ 'OK' ]: return retVal reportData = retVal[ 'Value' ] self.__dataCache.add( reportHash, self.__dataLifeTime, reportData ) return S_OK( reportData ) def getReportPlot( self, reportRequest, reportHash, reportData, plotFunc ): """ Get report data from cache if exists, else generate it """ plotDict = self.__graphCache.get( reportHash ) if plotDict == False: basePlotFileName = "%s/%s" % ( self.graphsLocation, reportHash ) retVal = plotFunc( reportRequest, reportData, basePlotFileName ) if not retVal[ 'OK' ]: return retVal plotDict = retVal[ 'Value' ] if plotDict[ 'plot' ]: plotDict[ 'plot' ] = "%s.png" % reportHash if plotDict[ 'thumbnail' ]: plotDict[ 'thumbnail' ] = "%s.thb.png" % reportHash self.__graphCache.add( reportHash, self.__graphLifeTime, plotDict ) return S_OK( plotDict ) def getPlotData( self, plotFileName ): filename = "%s/%s" % ( self.graphsLocation, plotFileName ) try: fd = file( filename, "rb" ) data = fd.read() fd.close() except Exception, e: return S_ERROR( "Can't open file %s: %s" % ( plotFileName, str( e ) ) ) return S_OK( data )
class DIRACPilotDirector(PilotDirector): """ DIRAC PilotDirector class """ def __init__(self, submitPool): """ Define some defaults and call parent __init__ """ self.gridMiddleware = 'DIRAC' PilotDirector.__init__(self, submitPool) self.computingElementList = COMPUTING_ELEMENTS self.computingElementDict = {} self.addComputingElement(self.computingElementList) self.siteName = gConfig.getValue('/LocalSite/Site', '') if not self.siteName: self.log.error( 'Can not run a Director if Site Name is not defined') sys.exit() self.__failingCECache = DictCache() self.__ticketsCECache = DictCache() def configure(self, csSection, submitPool): """ Here goes common configuration for DIRAC PilotDirector """ PilotDirector.configure(self, csSection, submitPool) self.reloadConfiguration(csSection, submitPool) self.__failingCECache.purgeExpired() self.__ticketsCECache.purgeExpired() for ce in self.__failingCECache.getKeys(): if ce in self.computingElementDict.keys(): try: del self.computingElementDict[ce] except: pass if self.computingElementDict: self.log.info(' ComputingElements:', ', '.join(self.computingElementDict.keys())) else: return # FIXME: this is to start testing _ceName, computingElementDict = self.computingElementDict.items()[0] self.computingElement = computingElementDict['CE'] self.log.debug(self.computingElement.getCEStatus()) self.log.info(' SiteName:', self.siteName) def configureFromSection(self, mySection): """ reload from CS """ PilotDirector.configureFromSection(self, mySection) self.computingElementList = gConfig.getValue( mySection + '/ComputingElements', self.computingElementList) self.addComputingElement(self.computingElementList) self.siteName = gConfig.getValue(mySection + '/SiteName', self.siteName) def addComputingElement(self, ceList): """ Check if a CE object for the current CE is available, instantiate one if necessary """ for CE in ceList: if CE not in self.computingElementDict: ceFactory = ComputingElementFactory() ceInstance = ceFactory.getCE(ceName=CE) if not ceInstance['OK']: self.log.error('Can not create CE object:', ceInstance['Message']) return self.computingElementDict[CE] = ceInstance[ 'Value'].ceConfigDict # add the 'CE' instance at the end to avoid being overwritten self.computingElementDict[CE]['CE'] = ceInstance['Value'] def _submitPilots(self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob): """ This method does the actual pilot submission to the DIRAC CE The logic is as follows: - If there are no available CE it return error - If there is no queue available in the CE's, it returns error - It creates a temp directory - It prepare a PilotScript """ taskQueueID = taskQueueDict['TaskQueueID'] # ownerDN = taskQueueDict['OwnerDN'] submittedPilots = 0 # if self.computingElement not in self.computingElementDict: # # Since we can exclude CEs from the list, it may become empty # return S_ERROR( ERROR_CE ) pilotRequirements = [] pilotRequirements.append(('CPUTime', taskQueueDict['CPUTime'])) # do we need to care about anything else? pilotRequirementsString = str(pilotRequirements) # Check that there are available queues for the Jobs: if self.enableListMatch: availableQueues = [] # now = Time.dateTime() cachedAvailableQueues = self.listMatchCache.get( pilotRequirementsString) if cachedAvailableQueues is None: availableQueues = self._listQueues(pilotRequirements) if availableQueues != False: self.listMatchCache.add(pilotRequirementsString, self.listMatchDelay, availableQueues) self.log.verbose( 'Available Queues for TaskQueue ', "%s: %s" % (taskQueueID, str(availableQueues))) else: availableQueues = cachedAvailableQueues if not availableQueues: return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID) baseDir = os.getcwd() workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID, dir=workDir) self.log.verbose('Using working Directory:', workingDirectory) os.chdir(workingDirectory) # set the Site Name pilotOptions.append("-n '%s'" % self.siteName) # submit pilots for every CE available for CE in self.computingElementDict.keys(): ceName = CE computingElement = self.computingElementDict[CE]['CE'] # add possible requirements from Site and CE for req, val in getResourceDict(ceName).items(): pilotOptions.append("-o '/AgentJobRequirements/%s=%s'" % (req, val)) ceConfigDict = self.computingElementDict[CE] httpProxy = '' if 'HttpProxy' in ceConfigDict: httpProxy = ceConfigDict['HttpProxy'] if 'JobExecDir' in ceConfigDict: pilotExecDir = ceConfigDict['JobExecDir'] try: pilotScript = self._writePilotScript(workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir) except: self.log.exception(ERROR_SCRIPT) try: os.chdir(baseDir) shutil.rmtree(workingDirectory) except: pass return S_ERROR(ERROR_SCRIPT) self.log.info("Pilots to submit: ", pilotsToSubmit) while submittedPilots < pilotsToSubmit: # Find out how many pilots can be submitted ret = computingElement.available() if not ret['OK']: self.log.error( 'Can not determine if pilot should be submitted: ', ret['Message']) break maxPilotsToSubmit = ret['Value'] self.log.info("Submit Pilots: ", maxPilotsToSubmit) if not maxPilotsToSubmit: break # submit the pilots and then check again for _i in range( min(maxPilotsToSubmit, pilotsToSubmit - submittedPilots)): submission = computingElement.submitJob( pilotScript, '', '') if not submission['OK']: self.log.error('Pilot submission failed: ', submission['Message']) # cleanup try: os.chdir(baseDir) shutil.rmtree(workingDirectory) except: pass return S_ERROR('Pilot submission failed after ' + str(submittedPilots) + ' pilots submitted successful') submittedPilots += 1 # let the batch system some time to digest the submitted job time.sleep(1) #next CE try: os.chdir(baseDir) shutil.rmtree(workingDirectory) except: pass return S_OK(submittedPilots) def _listQueues(self, pilotRequirements): """ For each defined CE return the list of Queues with available, running and waiting slots, matching the requirements of the pilots. Currently only CPU time is considered """ result = self.computingElement.available(pilotRequirements) if not result['OK']: self.log.error('Can not determine available queues', result['Message']) return False return result['Value'] def _writePilotScript(self, workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir): """ Prepare the script to execute the pilot For the moment it will do like Grid Pilots, a full DIRAC installation It assumes that the pilot script will have access to the submit working directory """ try: compressedAndEncodedProxy = base64.encodestring( bz2.compress(proxy.dumpAllToString()['Value'])).replace( '\n', '') compressedAndEncodedPilot = base64.encodestring( bz2.compress(open(self.pilot, "rb").read(), 9)).replace('\n', '') compressedAndEncodedInstall = base64.encodestring( bz2.compress(open(self.install, "rb").read(), 9)).replace('\n', '') except: self.log.exception( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) return S_ERROR( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) localPilot = """#!/bin/bash /usr/bin/env python << EOF # import os, stat, tempfile, sys, shutil, base64, bz2 try: pilotExecDir = '%(pilotExecDir)s' if not pilotExecDir: pilotExecDir = None pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir ) os.chdir( pilotWorkingDirectory ) open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedProxy)s" ) ) ) open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedPilot)s" ) ) ) open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( "%(compressedAndEncodedInstall)s" ) ) ) os.chmod("proxy", stat.S_IRUSR | stat.S_IWUSR) os.chmod("%(pilotScript)s", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) os.chmod("%(installScript)s", stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) if "LD_LIBRARY_PATH" not in os.environ: os.environ["LD_LIBRARY_PATH"]="" os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy') if "%(httpProxy)s": os.environ["HTTP_PROXY"]="%(httpProxy)s" os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates') # TODO: structure the output print '===========================================================' print 'Environment of execution host' for key in os.environ.keys(): print key + '=' + os.environ[key] print '===========================================================' except Exception, x: print >> sys.stderr, x sys.exit(-1) cmd = "python %(pilotScript)s %(pilotOptions)s" print 'Executing: ', cmd sys.stdout.flush() os.system( cmd ) shutil.rmtree( pilotWorkingDirectory ) EOF """ % { 'compressedAndEncodedProxy': compressedAndEncodedProxy, 'compressedAndEncodedPilot': compressedAndEncodedPilot, 'compressedAndEncodedInstall': compressedAndEncodedInstall, 'httpProxy': httpProxy, 'pilotScript': os.path.basename(self.pilot), 'installScript': os.path.basename(self.install), 'pilotOptions': ' '.join(pilotOptions), 'pilotExecDir': pilotExecDir } fd, name = tempfile.mkstemp(suffix='_pilotwrapper.py', prefix='DIRAC_', dir=workingDirectory) pilotWrapper = os.fdopen(fd, 'w') pilotWrapper.write(localPilot) pilotWrapper.close() return name def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup, requiredTimeLeft): """ Download a limited pilot proxy with VOMS extensions depending on the group """ #Assign VOMS attribute vomsAttr = CS.getVOMSAttributeForGroup(ownerGroup) if not vomsAttr: self.log.info( "Downloading a proxy without VOMS extensions for %s@%s" % (ownerDN, ownerGroup)) return gProxyManager.downloadProxy( ownerDN, ownerGroup, limited=True, requiredTimeLeft=requiredTimeLeft) else: self.log.info( "Downloading a proxy with '%s' VOMS extension for %s@%s" % (vomsAttr, ownerDN, ownerGroup)) return gProxyManager.downloadVOMSProxy( ownerDN, ownerGroup, limited=True, requiredTimeLeft=requiredTimeLeft, requiredVOMSAttribute=vomsAttr)
class DataCache: def __init__(self, dirName="accountingPlots"): self.graphsLocation = os.path.join(rootPath, "data", dirName) self.cachedGraphs = {} self.alive = True self.purgeThread = threading.Thread(target=self.purgeExpired) self.purgeThread.setDaemon(1) self.purgeThread.start() self.__dataCache = DictCache() self.__graphCache = DictCache(deleteFunction=self._deleteGraph) self.__dataLifeTime = 600 self.__graphLifeTime = 3600 def setGraphsLocation(self, graphsDir): self.graphsLocation = graphsDir for graphName in os.listdir(self.graphsLocation): if graphName.find(".png") > 0: graphLocation = "%s/%s" % (self.graphsLocation, graphName) gLogger.verbose("Purging %s" % graphLocation) os.unlink(graphLocation) def purgeExpired(self): while self.alive: time.sleep(600) self.__graphCache.purgeExpired() self.__dataCache.purgeExpired() def getReportData(self, reportRequest, reportHash, dataFunc): """ Get report data from cache if exists, else generate it """ reportData = self.__dataCache.get(reportHash) if not reportData: retVal = dataFunc(reportRequest) if not retVal["OK"]: return retVal reportData = retVal["Value"] self.__dataCache.add(reportHash, self.__dataLifeTime, reportData) return S_OK(reportData) def getReportPlot(self, reportRequest, reportHash, reportData, plotFunc): """ Get report data from cache if exists, else generate it """ plotDict = self.__graphCache.get(reportHash) if not plotDict: basePlotFileName = "%s/%s" % (self.graphsLocation, reportHash) retVal = plotFunc(reportRequest, reportData, basePlotFileName) if not retVal["OK"]: return retVal plotDict = retVal["Value"] if plotDict["plot"]: plotDict["plot"] = "%s.png" % reportHash if plotDict["thumbnail"]: plotDict["thumbnail"] = "%s.thb.png" % reportHash self.__graphCache.add(reportHash, self.__graphLifeTime, plotDict) return S_OK(plotDict) def getPlotData(self, plotFileName): filename = "%s/%s" % (self.graphsLocation, plotFileName) try: fd = open(filename, "rb") data = fd.read() fd.close() except Exception as e: return S_ERROR("Can't open file %s: %s" % (plotFileName, str(e))) return S_OK(data) def _deleteGraph(self, plotDict): try: for key in plotDict: value = plotDict[key] if value: fPath = os.path.join(self.graphsLocation, str(value)) if os.path.isfile(fPath): gLogger.info("Deleting plot from cache", value) os.unlink(fPath) else: gLogger.info("Plot has already been deleted", value) except Exception: pass
class GridPilotDirector(PilotDirector): """ Base Grid PilotDirector class Derived classes must declare: self.Middleware: It must correspond to the string before "PilotDirector". (For proper naming of the logger) self.ResourceBrokers: list of Brokers used by the Director. (For proper error reporting) """ def __init__(self, submitPool): """ Define some defaults and call parent __init__ """ self.gridEnv = GRIDENV self.cpuPowerRef = CPU_POWER_REF self.requirements = REQUIREMENTS self.rank = RANK self.fuzzyRank = FUZZY_RANK self.__failingWMSCache = DictCache() self.__ticketsWMSCache = DictCache() self.__listMatchWMSCache = DictCache() PilotDirector.__init__(self, submitPool) def configure(self, csSection, submitPool): """ Here goes common configuration for all Grid PilotDirectors """ PilotDirector.configure(self, csSection, submitPool) self.reloadConfiguration(csSection, submitPool) self.__failingWMSCache.purgeExpired() self.__ticketsWMSCache.purgeExpired() for rb in self.__failingWMSCache.getKeys(): if rb in self.resourceBrokers: try: self.resourceBrokers.remove(rb) except: pass self.resourceBrokers = List.randomize(self.resourceBrokers) if self.gridEnv: self.log.info(' GridEnv: ', self.gridEnv) if self.resourceBrokers: self.log.info(' ResourceBrokers:', ', '.join(self.resourceBrokers)) def configureFromSection(self, mySection): """ reload from CS """ PilotDirector.configureFromSection(self, mySection) self.gridEnv = gConfig.getValue(mySection + '/GridEnv', self.gridEnv) if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue('/DIRAC/Setup', '') if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '') if instance: self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '') self.resourceBrokers = gConfig.getValue(mySection + '/ResourceBrokers', self.resourceBrokers) self.cpuPowerRef = gConfig.getValue(mySection + '/CPUPowerRef', self.cpuPowerRef) self.requirements = gConfig.getValue(mySection + '/Requirements', self.requirements) self.rank = gConfig.getValue(mySection + '/Rank', self.rank) self.fuzzyRank = gConfig.getValue(mySection + '/FuzzyRank', self.fuzzyRank) def _submitPilots(self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob): """ This method does the actual pilot submission to the Grid RB The logic is as follows: - If there are no available RB it return error - If there is no VOMS extension in the proxy, return error - It creates a temp directory - Prepare a JDL it has some part common to gLite and LCG (the payload description) it has some part specific to each middleware """ taskQueueID = taskQueueDict['TaskQueueID'] # ownerDN = taskQueueDict['OwnerDN'] credDict = proxy.getCredentials()['Value'] ownerDN = credDict['identity'] ownerGroup = credDict['group'] if not self.resourceBrokers: # Since we can exclude RBs from the list, it may become empty return S_ERROR(ERROR_RB) # Need to get VOMS extension for the later interactions with WMS ret = gProxyManager.getVOMSAttributes(proxy) if not ret['OK']: self.log.error(ERROR_VOMS, ret['Message']) return S_ERROR(ERROR_VOMS) if not ret['Value']: return S_ERROR(ERROR_VOMS) vomsGroup = ret['Value'][0] workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID, dir=workDir) self.log.verbose('Using working Directory:', workingDirectory) # Write JDL retDict = self._prepareJDL(taskQueueDict, workingDirectory, pilotOptions, pilotsPerJob, ceMask, submitPrivatePilot, privateTQ) jdl = retDict['JDL'] pilotRequirements = retDict['Requirements'] rb = retDict['RB'] if not jdl: try: shutil.rmtree(workingDirectory) except: pass return S_ERROR(ERROR_JDL) # Check that there are available queues for the Job: if self.enableListMatch: availableCEs = [] now = Time.dateTime() availableCEs = self.listMatchCache.get(pilotRequirements) if availableCEs == False: availableCEs = self._listMatch(proxy, jdl, taskQueueID, rb) if availableCEs != False: self.log.verbose('LastListMatch', now) self.log.verbose('AvailableCEs ', availableCEs) self.listMatchCache.add( pilotRequirements, self.listMatchDelay * 60, value=availableCEs) # it is given in minutes if not availableCEs: try: shutil.rmtree(workingDirectory) except: pass return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID) # Now we are ready for the actual submission, so self.log.verbose('Submitting Pilots for TaskQueue', taskQueueID) submitRet = self._submitPilot(proxy, pilotsPerJob, jdl, taskQueueID, rb) try: shutil.rmtree(workingDirectory) except: pass if not submitRet: return S_ERROR('Pilot Submission Failed for TQ %d ' % taskQueueID) # pilotReference, resourceBroker = submitRet submittedPilots = 0 if pilotsPerJob != 1 and len(submitRet) != pilotsPerJob: # Parametric jobs are used for pilotReference, resourceBroker in submitRet: pilotReference = self._getChildrenReferences( proxy, pilotReference, taskQueueID) submittedPilots += len(pilotReference) pilotAgentsDB.addPilotTQReference(pilotReference, taskQueueID, ownerDN, ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements) else: for pilotReference, resourceBroker in submitRet: pilotReference = [pilotReference] submittedPilots += len(pilotReference) pilotAgentsDB.addPilotTQReference(pilotReference, taskQueueID, ownerDN, ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements) # add some sleep here time.sleep(0.1 * submittedPilots) if pilotsToSubmit > pilotsPerJob: # Additional submissions are necessary, need to get a new token and iterate. pilotsToSubmit -= pilotsPerJob result = gProxyManager.requestToken( ownerDN, ownerGroup, max(pilotsToSubmit, self.maxJobsInFillMode)) if not result['OK']: self.log.error(ERROR_TOKEN, result['Message']) result = S_ERROR(ERROR_TOKEN) result['Value'] = submittedPilots return result (token, numberOfUses) = result['Value'] for option in pilotOptions: if option.find('-o /Security/ProxyToken=') == 0: pilotOptions.remove(option) pilotOptions.append('-o /Security/ProxyToken=%s' % token) pilotsPerJob = max( 1, min(pilotsPerJob, int(numberOfUses / self.maxJobsInFillMode))) result = self._submitPilots(workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob) if not result['OK']: if 'Value' not in result: result['Value'] = 0 result['Value'] += submittedPilots return result submittedPilots += result['Value'] return S_OK(submittedPilots) def _prepareJDL(self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ): """ This method should be overridden in a subclass """ self.log.error( '_prepareJDL() method should be implemented in a subclass') sys.exit() def _JobJDL(self, taskQueueDict, pilotOptions, ceMask): """ The Job JDL is the same for LCG and GLite """ pilotJDL = 'Executable = "%s";\n' % os.path.basename(self.pilot) executable = self.pilot pilotJDL += 'Arguments = "%s";\n' % ' '.join(pilotOptions) pilotJDL += 'CPUTimeRef = %s;\n' % taskQueueDict['CPUTime'] pilotJDL += 'CPUPowerRef = %s;\n' % self.cpuPowerRef pilotJDL += """CPUWorkRef = real( CPUTimeRef * CPUPowerRef ); Lookup = "CPUScalingReferenceSI00=*"; cap = isList( other.GlueCECapability ) ? other.GlueCECapability : { "dummy" }; i0 = regexp( Lookup, cap[0] ) ? 0 : undefined; i1 = isString( cap[1] ) && regexp( Lookup, cap[1] ) ? 1 : i0; i2 = isString( cap[2] ) && regexp( Lookup, cap[2] ) ? 2 : i1; i3 = isString( cap[3] ) && regexp( Lookup, cap[3] ) ? 3 : i2; i4 = isString( cap[4] ) && regexp( Lookup, cap[4] ) ? 4 : i3; i5 = isString( cap[5] ) && regexp( Lookup, cap[5] ) ? 5 : i4; index = isString( cap[6] ) && regexp( Lookup, cap[6] ) ? 6 : i5; i = isUndefined( index ) ? 0 : index; QueuePowerRef = real( ! isUndefined( index ) ? int( substr( cap[i], size( Lookup ) - 1 ) ) : other.GlueHostBenchmarkSI00 ); QueueTimeRef = real( other.GlueCEPolicyMaxCPUTime * 60 ); QueueWorkRef = QueuePowerRef * QueueTimeRef; """ requirements = list(self.requirements) if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: # if there an explicit Grig CE requested by the TQ, remove the Ranking requirement for req in self.requirements: if req.strip().lower()[:6] == 'rank >': requirements.remove(req) requirements.append('QueueWorkRef > CPUWorkRef') siteRequirements = '\n || '.join( ['other.GlueCEInfoHostName == "%s"' % s for s in ceMask]) requirements.append("( %s\n )" % siteRequirements) pilotRequirements = '\n && '.join(requirements) pilotJDL += 'pilotRequirements = %s;\n' % pilotRequirements pilotJDL += 'Rank = %s;\n' % self.rank pilotJDL += 'FuzzyRank = %s;\n' % self.fuzzyRank pilotJDL += 'StdOutput = "%s";\n' % outputSandboxFiles[0] pilotJDL += 'StdError = "%s";\n' % outputSandboxFiles[1] pilotJDL += 'InputSandbox = { "%s" };\n' % '", "'.join( [self.install, executable]) pilotJDL += 'OutputSandbox = { %s };\n' % ', '.join( ['"%s"' % f for f in outputSandboxFiles]) self.log.verbose(pilotJDL) return (pilotJDL, pilotRequirements) def parseListMatchStdout(self, proxy, cmd, taskQueueID, rb): """ Parse List Match stdout to return list of matched CE's """ self.log.verbose('Executing List Match for TaskQueue', taskQueueID) start = time.time() ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret['OK']: self.log.error('Failed to execute List Match:', ret['Message']) self.__sendErrorMail(rb, 'List Match', cmd, ret, proxy) return False if ret['Value'][0] != 0: self.log.error('Error executing List Match:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) self.__sendErrorMail(rb, 'List Match', cmd, ret, proxy) return False self.log.info('List Match Execution Time: %.2f for TaskQueue %d' % ((time.time() - start), taskQueueID)) stdout = ret['Value'][1] stderr = ret['Value'][2] availableCEs = [] # Parse std.out for line in List.fromChar(stdout, '\n'): if re.search('/jobmanager-', line) or re.search('/cream-', line): # TODO: the line has to be stripped from extra info availableCEs.append(line) if not availableCEs: self.log.info('List-Match failed to find CEs for TaskQueue', taskQueueID) self.log.info(stdout) self.log.info(stderr) else: self.log.debug('List-Match returns:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) self.log.info( 'List-Match found %s CEs for TaskQueue' % len(availableCEs), taskQueueID) self.log.verbose(', '.join(availableCEs)) return availableCEs def parseJobSubmitStdout(self, proxy, cmd, taskQueueID, rb): """ Parse Job Submit stdout to return pilot reference """ start = time.time() self.log.verbose('Executing Job Submit for TaskQueue', taskQueueID) ret = executeGridCommand(proxy, cmd, self.gridEnv) if not ret['OK']: self.log.error('Failed to execute Job Submit:', ret['Message']) self.__sendErrorMail(rb, 'Job Submit', cmd, ret, proxy) return False if ret['Value'][0] != 0: self.log.error('Error executing Job Submit:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) self.__sendErrorMail(rb, 'Job Submit', cmd, ret, proxy) return False self.log.info('Job Submit Execution Time: %.2f for TaskQueue %d' % ((time.time() - start), taskQueueID)) stdout = ret['Value'][1] stderr = ret['Value'][2] submittedPilot = None failed = 1 rb = '' for line in List.fromChar(stdout, '\n'): m = re.search("(https:\S+)", line) if (m): glite_id = m.group(1) submittedPilot = glite_id if not rb: m = re.search("https://(.+):.+", glite_id) rb = m.group(1) failed = 0 if failed: self.log.error('Job Submit returns no Reference:', str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3])) return False self.log.info('Reference %s for TaskQueue %s' % (glite_id, taskQueueID)) return glite_id, rb def _writeJDL(self, filename, jdlList): try: f = open(filename, 'w') f.write('\n'.join(jdlList)) f.close() except Exception, x: self.log.exception() return '' return filename
class PilotDirector: """ Base Pilot Director class. Derived classes must implement: * __init__( self, submitPool ): that must call the parent class __init__ method and then do its own initialization * configure( self, csSection, submitPool ): that must call the parent class configure method and the do its own configuration * _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) actual method doing the submission to the backend once the submitPilots method has prepared the common part Derived classes might implement: * configureFromSection( self, mySection ): to reload from a CS section the additional datamembers they might have defined. If additional datamembers are defined, they must: - be declared in the __init__ - be reconfigured in the configureFromSection method by executing self.reloadConfiguration( csSection, submitPool ) in their configure method """ gridMiddleware = '' def __init__( self, submitPool ): """ Define the logger and some defaults """ if submitPool == self.gridMiddleware: self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware ) else: self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) ) self.pilot = DIRAC_PILOT self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool self.extraPilotOptions = [] self.installVersion = DIRAC_VERSION self.installProject = DIRAC_PROJECT self.installation = DIRAC_INSTALLATION self.pilotExtensionsList = [] self.virtualOrganization = VIRTUAL_ORGANIZATION self.install = DIRAC_INSTALL self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.targetGrids = [ self.gridMiddleware ] self.enableListMatch = ENABLE_LISTMATCH self.listMatchDelay = LISTMATCH_DELAY self.listMatchCache = DictCache() self.privatePilotFraction = PRIVATE_PILOT_FRACTION self.errorClearTime = ERROR_CLEAR_TIME self.errorTicketTime = ERROR_TICKET_TIME self.errorMailAddress = DIRAC.errorMail self.alarmMailAddress = DIRAC.alarmMail self.mailFromAddress = FROM_MAIL if not 'log' in self.__dict__: self.log = gLogger.getSubLogger( 'PilotDirector' ) self.log.info( 'Initialized' ) def configure( self, csSection, submitPool ): """ Here goes common configuration for all PilotDirectors """ self.configureFromSection( csSection ) self.reloadConfiguration( csSection, submitPool ) # Get the defaults for the Setup where the Director is running opsHelper = Operations() self.installVersion = opsHelper.getValue( cfgPath( 'Pilot', 'Version' ), [ self.installVersion ] )[0] self.installProject = opsHelper.getValue( cfgPath( 'Pilot', 'Project' ), self.installProject ) self.installation = opsHelper.getValue( cfgPath( 'Pilot', 'Installation' ), self.installation ) self.pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", self.pilotExtensionsList ) self.log.info( '===============================================' ) self.log.info( 'Configuration:' ) self.log.info( '' ) self.log.info( ' Target Grids: ', ', '.join( self.targetGrids ) ) self.log.info( ' Install script: ', self.install ) self.log.info( ' Pilot script: ', self.pilot ) self.log.info( ' Install Ver: ', self.installVersion ) if self.installProject: self.log.info( ' Project: ', self.installProject ) if self.installation: self.log.info( ' Installation: ', self.installation ) if self.extraPilotOptions: self.log.info( ' Extra Options: ', ' '.join( self.extraPilotOptions ) ) self.log.info( ' ListMatch: ', self.enableListMatch ) self.log.info( ' Private %: ', self.privatePilotFraction * 100 ) if self.enableListMatch: self.log.info( ' ListMatch Delay:', self.listMatchDelay ) self.listMatchCache.purgeExpired() def reloadConfiguration( self, csSection, submitPool ): """ Common Configuration can be overwriten for each GridMiddleware """ mySection = csSection + '/' + self.gridMiddleware self.configureFromSection( mySection ) # And Again for each SubmitPool mySection = csSection + '/' + submitPool self.configureFromSection( mySection ) def configureFromSection( self, mySection ): """ reload from CS """ self.pilot = gConfig.getValue( mySection + '/PilotScript' , self.pilot ) #TODO: Remove this DIRACVersion after 06/2012 self.installVersion = gConfig.getValue( mySection + '/DIRACVersion' , self.installVersion ) self.installVersion = gConfig.getValue( mySection + '/Version' , self.installVersion ) self.extraPilotOptions = gConfig.getValue( mySection + '/ExtraPilotOptions' , self.extraPilotOptions ) self.install = gConfig.getValue( mySection + '/InstallScript' , self.install ) self.installProject = gConfig.getValue( mySection + '/Project' , self.installProject ) self.installation = gConfig.getValue( mySection + '/Installation' , self.installation ) self.maxJobsInFillMode = gConfig.getValue( mySection + '/MaxJobsInFillMode' , self.maxJobsInFillMode ) self.targetGrids = gConfig.getValue( mySection + '/TargetGrids' , self.targetGrids ) self.enableListMatch = gConfig.getValue( mySection + '/EnableListMatch' , self.enableListMatch ) self.listMatchDelay = gConfig.getValue( mySection + '/ListMatchDelay' , self.listMatchDelay ) self.errorClearTime = gConfig.getValue( mySection + '/ErrorClearTime' , self.errorClearTime ) self.errorTicketTime = gConfig.getValue( mySection + '/ErrorTicketTime' , self.errorTicketTime ) self.errorMailAddress = gConfig.getValue( mySection + '/ErrorMailAddress' , self.errorMailAddress ) self.alarmMailAddress = gConfig.getValue( mySection + '/AlarmMailAddress' , self.alarmMailAddress ) self.mailFromAddress = gConfig.getValue( mySection + '/MailFromAddress' , self.mailFromAddress ) self.privatePilotFraction = gConfig.getValue( mySection + '/PrivatePilotFraction' , self.privatePilotFraction ) virtualOrganization = gConfig.getValue( mySection + '/VirtualOrganization' , '' ) if not virtualOrganization: virtualOrganization = getVOForGroup( 'NonExistingGroup' ) if not virtualOrganization: virtualOrganization = self.virtualOrganization self.virtualOrganization = virtualOrganization def _resolveCECandidates( self, taskQueueDict ): """ Return a list of CEs for this TaskQueue """ # assume user knows what they're doing and avoid site mask e.g. sam jobs if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( taskQueueDict['GridCEs'] ) ) return taskQueueDict['GridCEs'] # Get the mask ret = jobDB.getSiteMask() if not ret['OK']: self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] ) return [] siteMask = ret['Value'] if not siteMask: self.log.error( 'Site mask is empty' ) return [] self.log.verbose( 'Site Mask: %s' % ', '.join( siteMask ) ) # remove banned sites from siteMask if 'BannedSites' in taskQueueDict: for site in taskQueueDict['BannedSites']: if site in siteMask: siteMask.remove( site ) self.log.verbose( 'Removing banned site %s from site Mask' % site ) # remove from the mask if a Site is given siteMask = [ site for site in siteMask if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ] if not siteMask: # pilot can not be submitted self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) return [] self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) # Get CE's associates to the given site Names ceMask = [] for grid in self.targetGrids: section = '/Resources/Sites/%s' % grid ret = gConfig.getSections( section ) if not ret['OK']: # this is hack, maintained until LCG is added as TargetGrid for the gLite SubmitPool section = '/Resources/Sites/LCG' ret = gConfig.getSections( section ) if not ret['OK']: self.log.error( 'Could not obtain CEs from CS', ret['Message'] ) continue gridSites = ret['Value'] for siteName in gridSites: if siteName in siteMask: ret = gConfig.getValue( '%s/%s/CE' % ( section, siteName ), [] ) for ce in ret: submissionMode = gConfig.getValue( '%s/%s/CEs/%s/SubmissionMode' % ( section, siteName, ce ), 'gLite' ) if submissionMode == self.gridMiddleware and ce not in ceMask: ceMask.append( ce ) if not ceMask: self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) ) self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) ) return ceMask def _getPilotOptions( self, taskQueueDict, pilotsToSubmit ): # Need to limit the maximum number of pilots to submit at once # For generic pilots this is limited by the number of use of the tokens and the # maximum number of jobs in Filling mode, but for private Jobs we need an extra limitation: pilotsToSubmit = max( min( pilotsToSubmit, int( 50 / self.maxJobsInFillMode ) ), 1 ) pilotOptions = [] privateIfGenericTQ = self.privatePilotFraction > random.random() privateTQ = ( 'PilotTypes' in taskQueueDict and 'private' in [ t.lower() for t in taskQueueDict['PilotTypes'] ] ) forceGeneric = 'ForceGeneric' in taskQueueDict submitPrivatePilot = ( privateIfGenericTQ or privateTQ ) and not forceGeneric if submitPrivatePilot: self.log.verbose( 'Submitting private pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) ownerDN = taskQueueDict['OwnerDN'] ownerGroup = taskQueueDict['OwnerGroup'] # User Group requirement pilotOptions.append( '-G %s' % taskQueueDict['OwnerGroup'] ) # check if group allows jobsharing ownerGroupProperties = getPropertiesForGroup( ownerGroup ) if not 'JobSharing' in ownerGroupProperties: # Add Owner requirement to pilot pilotOptions.append( "-O '%s'" % ownerDN ) if privateTQ: pilotOptions.append( '-o /Resources/Computing/CEDefaults/PilotType=private' ) maxJobsInFillMode = self.maxJobsInFillMode else: #For generic jobs we'll submit mixture of generic and private pilots self.log.verbose( 'Submitting generic pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] ) #ADRI: Find the generic group result = findGenericPilotCredentials( group = taskQueueDict[ 'OwnerGroup' ] ) if not result[ 'OK' ]: self.log.error( ERROR_GENERIC_CREDENTIALS, result[ 'Message' ] ) return S_ERROR( ERROR_GENERIC_CREDENTIALS ) ownerDN, ownerGroup = result[ 'Value' ] result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) ) if not result[ 'OK' ]: self.log.error( ERROR_TOKEN, result['Message'] ) return S_ERROR( ERROR_TOKEN ) ( token, numberOfUses ) = result[ 'Value' ] pilotsToSubmit = min( numberOfUses, pilotsToSubmit ) pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) pilotsToSubmit = max( 1, ( pilotsToSubmit - 1 ) / self.maxJobsInFillMode + 1 ) maxJobsInFillMode = int( numberOfUses / pilotsToSubmit ) # Use Filling mode pilotOptions.append( '-M %s' % maxJobsInFillMode ) # Debug pilotOptions.append( '-d' ) # Setup. pilotOptions.append( '-S %s' % taskQueueDict['Setup'] ) # CS Servers csServers = gConfig.getServersList() if len( csServers ) > 3: # Remove the master master = gConfigurationData.getMasterServer() if master in csServers: csServers.remove( master ) pilotOptions.append( '-C %s' % ",".join( csServers ) ) # DIRAC Extensions to be used in pilots # ubeda: I'm not entirely sure if we can use here the same opsHelper as in line # line +352 pilotExtensionsList = Operations().getValue( "Pilot/Extensions", [] ) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = getCSExtensions() if extensionsList: pilotOptions.append( '-e %s' % ",".join( extensionsList ) ) #Get DIRAC version and project, There might be global Setup defaults and per VO/Setup defaults (from configure) opsHelper = Operations( group = taskQueueDict['OwnerGroup'], setup = taskQueueDict['Setup'] ) # Requested version of DIRAC (it can be a list, so we take the fist one) version = opsHelper.getValue( cfgPath( 'Pilot', 'Version' ) , [ self.installVersion ] )[0] pilotOptions.append( '-r %s' % version ) # Requested Project to install installProject = opsHelper.getValue( cfgPath( 'Pilot', 'Project' ) , self.installProject ) if installProject: pilotOptions.append( '-l %s' % installProject ) installation = opsHelper.getValue( cfgPath( 'Pilot', 'Installation' ), self.installation ) if installation: pilotOptions.append( "-V %s" % installation ) # Requested CPU time pilotOptions.append( '-T %s' % taskQueueDict['CPUTime'] ) if self.submitPoolOption not in self.extraPilotOptions: pilotOptions.append( self.submitPoolOption ) if self.extraPilotOptions: pilotOptions.extend( self.extraPilotOptions ) return S_OK( ( pilotOptions, pilotsToSubmit, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) ) def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ): """ This method must be implemented on the Backend specific derived class. This is problem with the Director, not with the Job so we must return S_OK Return S_ERROR if not defined. """ self.log.error( '_submitPilots method not implemented' ) return S_OK() def submitPilots( self, taskQueueDict, pilotsToSubmit, workDir = None ): """ Submit pilot for the given TaskQueue, this method just insert the request in the corresponding ThreadPool, the submission is done from the Thread Pool job """ try: taskQueueID = taskQueueDict['TaskQueueID'] self.log.verbose( 'Submitting Pilot' ) ceMask = self._resolveCECandidates( taskQueueDict ) if not ceMask: return S_ERROR( 'No CE available for TaskQueue %d' % int( taskQueueID ) ) result = self._getPilotOptions( taskQueueDict, pilotsToSubmit ) if not result['OK']: return result ( pilotOptions, pilotsPerJob, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) = result['Value'] # get a valid proxy, submit with a long proxy to avoid renewal ret = self._getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft = 86400 * 5 ) if not ret['OK']: self.log.error( ret['Message'] ) self.log.error( 'No proxy Available', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) ) return S_ERROR( ERROR_PROXY ) proxy = ret['Value'] # Now call a Grid Specific method to handle the final submission of the pilots return self._submitPilots( workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) except Exception: self.log.exception( 'Error in Pilot Submission' ) return S_OK( 0 ) def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ): """ To be overwritten if a given Pilot does not require a full proxy """ self.log.info( "Downloading %s@%s proxy" % ( ownerDN, ownerGroup ) ) return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft ) def exceptionCallBack( self, threadedJob, exceptionInfo ): self.log.exception( 'Error in Pilot Submission' )
class GridPilotDirector( PilotDirector ): """ Base Grid PilotDirector class Derived classes must declare: self.Middleware: It must correspond to the string before "PilotDirector". (For proper naming of the logger) self.ResourceBrokers: list of Brokers used by the Director. (For proper error reporting) """ def __init__( self, submitPool ): """ Define some defaults and call parent __init__ """ self.gridEnv = GRIDENV self.cpuPowerRef = CPU_POWER_REF self.requirements = REQUIREMENTS self.rank = RANK self.fuzzyRank = FUZZY_RANK self.__failingWMSCache = DictCache() self.__ticketsWMSCache = DictCache() self.__listMatchWMSCache = DictCache() PilotDirector.__init__( self, submitPool ) def configure( self, csSection, submitPool ): """ Here goes common configuration for all Grid PilotDirectors """ PilotDirector.configure( self, csSection, submitPool ) self.reloadConfiguration( csSection, submitPool ) self.__failingWMSCache.purgeExpired() self.__ticketsWMSCache.purgeExpired() for rb in self.__failingWMSCache.getKeys(): if rb in self.resourceBrokers: try: self.resourceBrokers.remove( rb ) except: pass self.resourceBrokers = List.randomize( self.resourceBrokers ) if self.gridEnv: self.log.info( ' GridEnv: ', self.gridEnv ) if self.resourceBrokers: self.log.info( ' ResourceBrokers:', ', '.join( self.resourceBrokers ) ) def configureFromSection( self, mySection ): """ reload from CS """ PilotDirector.configureFromSection( self, mySection ) self.gridEnv = gConfig.getValue( mySection + '/GridEnv', self.gridEnv ) if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue( '/DIRAC/Setup', '' ) if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' ) if instance: self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' ) self.resourceBrokers = gConfig.getValue( mySection + '/ResourceBrokers' , self.resourceBrokers ) self.cpuPowerRef = gConfig.getValue( mySection + '/CPUPowerRef' , self.cpuPowerRef ) self.requirements = gConfig.getValue( mySection + '/Requirements' , self.requirements ) self.rank = gConfig.getValue( mySection + '/Rank' , self.rank ) self.fuzzyRank = gConfig.getValue( mySection + '/FuzzyRank' , self.fuzzyRank ) def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ): """ This method does the actual pilot submission to the Grid RB The logic is as follows: - If there are no available RB it return error - If there is no VOMS extension in the proxy, return error - It creates a temp directory - Prepare a JDL it has some part common to gLite and LCG (the payload description) it has some part specific to each middleware """ taskQueueID = taskQueueDict['TaskQueueID'] # ownerDN = taskQueueDict['OwnerDN'] credDict = proxy.getCredentials()['Value'] ownerDN = credDict['identity'] ownerGroup = credDict[ 'group' ] if not self.resourceBrokers: # Since we can exclude RBs from the list, it may become empty return S_ERROR( ERROR_RB ) # Need to get VOMS extension for the later interactions with WMS ret = gProxyManager.getVOMSAttributes( proxy ) if not ret['OK']: self.log.error( ERROR_VOMS, ret['Message'] ) return S_ERROR( ERROR_VOMS ) if not ret['Value']: return S_ERROR( ERROR_VOMS ) vomsGroup = ret['Value'][0] workingDirectory = tempfile.mkdtemp( prefix = 'TQ_%s_' % taskQueueID, dir = workDir ) self.log.verbose( 'Using working Directory:', workingDirectory ) # Write JDL retDict = self._prepareJDL( taskQueueDict, workingDirectory, pilotOptions, pilotsPerJob, ceMask, submitPrivatePilot, privateTQ ) jdl = retDict['JDL'] pilotRequirements = retDict['Requirements'] rb = retDict['RB'] if not jdl: try: shutil.rmtree( workingDirectory ) except: pass return S_ERROR( ERROR_JDL ) # Check that there are available queues for the Job: if self.enableListMatch: availableCEs = [] now = Time.dateTime() availableCEs = self.listMatchCache.get( pilotRequirements ) if availableCEs == False: availableCEs = self._listMatch( proxy, jdl, taskQueueID, rb ) if availableCEs != False: self.log.verbose( 'LastListMatch', now ) self.log.verbose( 'AvailableCEs ', availableCEs ) self.listMatchCache.add( pilotRequirements, self.listMatchDelay * 60, value = availableCEs ) # it is given in minutes if not availableCEs: try: shutil.rmtree( workingDirectory ) except: pass return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID ) # Now we are ready for the actual submission, so self.log.verbose( 'Submitting Pilots for TaskQueue', taskQueueID ) submitRet = self._submitPilot( proxy, pilotsPerJob, jdl, taskQueueID, rb ) try: shutil.rmtree( workingDirectory ) except: pass if not submitRet: return S_ERROR( 'Pilot Submission Failed for TQ %d ' % taskQueueID ) # pilotReference, resourceBroker = submitRet submittedPilots = 0 if pilotsPerJob != 1 and len( submitRet ) != pilotsPerJob: # Parametric jobs are used for pilotReference, resourceBroker in submitRet: pilotReference = self._getChildrenReferences( proxy, pilotReference, taskQueueID ) submittedPilots += len( pilotReference ) pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN, ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements ) else: for pilotReference, resourceBroker in submitRet: pilotReference = [pilotReference] submittedPilots += len( pilotReference ) pilotAgentsDB.addPilotTQReference( pilotReference, taskQueueID, ownerDN, ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements ) # add some sleep here time.sleep( 0.1 * submittedPilots ) if pilotsToSubmit > pilotsPerJob: # Additional submissions are necessary, need to get a new token and iterate. pilotsToSubmit -= pilotsPerJob result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) ) if not result[ 'OK' ]: self.log.error( ERROR_TOKEN, result['Message'] ) result = S_ERROR( ERROR_TOKEN ) result['Value'] = submittedPilots return result ( token, numberOfUses ) = result[ 'Value' ] for option in pilotOptions: if option.find( '-o /Security/ProxyToken=' ) == 0: pilotOptions.remove( option ) pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) pilotsPerJob = max( 1, min( pilotsPerJob, int( numberOfUses / self.maxJobsInFillMode ) ) ) result = self._submitPilots( workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ) if not result['OK']: if 'Value' not in result: result['Value'] = 0 result['Value'] += submittedPilots return result submittedPilots += result['Value'] return S_OK( submittedPilots ) def _prepareJDL( self, taskQueueDict, workingDirectory, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ ): """ This method should be overridden in a subclass """ self.log.error( '_prepareJDL() method should be implemented in a subclass' ) sys.exit() def _JobJDL( self, taskQueueDict, pilotOptions, ceMask ): """ The Job JDL is the same for LCG and GLite """ pilotJDL = 'Executable = "%s";\n' % os.path.basename( self.pilot ) executable = self.pilot pilotJDL += 'Arguments = "%s";\n' % ' '.join( pilotOptions ) pilotJDL += 'CPUTimeRef = %s;\n' % taskQueueDict['CPUTime'] pilotJDL += 'CPUPowerRef = %s;\n' % self.cpuPowerRef pilotJDL += """CPUWorkRef = real( CPUTimeRef * CPUPowerRef ); Lookup = "CPUScalingReferenceSI00=*"; cap = isList( other.GlueCECapability ) ? other.GlueCECapability : { "dummy" }; i0 = regexp( Lookup, cap[0] ) ? 0 : undefined; i1 = isString( cap[1] ) && regexp( Lookup, cap[1] ) ? 1 : i0; i2 = isString( cap[2] ) && regexp( Lookup, cap[2] ) ? 2 : i1; i3 = isString( cap[3] ) && regexp( Lookup, cap[3] ) ? 3 : i2; i4 = isString( cap[4] ) && regexp( Lookup, cap[4] ) ? 4 : i3; i5 = isString( cap[5] ) && regexp( Lookup, cap[5] ) ? 5 : i4; index = isString( cap[6] ) && regexp( Lookup, cap[6] ) ? 6 : i5; i = isUndefined( index ) ? 0 : index; QueuePowerRef = real( ! isUndefined( index ) ? int( substr( cap[i], size( Lookup ) - 1 ) ) : other.GlueHostBenchmarkSI00 ); QueueTimeRef = real( other.GlueCEPolicyMaxCPUTime * 60 ); QueueWorkRef = QueuePowerRef * QueueTimeRef; """ requirements = list( self.requirements ) if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']: # if there an explicit Grig CE requested by the TQ, remove the Ranking requirement for req in self.requirements: if req.strip().lower()[:6] == 'rank >': requirements.remove( req ) requirements.append( 'QueueWorkRef > CPUWorkRef' ) siteRequirements = '\n || '.join( [ 'other.GlueCEInfoHostName == "%s"' % s for s in ceMask ] ) requirements.append( "( %s\n )" % siteRequirements ) pilotRequirements = '\n && '.join( requirements ) pilotJDL += 'pilotRequirements = %s;\n' % pilotRequirements pilotJDL += 'Rank = %s;\n' % self.rank pilotJDL += 'FuzzyRank = %s;\n' % self.fuzzyRank pilotJDL += 'StdOutput = "%s";\n' % outputSandboxFiles[0] pilotJDL += 'StdError = "%s";\n' % outputSandboxFiles[1] pilotJDL += 'InputSandbox = { "%s" };\n' % '", "'.join( [ self.install, executable ] ) pilotJDL += 'OutputSandbox = { %s };\n' % ', '.join( [ '"%s"' % f for f in outputSandboxFiles ] ) self.log.verbose( pilotJDL ) return ( pilotJDL, pilotRequirements ) def parseListMatchStdout( self, proxy, cmd, taskQueueID, rb ): """ Parse List Match stdout to return list of matched CE's """ self.log.verbose( 'Executing List Match for TaskQueue', taskQueueID ) start = time.time() ret = executeGridCommand( proxy, cmd, self.gridEnv ) if not ret['OK']: self.log.error( 'Failed to execute List Match:', ret['Message'] ) self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy ) return False if ret['Value'][0] != 0: self.log.error( 'Error executing List Match:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) self.__sendErrorMail( rb, 'List Match', cmd, ret, proxy ) return False self.log.info( 'List Match Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) ) stdout = ret['Value'][1] stderr = ret['Value'][2] availableCEs = [] # Parse std.out for line in List.fromChar( stdout, '\n' ): if re.search( '/jobmanager-', line ) or re.search( '/cream-', line ): # TODO: the line has to be stripped from extra info availableCEs.append( line ) if not availableCEs: self.log.info( 'List-Match failed to find CEs for TaskQueue', taskQueueID ) self.log.info( stdout ) self.log.info( stderr ) else: self.log.debug( 'List-Match returns:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) self.log.info( 'List-Match found %s CEs for TaskQueue' % len( availableCEs ), taskQueueID ) self.log.verbose( ', '.join( availableCEs ) ) return availableCEs def parseJobSubmitStdout( self, proxy, cmd, taskQueueID, rb ): """ Parse Job Submit stdout to return pilot reference """ start = time.time() self.log.verbose( 'Executing Job Submit for TaskQueue', taskQueueID ) ret = executeGridCommand( proxy, cmd, self.gridEnv ) if not ret['OK']: self.log.error( 'Failed to execute Job Submit:', ret['Message'] ) self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy ) return False if ret['Value'][0] != 0: self.log.error( 'Error executing Job Submit:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) self.__sendErrorMail( rb, 'Job Submit', cmd, ret, proxy ) return False self.log.info( 'Job Submit Execution Time: %.2f for TaskQueue %d' % ( ( time.time() - start ), taskQueueID ) ) stdout = ret['Value'][1] stderr = ret['Value'][2] submittedPilot = None failed = 1 rb = '' for line in List.fromChar( stdout, '\n' ): m = re.search( "(https:\S+)", line ) if ( m ): glite_id = m.group( 1 ) submittedPilot = glite_id if not rb: m = re.search( "https://(.+):.+", glite_id ) rb = m.group( 1 ) failed = 0 if failed: self.log.error( 'Job Submit returns no Reference:', str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) return False self.log.info( 'Reference %s for TaskQueue %s' % ( glite_id, taskQueueID ) ) return glite_id, rb def _writeJDL( self, filename, jdlList ): try: f = open( filename, 'w' ) f.write( '\n'.join( jdlList ) ) f.close() except Exception, x: self.log.exception() return '' return filename