def getOpsSection(): """ Where is the shifters section? """ vo = CSGlobals.getVO() setup = CSGlobals.getSetup() if vo: res = gConfig.getSections( '/Operations/%s/%s/Shifter' % (vo, setup) ) if res['OK']: return S_OK( '/Operations/%s/%s/Shifter' % ( vo, setup ) ) res = gConfig.getSections( '/Operations/%s/Defaults/Shifter' % vo ) if res['OK']: return S_OK( '/Operations/%s/Defaults/Shifter' % vo ) else: res = gConfig.getSections( '/Operations/%s/Shifter' % setup ) if res['OK']: return S_OK( '/Operations/%s/Shifter' % setup ) res = gConfig.getSections( '/Operations/Defaults/Shifter' ) if res['OK']: return S_OK( '/Operations/Defaults/Shifter' ) return S_ERROR( "No shifter section" )
def __getSearchPaths( self ): paths = [ "/Operations/Defaults", "/Operations/%s" % self.__setup ] if not self.__vo: globalVO = CSGlobals.getVO() if not globalVO: return paths self.__vo = CSGlobals.getVO() paths.append( "/Operations/%s/Defaults" % self.__vo ) paths.append( "/Operations/%s/%s" % ( self.__vo, self.__setup ) ) return paths
def _curlDownload( self, granularity, site, tests ): """ Download SAM status for entity using the SAM DB programmatic interface """ samdbpi_url = "http://lcg-sam.cern.ch:8080/same-pi/" # Set your method if granularity in ( 'Site', 'Sites' ): samdbpi_method = "site_status.jsp?" elif granularity in ( 'Resource', 'Resources' ): samdbpi_method = "service_endpoint_status.jsp?" # Set your site samdbpi_site = site # set test samdbpi_test = "" if tests is None: samdbpi_test = "&only_ss" extension = CSGlobals.getCSExtensions()[0] samdb_ep = samdbpi_url + samdbpi_method + "VO_name=" + extension + "&Site_name=" + samdbpi_site + samdbpi_test req = urllib2.Request( samdb_ep ) samPage = urllib2.urlopen( req ) sam = samPage.read() return sam
def __getInstallFlags(self): """ Get the flags to pass to dirac-install.py inside the container. Returns a string containing the command line flags. """ instOpts = [] setup = gConfig.getValue("/DIRAC/Setup", "unknown") opsHelper = Operations.Operations(setup=setup) installationName = opsHelper.getValue("Pilot/Installation", "") if installationName: instOpts.append('-V %s' % installationName) diracVersions = opsHelper.getValue("Pilot/Version", []) instOpts.append("-r '%s'" % diracVersions[0]) pyVer = "%u%u" % (sys.version_info.major, sys.version_info.minor) instOpts.append("-i %s" % pyVer) pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", []) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = CSGlobals.getCSExtensions() if extensionsList: instOpts.append("-e '%s'" % ','.join([ext for ext in extensionsList if 'Web' not in ext])) if 'ContainerExtraOpts' in self.ceParameters: instOpts.append(self.ceParameters['ContainerExtraOpts']) return ' '.join(instOpts)
def __discoverSettings( self ): #Set the VO globalVO = CSGlobals.getVO() if globalVO: self.__vo = globalVO elif self.__uVO: self.__vo = self.__uVO else: self.__vo = Registry.getVOForGroup( self.__uGroup ) if not self.__vo: self.__vo = False #Set the setup self.__setup = False if self.__uSetup: self.__setup = self.__uSetup else: self.__setup = CSGlobals.getSetup()
def __generateRootModules( self, baseModules ): """ Iterate over all the possible root modules """ self.__rootModules = baseModules for rootModule in reversed( CSGlobals.getCSExtensions() ): if rootModule[-5:] != "DIRAC" and rootModule not in self.__rootModules: self.__rootModules.append( "%sDIRAC" % rootModule ) self.__rootModules.append( "" )
def __rootModules( self ): """ Iterate over all the possible root modules """ for rootModule in CSGlobals.getCSExtensions(): if rootModule[-5:] != "DIRAC": rootModule = "%sDIRAC" % rootModule yield rootModule yield 'DIRAC' yield ''
def __discoverSettings( self ): #Set the VO globalVO = CSGlobals.getVO() if globalVO: self.__vo = globalVO elif self.__uVO: self.__vo = self.__uVO else: self.__vo = Registry.getVOForGroup( self.__uGroup ) if not self.__vo: self.__vo = None
def __generateRootModules( self, baseModules ): """ Iterate over all the possible root modules """ self.__rootModules = baseModules for rootModule in reversed( CSGlobals.getCSExtensions() ): if rootModule[-5:] != "DIRAC" and rootModule not in self.__rootModules: self.__rootModules.append( "%sDIRAC" % rootModule ) self.__rootModules.append( "" ) # Reversing the order because we want first to look in the extension(s) self.__rootModules.reverse()
def includeExtensionErrors(): """ Merge all the errors of all the extensions into the errors of these modules Should be called only at the initialization of DIRAC, so by the parseCommandLine, dirac-agent.py, dirac-service.py, dirac-executor.py """ def __recurseImport( modName, parentModule = None, fullName = False ): """ Internal function to load modules """ if isinstance( modName, basestring ): modName = modName.split( "." ) if not fullName: fullName = ".".join( modName ) try: if parentModule: impData = imp.find_module( modName[0], parentModule.__path__ ) else: impData = imp.find_module( modName[0] ) impModule = imp.load_module( modName[0], *impData ) if impData[0]: impData[0].close() except ImportError: return None if len( modName ) == 1: return impModule return __recurseImport( modName[1:], impModule, fullName = fullName ) from DIRAC.ConfigurationSystem.Client.Helpers import CSGlobals allExtensions = CSGlobals.getCSExtensions() for extension in allExtensions: ext_derrno = None try: ext_derrno = __recurseImport( '%sDIRAC.Core.Utilities.DErrno' % extension ) if ext_derrno: # The next 3 dictionary MUST be present for consistency # Global name of errors sys.modules[__name__].__dict__.update( ext_derrno.extra_dErrName ) # Dictionary with the error codes sys.modules[__name__].dErrorCode.update( ext_derrno.extra_dErrorCode ) # Error description string sys.modules[__name__].dStrError.update( ext_derrno.extra_dStrError ) # extra_compatErrorString is optional for err in getattr( ext_derrno, 'extra_compatErrorString', [] ) : sys.modules[__name__].compatErrorString.setdefault( err, [] ).extend( ext_derrno.extra_compatErrorString[err] ) except: pass
def __discoverSettings( self ): #Set the VO globalVO = CSGlobals.getVO() if globalVO: self.__vo = globalVO elif self.__uVO: self.__vo = self.__uVO elif self.__uGroup: self.__vo = Registry.getVOForGroup( self.__uGroup ) if not self.__vo: self.__vo = False else: result = getVOfromProxyGroup() if result['OK']: self.__vo = result['Value'] #Set the setup self.__setup = False if self.__uSetup: self.__setup = self.__uSetup else: self.__setup = CSGlobals.getSetup()
def setHandlers( cls, handlers ): cls.__handlers = {} for k in handlers: handler = handlers[ k ] cls.__handlers[ handler.LOCATION.strip("/") ] = handler #Calculate extensions cls.__extensions = [] for ext in CSGlobals.getInstalledExtensions(): if ext in ( "WebAppDIRAC", "DIRAC" ): continue cls.__extensions.append( ext ) cls.__extensions.append( "DIRAC" ) cls.__extensions.append( "WebAppDIRAC" )
def loadObjects( path, reFilter = None, parentClass = None ): """ :param str path the path to the syetem for example: DIRAC/AccountingSystem :param object reFilter regular expression used to found the class :param object parentClass class instance :return dict it return the name of the clase and the instance of the class. """ if not reFilter: reFilter = re.compile( ".*[a-z1-9]\.py$" ) pathList = List.fromChar( path, "/" ) parentModuleList = [ "%sDIRAC" % ext for ext in CSGlobals.getCSExtensions() ] + [ 'DIRAC' ] objectsToLoad = {} #Find which object files match for parentModule in parentModuleList: objDir = os.path.join( DIRAC.rootPath, parentModule, *pathList ) if not os.path.isdir( objDir ): continue for objFile in os.listdir( objDir ): if reFilter.match( objFile ): pythonClassName = objFile[:-3] if pythonClassName not in objectsToLoad: gLogger.info( "Adding to load queue %s/%s/%s" % ( parentModule, path, pythonClassName ) ) objectsToLoad[ pythonClassName ] = parentModule #Load them! loadedObjects = {} for pythonClassName in objectsToLoad: parentModule = objectsToLoad[ pythonClassName ] try: #Where parentModule can be DIRAC, pathList is something like [ "AccountingSystem", "Client", "Types" ] #And the python class name is.. well, the python class name objPythonPath = "%s.%s.%s" % ( parentModule, ".".join( pathList ), pythonClassName ) objModule = __import__( objPythonPath, globals(), locals(), pythonClassName ) objClass = getattr( objModule, pythonClassName ) except Exception as e: gLogger.error( "Can't load type", "%s/%s: %s" % ( parentModule, pythonClassName, str( e ) ) ) continue if parentClass == objClass: continue if parentClass and not issubclass( objClass, parentClass ): gLogger.warn( "%s is not a subclass of %s. Skipping" % ( objClass, parentClass ) ) continue gLogger.info( "Loaded %s" % objPythonPath ) loadedObjects[ pythonClassName ] = objClass return loadedObjects
def __discoverSettings( self ): #Set the VO self.__threadData.vo = False if self.__threadData.uVO: self.__threadData.vo = self.__threadData.uVO else: self.__threadData.vo = Registry.getVOForGroup( self.__threadData.uGroup ) if not self.__threadData.vo: raise RuntimeError( "Don't know how to discover VO. Please check your VO and groups configuration" ) #Set the setup self.__threadData.setup = False if self.__threadData.uSetup: self.__threadData.setup = self.__threadData.uSetup else: self.__threadData.setup = CSGlobals.getSetup()
def loadWebAppCFGFiles(): """ Load WebApp/web.cfg definitions """ exts = [] for ext in CSGlobals.getCSExtensions(): if ext == "DIRAC": continue if ext[-5:] != "DIRAC": ext = "%sDIRAC" % ext if ext != "WebAppDIRAC": exts.append( ext ) exts.append( "DIRAC" ) exts.append( "WebAppDIRAC" ) webCFG = CFG() for modName in reversed( exts ): try: modPath = imp.find_module( modName )[1] except ImportError: continue gLogger.verbose( "Found module %s at %s" % ( modName, modPath ) ) cfgPath = os.path.join( modPath, "WebApp", "web.cfg" ) if not os.path.isfile( cfgPath ): gLogger.verbose( "Inexistant %s" % cfgPath ) continue try: modCFG = CFG().loadFromFile( cfgPath ) except Exception, excp: gLogger.error( "Could not load %s: %s" % ( cfgPath, excp ) ) continue gLogger.verbose( "Loaded %s" % cfgPath ) expl = [ BASECS ] while len( expl ): current = expl.pop( 0 ) if not modCFG.isSection( current ): continue if modCFG.getOption( "%s/AbsoluteDefinition" % current, False ): gLogger.verbose( "%s:%s is an absolute definition" % ( modName, current ) ) try: webCFG.deleteKey( current ) except: pass modCFG.deleteKey( "%s/AbsoluteDefinition" % current ) else: for sec in modCFG[ current ].listSections(): expl.append( "%s/%s" % ( current, sec ) ) #Add the modCFG webCFG = webCFG.mergeWith( modCFG )
def getPaths( self, dirName ): """ Get lists of paths for all installed and enabled extensions """ pathList = [] for extName in CSGlobals.getCSExtensions(): if extName.rfind( "DIRAC" ) != len( extName ) - 5: extName = "%sDIRAC" % extName if extName == "WebAppDIRAC": continue try: modFile, modPath, desc = imp.find_module( extName ) except ImportError: continue staticPath = os.path.join( modPath, "WebApp", dirName ) if os.path.isdir( staticPath ): pathList.append( staticPath ) #Add WebAppDirac to the end pathList.append( os.path.join( WebAppDIRAC.rootPath, "WebApp", dirName ) ) return pathList
def generatePath( self, option, vo = False, setup = False ): """ Generate the CS path for an option if vo is not defined, the helper's vo will be used for multi VO installations if setup evaluates False (except None) -> The helpers setup will be used if setup is defined -> whatever is defined will be used as setup if setup is None -> Defaults will be used """ path = "/Operations" if not CSGlobals.getVO(): if not vo: vo = self.__vo if vo: path += "/%s" % vo if not setup and setup != None: if not setup: setup = self.__setup if setup: path += "/%s" % setup else: path += "/Defaults" return "%s/%s" % ( path, option )
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tqDict['Tag'] = [] self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add( site ) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add( site ) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None ) totalWaitingPilots = 0 if result['OK']: totalWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) ) #if totalWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR( 'Can not get the site mask' ) siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle( queues ) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor if failedCount != 0: self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) ) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] siteMask = siteName in siteMaskList if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) ) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) ) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName #if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose( 'No matching TQs found for %s' % queue ) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) ) # Get the number of already waiting pilots for these task queues totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) if totalWaitingPilots >= totalTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots ) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) ) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue totalSlots = self.__getQueueSlots( queue ) if totalSlots == 0: self.log.debug( '%s: No slots available' % queue ) continue pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len( pilotList ) totalSubmittedPilots += len( pilotList ) self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) ) return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools} if self.vo: tqDict["Community"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result["Value"] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]["CE"] ceName = self.queueDict[queue]["CEName"] ceType = self.queueDict[queue]["CEType"] queueName = self.queueDict[queue]["QueueName"] siteName = self.queueDict[queue]["Site"] siteMask = siteName in siteMaskList if "CPUTime" in self.queueDict[queue]["ParametersDict"]: queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result self.proxy = result["Value"] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result["OK"]: self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"])) continue ceInfoDict = result["CEInfoDict"] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % ( ceName, queueName, ceInfoDict["WaitingJobs"], ceInfoDict["RunningJobs"], ceInfoDict["SubmittedJobs"], ceInfoDict["MaxTotalJobs"], ) ) totalSlots = result["Value"] ceDict = ce.getParameterDict() ceDict["GridCE"] = ceName if not siteMask and "Site" in ceDict: self.log.info("Site not in the mask %s" % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict["Site"] if self.vo: ceDict["Community"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups # This is a hack to get rid of ! ceDict["SubmitPool"] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: continue ceDict["Platform"] = result["Value"] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.info("No matching TQs found") continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime ) if not result["OK"]: self.log.error("Failed to get Number of Waiting pilots", result["Message"]) totalWaitingPilots = 0 else: totalWaitingPilots = result["Value"] self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots) pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d" % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get("BundleProxy", False) jobExecDir = "" if ceType == "CREAM": jobExecDir = "." jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir) httpProxy = self.queueDict[queue].get("HttpProxy", "") result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result["OK"]: return result executable, pilotSubmissionChunk = result["Value"] result = ce.submitJob(executable, "", pilotSubmissionChunk) os.unlink(executable) if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % queue, result["Message"]) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result["Value"] self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key("PilotStampDict"): stampDict = result["PilotStampDict"] tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict ) if not result["OK"]: self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"]) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, "Submitted", ceName, "Successfully submitted by the SiteDirector", siteName, queueName, ) if not result["OK"]: self.log.error("Failed to set pilot status: ", result["Message"]) continue return S_OK()
def __getPilotOptions(self, queue, pilotsToSubmit): """ Prepare pilot options """ queueDict = self.queueDict[queue]["ParametersDict"] pilotOptions = [] setup = gConfig.getValue("/DIRAC/Setup", "unknown") if setup == "unknown": self.log.error("Setup is not defined in the configuration") return [None, None] pilotOptions.append("-S %s" % setup) opsHelper = Operations.Operations(group=self.pilotGroup, setup=setup) # Installation defined? installationName = opsHelper.getValue("Pilot/Installation", "") if installationName: pilotOptions.append("-V %s" % installationName) # Project defined? projectName = opsHelper.getValue("Pilot/Project", "") if projectName: pilotOptions.append("-l %s" % projectName) else: self.log.info("DIRAC project will be installed by pilots") # Request a release diracVersion = opsHelper.getValue("Pilot/Version", []) if not diracVersion: self.log.error("Pilot/Version is not defined in the configuration") return [None, None] # diracVersion is a list of accepted releases. Just take the first one pilotOptions.append("-r %s" % diracVersion[0]) ownerDN = self.pilotDN ownerGroup = self.pilotGroup # Request token for maximum pilot efficiency result = gProxyManager.requestToken(ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode) if not result["OK"]: self.log.error("Invalid proxy token request", result["Message"]) return [None, None] (token, numberOfUses) = result["Value"] pilotOptions.append("-o /Security/ProxyToken=%s" % token) # Use Filling mode pilotOptions.append("-M %s" % min(numberOfUses, self.maxJobsInFillMode)) # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode ) # with numberOfUses tokens we can submit at most: # numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) # pilots newPilotsToSubmit = numberOfUses / min(numberOfUses, self.maxJobsInFillMode) if newPilotsToSubmit != pilotsToSubmit: self.log.info( "Number of pilots to submit is changed to %d after getting the proxy token" % newPilotsToSubmit ) pilotsToSubmit = newPilotsToSubmit # Debug if self.pilotLogLevel.lower() == "debug": pilotOptions.append("-d") # CS Servers csServers = gConfig.getValue("/DIRAC/Configuration/Servers", []) pilotOptions.append("-C %s" % ",".join(csServers)) # DIRAC Extensions to be used in pilots pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", []) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != "None": extensionsList = pilotExtensionsList else: extensionsList = CSGlobals.getCSExtensions() if extensionsList: pilotOptions.append("-e %s" % ",".join(extensionsList)) # Requested CPU time pilotOptions.append("-T %s" % queueDict["CPUTime"]) # CEName pilotOptions.append("-N %s" % self.queueDict[queue]["CEName"]) # SiteName pilotOptions.append("-n %s" % queueDict["Site"]) if "ClientPlatform" in queueDict: pilotOptions.append("-p '%s'" % queueDict["ClientPlatform"]) if "SharedArea" in queueDict: pilotOptions.append("-o '/LocalSite/SharedArea=%s'" % queueDict["SharedArea"]) if "SI00" in queueDict: factor = float(queueDict["SI00"]) / 250.0 pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % factor) pilotOptions.append("-o '/LocalSite/CPUNormalizationFactor=%s'" % factor) else: if "CPUScalingFactor" in queueDict: pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % queueDict["CPUScalingFactor"]) if "CPUNormalizationFactor" in queueDict: pilotOptions.append("-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict["CPUNormalizationFactor"]) # Hack if self.defaultSubmitPools: pilotOptions.append("-o /Resources/Computing/CEDefaults/SubmitPool=%s" % self.defaultSubmitPools) if self.group: pilotOptions.append("-G %s" % self.group) self.log.verbose("pilotOptions: ", " ".join(pilotOptions)) return [pilotOptions, pilotsToSubmit]
def __getVOPath( self ): if CSGlobals.getVO(): return "/Operations" return "/Operations/%s" % self.__threadData.vo
def __init__( self, name = 'Monitoring/MonitoringDB', readOnly = False ): super( MonitoringDB, self ).__init__( 'MonitoringDB', name, CSGlobals.getSetup().lower() ) self.__readonly = readOnly self.__documents = {} self.__loadIndexes()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups if self.checkPlatform: result = self.resourcesModule.getCompatiblePlatforms( self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus( ceName, "ComputingElement") if not result['OK']: self.log.error( "Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools if self.checkPlatform: platform = self.queueDict[queue]['Platform'] result = self.resourcesModule.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq][ 'Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: result = pilotAgentsDB.countPilots( { 'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS }, None) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit)) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir) executable, pilotSubmissionChunk = self.getExecutable( queue, pilotsToSubmit, bundleProxy=bundleProxy, jobExecDir=jobExecDir, processors=processors) result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if 'PilotStampDict' in result: stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools} if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups if self.checkPlatform: result = self.resourcesModule.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS}, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info('Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus(ceName, "ComputingElement") if not result['OK']: self.log.error("Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools if self.checkPlatform: platform = self.queueDict[queue]['Platform'] result = self.resourcesModule.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS}, None) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit)) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir) executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit, bundleProxy=bundleProxy, jobExecDir=jobExecDir, processors=processors) result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if 'PilotStampDict' in result: stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, stampDict) if not result['OK']: self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() queues = self.queueDict.keys() random.shuffle( queues ) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' ) platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info( 'No matching TQs found' ) continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue return S_OK()
def _getPilotOptions( self, queue, pilotsToSubmit ): """ Prepare pilot options """ queueDict = self.queueDict[queue]['ParametersDict'] pilotOptions = [] setup = gConfig.getValue( "/DIRAC/Setup", "unknown" ) if setup == 'unknown': self.log.error( 'Setup is not defined in the configuration' ) return [ None, None ] pilotOptions.append( '-S %s' % setup ) opsHelper = Operations.Operations( group = self.pilotGroup, setup = setup ) #Installation defined? installationName = opsHelper.getValue( "Pilot/Installation", "" ) if installationName: pilotOptions.append( '-V %s' % installationName ) #Project defined? projectName = opsHelper.getValue( "Pilot/Project", "" ) if projectName: pilotOptions.append( '-l %s' % projectName ) else: self.log.info( 'DIRAC project will be installed by pilots' ) #Request a release diracVersion = opsHelper.getValue( "Pilot/Version", [] ) if not diracVersion: self.log.error( 'Pilot/Version is not defined in the configuration' ) return [ None, None ] # diracVersion is a list of accepted releases pilotOptions.append( '-r %s' % ','.join( str( it ) for it in diracVersion ) ) ownerDN = self.pilotDN ownerGroup = self.pilotGroup # Request token for maximum pilot efficiency result = gProxyManager.requestToken( ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode ) if not result[ 'OK' ]: self.log.error( 'Invalid proxy token request', result['Message'] ) return [ None, None ] ( token, numberOfUses ) = result[ 'Value' ] pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) # Use Filling mode pilotOptions.append( '-M %s' % min( numberOfUses, self.maxJobsInFillMode ) ) # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode ) # with numberOfUses tokens we can submit at most: # numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) # pilots newPilotsToSubmit = numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) if newPilotsToSubmit != pilotsToSubmit: self.log.info( 'Number of pilots to submit is changed to %d after getting the proxy token' % newPilotsToSubmit ) pilotsToSubmit = newPilotsToSubmit # Debug if self.pilotLogLevel.lower() == 'debug': pilotOptions.append( '-d' ) # CS Servers csServers = gConfig.getValue( "/DIRAC/Configuration/Servers", [] ) pilotOptions.append( '-C %s' % ",".join( csServers ) ) # DIRAC Extensions to be used in pilots pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", [] ) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = CSGlobals.getCSExtensions() if extensionsList: pilotOptions.append( '-e %s' % ",".join( extensionsList ) ) # Requested CPU time pilotOptions.append( '-T %s' % queueDict['CPUTime'] ) # CEName pilotOptions.append( '-N %s' % self.queueDict[queue]['CEName'] ) # Queue pilotOptions.append( '-Q %s' % self.queueDict[queue]['QueueName'] ) # SiteName pilotOptions.append( '-n %s' % queueDict['Site'] ) if 'ClientPlatform' in queueDict: pilotOptions.append( "-p '%s'" % queueDict['ClientPlatform'] ) if 'SharedArea' in queueDict: pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % queueDict['SharedArea'] ) # if 'SI00' in queueDict: # factor = float( queueDict['SI00'] ) / 250. # pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % factor ) # pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % factor ) # else: # if 'CPUScalingFactor' in queueDict: # pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % queueDict['CPUScalingFactor'] ) # if 'CPUNormalizationFactor' in queueDict: # pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict['CPUNormalizationFactor'] ) if "ExtraPilotOptions" in queueDict: pilotOptions.append( queueDict['ExtraPilotOptions'] ) # Hack if self.defaultSubmitPools: pilotOptions.append( '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % self.defaultSubmitPools ) if "Tag" in queueDict: tagString = ','.join( queueDict['Tag'] ) pilotOptions.append( '-o /Resources/Computing/CEDefaults/Tag=%s' % tagString ) if self.group: pilotOptions.append( '-G %s' % self.group ) return [ pilotOptions, pilotsToSubmit ]
def beginExecution( self ): self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() ) # The SiteDirector is for a particular user community self.vo = self.am_getOption( "VO", '' ) if not self.vo: self.vo = self.am_getOption( "Community", '' ) if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption( "Group", '' ) # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector self.voGroups = [] # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO( self.vo ) if not result['OK']: return result for group in result['Value']: if 'NormalUser' in Registry.getPropertiesForGroup( group ): self.voGroups.append( group ) else: self.voGroups = [ self.group ] result = findGenericPilotCredentials( vo = self.vo ) if not result[ 'OK' ]: return result self.pilotDN, self.pilotGroup = result[ 'Value' ] self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN ) self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup ) self.platforms = [] self.sites = [] self.defaultSubmitPools = '' if self.group: self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '' ) elif self.vo: self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '' ) self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT ) self.install = DIRAC_INSTALL self.extraModules = self.am_getOption( 'ExtraPilotModules', [] ) + DIRAC_MODULES self.workingDirectory = self.am_getOption( 'WorkDirectory' ) self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 ) self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' ) self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode ) self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit ) self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True ) self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 3600 ) self.failedQueueCycleFactor = self.am_getOption( 'FailedQueueCycleFactor', 10 ) self.pilotStatusUpdateCycleFactor = self.am_getOption( 'PilotStatusUpdateCycleFactor', 10 ) # Flags self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True ) self.getOutput = self.am_getOption( 'GetPilotOutput', True ) self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True ) # Get the site description dictionary siteNames = None if not self.am_getOption( 'Site', 'Any' ).lower() == "any": siteNames = self.am_getOption( 'Site', [] ) if not siteNames: siteNames = None ceTypes = None if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any": ceTypes = self.am_getOption( 'CETypes', [] ) ces = None if not self.am_getOption( 'CEs', 'Any' ).lower() == "any": ces = self.am_getOption( 'CEs', [] ) if not ces: ces = None result = Resources.getQueues( community = self.vo, siteList = siteNames, ceList = ces, ceTypeList = ceTypes, mode = 'Direct' ) if not result['OK']: return result resourceDict = result['Value'] result = self.getQueues( resourceDict ) if not result['OK']: return result #if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] #self.siteNames = siteNames if self.updateStatus: self.log.always( 'Pilot status update requested' ) if self.getOutput: self.log.always( 'Pilot output retrieval requested' ) if self.sendAccounting: self.log.always( 'Pilot accounting sending requested' ) self.log.always( 'Sites:', siteNames ) self.log.always( 'CETypes:', ceTypes ) self.log.always( 'CEs:', ces ) self.log.always( 'PilotDN:', self.pilotDN ) self.log.always( 'PilotGroup:', self.pilotGroup ) self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit ) self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode ) self.localhost = socket.getfqdn() self.proxy = '' if self.firstPass: if self.queueDict: self.log.always( "Agent will serve queues:" ) for queue in self.queueDict: self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'], self.queueDict[queue]['CEName'], queue ) ) self.firstPass = False return S_OK()
def beginExecution(self): self.gridEnv = self.am_getOption("GridEnv", getGridEnv()) # The SiteDirector is for a particular user community self.vo = self.am_getOption("Community", "") if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption("Group", "") # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector self.voGroups = [] # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO(self.vo) if not result["OK"]: return result for group in result["Value"]: if "NormalUser" in Registry.getPropertiesForGroup(group): self.voGroups.append(group) else: self.voGroups = [self.group] result = findGenericPilotCredentials(vo=self.vo) if not result["OK"]: return result self.pilotDN, self.pilotGroup = result["Value"] self.pilotDN = self.am_getOption("PilotDN", self.pilotDN) self.pilotGroup = self.am_getOption("PilotGroup", self.pilotGroup) self.platforms = [] self.sites = [] self.defaultSubmitPools = "" if self.group: self.defaultSubmitPools = Registry.getGroupOption(self.group, "SubmitPools", "") elif self.vo: self.defaultSubmitPools = Registry.getVOOption(self.vo, "SubmitPools", "") self.pilot = self.am_getOption("PilotScript", DIRAC_PILOT) self.install = DIRAC_INSTALL self.workingDirectory = self.am_getOption("WorkDirectory") self.maxQueueLength = self.am_getOption("MaxQueueLength", 86400 * 3) self.pilotLogLevel = self.am_getOption("PilotLogLevel", "INFO") self.maxJobsInFillMode = self.am_getOption("MaxJobsInFillMode", self.maxJobsInFillMode) self.maxPilotsToSubmit = self.am_getOption("MaxPilotsToSubmit", self.maxPilotsToSubmit) self.pilotWaitingFlag = self.am_getOption("PilotWaitingFlag", True) self.pilotWaitingTime = self.am_getOption("MaxPilotWaitingTime", 7200) # Flags self.updateStatus = self.am_getOption("UpdatePilotStatus", True) self.getOutput = self.am_getOption("GetPilotOutput", True) self.sendAccounting = self.am_getOption("SendPilotAccounting", True) # Get the site description dictionary siteNames = None if not self.am_getOption("Site", "Any").lower() == "any": siteNames = self.am_getOption("Site", []) ceTypes = None if not self.am_getOption("CETypes", "Any").lower() == "any": ceTypes = self.am_getOption("CETypes", []) ces = None if not self.am_getOption("CEs", "Any").lower() == "any": ces = self.am_getOption("CEs", []) result = Resources.getQueues( community=self.vo, siteList=siteNames, ceList=ces, ceTypeList=ceTypes, mode="Direct" ) if not result["OK"]: return result resourceDict = result["Value"] result = self.getQueues(resourceDict) if not result["OK"]: return result # if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] # self.siteNames = siteNames if self.updateStatus: self.log.always("Pilot status update requested") if self.getOutput: self.log.always("Pilot output retrieval requested") if self.sendAccounting: self.log.always("Pilot accounting sending requested") self.log.always("Sites:", siteNames) self.log.always("CETypes:", ceTypes) self.log.always("CEs:", ces) self.log.always("PilotDN:", self.pilotDN) self.log.always("PilotGroup:", self.pilotGroup) self.log.always("MaxPilotsToSubmit:", self.maxPilotsToSubmit) self.log.always("MaxJobsInFillMode:", self.maxJobsInFillMode) self.localhost = socket.getfqdn() self.proxy = "" if self.queueDict: self.log.always("Agent will serve queues:") for queue in self.queueDict: self.log.always( "Site: %s, CE: %s, Queue: %s" % (self.queueDict[queue]["Site"], self.queueDict[queue]["CEName"], queue) ) return S_OK()
def getTicketsList(self, name, startDate=None, endDate=None): """ Return tickets of entity in name @param name: should be the name of the site @param startDate: starting date (optional) @param endDate: end date (optional) """ self.statusCount = {} self.shortDescription = {} # create client instance using GGUS wsdl: gclient = Client("https://prod-ars.ggus.eu/arsys/WSDL/public/prod-ars/GGUS") authInfo = gclient.factory.create("AuthenticationInfo") authInfo.userName = "******" authInfo.password = "******" gclient.set_options(soapheaders=authInfo) # prepare the query string: extension = CSGlobals.getCSExtensions()[0].lower() query = "'GHD_Affected Site'=\"" + name + '" AND \'GHD_Affected VO\'="%s"' % extension if startDate is not None: query = query + " AND 'GHD_Date Of Creation'>" + str(startDate) if endDate is not None: query = query + " AND 'GHD_Date Of Creation'<" + str(endDate) # create the URL to get tickets relative to the site: # Updated from https://gus.fzk.de to https://ggus.eu ggusURL = ( "https://ggus.eu/ws/ticket_search.php?show_columns_check[]=REQUEST_ID&" "show_columns_check[]=TICKET_TYPE&" "show_columns_check[]=AFFECTED_VO&" "show_columns_check[]=AFFECTED_SITE&" "show_columns_check[]=RESPONSIBLE_UNIT&" "show_columns_check[]=STATUS&" "show_columns_check[]=DATE_OF_CREATION&" "show_columns_check[]=LAST_UPDATE&" "show_columns_check[]=SHORT_DESCRIPTION&" "ticket=&" "supportunit=all&" "vo=%s&" "user=&" "keyword=&" "involvedsupporter=&" "assignto=&" "affectedsite=" + name + "&" "specattrib=0&" "status=open&" "priority=all&" "typeofproblem=all&" "mouarea=&" "radiotf=1&" "timeframe=any&" "tf_date_day_s=&" "tf_date_month_s=&" "tf_date_year_s=&" "tf_date_day_e=&" "tf_date_month_e=&" "tf_date_year_e=&" "lm_date_day=12&" "lm_date_month=2&" "lm_date_year=2010&" "orderticketsby=GHD_INT_REQUEST_ID&" "orderhow=descending" % (extension, extension) ) # the query must be into a try block. Empty queries, though formally correct, raise an exception try: self.ticketList = gclient.service.TicketGetList(query) self.globalStatistics() except WebFault: self.statusCount["terminal"] = 0 self.statusCount["open"] = 0 return S_OK((self.statusCount, ggusURL, self.shortDescription))
def addShifter( self, shifters = None ): """ Adds or modify one or more shifters. Also, adds the shifter section in case this is not present. Shifter identities are used in several places, mostly for running agents shifters should be in the form {'ShifterRole':{'User':'******', 'Group':'aDIRACGroup'}} :return: S_OK/S_ERROR """ def getOpsSection(): """ Where is the shifters section? """ vo = CSGlobals.getVO() setup = CSGlobals.getSetup() if vo: res = gConfig.getSections( '/Operations/%s/%s/Shifter' % (vo, setup) ) if res['OK']: return S_OK( '/Operations/%s/%s/Shifter' % ( vo, setup ) ) res = gConfig.getSections( '/Operations/%s/Defaults/Shifter' % vo ) if res['OK']: return S_OK( '/Operations/%s/Defaults/Shifter' % vo ) else: res = gConfig.getSections( '/Operations/%s/Shifter' % setup ) if res['OK']: return S_OK( '/Operations/%s/Shifter' % setup ) res = gConfig.getSections( '/Operations/Defaults/Shifter' ) if res['OK']: return S_OK( '/Operations/Defaults/Shifter' ) return S_ERROR( "No shifter section" ) if shifters is None: shifters = {} if not self.__initialized['OK']: return self.__initialized # get current shifters opsH = Operations( ) currentShifterRoles = opsH.getSections( 'Shifter' ) if not currentShifterRoles['OK']: # we assume the shifter section is not present currentShifterRoles = [] else: currentShifterRoles = currentShifterRoles['Value'] currentShiftersDict = {} for currentShifterRole in currentShifterRoles: currentShifter = opsH.getOptionsDict( 'Shifter/%s' % currentShifterRole ) if not currentShifter['OK']: return currentShifter currentShifter = currentShifter['Value'] currentShiftersDict[currentShifterRole] = currentShifter # Removing from shifters what does not need to be changed for sRole in shifters: if sRole in currentShiftersDict: if currentShiftersDict[sRole] == shifters[sRole]: shifters.pop( sRole ) # get shifters section to modify section = getOpsSection() # Is this section present? if not section['OK']: if section['Message'] == "No shifter section": gLogger.warn( section['Message'] ) gLogger.info( "Adding shifter section" ) vo = CSGlobals.getVO() if vo: section = '/Operations/%s/Defaults/Shifter' % vo else: section = '/Operations/Defaults/Shifter' res = self.__csMod.createSection( section ) if not res: gLogger.error( "Section %s not created" % section ) return S_ERROR( "Section %s not created" % section ) else: gLogger.error( section['Message'] ) return section else: section = section['Value'] #add or modify shifters for shifter in shifters: self.__csMod.removeSection( section + '/' + shifter ) self.__csMod.createSection( section + '/' + shifter ) self.__csMod.createSection( section + '/' + shifter + '/' + 'User' ) self.__csMod.createSection( section + '/' + shifter + '/' + 'Group' ) self.__csMod.setOptionValue( section + '/' + shifter + '/' + 'User', shifters[shifter]['User'] ) self.__csMod.setOptionValue( section + '/' + shifter + '/' + 'Group', shifters[shifter]['Group'] ) self.__csModified = True return S_OK( True )
def __getVOPath(self): if CSGlobals.getVO(): return "/Operations" return "/Operations/%s" % self.__threadData.vo
def addShifter(self, shifters=None): """ Adds or modify one or more shifters. Also, adds the shifter section in case this is not present. Shifter identities are used in several places, mostly for running agents shifters should be in the form {'ShifterRole':{'User':'******', 'Group':'aDIRACGroup'}} :return: S_OK/S_ERROR """ def getOpsSection(): """ Where is the shifters section? """ vo = CSGlobals.getVO() setup = CSGlobals.getSetup() if vo: res = gConfig.getSections('/Operations/%s/%s/Shifter' % (vo, setup)) if res['OK']: return S_OK('/Operations/%s/%s/Shifter' % (vo, setup)) res = gConfig.getSections('/Operations/%s/Defaults/Shifter' % vo) if res['OK']: return S_OK('/Operations/%s/Defaults/Shifter' % vo) else: res = gConfig.getSections('/Operations/%s/Shifter' % setup) if res['OK']: return S_OK('/Operations/%s/Shifter' % setup) res = gConfig.getSections('/Operations/Defaults/Shifter') if res['OK']: return S_OK('/Operations/Defaults/Shifter') return S_ERROR("No shifter section") if shifters is None: shifters = {} if not self.__initialized['OK']: return self.__initialized # get current shifters opsH = Operations() currentShifterRoles = opsH.getSections('Shifter') if not currentShifterRoles['OK']: # we assume the shifter section is not present currentShifterRoles = [] else: currentShifterRoles = currentShifterRoles['Value'] currentShiftersDict = {} for currentShifterRole in currentShifterRoles: currentShifter = opsH.getOptionsDict('Shifter/%s' % currentShifterRole) if not currentShifter['OK']: return currentShifter currentShifter = currentShifter['Value'] currentShiftersDict[currentShifterRole] = currentShifter # Removing from shifters what does not need to be changed for sRole in shifters: if sRole in currentShiftersDict: if currentShiftersDict[sRole] == shifters[sRole]: shifters.pop(sRole) # get shifters section to modify section = getOpsSection() # Is this section present? if not section['OK']: if section['Message'] == "No shifter section": gLogger.warn(section['Message']) gLogger.info("Adding shifter section") vo = CSGlobals.getVO() if vo: section = '/Operations/%s/Defaults/Shifter' % vo else: section = '/Operations/Defaults/Shifter' res = self.__csMod.createSection(section) if not res: gLogger.error("Section %s not created" % section) return S_ERROR("Section %s not created" % section) else: gLogger.error(section['Message']) return section else: section = section['Value'] #add or modify shifters for shifter in shifters: self.__csMod.removeSection(section + '/' + shifter) self.__csMod.createSection(section + '/' + shifter) self.__csMod.createSection(section + '/' + shifter + '/' + 'User') self.__csMod.createSection(section + '/' + shifter + '/' + 'Group') self.__csMod.setOptionValue(section + '/' + shifter + '/' + 'User', shifters[shifter]['User']) self.__csMod.setOptionValue( section + '/' + shifter + '/' + 'Group', shifters[shifter]['Group']) self.__csModified = True return S_OK(True)