Exemple #1
0
    def getOpsSection():
      """
      Where is the shifters section?
      """
      vo = CSGlobals.getVO()
      setup = CSGlobals.getSetup()

      if vo:
        res = gConfig.getSections( '/Operations/%s/%s/Shifter' % (vo, setup) )
        if res['OK']:
          return S_OK( '/Operations/%s/%s/Shifter' % ( vo, setup ) )

        res = gConfig.getSections( '/Operations/%s/Defaults/Shifter' % vo )
        if res['OK']:
          return S_OK( '/Operations/%s/Defaults/Shifter' % vo )

      else:
        res = gConfig.getSections( '/Operations/%s/Shifter' % setup )
        if res['OK']:
          return S_OK( '/Operations/%s/Shifter' % setup )

        res = gConfig.getSections( '/Operations/Defaults/Shifter' )
        if res['OK']:
          return S_OK( '/Operations/Defaults/Shifter' )

      return S_ERROR( "No shifter section" )
Exemple #2
0
 def __getSearchPaths( self ):
   paths = [ "/Operations/Defaults", "/Operations/%s" % self.__setup ]
   if not self.__vo:
     globalVO = CSGlobals.getVO()
     if not globalVO:
       return paths
     self.__vo = CSGlobals.getVO()
   paths.append( "/Operations/%s/Defaults" % self.__vo )
   paths.append( "/Operations/%s/%s" % ( self.__vo, self.__setup ) )
   return paths
  def _curlDownload( self, granularity, site, tests ):
    """ Download SAM status for entity using the SAM DB programmatic interface
    """

    samdbpi_url = "http://lcg-sam.cern.ch:8080/same-pi/"
    # Set your method
    if granularity in ( 'Site', 'Sites' ):
      samdbpi_method = "site_status.jsp?"
    elif granularity in ( 'Resource', 'Resources' ):
      samdbpi_method = "service_endpoint_status.jsp?"
    # Set your site
    samdbpi_site = site
    # set test
    samdbpi_test = ""
    if tests is None:
      samdbpi_test = "&only_ss"

    extension = CSGlobals.getCSExtensions()[0]

    samdb_ep = samdbpi_url + samdbpi_method + "VO_name=" + extension + "&Site_name=" + samdbpi_site + samdbpi_test

    req = urllib2.Request( samdb_ep )
    samPage = urllib2.urlopen( req )

    sam = samPage.read()

    return sam
  def __getInstallFlags(self):
    """ Get the flags to pass to dirac-install.py inside the container.
        Returns a string containing the command line flags.
    """
    instOpts = []
    setup = gConfig.getValue("/DIRAC/Setup", "unknown")
    opsHelper = Operations.Operations(setup=setup)

    installationName = opsHelper.getValue("Pilot/Installation", "")
    if installationName:
      instOpts.append('-V %s' % installationName)

    diracVersions = opsHelper.getValue("Pilot/Version", [])
    instOpts.append("-r '%s'" % diracVersions[0])

    pyVer = "%u%u" % (sys.version_info.major, sys.version_info.minor)
    instOpts.append("-i %s" % pyVer)
    pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", [])
    extensionsList = []
    if pilotExtensionsList:
      if pilotExtensionsList[0] != 'None':
        extensionsList = pilotExtensionsList
    else:
      extensionsList = CSGlobals.getCSExtensions()
    if extensionsList:
      instOpts.append("-e '%s'" % ','.join([ext for ext in extensionsList if 'Web' not in ext]))
    if 'ContainerExtraOpts' in self.ceParameters:
      instOpts.append(self.ceParameters['ContainerExtraOpts'])
    return ' '.join(instOpts)
Exemple #5
0
 def __discoverSettings( self ):
   #Set the VO
   globalVO = CSGlobals.getVO()
   if globalVO:
     self.__vo = globalVO
   elif self.__uVO:
     self.__vo = self.__uVO
   else:
     self.__vo = Registry.getVOForGroup( self.__uGroup )
     if not self.__vo:
       self.__vo = False
   #Set the setup
   self.__setup = False
   if self.__uSetup:
     self.__setup = self.__uSetup
   else:
     self.__setup = CSGlobals.getSetup()
Exemple #6
0
 def __generateRootModules( self, baseModules ):
   """ Iterate over all the possible root modules
   """
   self.__rootModules = baseModules
   for rootModule in reversed( CSGlobals.getCSExtensions() ):
     if rootModule[-5:] != "DIRAC" and rootModule not in self.__rootModules:
       self.__rootModules.append( "%sDIRAC" % rootModule )
   self.__rootModules.append( "" )
Exemple #7
0
 def __rootModules( self ):
   """ Iterate over all the possible root modules
   """
   for rootModule in CSGlobals.getCSExtensions():
     if rootModule[-5:] != "DIRAC":
       rootModule = "%sDIRAC" % rootModule
     yield rootModule
   yield 'DIRAC'
   yield ''
Exemple #8
0
 def __discoverSettings( self ):
   #Set the VO
   globalVO = CSGlobals.getVO()
   if globalVO:
     self.__vo = globalVO
   elif self.__uVO:
     self.__vo = self.__uVO
   else:
     self.__vo = Registry.getVOForGroup( self.__uGroup )
     if not self.__vo:
       self.__vo = None
  def __generateRootModules( self, baseModules ):
    """ Iterate over all the possible root modules
    """
    self.__rootModules = baseModules
    for rootModule in reversed( CSGlobals.getCSExtensions() ):
      if rootModule[-5:] != "DIRAC" and rootModule not in self.__rootModules:
        self.__rootModules.append( "%sDIRAC" % rootModule )
    self.__rootModules.append( "" )

    # Reversing the order because we want first to look in the extension(s)
    self.__rootModules.reverse()
Exemple #10
0
def includeExtensionErrors():
  """ Merge all the errors of all the extensions into the errors of these modules
      Should be called only at the initialization of DIRAC, so by the parseCommandLine,
      dirac-agent.py, dirac-service.py, dirac-executor.py
  """

  def __recurseImport( modName, parentModule = None, fullName = False ):
    """ Internal function to load modules
    """
    if isinstance( modName, basestring ):
      modName = modName.split( "." )
    if not fullName:
      fullName = ".".join( modName )
    try:
      if parentModule:
        impData = imp.find_module( modName[0], parentModule.__path__ )
      else:
        impData = imp.find_module( modName[0] )
      impModule = imp.load_module( modName[0], *impData )
      if impData[0]:
        impData[0].close()
    except ImportError:
      return  None
    if len( modName ) == 1:
      return  impModule
    return __recurseImport( modName[1:], impModule, fullName = fullName )


  from DIRAC.ConfigurationSystem.Client.Helpers import CSGlobals
  allExtensions = CSGlobals.getCSExtensions()

  for extension in allExtensions:
    ext_derrno = None
    try:

      ext_derrno = __recurseImport( '%sDIRAC.Core.Utilities.DErrno' % extension )

      if ext_derrno:
        # The next 3 dictionary MUST be present for consistency

        # Global name of errors
        sys.modules[__name__].__dict__.update( ext_derrno.extra_dErrName )
        # Dictionary with the error codes
        sys.modules[__name__].dErrorCode.update( ext_derrno.extra_dErrorCode )
        # Error description string
        sys.modules[__name__].dStrError.update( ext_derrno.extra_dStrError )

        # extra_compatErrorString is optional
        for err in getattr( ext_derrno, 'extra_compatErrorString', [] ) :
          sys.modules[__name__].compatErrorString.setdefault( err, [] ).extend( ext_derrno.extra_compatErrorString[err] )

    except:
      pass
Exemple #11
0
 def __discoverSettings( self ):
   #Set the VO
   globalVO = CSGlobals.getVO()
   if globalVO:
     self.__vo = globalVO
   elif self.__uVO:
     self.__vo = self.__uVO
   elif self.__uGroup:
     self.__vo = Registry.getVOForGroup( self.__uGroup )
     if not self.__vo:
       self.__vo = False
   else:
     result = getVOfromProxyGroup()
     if result['OK']:
       self.__vo = result['Value']    
   #Set the setup
   self.__setup = False
   if self.__uSetup:
     self.__setup = self.__uSetup
   else:
     self.__setup = CSGlobals.getSetup()
Exemple #12
0
 def setHandlers( cls, handlers ):
   cls.__handlers = {}
   for k in handlers:
     handler = handlers[ k ]
     cls.__handlers[ handler.LOCATION.strip("/") ] = handler
   #Calculate extensions
   cls.__extensions = []
   for ext in CSGlobals.getInstalledExtensions():
     if ext in ( "WebAppDIRAC", "DIRAC" ):
       continue
     cls.__extensions.append( ext )
   cls.__extensions.append( "DIRAC" )
   cls.__extensions.append( "WebAppDIRAC" )
Exemple #13
0
def loadObjects( path, reFilter = None, parentClass = None ):
  """
  :param str path the path to the syetem for example: DIRAC/AccountingSystem
  :param object reFilter regular expression used to found the class
  :param object parentClass class instance
  :return dict it return the name of the clase and the instance of the class.
  """
  if not reFilter:
    reFilter = re.compile( ".*[a-z1-9]\.py$" )
  pathList = List.fromChar( path, "/" )

  parentModuleList = [ "%sDIRAC" % ext for ext in CSGlobals.getCSExtensions() ] + [ 'DIRAC' ]
  objectsToLoad = {}
  #Find which object files match
  for parentModule in parentModuleList:
    objDir = os.path.join( DIRAC.rootPath, parentModule, *pathList )
    if not os.path.isdir( objDir ):
      continue
    for objFile in os.listdir( objDir ):
      if reFilter.match( objFile ):
        pythonClassName = objFile[:-3]
        if pythonClassName not in objectsToLoad:
          gLogger.info( "Adding to load queue %s/%s/%s" % ( parentModule, path, pythonClassName ) )
          objectsToLoad[ pythonClassName ] = parentModule

  #Load them!
  loadedObjects = {}

  for pythonClassName in objectsToLoad:
    parentModule = objectsToLoad[ pythonClassName ]
    try:
      #Where parentModule can be DIRAC, pathList is something like [ "AccountingSystem", "Client", "Types" ]
      #And the python class name is.. well, the python class name
      objPythonPath = "%s.%s.%s" % ( parentModule, ".".join( pathList ), pythonClassName )
      objModule = __import__( objPythonPath,
                               globals(),
                               locals(), pythonClassName )
      objClass = getattr( objModule, pythonClassName )
    except Exception as e:
      gLogger.error( "Can't load type", "%s/%s: %s" % ( parentModule, pythonClassName, str( e ) ) )
      continue
    if parentClass == objClass:
      continue
    if parentClass and not issubclass( objClass, parentClass ):
      gLogger.warn( "%s is not a subclass of %s. Skipping" % ( objClass, parentClass ) )
      continue
    gLogger.info( "Loaded %s" % objPythonPath )
    loadedObjects[ pythonClassName ] = objClass

  return loadedObjects
Exemple #14
0
 def __discoverSettings( self ):
   #Set the VO
   self.__threadData.vo = False
   if self.__threadData.uVO:
     self.__threadData.vo = self.__threadData.uVO
   else:
     self.__threadData.vo = Registry.getVOForGroup( self.__threadData.uGroup )
     if not self.__threadData.vo:
       raise RuntimeError( "Don't know how to discover VO. Please check your VO and groups configuration" )
   #Set the setup
   self.__threadData.setup = False
   if self.__threadData.uSetup:
     self.__threadData.setup = self.__threadData.uSetup
   else:
     self.__threadData.setup = CSGlobals.getSetup()
Exemple #15
0
def loadWebAppCFGFiles():
  """
  Load WebApp/web.cfg definitions
  """
  exts = []
  for ext in CSGlobals.getCSExtensions():
    if ext == "DIRAC":
      continue
    if ext[-5:] != "DIRAC":
      ext = "%sDIRAC" % ext
    if ext != "WebAppDIRAC":
      exts.append( ext )
  exts.append( "DIRAC" )
  exts.append( "WebAppDIRAC" )
  webCFG = CFG()
  for modName in reversed( exts ):
    try:
      modPath = imp.find_module( modName )[1]
    except ImportError:
      continue
    gLogger.verbose( "Found module %s at %s" % ( modName, modPath ) )
    cfgPath = os.path.join( modPath, "WebApp", "web.cfg" )
    if not os.path.isfile( cfgPath ):
      gLogger.verbose( "Inexistant %s" % cfgPath )
      continue
    try:
      modCFG = CFG().loadFromFile( cfgPath )
    except Exception, excp:
      gLogger.error( "Could not load %s: %s" % ( cfgPath, excp ) )
      continue
    gLogger.verbose( "Loaded %s" % cfgPath )
    expl = [ BASECS ]
    while len( expl ):
      current = expl.pop( 0 )
      if not modCFG.isSection( current ):
        continue
      if modCFG.getOption( "%s/AbsoluteDefinition" % current, False ):
        gLogger.verbose( "%s:%s is an absolute definition" % ( modName, current ) )
        try:
          webCFG.deleteKey( current )
        except:
          pass
        modCFG.deleteKey( "%s/AbsoluteDefinition" % current )
      else:
        for sec in modCFG[ current ].listSections():
          expl.append( "%s/%s" % ( current, sec ) )
    #Add the modCFG
    webCFG = webCFG.mergeWith( modCFG )
Exemple #16
0
 def getPaths( self, dirName ):
   """
   Get lists of paths for all installed and enabled extensions
   """
   pathList = []
   for extName in CSGlobals.getCSExtensions():
     if extName.rfind( "DIRAC" ) != len( extName ) - 5:
       extName = "%sDIRAC" % extName
     if extName == "WebAppDIRAC":
       continue
     try:
       modFile, modPath, desc = imp.find_module( extName )
     except ImportError:
       continue
     staticPath = os.path.join( modPath, "WebApp", dirName )
     if os.path.isdir( staticPath ):
       pathList.append( staticPath )
   #Add WebAppDirac to the end
   pathList.append( os.path.join( WebAppDIRAC.rootPath, "WebApp", dirName ) )
   return pathList
Exemple #17
0
 def generatePath( self, option, vo = False, setup = False ):
   """
   Generate the CS path for an option
   if vo is not defined, the helper's vo will be used for multi VO installations
   if setup evaluates False (except None) -> The helpers setup will  be used
   if setup is defined -> whatever is defined will be used as setup
   if setup is None -> Defaults will be used
   """
   path = "/Operations"
   if not CSGlobals.getVO():
     if not vo:
       vo = self.__vo
     if vo:
       path += "/%s" % vo
   if not setup and setup != None:
     if not setup:
       setup = self.__setup
   if setup:
     path += "/%s" % setup
   else:
     path += "/Defaults" 
   return "%s/%s" % ( path, option )
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tqDict['Tag'] = []
    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add( site )
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add( site )
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                          'Status': WAITING_PILOT_STATUS },
                                           None )
    totalWaitingPilots = 0
    if result['OK']:
      totalWaitingPilots = result['Value']
    self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) )
    #if totalWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    # Check if the site is allowed in the mask
    result = jobDB.getSiteMask()
    if not result['OK']:
      return S_ERROR( 'Can not get the site mask' )
    siteMaskList = result['Value']

    queues = self.queueDict.keys()
    random.shuffle( queues )
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) )
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      platform = self.queueDict[queue]['Platform']
      siteMask = siteName in siteMaskList

      if not anySite and siteName not in jobSites:
        self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) )
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) )
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      #if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools
      
      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose( 'No matching TQs found for %s' % queue )
        continue

      matchedQueues += 1
      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) )

      # Get the number of already waiting pilots for these task queues
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                              None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )
      if totalWaitingPilots >= totalTQJobs:
        self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots )
        continue

      self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) )

      # Get the working proxy
      cpuTime = queueCPUTime + 86400
      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      totalSlots = self.__getQueueSlots( queue )
      if totalSlots == 0:
        self.log.debug( '%s: No slots available' % queue )
        continue

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \
                              ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          self.failedQueues[queue] += 1
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.queueSlots[queue]['AvailableSlots'] -= len( pilotList )
        totalSubmittedPilots += len( pilotList )
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random() * sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                      tqID,
                                                      self.pilotDN,
                                                      self.pilotGroup,
                                                      self.localhost,
                                                      ceType,
                                                      '',
                                                      stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) )
    return S_OK()
Exemple #19
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools}
        if self.vo:
            tqDict["Community"] = self.vo
        if self.voGroups:
            tqDict["OwnerGroup"] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result["OK"]:
            return result
        tqDict["Platform"] = result["Value"]
        tqDict["Site"] = self.sites

        self.log.verbose("Checking overall TQ availability with requirements")
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result["OK"]:
            return result
        if not result["Value"]:
            self.log.verbose("No Waiting jobs suitable for the director")
            return S_OK()

        # Check if the site is allowed in the mask
        result = jobDB.getSiteMask()
        if not result["OK"]:
            return S_ERROR("Can not get the site mask")
        siteMaskList = result["Value"]

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]["CE"]
            ceName = self.queueDict[queue]["CEName"]
            ceType = self.queueDict[queue]["CEType"]
            queueName = self.queueDict[queue]["QueueName"]
            siteName = self.queueDict[queue]["Site"]
            siteMask = siteName in siteMaskList

            if "CPUTime" in self.queueDict[queue]["ParametersDict"]:
                queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"])
            else:
                self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
            if not result["OK"]:
                return result
            self.proxy = result["Value"]
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result["OK"]:
                self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"]))
                continue
            ceInfoDict = result["CEInfoDict"]
            self.log.info(
                "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d"
                % (
                    ceName,
                    queueName,
                    ceInfoDict["WaitingJobs"],
                    ceInfoDict["RunningJobs"],
                    ceInfoDict["SubmittedJobs"],
                    ceInfoDict["MaxTotalJobs"],
                )
            )

            totalSlots = result["Value"]

            ceDict = ce.getParameterDict()
            ceDict["GridCE"] = ceName
            if not siteMask and "Site" in ceDict:
                self.log.info("Site not in the mask %s" % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict["Site"]
            if self.vo:
                ceDict["Community"] = self.vo
            if self.voGroups:
                ceDict["OwnerGroup"] = self.voGroups

            # This is a hack to get rid of !
            ceDict["SubmitPool"] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(self.platforms)
            if not result["OK"]:
                continue
            ceDict["Platform"] = result["Value"]

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result["OK"]:
                self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"])
                return result
            taskQueueDict = result["Value"]
            if not taskQueueDict:
                self.log.info("No matching TQs found")
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]["Jobs"]

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime
                )
                if not result["OK"]:
                    self.log.error("Failed to get Number of Waiting pilots", result["Message"])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result["Value"]
                    self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots)

            pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info(
                "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d"
                % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit)
            )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get("BundleProxy", False)
                jobExecDir = ""
                if ceType == "CREAM":
                    jobExecDir = "."
                jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir)
                httpProxy = self.queueDict[queue].get("HttpProxy", "")

                result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir)
                if not result["OK"]:
                    return result

                executable, pilotSubmissionChunk = result["Value"]
                result = ce.submitJob(executable, "", pilotSubmissionChunk)
                os.unlink(executable)
                if not result["OK"]:
                    self.log.error("Failed submission to queue %s:\n" % queue, result["Message"])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result["Value"]
                self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key("PilotStampDict"):
                    stampDict = result["PilotStampDict"]
                tqPriorityList = []
                sumPriority = 0.0
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]["Priority"]
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict
                    )
                    if not result["OK"]:
                        self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot,
                            "Submitted",
                            ceName,
                            "Successfully submitted by the SiteDirector",
                            siteName,
                            queueName,
                        )
                        if not result["OK"]:
                            self.log.error("Failed to set pilot status: ", result["Message"])
                            continue

        return S_OK()
Exemple #20
0
    def __getPilotOptions(self, queue, pilotsToSubmit):
        """ Prepare pilot options
    """

        queueDict = self.queueDict[queue]["ParametersDict"]
        pilotOptions = []

        setup = gConfig.getValue("/DIRAC/Setup", "unknown")
        if setup == "unknown":
            self.log.error("Setup is not defined in the configuration")
            return [None, None]
        pilotOptions.append("-S %s" % setup)
        opsHelper = Operations.Operations(group=self.pilotGroup, setup=setup)

        # Installation defined?
        installationName = opsHelper.getValue("Pilot/Installation", "")
        if installationName:
            pilotOptions.append("-V %s" % installationName)

        # Project defined?
        projectName = opsHelper.getValue("Pilot/Project", "")
        if projectName:
            pilotOptions.append("-l %s" % projectName)
        else:
            self.log.info("DIRAC project will be installed by pilots")

        # Request a release
        diracVersion = opsHelper.getValue("Pilot/Version", [])
        if not diracVersion:
            self.log.error("Pilot/Version is not defined in the configuration")
            return [None, None]
        # diracVersion is a list of accepted releases. Just take the first one
        pilotOptions.append("-r %s" % diracVersion[0])

        ownerDN = self.pilotDN
        ownerGroup = self.pilotGroup
        # Request token for maximum pilot efficiency
        result = gProxyManager.requestToken(ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode)
        if not result["OK"]:
            self.log.error("Invalid proxy token request", result["Message"])
            return [None, None]
        (token, numberOfUses) = result["Value"]
        pilotOptions.append("-o /Security/ProxyToken=%s" % token)
        # Use Filling mode
        pilotOptions.append("-M %s" % min(numberOfUses, self.maxJobsInFillMode))

        # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode )
        # with numberOfUses tokens we can submit at most:
        #    numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
        # pilots
        newPilotsToSubmit = numberOfUses / min(numberOfUses, self.maxJobsInFillMode)
        if newPilotsToSubmit != pilotsToSubmit:
            self.log.info(
                "Number of pilots to submit is changed to %d after getting the proxy token" % newPilotsToSubmit
            )
            pilotsToSubmit = newPilotsToSubmit
        # Debug
        if self.pilotLogLevel.lower() == "debug":
            pilotOptions.append("-d")
        # CS Servers
        csServers = gConfig.getValue("/DIRAC/Configuration/Servers", [])
        pilotOptions.append("-C %s" % ",".join(csServers))

        # DIRAC Extensions to be used in pilots
        pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", [])
        extensionsList = []
        if pilotExtensionsList:
            if pilotExtensionsList[0] != "None":
                extensionsList = pilotExtensionsList
        else:
            extensionsList = CSGlobals.getCSExtensions()
        if extensionsList:
            pilotOptions.append("-e %s" % ",".join(extensionsList))

        # Requested CPU time
        pilotOptions.append("-T %s" % queueDict["CPUTime"])
        # CEName
        pilotOptions.append("-N %s" % self.queueDict[queue]["CEName"])
        # SiteName
        pilotOptions.append("-n %s" % queueDict["Site"])
        if "ClientPlatform" in queueDict:
            pilotOptions.append("-p '%s'" % queueDict["ClientPlatform"])

        if "SharedArea" in queueDict:
            pilotOptions.append("-o '/LocalSite/SharedArea=%s'" % queueDict["SharedArea"])

        if "SI00" in queueDict:
            factor = float(queueDict["SI00"]) / 250.0
            pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % factor)
            pilotOptions.append("-o '/LocalSite/CPUNormalizationFactor=%s'" % factor)
        else:
            if "CPUScalingFactor" in queueDict:
                pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % queueDict["CPUScalingFactor"])
            if "CPUNormalizationFactor" in queueDict:
                pilotOptions.append("-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict["CPUNormalizationFactor"])

        # Hack
        if self.defaultSubmitPools:
            pilotOptions.append("-o /Resources/Computing/CEDefaults/SubmitPool=%s" % self.defaultSubmitPools)

        if self.group:
            pilotOptions.append("-G %s" % self.group)

        self.log.verbose("pilotOptions: ", " ".join(pilotOptions))

        return [pilotOptions, pilotsToSubmit]
Exemple #21
0
 def __getVOPath( self ):
   if CSGlobals.getVO():
     return "/Operations"
   return "/Operations/%s" % self.__threadData.vo
Exemple #22
0
 def __init__( self, name = 'Monitoring/MonitoringDB', readOnly = False ):
   super( MonitoringDB, self ).__init__( 'MonitoringDB', name, CSGlobals.getSetup().lower() )
   self.__readonly = readOnly
   self.__documents = {}
   self.__loadIndexes()
Exemple #23
0
    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        queues = self.queueDict.keys()

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        if self.checkPlatform:
            result = self.resourcesModule.getCompatiblePlatforms(
                self.platforms)
            if not result['OK']:
                return result
            tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites
        tags = []
        for queue in queues:
            tags += self.queueDict[queue]['ParametersDict']['Tag']
        tqDict['Tag'] = list(set(tags))

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        matcherClient = MatcherClient()
        result = matcherClient.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result['Value']:
            if "Sites" in result['Value'][tqID]:
                for site in result['Value'][tqID]['Sites']:
                    if site.lower() != 'any':
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result['Value'][tqID]:
                if "Sites" in result['Value'][tqID]:
                    for site in result['Value'][tqID]['Sites']:
                        if site.lower() != 'any':
                            testSites.add(site)
            totalWaitingJobs += result['Value'][tqID]['Jobs']

        tqIDList = result['Value'].keys()
        self.log.info(tqIDList)
        result = pilotAgentsDB.countPilots(
            {
                'TaskQueueID': tqIDList,
                'Status': WAITING_PILOT_STATUS
            }, None)
        tagWaitingPilots = 0
        if result['OK']:
            tagWaitingPilots = result['Value']
        self.log.info(
            'Total %d jobs in %d task queues with %d waiting pilots' %
            (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
        self.log.info('Queues: ', self.queueDict.keys())
        # if tagWaitingPilots >= totalWaitingJobs:
        #  self.log.info( 'No more pilots to be submitted in this cycle' )
        #  return S_OK()

        result = self.siteClient.getUsableSites()
        if not result['OK']:
            return result
        siteMaskList = result['Value']

        queues = self.queueDict.keys()
        random.shuffle(queues)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for queue in queues:

            # Check if the queue failed previously
            failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
            if failedCount != 0:
                self.log.warn("%s queue failed recently, skipping %d cycles" %
                              (queue, 10 - failedCount))
                self.failedQueues[queue] += 1
                continue

            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            queueTags = self.queueDict[queue]['ParametersDict']['Tag']
            siteMask = siteName in siteMaskList
            processorTags = []

            # Check the status of the Site
            result = self.siteClient.getUsableSites(siteName)
            if not result['OK']:
                self.log.error("Can not get the status of site %s: %s" %
                               (siteName, result['Message']))
                continue
            if siteName not in result.get('Value', []):
                self.log.info("site %s is not active" % siteName)
                continue

            if self.rssFlag:
                # Check the status of the ComputingElement
                result = self.rssClient.getElementStatus(
                    ceName, "ComputingElement")
                if not result['OK']:
                    self.log.error(
                        "Can not get the status of computing element",
                        " %s: %s" % (siteName, result['Message']))
                    continue
                if result['Value']:
                    # get the value of the status
                    result = result['Value'][ceName]['all']

                if result not in ('Active', 'Degraded'):
                    self.log.verbose(
                        "Skipping computing element %s at %s: resource not usable"
                        % (ceName, siteName))
                    continue

            for tag in queueTags:
                if re.match(r'^[0-9]+Processors$', tag):
                    processorTags.append(tag)
            if 'WholeNode' in queueTags:
                processorTags.append('WholeNode')

            if not anySite and siteName not in jobSites:
                self.log.verbose(
                    "Skipping queue %s at %s: no workload expected" %
                    (queueName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose(
                    "Skipping queue %s at site %s not in the mask" %
                    (queueName, siteName))
                continue

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            # if not siteMask and 'Site' in ceDict:
            #  self.log.info( 'Site not in the mask %s' % siteName )
            #  self.log.info( 'Removing "Site" from matching Dict' )
            #  del ceDict[ 'Site' ]
            if not siteMask:
                ceDict['JobType'] = "Test"
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            if self.checkPlatform:
                platform = self.queueDict[queue]['Platform']
                result = self.resourcesModule.getCompatiblePlatforms(platform)
                if not result['OK']:
                    continue
                ceDict['Platform'] = result['Value']

            ceDict['Tag'] = queueTags
            # Get the number of eligible jobs for the target site/queue
            result = matcherClient.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.verbose('No matching TQs found for %s' % queue)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            totalTQJobsByProcessors = {}
            tqIDList = taskQueueDict.keys()
            tqIDListByProcessors = {}
            for tq in taskQueueDict:
                if 'Tags' not in taskQueueDict[tq]:
                    # skip non multiprocessor tqs
                    continue
                for tag in taskQueueDict[tq]['Tags']:
                    if tag in processorTags:
                        tqIDListByProcessors.setdefault(tag, [])
                        tqIDListByProcessors[tag].append(tq)

                        totalTQJobsByProcessors.setdefault(tag, 0)
                        totalTQJobsByProcessors[tag] += taskQueueDict[tq][
                            'Jobs']

                totalTQJobs += taskQueueDict[tq]['Jobs']

            self.log.verbose(
                '%d job(s) from %d task queue(s) are eligible for %s queue' %
                (totalTQJobs, len(tqIDList), queue))

            queueSubmittedPilots = 0
            for tag in tqIDListByProcessors:

                self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" %
                                 (tag, tqIDListByProcessors[tag]))

                processors = 1

                m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
                if m:
                    processors = int(m.group('processors'))
                if tag == 'WholeNode':
                    processors = -1

                tagTQJobs = totalTQJobsByProcessors[tag]
                tagTqIDList = tqIDListByProcessors[tag]

                # Get the number of already waiting pilots for these task queues
                tagWaitingPilots = 0
                if self.pilotWaitingFlag:
                    result = pilotAgentsDB.countPilots(
                        {
                            'TaskQueueID': tagTqIDList,
                            'Status': WAITING_PILOT_STATUS
                        }, None)
                    if not result['OK']:
                        self.log.error(
                            'Failed to get Number of Waiting pilots',
                            result['Message'])
                        tagWaitingPilots = 0
                    else:
                        tagWaitingPilots = result['Value']
                        self.log.verbose(
                            'Waiting Pilots for TaskQueue %s:' % tagTqIDList,
                            tagWaitingPilots)
                if tagWaitingPilots >= tagTQJobs:
                    self.log.verbose(
                        "%d waiting pilots already for all the available jobs"
                        % tagWaitingPilots)
                    continue

                self.log.verbose(
                    "%d waiting pilots for the total of %d eligible jobs for %s"
                    % (tagWaitingPilots, tagTQJobs, queue))

                # Get the working proxy
                cpuTime = queueCPUTime + 86400
                self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                                 (self.pilotDN, self.pilotGroup, cpuTime))
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, cpuTime)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, cpuTime - 60)

                # Get the number of available slots on the target site/queue
                totalSlots = self.getQueueSlots(queue, False)
                if totalSlots == 0:
                    self.log.debug('%s: No slots available' % queue)
                    continue

                # Note: comparing slots to job numbers is not accurate in multiprocessor case.
                #       This could lead to over submission.
                pilotsToSubmit = max(
                    0, min(totalSlots, tagTQJobs - tagWaitingPilots))
                self.log.info(
                    '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d'
                    % (queue, totalSlots, tagTQJobs, tagWaitingPilots,
                       pilotsToSubmit))

                # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
                pilotsToSubmit = min(
                    self.maxPilotsToSubmit - queueSubmittedPilots,
                    pilotsToSubmit)

                while pilotsToSubmit > 0:
                    self.log.info('Going to submit %d pilots to %s queue' %
                                  (pilotsToSubmit, queue))

                    bundleProxy = self.queueDict[queue].get(
                        'BundleProxy', False)
                    jobExecDir = ''
                    jobExecDir = self.queueDict[queue]['ParametersDict'].get(
                        'JobExecDir', jobExecDir)

                    executable, pilotSubmissionChunk = self.getExecutable(
                        queue,
                        pilotsToSubmit,
                        bundleProxy=bundleProxy,
                        jobExecDir=jobExecDir,
                        processors=processors)
                    result = ce.submitJob(executable,
                                          '',
                                          pilotSubmissionChunk,
                                          processors=processors)
                    # ## FIXME: The condor thing only transfers the file with some
                    # ## delay, so when we unlink here the script is gone
                    # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
                    if ceType != 'HTCondorCE':
                        os.unlink(executable)
                    if not result['OK']:
                        self.log.error(
                            'Failed submission to queue %s:\n' % queue,
                            result['Message'])
                        pilotsToSubmit = 0
                        self.failedQueues[queue] += 1
                        continue

                    pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                    queueSubmittedPilots += pilotSubmissionChunk
                    # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                    # task queue priorities
                    pilotList = result['Value']
                    self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
                    totalSubmittedPilots += len(pilotList)
                    self.log.info('Submitted %d pilots to %s@%s' %
                                  (len(pilotList), queueName, ceName))
                    stampDict = {}
                    if 'PilotStampDict' in result:
                        stampDict = result['PilotStampDict']
                    tqPriorityList = []
                    sumPriority = 0.
                    for tq in tagTqIDList:
                        sumPriority += taskQueueDict[tq]['Priority']
                        tqPriorityList.append((tq, sumPriority))
                    rndm = random.random() * sumPriority
                    tqDict = {}
                    for pilotID in pilotList:
                        rndm = random.random() * sumPriority
                        for tq, prio in tqPriorityList:
                            if rndm < prio:
                                tqID = tq
                                break
                        if tqID not in tqDict:
                            tqDict[tqID] = []
                        tqDict[tqID].append(pilotID)

                    for tqID, pilotList in tqDict.items():
                        result = pilotAgentsDB.addPilotTQReference(
                            pilotList, tqID, self.pilotDN, self.pilotGroup,
                            self.localhost, ceType, stampDict)
                        if not result['OK']:
                            self.log.error(
                                'Failed add pilots to the PilotAgentsDB: ',
                                result['Message'])
                            continue
                        for pilot in pilotList:
                            result = pilotAgentsDB.setPilotStatus(
                                pilot, 'Submitted', ceName,
                                'Successfully submitted by the SiteDirector',
                                siteName, queueName)
                            if not result['OK']:
                                self.log.error('Failed to set pilot status: ',
                                               result['Message'])
                                continue

        self.log.info(
            "%d pilots submitted in total in this cycle, %d matched queues" %
            (totalSubmittedPilots, matchedQueues))
        return S_OK()
  def submitJobs(self):
    """ Go through defined computing elements and submit jobs if necessary
    """

    queues = self.queueDict.keys()

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = {'Setup': setup,
              'CPUTime': 9999999,
              'SubmitPool': self.defaultSubmitPools}
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    if self.checkPlatform:
      result = self.resourcesModule.getCompatiblePlatforms(self.platforms)
      if not result['OK']:
        return result
      tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites
    tags = []
    for queue in queues:
      tags += self.queueDict[queue]['ParametersDict']['Tag']
    tqDict['Tag'] = list(set(tags))

    self.log.verbose('Checking overall TQ availability with requirements')
    self.log.verbose(tqDict)

    matcherClient = MatcherClient()
    result = matcherClient.getMatchingTaskQueues(tqDict)
    if not result['OK']:
      return result
    if not result['Value']:
      self.log.verbose('No Waiting jobs suitable for the director')
      return S_OK()

    jobSites = set()
    anySite = False
    testSites = set()
    totalWaitingJobs = 0
    for tqID in result['Value']:
      if "Sites" in result['Value'][tqID]:
        for site in result['Value'][tqID]['Sites']:
          if site.lower() != 'any':
            jobSites.add(site)
          else:
            anySite = True
      else:
        anySite = True
      if "JobTypes" in result['Value'][tqID]:
        if "Sites" in result['Value'][tqID]:
          for site in result['Value'][tqID]['Sites']:
            if site.lower() != 'any':
              testSites.add(site)
      totalWaitingJobs += result['Value'][tqID]['Jobs']

    tqIDList = result['Value'].keys()
    self.log.info(tqIDList)
    result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList,
                                        'Status': WAITING_PILOT_STATUS},
                                       None)
    tagWaitingPilots = 0
    if result['OK']:
      tagWaitingPilots = result['Value']
    self.log.info('Total %d jobs in %d task queues with %d waiting pilots' %
                  (totalWaitingJobs, len(tqIDList), tagWaitingPilots))
    self.log.info('Queues: ', self.queueDict.keys())
    # if tagWaitingPilots >= totalWaitingJobs:
    #  self.log.info( 'No more pilots to be submitted in this cycle' )
    #  return S_OK()

    result = self.siteClient.getUsableSites()
    if not result['OK']:
      return result
    siteMaskList = result['Value']

    queues = self.queueDict.keys()
    random.shuffle(queues)
    totalSubmittedPilots = 0
    matchedQueues = 0
    for queue in queues:

      # Check if the queue failed previously
      failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor
      if failedCount != 0:
        self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount))
        self.failedQueues[queue] += 1
        continue

      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      queueTags = self.queueDict[queue]['ParametersDict']['Tag']
      siteMask = siteName in siteMaskList
      processorTags = []

      # Check the status of the Site
      result = self.siteClient.getUsableSites(siteName)
      if not result['OK']:
        self.log.error("Can not get the status of site %s: %s" %
                       (siteName, result['Message']))
        continue
      if siteName not in result.get('Value', []):
        self.log.info("site %s is not active" % siteName)
        continue

      if self.rssFlag:
        # Check the status of the ComputingElement
        result = self.rssClient.getElementStatus(ceName, "ComputingElement")
        if not result['OK']:
          self.log.error("Can not get the status of computing element",
                         " %s: %s" % (siteName, result['Message']))
          continue
        if result['Value']:
          # get the value of the status
          result = result['Value'][ceName]['all']

        if result not in ('Active', 'Degraded'):
          self.log.verbose(
              "Skipping computing element %s at %s: resource not usable" % (ceName, siteName))
          continue

      for tag in queueTags:
        if re.match(r'^[0-9]+Processors$', tag):
          processorTags.append(tag)
      if 'WholeNode' in queueTags:
        processorTags.append('WholeNode')

      if not anySite and siteName not in jobSites:
        self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName))
        continue
      if not siteMask and siteName not in testSites:
        self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName))
        continue

      if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
        queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime'])
      else:
        self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue)
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Prepare the queue description to look for eligible jobs
      ceDict = ce.getParameterDict()
      ceDict['GridCE'] = ceName
      # if not siteMask and 'Site' in ceDict:
      #  self.log.info( 'Site not in the mask %s' % siteName )
      #  self.log.info( 'Removing "Site" from matching Dict' )
      #  del ceDict[ 'Site' ]
      if not siteMask:
        ceDict['JobType'] = "Test"
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      if self.checkPlatform:
        platform = self.queueDict[queue]['Platform']
        result = self.resourcesModule.getCompatiblePlatforms(platform)
        if not result['OK']:
          continue
        ceDict['Platform'] = result['Value']

      ceDict['Tag'] = queueTags
      # Get the number of eligible jobs for the target site/queue
      result = matcherClient.getMatchingTaskQueues(ceDict)
      if not result['OK']:
        self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message'])
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.verbose('No matching TQs found for %s' % queue)
        continue

      matchedQueues += 1
      totalTQJobs = 0
      totalTQJobsByProcessors = {}
      tqIDList = taskQueueDict.keys()
      tqIDListByProcessors = {}
      for tq in taskQueueDict:
        if 'Tags' not in taskQueueDict[tq]:
          # skip non multiprocessor tqs
          continue
        for tag in taskQueueDict[tq]['Tags']:
          if tag in processorTags:
            tqIDListByProcessors.setdefault(tag, [])
            tqIDListByProcessors[tag].append(tq)

            totalTQJobsByProcessors.setdefault(tag, 0)
            totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs']

        totalTQJobs += taskQueueDict[tq]['Jobs']

      self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs,
                                                                                      len(tqIDList), queue))

      queueSubmittedPilots = 0
      for tag in tqIDListByProcessors:

        self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag]))

        processors = 1

        m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag)
        if m:
          processors = int(m.group('processors'))
        if tag == 'WholeNode':
          processors = -1

        tagTQJobs = totalTQJobsByProcessors[tag]
        tagTqIDList = tqIDListByProcessors[tag]

        # Get the number of already waiting pilots for these task queues
        tagWaitingPilots = 0
        if self.pilotWaitingFlag:
          result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList,
                                              'Status': WAITING_PILOT_STATUS},
                                             None)
          if not result['OK']:
            self.log.error('Failed to get Number of Waiting pilots', result['Message'])
            tagWaitingPilots = 0
          else:
            tagWaitingPilots = result['Value']
            self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots)
        if tagWaitingPilots >= tagTQJobs:
          self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots)
          continue

        self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots,
                                                                                         tagTQJobs, queue))

        # Get the working proxy
        cpuTime = queueCPUTime + 86400
        self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
        result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy(self.proxy, cpuTime - 60)

        # Get the number of available slots on the target site/queue
        totalSlots = self.getQueueSlots(queue, False)
        if totalSlots == 0:
          self.log.debug('%s: No slots available' % queue)
          continue

        # Note: comparing slots to job numbers is not accurate in multiprocessor case.
        #       This could lead to over submission.
        pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots))
        self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' %
                      (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit))

        # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
        pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit)

        while pilotsToSubmit > 0:
          self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue))

          bundleProxy = self.queueDict[queue].get('BundleProxy', False)
          jobExecDir = ''
          jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir)

          executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit,
                                                                bundleProxy=bundleProxy,
                                                                jobExecDir=jobExecDir,
                                                                processors=processors)
          result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors)
          # ## FIXME: The condor thing only transfers the file with some
          # ## delay, so when we unlink here the script is gone
          # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts...
          if ceType != 'HTCondorCE':
            os.unlink(executable)
          if not result['OK']:
            self.log.error('Failed submission to queue %s:\n' % queue, result['Message'])
            pilotsToSubmit = 0
            self.failedQueues[queue] += 1
            continue

          pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
          queueSubmittedPilots += pilotSubmissionChunk
          # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
          # task queue priorities
          pilotList = result['Value']
          self.queueSlots[queue]['AvailableSlots'] -= len(pilotList)
          totalSubmittedPilots += len(pilotList)
          self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName))
          stampDict = {}
          if 'PilotStampDict' in result:
            stampDict = result['PilotStampDict']
          tqPriorityList = []
          sumPriority = 0.
          for tq in tagTqIDList:
            sumPriority += taskQueueDict[tq]['Priority']
            tqPriorityList.append((tq, sumPriority))
          rndm = random.random() * sumPriority
          tqDict = {}
          for pilotID in pilotList:
            rndm = random.random() * sumPriority
            for tq, prio in tqPriorityList:
              if rndm < prio:
                tqID = tq
                break
            if tqID not in tqDict:
              tqDict[tqID] = []
            tqDict[tqID].append(pilotID)

          for tqID, pilotList in tqDict.items():
            result = pilotAgentsDB.addPilotTQReference(pilotList,
                                                       tqID,
                                                       self.pilotDN,
                                                       self.pilotGroup,
                                                       self.localhost,
                                                       ceType,
                                                       stampDict)
            if not result['OK']:
              self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message'])
              continue
            for pilot in pilotList:
              result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName,
                                                    'Successfully submitted by the SiteDirector',
                                                    siteName, queueName)
              if not result['OK']:
                self.log.error('Failed to set pilot status: ', result['Message'])
                continue

    self.log.info(
        "%d pilots submitted in total in this cycle, %d matched queues" %
        (totalSubmittedPilots, matchedQueues))
    return S_OK()
Exemple #25
0
  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites

    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    queues = self.queueDict.keys()
    random.shuffle( queues )
    for queue in queues:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' )
      platform = self.queueDict[queue]['Platform']

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Get the working proxy
      cpuTime = queueCPUTime + 86400

      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      result = ce.available()
      if not result['OK']:
        self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
        continue
      ceInfoDict = result['CEInfoDict']
      self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                     ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                       ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

      totalSlots = result['Value']

      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      if not siteMask and 'Site' in ceDict:
        self.log.info( 'Site not in the mask %s' % siteName )
        self.log.info( 'Removing "Site" from matching Dict' )
        del ceDict[ 'Site' ]
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.info( 'No matching TQs found' )
        continue

      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      pilotsToSubmit = min( totalSlots, totalTQJobs )

      # Get the number of already waiting pilots for this queue
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                            None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                              ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random()*sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                     tqID,
                                                     self.pilotDN,
                                                     self.pilotGroup,
                                                     self.localhost,
                                                     ceType,
                                                     '',
                                                     stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    return S_OK()
Exemple #26
0
 def __init__( self, name = 'Monitoring/MonitoringDB', readOnly = False ):
   super( MonitoringDB, self ).__init__( 'MonitoringDB', name, CSGlobals.getSetup().lower() )
   self.__readonly = readOnly
   self.__documents = {}
   self.__loadIndexes()
  def _getPilotOptions( self, queue, pilotsToSubmit ):
    """ Prepare pilot options
    """

    queueDict = self.queueDict[queue]['ParametersDict']
    pilotOptions = []

    setup = gConfig.getValue( "/DIRAC/Setup", "unknown" )
    if setup == 'unknown':
      self.log.error( 'Setup is not defined in the configuration' )
      return [ None, None ]
    pilotOptions.append( '-S %s' % setup )
    opsHelper = Operations.Operations( group = self.pilotGroup, setup = setup )

    #Installation defined?
    installationName = opsHelper.getValue( "Pilot/Installation", "" )
    if installationName:
      pilotOptions.append( '-V %s' % installationName )

    #Project defined?
    projectName = opsHelper.getValue( "Pilot/Project", "" )
    if projectName:
      pilotOptions.append( '-l %s' % projectName )
    else:
      self.log.info( 'DIRAC project will be installed by pilots' )

    #Request a release
    diracVersion = opsHelper.getValue( "Pilot/Version", [] )
    if not diracVersion:
      self.log.error( 'Pilot/Version is not defined in the configuration' )
      return [ None, None ]
    # diracVersion is a list of accepted releases
    pilotOptions.append( '-r %s' % ','.join( str( it ) for it in diracVersion ) )

    ownerDN = self.pilotDN
    ownerGroup = self.pilotGroup
    # Request token for maximum pilot efficiency
    result = gProxyManager.requestToken( ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode )
    if not result[ 'OK' ]:
      self.log.error( 'Invalid proxy token request', result['Message'] )
      return [ None, None ]
    ( token, numberOfUses ) = result[ 'Value' ]
    pilotOptions.append( '-o /Security/ProxyToken=%s' % token )
    # Use Filling mode
    pilotOptions.append( '-M %s' % min( numberOfUses, self.maxJobsInFillMode ) )

    # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode )
    # with numberOfUses tokens we can submit at most:
    #    numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
    # pilots
    newPilotsToSubmit = numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
    if newPilotsToSubmit != pilotsToSubmit:
      self.log.info( 'Number of pilots to submit is changed to %d after getting the proxy token' % newPilotsToSubmit )
      pilotsToSubmit = newPilotsToSubmit
    # Debug
    if self.pilotLogLevel.lower() == 'debug':
      pilotOptions.append( '-d' )
    # CS Servers
    csServers = gConfig.getValue( "/DIRAC/Configuration/Servers", [] )
    pilotOptions.append( '-C %s' % ",".join( csServers ) )

    # DIRAC Extensions to be used in pilots
    pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", [] )
    extensionsList = []
    if pilotExtensionsList:
      if pilotExtensionsList[0] != 'None':
        extensionsList = pilotExtensionsList
    else:
      extensionsList = CSGlobals.getCSExtensions()
    if extensionsList:
      pilotOptions.append( '-e %s' % ",".join( extensionsList ) )

    # Requested CPU time
    pilotOptions.append( '-T %s' % queueDict['CPUTime'] )
    # CEName
    pilotOptions.append( '-N %s' % self.queueDict[queue]['CEName'] )
    # Queue
    pilotOptions.append( '-Q %s' % self.queueDict[queue]['QueueName'] )
    # SiteName
    pilotOptions.append( '-n %s' % queueDict['Site'] )
    if 'ClientPlatform' in queueDict:
      pilotOptions.append( "-p '%s'" % queueDict['ClientPlatform'] )

    if 'SharedArea' in queueDict:
      pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % queueDict['SharedArea'] )

#     if 'SI00' in queueDict:
#       factor = float( queueDict['SI00'] ) / 250.
#       pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % factor )
#       pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % factor )
#     else:
#       if 'CPUScalingFactor' in queueDict:
#         pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % queueDict['CPUScalingFactor'] )
#       if 'CPUNormalizationFactor' in queueDict:
#         pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict['CPUNormalizationFactor'] )

    if "ExtraPilotOptions" in queueDict:
      pilotOptions.append( queueDict['ExtraPilotOptions'] )

    # Hack
    if self.defaultSubmitPools:
      pilotOptions.append( '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % self.defaultSubmitPools )

    if "Tag" in queueDict:
      tagString = ','.join( queueDict['Tag'] )
      pilotOptions.append( '-o /Resources/Computing/CEDefaults/Tag=%s' % tagString )

    if self.group:
      pilotOptions.append( '-G %s' % self.group )

    return [ pilotOptions, pilotsToSubmit ]
  def beginExecution( self ):

    self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() )
    # The SiteDirector is for a particular user community
    self.vo = self.am_getOption( "VO", '' )
    if not self.vo:
      self.vo = self.am_getOption( "Community", '' )
    if not self.vo:
      self.vo = CSGlobals.getVO()
    # The SiteDirector is for a particular user group
    self.group = self.am_getOption( "Group", '' )
    # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector
    self.voGroups = []

    # Choose the group for which pilots will be submitted. This is a hack until
    # we will be able to match pilots to VOs.
    if not self.group:
      if self.vo:
        result = Registry.getGroupsForVO( self.vo )
        if not result['OK']:
          return result
        for group in result['Value']:
          if 'NormalUser' in Registry.getPropertiesForGroup( group ):
            self.voGroups.append( group )
    else:
      self.voGroups = [ self.group ]

    result = findGenericPilotCredentials( vo = self.vo )
    if not result[ 'OK' ]:
      return result
    self.pilotDN, self.pilotGroup = result[ 'Value' ]
    self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN )
    self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup )

    self.platforms = []
    self.sites = []
    self.defaultSubmitPools = ''
    if self.group:
      self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '' )
    elif self.vo:
      self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '' )

    self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT )
    self.install = DIRAC_INSTALL
    self.extraModules = self.am_getOption( 'ExtraPilotModules', [] ) + DIRAC_MODULES
    self.workingDirectory = self.am_getOption( 'WorkDirectory' )
    self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 )
    self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' )
    self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode )
    self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit )
    self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True )
    self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 3600 )
    self.failedQueueCycleFactor = self.am_getOption( 'FailedQueueCycleFactor', 10 )
    self.pilotStatusUpdateCycleFactor = self.am_getOption( 'PilotStatusUpdateCycleFactor', 10 ) 

    # Flags
    self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True )
    self.getOutput = self.am_getOption( 'GetPilotOutput', True )
    self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True )

    # Get the site description dictionary
    siteNames = None
    if not self.am_getOption( 'Site', 'Any' ).lower() == "any":
      siteNames = self.am_getOption( 'Site', [] )
      if not siteNames:
        siteNames = None
    ceTypes = None
    if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any":
      ceTypes = self.am_getOption( 'CETypes', [] )
    ces = None
    if not self.am_getOption( 'CEs', 'Any' ).lower() == "any":
      ces = self.am_getOption( 'CEs', [] )
      if not ces:
        ces = None
    result = Resources.getQueues( community = self.vo,
                                  siteList = siteNames,
                                  ceList = ces,
                                  ceTypeList = ceTypes,
                                  mode = 'Direct' )
    if not result['OK']:
      return result
    resourceDict = result['Value']
    result = self.getQueues( resourceDict )
    if not result['OK']:
      return result

    #if not siteNames:
    #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
    #  if siteName == 'Unknown':
    #    return S_OK( 'No site specified for the SiteDirector' )
    #  else:
    #    siteNames = [siteName]
    #self.siteNames = siteNames

    if self.updateStatus:
      self.log.always( 'Pilot status update requested' )
    if self.getOutput:
      self.log.always( 'Pilot output retrieval requested' )
    if self.sendAccounting:
      self.log.always( 'Pilot accounting sending requested' )

    self.log.always( 'Sites:', siteNames )
    self.log.always( 'CETypes:', ceTypes )
    self.log.always( 'CEs:', ces )
    self.log.always( 'PilotDN:', self.pilotDN )
    self.log.always( 'PilotGroup:', self.pilotGroup )
    self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit )
    self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode )

    self.localhost = socket.getfqdn()
    self.proxy = ''

    if self.firstPass:
      if self.queueDict:
        self.log.always( "Agent will serve queues:" )
        for queue in self.queueDict:
          self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'],
                                                           self.queueDict[queue]['CEName'],
                                                           queue ) )
    self.firstPass = False
    return S_OK()
Exemple #29
0
    def beginExecution(self):

        self.gridEnv = self.am_getOption("GridEnv", getGridEnv())
        # The SiteDirector is for a particular user community
        self.vo = self.am_getOption("Community", "")
        if not self.vo:
            self.vo = CSGlobals.getVO()
        # The SiteDirector is for a particular user group
        self.group = self.am_getOption("Group", "")
        # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector
        self.voGroups = []

        # Choose the group for which pilots will be submitted. This is a hack until
        # we will be able to match pilots to VOs.
        if not self.group:
            if self.vo:
                result = Registry.getGroupsForVO(self.vo)
                if not result["OK"]:
                    return result
                for group in result["Value"]:
                    if "NormalUser" in Registry.getPropertiesForGroup(group):
                        self.voGroups.append(group)
        else:
            self.voGroups = [self.group]

        result = findGenericPilotCredentials(vo=self.vo)
        if not result["OK"]:
            return result
        self.pilotDN, self.pilotGroup = result["Value"]
        self.pilotDN = self.am_getOption("PilotDN", self.pilotDN)
        self.pilotGroup = self.am_getOption("PilotGroup", self.pilotGroup)

        self.platforms = []
        self.sites = []
        self.defaultSubmitPools = ""
        if self.group:
            self.defaultSubmitPools = Registry.getGroupOption(self.group, "SubmitPools", "")
        elif self.vo:
            self.defaultSubmitPools = Registry.getVOOption(self.vo, "SubmitPools", "")

        self.pilot = self.am_getOption("PilotScript", DIRAC_PILOT)
        self.install = DIRAC_INSTALL
        self.workingDirectory = self.am_getOption("WorkDirectory")
        self.maxQueueLength = self.am_getOption("MaxQueueLength", 86400 * 3)
        self.pilotLogLevel = self.am_getOption("PilotLogLevel", "INFO")
        self.maxJobsInFillMode = self.am_getOption("MaxJobsInFillMode", self.maxJobsInFillMode)
        self.maxPilotsToSubmit = self.am_getOption("MaxPilotsToSubmit", self.maxPilotsToSubmit)
        self.pilotWaitingFlag = self.am_getOption("PilotWaitingFlag", True)
        self.pilotWaitingTime = self.am_getOption("MaxPilotWaitingTime", 7200)

        # Flags
        self.updateStatus = self.am_getOption("UpdatePilotStatus", True)
        self.getOutput = self.am_getOption("GetPilotOutput", True)
        self.sendAccounting = self.am_getOption("SendPilotAccounting", True)

        # Get the site description dictionary
        siteNames = None
        if not self.am_getOption("Site", "Any").lower() == "any":
            siteNames = self.am_getOption("Site", [])
        ceTypes = None
        if not self.am_getOption("CETypes", "Any").lower() == "any":
            ceTypes = self.am_getOption("CETypes", [])
        ces = None
        if not self.am_getOption("CEs", "Any").lower() == "any":
            ces = self.am_getOption("CEs", [])
        result = Resources.getQueues(
            community=self.vo, siteList=siteNames, ceList=ces, ceTypeList=ceTypes, mode="Direct"
        )
        if not result["OK"]:
            return result
        resourceDict = result["Value"]
        result = self.getQueues(resourceDict)
        if not result["OK"]:
            return result

        # if not siteNames:
        #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
        #  if siteName == 'Unknown':
        #    return S_OK( 'No site specified for the SiteDirector' )
        #  else:
        #    siteNames = [siteName]
        # self.siteNames = siteNames

        if self.updateStatus:
            self.log.always("Pilot status update requested")
        if self.getOutput:
            self.log.always("Pilot output retrieval requested")
        if self.sendAccounting:
            self.log.always("Pilot accounting sending requested")

        self.log.always("Sites:", siteNames)
        self.log.always("CETypes:", ceTypes)
        self.log.always("CEs:", ces)
        self.log.always("PilotDN:", self.pilotDN)
        self.log.always("PilotGroup:", self.pilotGroup)
        self.log.always("MaxPilotsToSubmit:", self.maxPilotsToSubmit)
        self.log.always("MaxJobsInFillMode:", self.maxJobsInFillMode)

        self.localhost = socket.getfqdn()
        self.proxy = ""

        if self.queueDict:
            self.log.always("Agent will serve queues:")
            for queue in self.queueDict:
                self.log.always(
                    "Site: %s, CE: %s, Queue: %s"
                    % (self.queueDict[queue]["Site"], self.queueDict[queue]["CEName"], queue)
                )

        return S_OK()
Exemple #30
0
    def getTicketsList(self, name, startDate=None, endDate=None):
        """ Return tickets of entity in name
       @param name: should be the name of the site
       @param startDate: starting date (optional)
       @param endDate: end date (optional)
    """
        self.statusCount = {}
        self.shortDescription = {}

        # create client instance using GGUS wsdl:
        gclient = Client("https://prod-ars.ggus.eu/arsys/WSDL/public/prod-ars/GGUS")
        authInfo = gclient.factory.create("AuthenticationInfo")
        authInfo.userName = "******"
        authInfo.password = "******"
        gclient.set_options(soapheaders=authInfo)
        # prepare the query string:
        extension = CSGlobals.getCSExtensions()[0].lower()
        query = "'GHD_Affected Site'=\"" + name + '" AND \'GHD_Affected VO\'="%s"' % extension
        if startDate is not None:
            query = query + " AND 'GHD_Date Of Creation'>" + str(startDate)
        if endDate is not None:
            query = query + " AND 'GHD_Date Of Creation'<" + str(endDate)

        # create the URL to get tickets relative to the site:
        # Updated from https://gus.fzk.de to https://ggus.eu
        ggusURL = (
            "https://ggus.eu/ws/ticket_search.php?show_columns_check[]=REQUEST_ID&"
            "show_columns_check[]=TICKET_TYPE&"
            "show_columns_check[]=AFFECTED_VO&"
            "show_columns_check[]=AFFECTED_SITE&"
            "show_columns_check[]=RESPONSIBLE_UNIT&"
            "show_columns_check[]=STATUS&"
            "show_columns_check[]=DATE_OF_CREATION&"
            "show_columns_check[]=LAST_UPDATE&"
            "show_columns_check[]=SHORT_DESCRIPTION&"
            "ticket=&"
            "supportunit=all&"
            "vo=%s&"
            "user=&"
            "keyword=&"
            "involvedsupporter=&"
            "assignto=&"
            "affectedsite=" + name + "&"
            "specattrib=0&"
            "status=open&"
            "priority=all&"
            "typeofproblem=all&"
            "mouarea=&"
            "radiotf=1&"
            "timeframe=any&"
            "tf_date_day_s=&"
            "tf_date_month_s=&"
            "tf_date_year_s=&"
            "tf_date_day_e=&"
            "tf_date_month_e=&"
            "tf_date_year_e=&"
            "lm_date_day=12&"
            "lm_date_month=2&"
            "lm_date_year=2010&"
            "orderticketsby=GHD_INT_REQUEST_ID&"
            "orderhow=descending" % (extension, extension)
        )

        # the query must be into a try block. Empty queries, though formally correct, raise an exception
        try:
            self.ticketList = gclient.service.TicketGetList(query)
            self.globalStatistics()
        except WebFault:
            self.statusCount["terminal"] = 0
            self.statusCount["open"] = 0

        return S_OK((self.statusCount, ggusURL, self.shortDescription))
Exemple #31
0
  def addShifter( self, shifters = None ):
    """
    Adds or modify one or more shifters. Also, adds the shifter section in case this is not present.
    Shifter identities are used in several places, mostly for running agents

    shifters should be in the form {'ShifterRole':{'User':'******', 'Group':'aDIRACGroup'}}

    :return: S_OK/S_ERROR
    """

    def getOpsSection():
      """
      Where is the shifters section?
      """
      vo = CSGlobals.getVO()
      setup = CSGlobals.getSetup()

      if vo:
        res = gConfig.getSections( '/Operations/%s/%s/Shifter' % (vo, setup) )
        if res['OK']:
          return S_OK( '/Operations/%s/%s/Shifter' % ( vo, setup ) )

        res = gConfig.getSections( '/Operations/%s/Defaults/Shifter' % vo )
        if res['OK']:
          return S_OK( '/Operations/%s/Defaults/Shifter' % vo )

      else:
        res = gConfig.getSections( '/Operations/%s/Shifter' % setup )
        if res['OK']:
          return S_OK( '/Operations/%s/Shifter' % setup )

        res = gConfig.getSections( '/Operations/Defaults/Shifter' )
        if res['OK']:
          return S_OK( '/Operations/Defaults/Shifter' )

      return S_ERROR( "No shifter section" )

    if shifters is None: shifters = {}
    if not self.__initialized['OK']:
      return self.__initialized

    # get current shifters
    opsH = Operations( )
    currentShifterRoles = opsH.getSections( 'Shifter' )
    if not currentShifterRoles['OK']:
      # we assume the shifter section is not present
      currentShifterRoles = []
    else:
      currentShifterRoles = currentShifterRoles['Value']
    currentShiftersDict = {}
    for currentShifterRole in currentShifterRoles:
      currentShifter = opsH.getOptionsDict( 'Shifter/%s' % currentShifterRole )
      if not currentShifter['OK']:
        return currentShifter
      currentShifter = currentShifter['Value']
      currentShiftersDict[currentShifterRole] = currentShifter

    # Removing from shifters what does not need to be changed
    for sRole in shifters:
      if sRole in currentShiftersDict:
        if currentShiftersDict[sRole] == shifters[sRole]:
          shifters.pop( sRole )

    # get shifters section to modify
    section = getOpsSection()

    # Is this section present?
    if not section['OK']:
      if section['Message'] == "No shifter section":
        gLogger.warn( section['Message'] )
        gLogger.info( "Adding shifter section" )
        vo = CSGlobals.getVO()
        if vo:
          section = '/Operations/%s/Defaults/Shifter' % vo
        else:
          section = '/Operations/Defaults/Shifter'
        res = self.__csMod.createSection( section )
        if not res:
          gLogger.error( "Section %s not created" % section )
          return S_ERROR( "Section %s not created" % section )
      else:
        gLogger.error( section['Message'] )
        return section
    else:
      section = section['Value']


    #add or modify shifters
    for shifter in shifters:
      self.__csMod.removeSection( section + '/' + shifter )
      self.__csMod.createSection( section + '/' + shifter )
      self.__csMod.createSection( section + '/' + shifter + '/' + 'User' )
      self.__csMod.createSection( section + '/' + shifter + '/' + 'Group' )
      self.__csMod.setOptionValue( section + '/' + shifter + '/' + 'User', shifters[shifter]['User'] )
      self.__csMod.setOptionValue( section + '/' + shifter + '/' + 'Group', shifters[shifter]['Group'] )

    self.__csModified = True
    return S_OK( True )
Exemple #32
0
 def __getVOPath(self):
     if CSGlobals.getVO():
         return "/Operations"
     return "/Operations/%s" % self.__threadData.vo
Exemple #33
0
    def addShifter(self, shifters=None):
        """
    Adds or modify one or more shifters. Also, adds the shifter section in case this is not present.
    Shifter identities are used in several places, mostly for running agents

    shifters should be in the form {'ShifterRole':{'User':'******', 'Group':'aDIRACGroup'}}

    :return: S_OK/S_ERROR
    """
        def getOpsSection():
            """
      Where is the shifters section?
      """
            vo = CSGlobals.getVO()
            setup = CSGlobals.getSetup()

            if vo:
                res = gConfig.getSections('/Operations/%s/%s/Shifter' %
                                          (vo, setup))
                if res['OK']:
                    return S_OK('/Operations/%s/%s/Shifter' % (vo, setup))

                res = gConfig.getSections('/Operations/%s/Defaults/Shifter' %
                                          vo)
                if res['OK']:
                    return S_OK('/Operations/%s/Defaults/Shifter' % vo)

            else:
                res = gConfig.getSections('/Operations/%s/Shifter' % setup)
                if res['OK']:
                    return S_OK('/Operations/%s/Shifter' % setup)

                res = gConfig.getSections('/Operations/Defaults/Shifter')
                if res['OK']:
                    return S_OK('/Operations/Defaults/Shifter')

            return S_ERROR("No shifter section")

        if shifters is None: shifters = {}
        if not self.__initialized['OK']:
            return self.__initialized

        # get current shifters
        opsH = Operations()
        currentShifterRoles = opsH.getSections('Shifter')
        if not currentShifterRoles['OK']:
            # we assume the shifter section is not present
            currentShifterRoles = []
        else:
            currentShifterRoles = currentShifterRoles['Value']
        currentShiftersDict = {}
        for currentShifterRole in currentShifterRoles:
            currentShifter = opsH.getOptionsDict('Shifter/%s' %
                                                 currentShifterRole)
            if not currentShifter['OK']:
                return currentShifter
            currentShifter = currentShifter['Value']
            currentShiftersDict[currentShifterRole] = currentShifter

        # Removing from shifters what does not need to be changed
        for sRole in shifters:
            if sRole in currentShiftersDict:
                if currentShiftersDict[sRole] == shifters[sRole]:
                    shifters.pop(sRole)

        # get shifters section to modify
        section = getOpsSection()

        # Is this section present?
        if not section['OK']:
            if section['Message'] == "No shifter section":
                gLogger.warn(section['Message'])
                gLogger.info("Adding shifter section")
                vo = CSGlobals.getVO()
                if vo:
                    section = '/Operations/%s/Defaults/Shifter' % vo
                else:
                    section = '/Operations/Defaults/Shifter'
                res = self.__csMod.createSection(section)
                if not res:
                    gLogger.error("Section %s not created" % section)
                    return S_ERROR("Section %s not created" % section)
            else:
                gLogger.error(section['Message'])
                return section
        else:
            section = section['Value']

        #add or modify shifters
        for shifter in shifters:
            self.__csMod.removeSection(section + '/' + shifter)
            self.__csMod.createSection(section + '/' + shifter)
            self.__csMod.createSection(section + '/' + shifter + '/' + 'User')
            self.__csMod.createSection(section + '/' + shifter + '/' + 'Group')
            self.__csMod.setOptionValue(section + '/' + shifter + '/' + 'User',
                                        shifters[shifter]['User'])
            self.__csMod.setOptionValue(
                section + '/' + shifter + '/' + 'Group',
                shifters[shifter]['Group'])

        self.__csModified = True
        return S_OK(True)