Example #1
0
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()
Example #2
0
 def initialize(self):
     """ Standard constructor
 """
     self.am_setOption("PollingTime", 60.0)
     self.am_setOption("maxPilotWaitingHours", 6)
     self.queueDict = {}
     self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
     self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT
     self.siteStatus = SiteStatus()
     return S_OK()
Example #3
0
 def _updateSiteMask( self, sitesData ):
   siteStatus = SiteStatus()
   siteMaskStatus = dict( sitesData )
   for site in siteMaskStatus:
     #
     #FIXME: we are only taking into account ComputingAccess
     #
     if siteStatus.isUsableSite( site, 'ComputingAccess' ):
       siteMaskStatus[ site ][ 'siteMaskStatus' ] = 'Allowed'
     else:
       siteMaskStatus[ site ][ 'siteMaskStatus' ] = 'Banned'
     sitesData[ site ][ 'siteMaskStatus' ] = siteMaskStatus[ site ][ 'siteMaskStatus' ]
   return S_OK( sitesData )
Example #4
0
 def _updateSiteMask(self, sitesData):
     siteStatus = SiteStatus()
     siteMaskStatus = dict(sitesData)
     for site in siteMaskStatus:
         #
         #FIXME: we are only taking into account ComputingAccess
         #
         if siteStatus.isUsableSite(site, 'ComputingAccess'):
             siteMaskStatus[site]['siteMaskStatus'] = 'Allowed'
         else:
             siteMaskStatus[site]['siteMaskStatus'] = 'Banned'
         sitesData[site]['siteMaskStatus'] = siteMaskStatus[site][
             'siteMaskStatus']
     return S_OK(sitesData)
Example #5
0
    def initialize(self):
        """ Standard initialize.
    """

        maxNumberOfThreads = self.am_getOption('maxNumberOfThreads',
                                               self.__maxNumberOfThreads)
        self.threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads)

        self.siteClient = SiteStatus()

        self.clients['SiteStatus'] = self.siteClient
        self.clients['ResourceManagementClient'] = ResourceManagementClient()

        return S_OK()
Example #6
0
    def __init__(self):
        """Internal initialization of the DIRAC Admin API."""
        super(DiracAdmin, self).__init__()

        self.csAPI = CSAPI()

        self.dbg = False
        if gConfig.getValue(self.section + "/LogLevel", "DEBUG") == "DEBUG":
            self.dbg = True

        self.scratchDir = gConfig.getValue(self.section + "/ScratchDir",
                                           "/tmp")
        self.currentDir = os.getcwd()
        self.rssFlag = ResourceStatus().rssFlag
        self.sitestatus = SiteStatus()
Example #7
0
class InputDataValidation( OptimizerExecutor ):
  """
      The specific Optimizer must provide the following methods:
      - initializeOptimizer() before each execution cycle
      - checkJob() - the main method called for each job
  """

  @classmethod
  def initializeOptimizer( cls ):
    """ Initialization of the Agent.
    """
    random.seed()
    cls.__SEStatus = DictCache.DictCache()
    cls.__sitesForSE = DictCache.DictCache()
    try:
      from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
    except ImportError, excp :
      return S_ERROR( "Could not import JobDB: %s" % str( excp ) )

    try:
      cls.__jobDB = JobDB()
    except RuntimeError:
      return S_ERROR( "Cannot connect to JobDB" )

    cls.__siteStatus = SiteStatus()

    cls.ex_setOption( "FailedStatus", "Input Data Not Available" )
    return S_OK()
Example #8
0
  def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger("Matcher")

    self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

    self.siteClient = SiteStatus()
Example #9
0
    def __init__(self):
        """Internal initialization of the DIRAC Admin API.
    """
        super(DiracAdmin, self).__init__()

        self.csAPI = CSAPI()

        self.dbg = False
        if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG':
            self.dbg = True

        self.scratchDir = gConfig.getValue(self.section + '/ScratchDir',
                                           '/tmp')
        self.currentDir = os.getcwd()
        self.rssFlag = ResourceStatus().rssFlag
        self.sitestatus = SiteStatus()
        self._siteSet = set(getSites().get('Value', []))
Example #10
0
    def printCEInfo(voName):

        resultQueues = Resources.getQueues(community=voName)
        if not resultQueues["OK"]:
            gLogger.error("Failed to get CE information")
            DIRACExit(-1)

        fields = ("Site", "CE", "CEType", "Queue", "Status")
        records = []

        # get list of usable sites within this cycle
        resultMask = SiteStatus().getUsableSites()
        if not resultMask["OK"]:
            return resultMask
        siteMaskList = resultMask.get("Value", [])

        rssClient = ResourceStatus()

        for site in resultQueues["Value"]:
            siteStatus = "Active" if site in siteMaskList else "InActive"
            siteNew = True
            for ce in resultQueues["Value"][site]:

                ceStatus = siteStatus
                if rssClient.rssFlag:
                    result = rssClient.getElementStatus(ce, "ComputingElement")
                    if result["OK"]:
                        ceStatus = result["Value"][ce]["all"]

                ceNew = True
                for queue in resultQueues["Value"][site][ce]["Queues"]:
                    pSite = site if siteNew else ""
                    pCE = ""
                    ceType = ""
                    if ceNew:
                        pCE = ce
                        ceType = resultQueues["Value"][site][ce]["CEType"]
                    records.append((pSite, pCE, ceType, queue, ceStatus))
                    ceNew = False
                    siteNew = False

        gLogger.notice(
            printTable(fields, records, printOut=False, columnSeparator="  "))
        return S_OK()
Example #11
0
def printCEInfo(voName):

    resultQueues = Resources.getQueues(community=voName)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)

    fields = ("Site", 'CE', 'CEType', 'Queue', 'Status')
    records = []

    # get list of usable sites within this cycle
    resultMask = SiteStatus().getUsableSites()
    if not resultMask['OK']:
        return resultMask
    siteMaskList = resultMask.get('Value', [])

    rssClient = ResourceStatus()

    for site in resultQueues['Value']:
        siteStatus = "Active" if site in siteMaskList else "InActive"
        siteNew = True
        for ce in resultQueues['Value'][site]:

            ceStatus = siteStatus
            if rssClient.rssFlag:
                result = rssClient.getElementStatus(ce, "ComputingElement")
                if result['OK']:
                    ceStatus = result['Value'][ce]['all']

            ceNew = True
            for queue in resultQueues['Value'][site][ce]['Queues']:
                pSite = site if siteNew else ''
                pCE = ''
                ceType = ''
                if ceNew:
                    pCE = ce
                    ceType = resultQueues['Value'][site][ce]['CEType']
                records.append((pSite, pCE, ceType, queue, ceStatus))
                ceNew = False
                siteNew = False

    gLogger.notice(
        printTable(fields, records, printOut=False, columnSeparator='  '))
    return S_OK()
Example #12
0
  def __init__( self ):
    """
    Constructor, initializes the logger, rssClient and caches.

    examples
      >>> resourceStatus = ResourceStatus()
    """

    super( ResourceStatus, self ).__init__()
    
    self.siteStatus = SiteStatus()
    
    # We can set CacheLifetime and CacheHistory from CS, so that we can tune them.
    cacheLifeTime = int( RssConfiguration().getConfigCache() )
    
    # RSSCaches, one per elementType ( StorageElement, ComputingElement )
    # Should be generated on the fly, instead of being hardcoded ?
    self.seCache = RSSCache( 'Storage', cacheLifeTime, self._updateSECache )
    self.ceCache = RSSCache( 'Computing', cacheLifeTime, self._updateCECache )
Example #13
0
 def initialize( self ):
   """ Standard constructor
   """
   self.am_setOption( "PollingTime", 60.0 )
   self.am_setOption( "maxPilotWaitingHours", 6 )
   self.queueDict = {}
   self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
   self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT
   self.siteStatus = SiteStatus()
   return S_OK()
Example #14
0
  def __init__( self, submitPool ):
    """
     Define the logger and some defaults
    """

    if submitPool == self.gridMiddleware:
      self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware )
    else:
      self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) )

    self.pilot = DIRAC_PILOT
    self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool
    self.extraPilotOptions = []
    self.installVersion = DIRAC_VERSION
    self.installProject = DIRAC_PROJECT
    self.installation = DIRAC_INSTALLATION
    self.pilotExtensionsList = []

    self.virtualOrganization = VIRTUAL_ORGANIZATION
    self.install = DIRAC_INSTALL
    self.extraModules = DIRAC_MODULES
    self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
    self.targetGrids = [ self.gridMiddleware ]


    self.enableListMatch = ENABLE_LISTMATCH
    self.listMatchDelay = LISTMATCH_DELAY
    self.listMatchCache = DictCache()

    self.privatePilotFraction = PRIVATE_PILOT_FRACTION

    self.errorClearTime = ERROR_CLEAR_TIME
    self.errorTicketTime = ERROR_TICKET_TIME
    self.errorMailAddress = DIRAC.errorMail
    self.alarmMailAddress = DIRAC.alarmMail
    self.mailFromAddress = FROM_MAIL

    self.siteClient = SiteStatus()

    if not  'log' in self.__dict__:
      self.log = gLogger.getSubLogger( 'PilotDirector' )
    self.log.info( 'Initialized' )
Example #15
0
  def __checkSitesInMask( self, job, siteCandidates ):
    """Returns list of site candidates that are in current mask.
    """

    siteStatus = SiteStatus()
    result     = siteStatus.getUsableSites( 'ComputingAccess' )  
    if not result['OK']:
      return S_ERROR( 'Could not get site mask' )

    sites = []
    usableSites = result['Value']
    for candidate in siteCandidates:
      if not candidate in usableSites:
        self.log.verbose( '%s is a candidate site for job %s but not in mask' % ( candidate, job ) )
      else:
        sites.append( candidate )

    self.log.info( 'Candidate sites in Mask are %s' % ( sites ) )

    return S_OK( sites )
Example #16
0
  def getSiteMask( self, printOutput = False ):
    """Retrieve current site mask from WMS Administrator service.

       Example usage:

       >>> print diracAdmin.getSiteMask()
       {'OK': True, 'Value': 0L}

       :returns: S_OK,S_ERROR

    """
    
    siteStatus = SiteStatus()
    result = siteStatus.getUsableSites( 'ComputingAccess' )
    if result['OK']:
      sites = result['Value']
      if printOutput:
        sites.sort()
        for site in sites:
          print site

    return result
Example #17
0
  def initialize( self ):
    """ Standard initialize.
    """

    maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', self.__maxNumberOfThreads )
    self.threadPool    = ThreadPool( maxNumberOfThreads, maxNumberOfThreads )

    self.siteClient  = SiteStatus()

    self.clients['SiteStatus']               = self.siteClient
    self.clients['ResourceManagementClient'] = ResourceManagementClient()

    return S_OK()
Example #18
0
    def __checkSitesInMask(self, job, siteCandidates):
        """Returns list of site candidates that are in current mask.
    """

        siteStatus = SiteStatus()
        result = siteStatus.getUsableSites('ComputingAccess')
        if not result['OK']:
            return S_ERROR('Could not get site mask')

        sites = []
        usableSites = result['Value']
        for candidate in siteCandidates:
            if not candidate in usableSites:
                self.log.verbose(
                    '%s is a candidate site for job %s but not in mask' %
                    (candidate, job))
            else:
                sites.append(candidate)

        self.log.info('Candidate sites in Mask are %s' % (sites))

        return S_OK(sites)
Example #19
0
    def getSiteMask(self, printOutput=False):
        """Retrieve current site mask from WMS Administrator service.

       Example usage:

       >>> print diracAdmin.getSiteMask()
       {'OK': True, 'Value': 0L}

       :returns: S_OK,S_ERROR

    """

        siteStatus = SiteStatus()
        result = siteStatus.getUsableSites('ComputingAccess')
        if result['OK']:
            sites = result['Value']
            if printOutput:
                sites.sort()
                for site in sites:
                    print site

        return result
Example #20
0
  def getBannedSites( self, printOutput = False ):
    """Retrieve current list of banned sites.

       Example usage:

       >>> print diracAdmin.getBannedSites()
       {'OK': True, 'Value': []}

       :returns: S_OK,S_ERROR

    """
    siteStatus = SiteStatus()

    result = siteStatus.getUnusableSites( 'ComputingAccess' )
    if not result['OK']:
      self.log.warn( result['Message'] )
      return result
    bannedSites = result['Value']

    bannedSites.sort()
    if printOutput:
      print '\n'.join( bannedSites )
    return S_OK( bannedSites )
Example #21
0
    def getBannedSites(self, printOutput=False):
        """Retrieve current list of banned sites.

       Example usage:

       >>> print diracAdmin.getBannedSites()
       {'OK': True, 'Value': []}

       :returns: S_OK,S_ERROR

    """
        siteStatus = SiteStatus()

        result = siteStatus.getUnusableSites('ComputingAccess')
        if not result['OK']:
            self.log.warn(result['Message'])
            return result
        bannedSites = result['Value']

        bannedSites.sort()
        if printOutput:
            print '\n'.join(bannedSites)
        return S_OK(bannedSites)
Example #22
0
  def __init__(self):
    """Internal initialization of the DIRAC Admin API.
    """
    super(DiracAdmin, self).__init__()

    self.csAPI = CSAPI()

    self.dbg = False
    if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG':
      self.dbg = True

    self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp')
    self.currentDir = os.getcwd()
    self.rssFlag = ResourceStatus().rssFlag
    self.sitestatus = SiteStatus()
Example #23
0
  def optimizeJob( self, jid, jobState ):
    # Reschedule delay
    result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] )
    if not result[ 'OK' ]:
      return result
    attDict = result[ 'Value' ]
    try:
      reschedules = int( attDict[ 'RescheduleCounter' ] )
    except ValueError:
      return S_ERROR( "RescheduleCounter has to be an integer" )
    if reschedules != 0:
      delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] )
      delay = delays[ min( reschedules, len( delays ) - 1 ) ]
      waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) )
      if waited < delay:
        return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay )

    # Get site requirements
    result = self._getSitesRequired( jobState )
    if not result[ 'OK' ]:
      return result
    userSites, userBannedSites = result[ 'Value' ]

    # Get active and banned sites from DIRAC
    siteStatus = SiteStatus()
    result = siteStatus.getUsableSites( 'ComputingAccess' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve active sites from JobDB" )
    usableSites = result[ 'Value' ]
    result = siteStatus.getUnusableSites( 'ComputingAccess' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve banned sites from JobDB" )
    unusableSites = result[ 'Value' ]

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      result = jobState.getAttribute( "JobType" )
      if not result[ 'OK' ]:
        return S_ERROR( "Could not retrieve job type" )
      jobType = result[ 'Value' ]
      if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ):
        sites = self._applySiteFilter( userSites, usableSites, unusableSites )
        if not sites:
          return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( userSites ) )

    # Get the Input data
    # Third, check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      # No input data? Generate requirements and next
      return self.__sendToTQ( jobState, userSites, userBannedSites )

    inputData = result[ 'Value' ]

    self.jobLog.verbose( 'Has an input data requirement' )
    idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' )
    result = self.retrieveOptimizerParam( idAgent )
    if not result['OK']:
      self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] )
      return S_ERROR( "File Catalog Access Failure" )
    opData = result[ 'Value' ]
    if 'SiteCandidates' not in opData:
      return S_ERROR( "No possible site candidates" )

    # Filter input data sites with user requirement
    siteCandidates = list( opData[ 'SiteCandidates' ] )
    self.jobLog.info( "Site candidates are %s" % siteCandidates )

    siteCandidates = self._applySiteFilter( siteCandidates, userSites, userBannedSites )
    if not siteCandidates:
      return S_ERROR( "Impossible InputData * Site requirements" )

    idSites = {}
    for site in siteCandidates:
      idSites[ site ] = opData[ 'SiteCandidates' ][ site ]

    #Check if sites have correct count of disk+tape replicas
    numData = len( inputData )
    errorSites = set()
    for site in idSites:
      if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]:
        self.jobLog.error( "Site candidate %s does not have all the input data" % site )
        errorSites.add( site )
    for site in errorSites:
      idSites.pop( site )
    if not idSites:
      return S_ERROR( "Site candidates do not have all the input data" )

    #Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites )
    if not siteCandidates:
      return S_ERROR( "No destination sites available" )

    # Is any site active?
    stageSites = self._applySiteFilter( siteCandidates, usableSites, unusableSites )
    if not stageSites:
      return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) )

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ( jobState, siteCandidates, userBannedSites )

    # Check if the user is allowed to stage
    if self.ex_getOption( "RestrictDataStage", False ):
      if not self.__checkStageAllowed( jobState ):
        return S_ERROR( "Stage not allowed" )

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) )
    stageData = idSites[ stageSite ]
    # Set as if everything has already been staged
    stageData[ 'disk' ] += stageData[ 'tape' ]
    stageData[ 'tape' ] = 0
    # Set the site info back to the original dict to save afterwards
    opData[ 'SiteCandidates' ][ stageSite ] = stageData

    result = self.__requestStaging( jobState, stageSite, opData )
    if not result[ 'OK' ]:
      return result
    stageLFNs = result[ 'Value' ]
    self._updateSharedSESites( stageSite, stageLFNs, opData )
    # Save the optimizer data again
    self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData )
    result = self.storeOptimizerParam( idAgent, opData )
    if not result[ 'OK' ]:
      return result

    return self._setJobSite( jobState, stageSites )
Example #24
0
class SiteDirector(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """
    def initialize(self):
        """ Standard constructor
    """
        self.am_setOption("PollingTime", 60.0)
        self.am_setOption("maxPilotWaitingHours", 6)
        self.queueDict = {}
        self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
        self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT
        self.siteStatus = SiteStatus()
        return S_OK()

    def beginExecution(self):

        self.gridEnv = self.am_getOption("GridEnv", getGridEnv())
        # The SiteDirector is for a particular user community
        self.vo = self.am_getOption("Community", '')
        if not self.vo:
            self.vo = CSGlobals.getVO()
        # The SiteDirector is for a particular user group
        self.group = self.am_getOption("Group", '')
        # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector
        self.voGroups = []

        # Choose the group for which pilots will be submitted. This is a hack until
        # we will be able to match pilots to VOs.
        if not self.group:
            if self.vo:
                result = Registry.getGroupsForVO(self.vo)
                if not result['OK']:
                    return result
                for group in result['Value']:
                    if 'NormalUser' in Registry.getPropertiesForGroup(group):
                        self.voGroups.append(group)
        else:
            self.voGroups = [self.group]

        result = findGenericPilotCredentials(vo=self.vo)
        if not result['OK']:
            return result
        self.pilotDN, self.pilotGroup = result['Value']
        self.pilotDN = self.am_getOption("PilotDN", self.pilotDN)
        self.pilotGroup = self.am_getOption("PilotGroup", self.pilotGroup)

        self.platforms = []
        self.sites = []
        self.defaultSubmitPools = ''
        if self.group:
            self.defaultSubmitPools = Registry.getGroupOption(
                self.group, 'SubmitPools', '')
        elif self.vo:
            self.defaultSubmitPools = Registry.getVOOption(
                self.vo, 'SubmitPools', '')

        self.pilot = self.am_getOption('PilotScript', DIRAC_PILOT)
        self.install = DIRAC_INSTALL
        self.workingDirectory = self.am_getOption('WorkDirectory')
        self.maxQueueLength = self.am_getOption('MaxQueueLength', 86400 * 3)
        self.pilotLogLevel = self.am_getOption('PilotLogLevel', 'INFO')
        self.maxJobsInFillMode = self.am_getOption('MaxJobsInFillMode',
                                                   self.maxJobsInFillMode)
        self.maxPilotsToSubmit = self.am_getOption('MaxPilotsToSubmit',
                                                   self.maxPilotsToSubmit)
        self.pilotWaitingFlag = self.am_getOption('PilotWaitingFlag', True)
        self.pilotWaitingTime = self.am_getOption('MaxPilotWaitingTime', 7200)

        # Flags
        self.updateStatus = self.am_getOption('UpdatePilotStatus', True)
        self.getOutput = self.am_getOption('GetPilotOutput', True)
        self.sendAccounting = self.am_getOption('SendPilotAccounting', True)

        # Get the site description dictionary
        siteNames = None
        if not self.am_getOption('Site', 'Any').lower() == "any":
            siteNames = self.am_getOption('Site', [])
        ceTypes = None
        if not self.am_getOption('CETypes', 'Any').lower() == "any":
            ceTypes = self.am_getOption('CETypes', [])
        ces = None
        if not self.am_getOption('CEs', 'Any').lower() == "any":
            ces = self.am_getOption('CEs', [])

        self._resources = Resources.Resources(vo=self.vo)
        result = self._resources.getEligibleQueuesInfo(siteList=siteNames,
                                                       ceList=ces,
                                                       ceTypeList=ceTypes,
                                                       mode='Direct')
        if not result['OK']:
            return result
        resourceDict = result['Value']
        result = self.getQueues(resourceDict)
        if not result['OK']:
            return result

        #if not siteNames:
        #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
        #  if siteName == 'Unknown':
        #    return S_OK( 'No site specified for the SiteDirector' )
        #  else:
        #    siteNames = [siteName]
        #self.siteNames = siteNames

        if self.updateStatus:
            self.log.always('Pilot status update requested')
        if self.getOutput:
            self.log.always('Pilot output retrieval requested')
        if self.sendAccounting:
            self.log.always('Pilot accounting sending requested')

        self.log.always('Sites:', siteNames)
        self.log.always('CETypes:', ceTypes)
        self.log.always('CEs:', ces)
        self.log.always('PilotDN:', self.pilotDN)
        self.log.always('PilotGroup:', self.pilotGroup)
        self.log.always('MaxPilotsToSubmit:', self.maxPilotsToSubmit)
        self.log.always('MaxJobsInFillMode:', self.maxJobsInFillMode)

        self.localhost = socket.getfqdn()
        self.proxy = ''

        if self.queueDict:
            self.log.always("Agent will serve queues:")
            for queue in self.queueDict:
                self.log.always("Site: %s, CE: %s, Queue: %s" %
                                (self.queueDict[queue]['Site'],
                                 self.queueDict[queue]['CEName'], queue))

        return S_OK()

    def getQueues(self, resourceDict):
        """ Get the list of relevant CEs and their descriptions
    """

        self.queueDict = {}
        ceFactory = ComputingElementFactory()

        for site in resourceDict:
            result = self._resources.getSiteFullName(site)
            if not result['OK']:
                continue
            siteFullName = result['Value']
            for ce in resourceDict[site]:
                ceDict = resourceDict[site][ce]
                qDict = ceDict.pop('Queues')
                for queue in qDict:
                    queueName = '%s_%s' % (ce, queue)
                    self.queueDict[queueName] = {}
                    self.queueDict[queueName]['ParametersDict'] = qDict[queue]
                    self.queueDict[queueName]['ParametersDict'][
                        'Queue'] = queue
                    self.queueDict[queueName]['ParametersDict'][
                        'Site'] = siteFullName
                    self.queueDict[queueName]['ParametersDict'][
                        'GridEnv'] = self.gridEnv
                    self.queueDict[queueName]['ParametersDict'][
                        'Setup'] = gConfig.getValue('/DIRAC/Setup', 'unknown')
                    # Evaluate the CPU limit of the queue according to the Glue convention
                    # To Do: should be a utility
                    if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \
                       "SI00" in self.queueDict[queueName]['ParametersDict']:
                        maxCPUTime = float(self.queueDict[queueName]
                                           ['ParametersDict']['maxCPUTime'])
                        # For some sites there are crazy values in the CS
                        maxCPUTime = max(maxCPUTime, 0)
                        maxCPUTime = min(maxCPUTime, 86400 * 12.5)
                        si00 = float(self.queueDict[queueName]
                                     ['ParametersDict']['SI00'])
                        queueCPUTime = 60. / 250. * maxCPUTime * si00
                        self.queueDict[queueName]['ParametersDict'][
                            'CPUTime'] = int(queueCPUTime)
                    qwDir = os.path.join(self.workingDirectory, queue)
                    if not os.path.exists(qwDir):
                        os.makedirs(qwDir)
                    self.queueDict[queueName]['ParametersDict'][
                        'WorkingDirectory'] = qwDir

                    platform = ''
                    if "Platform" in self.queueDict[queueName][
                            'ParametersDict']:
                        platform = self.queueDict[queueName]['ParametersDict'][
                            'Platform']
                    elif "Platform" in ceDict:
                        platform = ceDict['Platform']
                    elif "OS" in ceDict:
                        architecture = ceDict.get('architecture', 'x86_64')
                        OS = ceDict['OS']
                        platform = '_'.join([architecture, OS])
                    if platform and not platform in self.platforms:
                        self.platforms.append(platform)

                    if not "Platform" in self.queueDict[queueName][
                            'ParametersDict'] and platform:
                        result = Resources.getDIRACPlatform(platform)
                        if result['OK']:
                            self.queueDict[queueName]['ParametersDict'][
                                'Platform'] = result['Value']

                    ceQueueDict = dict(ceDict)
                    ceQueueDict.update(
                        self.queueDict[queueName]['ParametersDict'])
                    result = ceFactory.getCE(ceName=ce,
                                             ceType=ceDict['CEType'],
                                             ceParametersDict=ceQueueDict)
                    if not result['OK']:
                        return result
                    self.queueDict[queueName]['CE'] = result['Value']
                    self.queueDict[queueName]['CEName'] = ce
                    self.queueDict[queueName]['CEType'] = ceDict['CEType']
                    self.queueDict[queueName]['Site'] = siteFullName
                    self.queueDict[queueName]['QueueName'] = queue
                    self.queueDict[queueName]['Platform'] = platform
                    result = self.queueDict[queueName]['CE'].isValid()
                    if not result['OK']:
                        self.log.fatal(result['Message'])
                        return result
                    if 'BundleProxy' in self.queueDict[queueName][
                            'ParametersDict']:
                        self.queueDict[queueName]['BundleProxy'] = True
                    elif 'BundleProxy' in ceDict:
                        self.queueDict[queueName]['BundleProxy'] = True

                    if siteFullName not in self.sites:
                        self.sites.append(siteFullName)

        return S_OK()

    def execute(self):
        """ Main execution method
    """

        if not self.queueDict:
            self.log.warn('No site defined, exiting the cycle')
            return S_OK()

        result = self.submitJobs()
        if not result['OK']:
            self.log.error('Errors in the job submission: ', result['Message'])

        if self.updateStatus:
            result = self.updatePilotStatus()
            if not result['OK']:
                self.log.error('Errors in updating pilot status: ',
                               result['Message'])

        return S_OK()

    def submitJobs(self):
        """ Go through defined computing elements and submit jobs if necessary
    """

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {
            'Setup': setup,
            'CPUTime': 9999999,
            'SubmitPool': self.defaultSubmitPools
        }
        if self.vo:
            tqDict['Community'] = self.vo
        if self.voGroups:
            tqDict['OwnerGroup'] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result['OK']:
            return result
        tqDict['Platform'] = result['Value']
        tqDict['Site'] = self.sites

        self.log.verbose('Checking overall TQ availability with requirements')
        self.log.verbose(tqDict)

        rpcMatcher = RPCClient("WorkloadManagement/Matcher")
        result = rpcMatcher.getMatchingTaskQueues(tqDict)
        if not result['OK']:
            return result
        if not result['Value']:
            self.log.verbose('No Waiting jobs suitable for the director')
            return S_OK()

        queues = self.queueDict.keys()
        random.shuffle(queues)
        for queue in queues:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            ceType = self.queueDict[queue]['CEType']
            queueName = self.queueDict[queue]['QueueName']
            siteName = self.queueDict[queue]['Site']
            siteMask = self.siteStatus.isUsableSite(siteName,
                                                    'ComputingAccess')
            platform = self.queueDict[queue]['Platform']

            if 'CPUTime' in self.queueDict[queue]['ParametersDict']:
                queueCPUTime = int(
                    self.queueDict[queue]['ParametersDict']['CPUTime'])
            else:
                self.log.warn(
                    'CPU time limit is not specified for queue %s, skipping...'
                    % queue)
                continue
            if queueCPUTime > self.maxQueueLength:
                queueCPUTime = self.maxQueueLength

            # Get the working proxy
            cpuTime = queueCPUTime + 86400

            self.log.verbose("Getting pilot proxy for %s/%s %d long" %
                             (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result['OK']:
                return result
            self.proxy = result['Value']
            ce.setProxy(self.proxy, cpuTime - 60)

            # Get the number of available slots on the target site/queue
            result = ce.available()
            if not result['OK']:
                self.log.warn(
                    'Failed to check the availability of queue %s: \n%s' %
                    (queue, result['Message']))
                continue
            ceInfoDict = result['CEInfoDict']
            self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                           ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                             ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

            totalSlots = result['Value']

            ceDict = ce.getParameterDict()
            ceDict['GridCE'] = ceName
            if not siteMask and 'Site' in ceDict:
                self.log.info('Site not in the mask %s' % siteName)
                self.log.info('Removing "Site" from matching Dict')
                del ceDict['Site']
            if self.vo:
                ceDict['Community'] = self.vo
            if self.voGroups:
                ceDict['OwnerGroup'] = self.voGroups

            # This is a hack to get rid of !
            ceDict['SubmitPool'] = self.defaultSubmitPools

            result = Resources.getCompatiblePlatforms(platform)
            if not result['OK']:
                continue
            ceDict['Platform'] = result['Value']

            # Get the number of eligible jobs for the target site/queue
            result = rpcMatcher.getMatchingTaskQueues(ceDict)
            if not result['OK']:
                self.log.error(
                    'Could not retrieve TaskQueues from TaskQueueDB',
                    result['Message'])
                return result
            taskQueueDict = result['Value']
            if not taskQueueDict:
                self.log.info('No matching TQs found')
                continue

            totalTQJobs = 0
            tqIDList = taskQueueDict.keys()
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]['Jobs']

            pilotsToSubmit = min(totalSlots, totalTQJobs)

            # Get the number of already waiting pilots for this queue
            totalWaitingPilots = 0
            if self.pilotWaitingFlag:
                lastUpdateTime = dateTime() - self.pilotWaitingTime * second
                result = pilotAgentsDB.countPilots(
                    {
                        'TaskQueueID': tqIDList,
                        'Status': WAITING_PILOT_STATUS
                    }, None, lastUpdateTime)
                if not result['OK']:
                    self.log.error('Failed to get Number of Waiting pilots',
                                   result['Message'])
                    totalWaitingPilots = 0
                else:
                    totalWaitingPilots = result['Value']
                    self.log.verbose(
                        'Waiting Pilots for TaskQueue %s:' % tqIDList,
                        totalWaitingPilots)

            pilotsToSubmit = max(
                0, min(totalSlots, totalTQJobs - totalWaitingPilots))
            self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                                    ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

            # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
            pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit)

            while pilotsToSubmit > 0:
                self.log.info('Going to submit %d pilots to %s queue' %
                              (pilotsToSubmit, queue))

                bundleProxy = self.queueDict[queue].get('BundleProxy', False)
                jobExecDir = ''
                if ceType == 'CREAM':
                    jobExecDir = '.'
                jobExecDir = self.queueDict[queue].get('JobExecDir',
                                                       jobExecDir)
                httpProxy = self.queueDict[queue].get('HttpProxy', '')

                result = self.__getExecutable(queue, pilotsToSubmit,
                                              bundleProxy, httpProxy,
                                              jobExecDir)
                if not result['OK']:
                    return result

                executable, pilotSubmissionChunk = result['Value']
                result = ce.submitJob(executable, '', pilotSubmissionChunk)
                os.unlink(executable)
                if not result['OK']:
                    self.log.error('Failed submission to queue %s:\n' % queue,
                                   result['Message'])
                    pilotsToSubmit = 0
                    continue

                pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
                # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
                # task queue priorities
                pilotList = result['Value']
                self.log.info('Submitted %d pilots to %s@%s' %
                              (len(pilotList), queueName, ceName))
                stampDict = {}
                if result.has_key('PilotStampDict'):
                    stampDict = result['PilotStampDict']
                tqPriorityList = []
                sumPriority = 0.
                for tq in taskQueueDict:
                    sumPriority += taskQueueDict[tq]['Priority']
                    tqPriorityList.append((tq, sumPriority))
                rndm = random.random() * sumPriority
                tqDict = {}
                for pilotID in pilotList:
                    rndm = random.random() * sumPriority
                    for tq, prio in tqPriorityList:
                        if rndm < prio:
                            tqID = tq
                            break
                    if not tqDict.has_key(tqID):
                        tqDict[tqID] = []
                    tqDict[tqID].append(pilotID)

                for tqID, pilotList in tqDict.items():
                    result = pilotAgentsDB.addPilotTQReference(
                        pilotList, tqID, self.pilotDN, self.pilotGroup,
                        self.localhost, ceType, '', stampDict)
                    if not result['OK']:
                        self.log.error(
                            'Failed add pilots to the PilotAgentsDB: ',
                            result['Message'])
                        continue
                    for pilot in pilotList:
                        result = pilotAgentsDB.setPilotStatus(
                            pilot, 'Submitted', ceName,
                            'Successfully submitted by the SiteDirector',
                            siteName, queueName)
                        if not result['OK']:
                            self.log.error('Failed to set pilot status: ',
                                           result['Message'])
                            continue

        return S_OK()

#####################################################################################

    def __getExecutable(self,
                        queue,
                        pilotsToSubmit,
                        bundleProxy=True,
                        httpProxy='',
                        jobExecDir=''):
        """ Prepare the full executable for queue
    """

        proxy = None
        if bundleProxy:
            proxy = self.proxy
        pilotOptions, pilotsToSubmit = self.__getPilotOptions(
            queue, pilotsToSubmit)
        if pilotOptions is None:
            return S_ERROR('Errors in compiling pilot options')
        executable = self.__writePilotScript(self.workingDirectory,
                                             pilotOptions, proxy, httpProxy,
                                             jobExecDir)
        return S_OK([executable, pilotsToSubmit])

#####################################################################################

    def __getPilotOptions(self, queue, pilotsToSubmit):
        """ Prepare pilot options
    """

        queueDict = self.queueDict[queue]['ParametersDict']
        pilotOptions = []

        setup = gConfig.getValue("/DIRAC/Setup", "unknown")
        if setup == 'unknown':
            self.log.error('Setup is not defined in the configuration')
            return [None, None]
        pilotOptions.append('-S %s' % setup)
        opsHelper = Operations.Operations(group=self.pilotGroup, setup=setup)

        #Installation defined?
        installationName = opsHelper.getValue("Pilot/Installation", "")
        if installationName:
            pilotOptions.append('-V %s' % installationName)

        #Project defined?
        projectName = opsHelper.getValue("Pilot/Project", "")
        if projectName:
            pilotOptions.append('-l %s' % projectName)
        else:
            self.log.info('DIRAC project will be installed by pilots')

        #Request a release
        diracVersion = opsHelper.getValue("Pilot/Version", [])
        if not diracVersion:
            self.log.error('Pilot/Version is not defined in the configuration')
            return [None, None]
        #diracVersion is a list of accepted releases. Just take the first one
        pilotOptions.append('-r %s' % diracVersion[0])

        ownerDN = self.pilotDN
        ownerGroup = self.pilotGroup
        # Request token for maximum pilot efficiency
        result = gProxyManager.requestToken(
            ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode)
        if not result['OK']:
            self.log.error('Invalid proxy token request', result['Message'])
            return [None, None]
        (token, numberOfUses) = result['Value']
        pilotOptions.append('-o /Security/ProxyToken=%s' % token)
        # Use Filling mode
        pilotOptions.append('-M %s' %
                            min(numberOfUses, self.maxJobsInFillMode))

        # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode )
        # with numberOfUses tokens we can submit at most:
        #    numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
        # pilots
        newPilotsToSubmit = numberOfUses / min(numberOfUses,
                                               self.maxJobsInFillMode)
        if newPilotsToSubmit != pilotsToSubmit:
            self.log.info(
                'Number of pilots to submit is changed to %d after getting the proxy token'
                % newPilotsToSubmit)
            pilotsToSubmit = newPilotsToSubmit
        # Debug
        if self.pilotLogLevel.lower() == 'debug':
            pilotOptions.append('-d')
        # CS Servers
        csServers = gConfig.getValue("/DIRAC/Configuration/Servers", [])
        pilotOptions.append('-C %s' % ",".join(csServers))

        # DIRAC Extensions to be used in pilots
        pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", [])
        extensionsList = []
        if pilotExtensionsList:
            if pilotExtensionsList[0] != 'None':
                extensionsList = pilotExtensionsList
        else:
            extensionsList = CSGlobals.getCSExtensions()
        if extensionsList:
            pilotOptions.append('-e %s' % ",".join(extensionsList))

        # Requested CPU time
        pilotOptions.append('-T %s' % queueDict['CPUTime'])
        # CEName
        pilotOptions.append('-N %s' % self.queueDict[queue]['CEName'])
        # SiteName
        pilotOptions.append('-n %s' % queueDict['Site'])
        if 'ClientPlatform' in queueDict:
            pilotOptions.append("-p '%s'" % queueDict['ClientPlatform'])

        if 'SharedArea' in queueDict:
            pilotOptions.append("-o '/LocalSite/SharedArea=%s'" %
                                queueDict['SharedArea'])

        if 'SI00' in queueDict:
            factor = float(queueDict['SI00']) / 250.
            pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % factor)
            pilotOptions.append("-o '/LocalSite/CPUNormalizationFactor=%s'" %
                                factor)
        else:
            if 'CPUScalingFactor' in queueDict:
                pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" %
                                    queueDict['CPUScalingFactor'])
            if 'CPUNormalizationFactor' in queueDict:
                pilotOptions.append(
                    "-o '/LocalSite/CPUNormalizationFactor=%s'" %
                    queueDict['CPUNormalizationFactor'])

        # Hack
        if self.defaultSubmitPools:
            pilotOptions.append(
                '-o /Resources/Computing/CEDefaults/SubmitPool=%s' %
                self.defaultSubmitPools)

        if self.group:
            pilotOptions.append('-G %s' % self.group)

        self.log.verbose("pilotOptions: ", ' '.join(pilotOptions))

        return [pilotOptions, pilotsToSubmit]


#####################################################################################

    def __writePilotScript(self,
                           workingDirectory,
                           pilotOptions,
                           proxy=None,
                           httpProxy='',
                           pilotExecDir=''):
        """ Bundle together and write out the pilot executable script, admixt the proxy if given
    """

        try:
            compressedAndEncodedProxy = ''
            proxyFlag = 'False'
            if proxy is not None:
                compressedAndEncodedProxy = base64.encodestring(
                    bz2.compress(proxy.dumpAllToString()['Value']))
                proxyFlag = 'True'
            compressedAndEncodedPilot = base64.encodestring(
                bz2.compress(open(self.pilot, "rb").read(), 9))
            compressedAndEncodedInstall = base64.encodestring(
                bz2.compress(open(self.install, "rb").read(), 9))
        except:
            self.log.exception(
                'Exception during file compression of proxy, dirac-pilot or dirac-install'
            )
            return S_ERROR(
                'Exception during file compression of proxy, dirac-pilot or dirac-install'
            )

        localPilot = """#!/bin/bash
/usr/bin/env python << EOF
#
import os, tempfile, sys, shutil, base64, bz2
try:
  pilotExecDir = '%(pilotExecDir)s'
  if not pilotExecDir:
    pilotExecDir = None
  pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir )
  pilotWorkingDirectory = os.path.realpath( pilotWorkingDirectory )
  os.chdir( pilotWorkingDirectory )
  if %(proxyFlag)s:
    open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedProxy)s\"\"\" ) ) )
    os.chmod("proxy",0600)
    os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy')
  open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedPilot)s\"\"\" ) ) )
  open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedInstall)s\"\"\" ) ) )
  os.chmod("%(pilotScript)s",0700)
  os.chmod("%(installScript)s",0700)
  if "LD_LIBRARY_PATH" not in os.environ:
    os.environ["LD_LIBRARY_PATH"]=""
  if "%(httpProxy)s":
    os.environ["HTTP_PROXY"]="%(httpProxy)s"
  os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates')
  # TODO: structure the output
  print '==========================================================='
  print 'Environment of execution host'
  for key in os.environ.keys():
    print key + '=' + os.environ[key]
  print '==========================================================='
except Exception, x:
  print >> sys.stderr, x
  sys.exit(-1)
cmd = "python %(pilotScript)s %(pilotOptions)s"
print 'Executing: ', cmd
sys.stdout.flush()
os.system( cmd )

shutil.rmtree( pilotWorkingDirectory )

EOF
""" % {
            'compressedAndEncodedProxy': compressedAndEncodedProxy,
            'compressedAndEncodedPilot': compressedAndEncodedPilot,
            'compressedAndEncodedInstall': compressedAndEncodedInstall,
            'httpProxy': httpProxy,
            'pilotExecDir': pilotExecDir,
            'pilotScript': os.path.basename(self.pilot),
            'installScript': os.path.basename(self.install),
            'pilotOptions': ' '.join(pilotOptions),
            'proxyFlag': proxyFlag
        }

        fd, name = tempfile.mkstemp(suffix='_pilotwrapper.py',
                                    prefix='DIRAC_',
                                    dir=workingDirectory)
        pilotWrapper = os.fdopen(fd, 'w')
        pilotWrapper.write(localPilot)
        pilotWrapper.close()
        return name

    def updatePilotStatus(self):
        """ Update status of pilots in transient states
    """
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']
            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']

            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'Status': TRANSIENT_PILOT_STATUS,
                'OwnerDN': self.pilotDN,
                'OwnerGroup': self.pilotGroup
            })
            if not result['OK']:
                self.log.error('Failed to select pilots: %s' %
                               result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue

            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']
            stampedPilotRefs = []
            for pRef in pilotDict:
                if pilotDict[pRef]['PilotStamp']:
                    stampedPilotRefs.append(pRef + ":::" +
                                            pilotDict[pRef]['PilotStamp'])
                else:
                    stampedPilotRefs = list(pilotRefs)
                    break

            result = ce.isProxyValid()
            if not result['OK']:
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 600)
                if not result['OK']:
                    return result
                self.proxy = result['Value']
                ce.setProxy(self.proxy, 500)

            result = ce.getJobStatus(stampedPilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots status from CE',
                               '%s: %s' % (ceName, result['Message']))
                continue
            pilotCEDict = result['Value']

            for pRef in pilotRefs:
                newStatus = ''
                oldStatus = pilotDict[pRef]['Status']
                ceStatus = pilotCEDict[pRef]
                if oldStatus == ceStatus:
                    # Status did not change, continue
                    continue
                elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
                    # Pilot finished without reporting, consider it Aborted
                    newStatus = 'Aborted'
                elif ceStatus != 'Unknown':
                    # Update the pilot status to the new value
                    newStatus = ceStatus

                if newStatus:
                    self.log.info('Updating status to %s for pilot %s' %
                                  (newStatus, pRef))
                    result = pilotAgentsDB.setPilotStatus(
                        pRef, newStatus, '', 'Updated by SiteDirector')
                # Retrieve the pilot output now
                if newStatus in FINAL_PILOT_STATUS:
                    if pilotDict[pRef]['OutputReady'].lower(
                    ) == 'false' and self.getOutput:
                        self.log.info('Retrieving output for pilot %s' % pRef)
                        pilotStamp = pilotDict[pRef]['PilotStamp']
                        pRefStamp = pRef
                        if pilotStamp:
                            pRefStamp = pRef + ':::' + pilotStamp
                        result = ce.getJobOutput(pRefStamp)
                        if not result['OK']:
                            self.log.error(
                                'Failed to get pilot output',
                                '%s: %s' % (ceName, result['Message']))
                        else:
                            output, error = result['Value']
                            if output:
                                result = pilotAgentsDB.storePilotOutput(
                                    pRef, output, error)
                                if not result['OK']:
                                    self.log.error(
                                        'Failed to store pilot output',
                                        result['Message'])
                            else:
                                self.log.warn(
                                    'Empty pilot output not stored to PilotDB')

        # The pilot can be in Done state set by the job agent check if the output is retrieved
        for queue in self.queueDict:
            ce = self.queueDict[queue]['CE']

            if not ce.isProxyValid(120):
                result = gProxyManager.getPilotProxyFromDIRACGroup(
                    self.pilotDN, self.pilotGroup, 1000)
                if not result['OK']:
                    return result
                ce.setProxy(self.proxy, 940)

            ceName = self.queueDict[queue]['CEName']
            queueName = self.queueDict[queue]['QueueName']
            ceType = self.queueDict[queue]['CEType']
            siteName = self.queueDict[queue]['Site']
            result = pilotAgentsDB.selectPilots({
                'DestinationSite': ceName,
                'Queue': queueName,
                'GridType': ceType,
                'GridSite': siteName,
                'OutputReady': 'False',
                'Status': FINAL_PILOT_STATUS
            })

            if not result['OK']:
                self.log.error('Failed to select pilots', result['Message'])
                continue
            pilotRefs = result['Value']
            if not pilotRefs:
                continue
            result = pilotAgentsDB.getPilotInfo(pilotRefs)
            if not result['OK']:
                self.log.error('Failed to get pilots info from DB',
                               result['Message'])
                continue
            pilotDict = result['Value']
            if self.getOutput:
                for pRef in pilotRefs:
                    self.log.info('Retrieving output for pilot %s' % pRef)
                    pilotStamp = pilotDict[pRef]['PilotStamp']
                    pRefStamp = pRef
                    if pilotStamp:
                        pRefStamp = pRef + ':::' + pilotStamp
                    result = ce.getJobOutput(pRefStamp)
                    if not result['OK']:
                        self.log.error('Failed to get pilot output',
                                       '%s: %s' % (ceName, result['Message']))
                    else:
                        output, error = result['Value']
                        result = pilotAgentsDB.storePilotOutput(
                            pRef, output, error)
                        if not result['OK']:
                            self.log.error('Failed to store pilot output',
                                           result['Message'])

            # Check if the accounting is to be sent
            if self.sendAccounting:
                result = pilotAgentsDB.selectPilots({
                    'DestinationSite':
                    ceName,
                    'Queue':
                    queueName,
                    'GridType':
                    ceType,
                    'GridSite':
                    siteName,
                    'AccountingSent':
                    'False',
                    'Status':
                    FINAL_PILOT_STATUS
                })

                if not result['OK']:
                    self.log.error('Failed to select pilots',
                                   result['Message'])
                    continue
                pilotRefs = result['Value']
                if not pilotRefs:
                    continue
                result = pilotAgentsDB.getPilotInfo(pilotRefs)
                if not result['OK']:
                    self.log.error('Failed to get pilots info from DB',
                                   result['Message'])
                    continue
                pilotDict = result['Value']
                result = self.sendPilotAccounting(pilotDict)
                if not result['OK']:
                    self.log.error('Failed to send pilot agent accounting')

        return S_OK()

    def sendPilotAccounting(self, pilotDict):
        """ Send pilot accounting record
    """
        for pRef in pilotDict:
            self.log.verbose('Preparing accounting record for pilot %s' % pRef)
            pA = PilotAccounting()
            pA.setEndTime(pilotDict[pRef]['LastUpdateTime'])
            pA.setStartTime(pilotDict[pRef]['SubmissionTime'])
            retVal = CS.getUsernameForDN(pilotDict[pRef]['OwnerDN'])
            if not retVal['OK']:
                userName = '******'
                self.log.error("Can't determine username for dn:",
                               pilotDict[pRef]['OwnerDN'])
            else:
                userName = retVal['Value']
            pA.setValueByKey('User', userName)
            pA.setValueByKey('UserGroup', pilotDict[pRef]['OwnerGroup'])
            result = getSiteForCE(pilotDict[pRef]['DestinationSite'])
            if result['OK'] and result['Value'].strip():
                pA.setValueByKey('Site', result['Value'].strip())
            else:
                pA.setValueByKey('Site', 'Unknown')
            pA.setValueByKey('GridCE', pilotDict[pRef]['DestinationSite'])
            pA.setValueByKey('GridMiddleware', pilotDict[pRef]['GridType'])
            pA.setValueByKey('GridResourceBroker', pilotDict[pRef]['Broker'])
            pA.setValueByKey('GridStatus', pilotDict[pRef]['Status'])
            if not 'Jobs' in pilotDict[pRef]:
                pA.setValueByKey('Jobs', 0)
            else:
                pA.setValueByKey('Jobs', len(pilotDict[pRef]['Jobs']))
            self.log.info("Adding accounting record for pilot %s" %
                          pilotDict[pRef]['PilotID'])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal['OK']:
                self.log.error('Failed to send accounting info for pilot ',
                               pRef)
            else:
                # Set up AccountingSent flag
                result = pilotAgentsDB.setAccountingFlag(pRef)
                if not result['OK']:
                    self.log.error('Failed to set accounting flag for pilot ',
                                   pRef)

        self.log.info('Committing accounting records for %d pilots' %
                      len(pilotDict))
        result = gDataStoreClient.commit()
        if result['OK']:
            for pRef in pilotDict:
                self.log.verbose('Setting AccountingSent flag for pilot %s' %
                                 pRef)
                result = pilotAgentsDB.setAccountingFlag(pRef)
                if not result['OK']:
                    self.log.error('Failed to set accounting flag for pilot ',
                                   pRef)
        else:
            return result

        return S_OK()
Example #25
0
  def _resolveCECandidates( self, taskQueueDict ):
    """
      Return a list of CEs for this TaskQueue
    """
    # assume user knows what they're doing and avoid site mask e.g. sam jobs
    if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
      self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'],
                     ', '.join( taskQueueDict['GridCEs'] ) )
      return taskQueueDict['GridCEs']

    # Get the mask
    siteStatus = SiteStatus()
    ret = siteStatus.getUsableSites( 'ComputingAccess' )
    if not ret['OK']:
      self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] )
      return []

    usableSites = ret['Value']
    if not usableSites:
      self.log.error( 'Site mask is empty' )
      return []

    self.log.verbose( 'Site Mask: %s' % ', '.join( usableSites ) )

    # remove banned sites from siteMask
    if 'BannedSites' in taskQueueDict:
      for site in taskQueueDict['BannedSites']:
        if site in usableSites:
          usableSites.remove( site )
          self.log.verbose( 'Removing banned site %s from site Mask' % site )

    # remove from the mask if a Site is given
    siteMask = [ site for site in usableSites if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ]

    if not siteMask:
      # pilot can not be submitted
      self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] )
      return []

    self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) )

    # Get CE's associates to the given site Names
    ceMask = []

    resources = Resources( vo = self.virtualOrganization )
    result = resources.getEligibleResources( 'Computing', {'Site':siteMask,
                                                           'SubmissionMode':'gLite',
                                                           'CEType':['LCG','CREAM']} )
    if not result['OK']:
      self.log.error( "Failed to get eligible ce's:", result['Message'] )
      return []
    ces = result['Value']

    for ce in ces:
      ceHost = resources.getComputingElementValue( ce, 'Host', 'unknown' )
      if ceHost != 'unknown':
        ceMask.append( ceHost )

    if not ceMask:
      self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) )

    self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) )

    return ceMask
Example #26
0
  def checkJob( self, job, classAdJob ):
    """This method controls the checking of the job.
    """
    self.log.verbose( 'Job %s will be processed' % ( job ) )

    # Check if the job was recently rescheduled
    result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] )
    if not result['OK']:
      self.log.error( result['Message'] )
      return S_ERROR( 'Can not get job attributes from JobDB' )
    jobDict = result['Value']
    reCounter = int( jobDict['RescheduleCounter'] )
    if reCounter != 0 :
      reTime = fromString( jobDict['RescheduleTime'] )
      delta = toEpoch() - toEpoch( reTime )
      delay = self.maxRescheduleDelay
      if reCounter <= len( self.rescheduleDelaysList ):
        delay = self.rescheduleDelaysList[reCounter - 1]
      if delta < delay:
        if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1:
          result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter )
        return S_OK()

    # First, get Site and BannedSites from the Job

    result = self.__getJobSiteRequirement( job, classAdJob )
    userBannedSites = result['BannedSites']
    userSites = result['Sites']

    if userSites:
      userSites = applySiteRequirements( userSites, [], userBannedSites )
      if not userSites:
        msg = 'Impossible Site Requirement'
        return S_ERROR( msg )

    # Second, get the Active and Banned sites from the RSS

    siteStatus = SiteStatus()
    
    usableSites   = siteStatus.getUsableSites( 'ComputingAccess' )
    unusableSites = siteStatus.getUnusableSites( 'ComputingAccess' )
    
    if not ( usableSites['OK'] and unusableSites['OK'] ):
      if not usableSites['OK']:
        self.log.error( usableSites['Message'] )
      if not unusableSites['OK']:
        self.log.error( unusableSites['Message'] )
      return S_ERROR( 'Can not get Active and Banned Sites from JobDB' )

    usableSites   = usableSites['Value']
    unusableSites = unusableSites['Value']

    if userSites:
      sites = applySiteRequirements( userSites, usableSites, unusableSites )
      if not sites:
        # Put on Hold only non-excluded job types
        jobType = classAdJob.getAttributeString( 'JobType' )
        if not jobType in self.excludedOnHoldJobTypes:
          msg = 'On Hold: Requested site is Banned or not Active'
          self.log.info( msg )
          result = self.jobDB.setJobStatus( job, application = msg )
          return S_OK()


    # Third, check if there is input data
    result = self.jobDB.getInputData( job )
    if not result['OK']:
      self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) )
      self.log.error( result['Message'] )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    hasInputData = False
    inputData = []
    for lfn in result['Value']:
      if lfn:
        inputData.append( lfn )
        hasInputData = True

    if not hasInputData:
      #With no input data requirement, job can proceed directly to task queue
      self.log.verbose( 'Job %s has no input data requirement' % ( job ) )
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    self.log.verbose( 'Job %s has an input data requirement ' % ( job ) )

    # Fourth, Check all optimizer information
    result = self.__checkOptimizerInfo( job )
    if not result['OK']:
      return result

    optInfo = result['Value']

    #Compare site candidates with current mask
    optSites = optInfo['SiteCandidates'].keys()
    self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) )
    # Check that it is compatible with user requirements
    optSites = applySiteRequirements( optSites, userSites, userBannedSites )
    if not optSites:
      msg = 'Impossible Site + InputData Requirement'
      return S_ERROR( msg )

    sites = applySiteRequirements( optSites, usableSites, unusableSites )
    if not sites:
      msg = 'On Hold: InputData Site is Banned or not Active'
      self.log.info( msg )
      result = self.jobDB.setJobStatus( job, application = msg )
      return S_OK()

    #Set stager request as necessary, optimize for smallest #files on tape if
    #more than one site candidate left at this point
    checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] )
    if not checkStaging['OK']:
      return checkStaging

    destinationSites = checkStaging['SiteCandidates']
    if not destinationSites:
      return S_ERROR( 'No destination sites available' )

    stagingFlag = checkStaging['Value']
    if stagingFlag:
      #Single site candidate chosen and staging required
      self.log.verbose( 'Job %s requires staging of input data' % ( job ) )
      # set all LFN to disk for the selected site
      stagingSite = destinationSites[0]
      siteDict = optInfo['SiteCandidates'][stagingSite]
      siteDict['disk'] = siteDict['disk'] + siteDict['tape']
      siteDict['tape'] = 0

      optInfo['SiteCandidates'][stagingSite] = siteDict
      self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo )
      result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo )
      if not result['OK']:
        return result

      # Site is selected for staging, report it
      self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) )

      result = self.__getStagingSites( stagingSite, destinationSites )
      if not result['OK']:
        stagingSites = [stagingSite]
      else:
        stagingSites = result['Value']

      if len( stagingSites ) == 1:
        self.jobDB.setJobAttribute( job, 'Site', stagingSite )
      else:
        # Get the name of the site group
        result = self.__getSiteGroup( stagingSites )
        if result['OK']:
          groupName = result['Value']
          if groupName:
            self.jobDB.setJobAttribute( job, 'Site', groupName )
          else:
            self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )
        else:
          self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )

      stagerDict = self.__setStagingRequest( job, stagingSite, optInfo )
      if not stagerDict['OK']:
        return stagerDict
      self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo )
      return S_OK()
    else:
      #No staging required, can proceed to task queue agent and then waiting status
      self.log.verbose( 'Job %s does not require staging of input data' % ( job ) )
    #Finally send job to TaskQueueAgent
    return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
Example #27
0
class DiracAdmin(API):
    """ Administrative functionalities
  """

    #############################################################################
    def __init__(self):
        """Internal initialization of the DIRAC Admin API.
    """
        super(DiracAdmin, self).__init__()

        self.csAPI = CSAPI()

        self.dbg = False
        if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG':
            self.dbg = True

        self.scratchDir = gConfig.getValue(self.section + '/ScratchDir',
                                           '/tmp')
        self.currentDir = os.getcwd()
        self.rssFlag = ResourceStatus().rssFlag
        self.sitestatus = SiteStatus()

    #############################################################################
    def uploadProxy(self, group):
        """Upload a proxy to the DIRAC WMS.  This method

       Example usage:

         >>> print diracAdmin.uploadProxy('lhcb_pilot')
         {'OK': True, 'Value': 0L}

       :param group: DIRAC Group
       :type job: string
       :return: S_OK,S_ERROR

       :param permanent: Indefinitely update proxy
       :type permanent: boolean

    """
        return gProxyManager.uploadProxy(diracGroup=group)

    #############################################################################
    def setProxyPersistency(self, userDN, userGroup, persistent=True):
        """Set the persistence of a proxy in the Proxy Manager

       Example usage:

         >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )
         {'OK': True }

       :param userDN: User DN
       :type userDN: string
       :param userGroup: DIRAC Group
       :type userGroup: string
       :param persistent: Persistent flag
       :type persistent: boolean
       :return: S_OK,S_ERROR
    """
        return gProxyManager.setPersistency(userDN, userGroup, persistent)

    #############################################################################
    def checkProxyUploaded(self, userDN, userGroup, requiredTime):
        """Set the persistence of a proxy in the Proxy Manager

       Example usage:

         >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )
         {'OK': True, 'Value' : True/False }

       :param userDN: User DN
       :type userDN: string
       :param userGroup: DIRAC Group
       :type userGroup: string
       :param requiredTime: Required life time of the uploaded proxy
       :type requiredTime: boolean
       :return: S_OK,S_ERROR
    """
        return gProxyManager.userHasProxy(userDN, userGroup, requiredTime)

    #############################################################################
    def getSiteMask(self, printOutput=False, status='Active'):
        """Retrieve current site mask from WMS Administrator service.

       Example usage:

         >>> print diracAdmin.getSiteMask()
         {'OK': True, 'Value': 0L}

       :return: S_OK,S_ERROR

    """

        result = self.sitestatus.getSites(siteState=status)
        if result['OK']:
            sites = result['Value']
            if printOutput:
                sites.sort()
                for site in sites:
                    print site

        return result

    #############################################################################
    def getBannedSites(self, gridType=[], printOutput=False):
        """Retrieve current list of banned  and probing sites.

       Example usage:

         >>> print diracAdmin.getBannedSites()
         {'OK': True, 'Value': []}

       :return: S_OK,S_ERROR

    """

        bannedSites = self.sitestatus.getSites(siteState='Banned')
        if not bannedSites['OK']:
            return bannedSites

        probingSites = self.sitestatus.getSites(siteState='Probing')
        if not probingSites['OK']:
            return probingSites

        mergedList = bannedSites['Value'] + probingSites['Value']

        mergedList.sort()
        if printOutput:
            print '\n'.join(mergedList)

        return S_OK(mergedList)

    #############################################################################
    def getSiteSection(self, site, printOutput=False):
        """Simple utility to get the list of CEs for DIRAC site name.

       Example usage:

         >>> print diracAdmin.getSiteSection('LCG.CERN.ch')
         {'OK': True, 'Value':}

       :return: S_OK,S_ERROR
    """
        gridType = site.split('.')[0]
        if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']:
            return S_ERROR('/Resources/Sites/%s is not a valid site section' %
                           (gridType))

        result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' %
                                        (gridType, site))
        if printOutput and result['OK']:
            print self.pPrint.pformat(result['Value'])
        return result

    #############################################################################
    def allowSite(self, site, comment, printOutput=False):
        """Adds the site to the site mask.

       Example usage:

         >>> print diracAdmin.allowSite()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        result = self.getSiteMask(status='Active')
        if not result['OK']:
            return result
        siteMask = result['Value']
        if site in siteMask:
            if printOutput:
                print 'Site %s is already Active' % site
            return S_OK('Site %s is already Active' % site)

        if self.rssFlag:
            result = self.sitestatus.setSiteStatus(site, 'Active', comment)
        else:
            wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
            result = wmsAdmin.allowSite(site, comment)
        if not result['OK']:
            return result

        if printOutput:
            print 'Site %s status is set to Active' % site

        return result

    #############################################################################
    def getSiteMaskLogging(self, site=None, printOutput=False):
        """Retrieves site mask logging information.

       Example usage:

         >>> print diracAdmin.getSiteMaskLogging('LCG.AUVER.fr')
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR
    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdmin.getSiteMaskLogging(site)
        if not result['OK']:
            return result

        if site:
            if not result['Value'].has_key(site):
                return S_ERROR('Site mask information not available for %s' %
                               (site))

        if printOutput:
            if site:
                print '\nSite Mask Logging Info for %s\n' % site
            else:
                print '\nAll Site Mask Logging Info\n'

            siteDict = result['Value']
            for site, tupleList in siteDict.iteritems():
                if not site:
                    print '\n===> %s\n' % site
                for tup in tupleList:
                    print str( tup[0] ).ljust( 8 ) + str( tup[1] ).ljust( 20 ) + \
                         '( ' + str( tup[2] ).ljust( len( str( tup[2] ) ) ) + ' )  "' + str( tup[3] ) + '"'
                print ' '
        return result

    #############################################################################
    def banSite(self, site, comment, printOutput=False):
        """Removes the site from the site mask.

       Example usage:

         >>> print diracAdmin.banSite()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        mask = self.getSiteMask(status='Banned')
        if not mask['OK']:
            return mask
        siteMask = mask['Value']
        if site in siteMask:
            if printOutput:
                print 'Site %s is already Banned' % site
            return S_OK('Site %s is already Banned' % site)

        if self.rssFlag:
            result = self.sitestatus.setSiteStatus(site, 'Banned', comment)
        else:
            wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
            result = wmsAdmin.banSite(site, comment)
        if not result['OK']:
            return result

        if printOutput:
            print 'Site %s status is set to Banned' % site

        return result

    #############################################################################
    def __checkSiteIsValid(self, site):
        """Internal function to check that a site name is valid.
    """
        sites = getSiteCEMapping()
        if not sites['OK']:
            return S_ERROR('Could not get site CE mapping')
        siteList = sites['Value'].keys()
        if not site in siteList:
            return S_ERROR(
                'Specified site %s is not in list of defined sites' % site)

        return S_OK('%s is valid' % site)

    #############################################################################
    def clearMask(self):
        """Removes all sites from the site mask.  Should be used with care.

       Example usage:

         >>> print diracAdmin.clearMask()
         {'OK': True, 'Value':''}

       :return: S_OK,S_ERROR

    """
        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdmin.clearMask()
        return result

    #############################################################################
    def getServicePorts(self, setup='', printOutput=False):
        """Checks the service ports for the specified setup.  If not given this is
       taken from the current installation (/DIRAC/Setup)

       Example usage:

         >>> print diracAdmin.getServicePorts()
         {'OK': True, 'Value':''}

       :return: S_OK,S_ERROR

    """
        if not setup:
            setup = gConfig.getValue('/DIRAC/Setup', '')

        setupList = gConfig.getSections('/DIRAC/Setups', [])
        if not setupList['OK']:
            return S_ERROR('Could not get /DIRAC/Setups sections')
        setupList = setupList['Value']
        if not setup in setupList:
            return S_ERROR('Setup %s is not in allowed list: %s' %
                           (setup, ', '.join(setupList)))

        serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup)
        if not serviceSetups['OK']:
            return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup)
        serviceSetups = serviceSetups['Value']  # dict
        systemList = gConfig.getSections('/Systems')
        if not systemList['OK']:
            return S_ERROR('Could not get Systems sections')
        systemList = systemList['Value']
        result = {}
        for system in systemList:
            if serviceSetups.has_key(system):
                path = '/Systems/%s/%s/Services' % (system,
                                                    serviceSetups[system])
                servicesList = gConfig.getSections(path)
                if not servicesList['OK']:
                    self.log.warn('Could not get sections in %s' % path)
                else:
                    servicesList = servicesList['Value']
                    if not servicesList:
                        servicesList = []
                    self.log.verbose('System: %s ServicesList: %s' %
                                     (system, ', '.join(servicesList)))
                    for service in servicesList:
                        spath = '%s/%s/Port' % (path, service)
                        servicePort = gConfig.getValue(spath, 0)
                        if servicePort:
                            self.log.verbose('Found port for %s/%s = %s' %
                                             (system, service, servicePort))
                            result['%s/%s' % (system, service)] = servicePort
                        else:
                            self.log.warn('No port found for %s' % spath)
            else:
                self.log.warn('%s is not defined in /DIRAC/Setups/%s' %
                              (system, setup))

        if printOutput:
            print self.pPrint.pformat(result)

        return S_OK(result)

    #############################################################################
    def getProxy(self, userDN, userGroup, validity=43200, limited=False):
        """Retrieves a proxy with default 12hr validity and stores
       this in a file in the local directory by default.

       Example usage:

         >>> print diracAdmin.getProxy()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
        return gProxyManager.downloadProxy(userDN,
                                           userGroup,
                                           limited=limited,
                                           requiredTimeLeft=validity)

    #############################################################################
    def getVOMSProxy(self,
                     userDN,
                     userGroup,
                     vomsAttr=False,
                     validity=43200,
                     limited=False):
        """Retrieves a proxy with default 12hr validity and VOMS extensions and stores
       this in a file in the local directory by default.

       Example usage:

         >>> print diracAdmin.getVOMSProxy()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
        return gProxyManager.downloadVOMSProxy(userDN,
                                               userGroup,
                                               limited=limited,
                                               requiredVOMSAttribute=vomsAttr,
                                               requiredTimeLeft=validity)

    #############################################################################
    def getPilotProxy(self, userDN, userGroup, validity=43200):
        """Retrieves a pilot proxy with default 12hr validity and stores
       this in a file in the local directory by default.

       Example usage:

         >>> print diracAdmin.getVOMSProxy()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """

        return gProxyManager.getPilotProxyFromDIRACGroup(
            userDN, userGroup, requiredTimeLeft=validity)

    #############################################################################
    def resetJob(self, jobID):
        """Reset a job or list of jobs in the WMS.  This operation resets the reschedule
       counter for a job or list of jobs and allows them to run as new.

       Example::

         >>> print dirac.reset(12345)
         {'OK': True, 'Value': [12345]}

       :param job: JobID
       :type job: integer or list of integers
       :return: S_OK,S_ERROR

    """
        if isinstance(jobID, basestring):
            try:
                jobID = int(jobID)
            except Exception as x:
                return self._errorReport(
                    str(x),
                    'Expected integer or convertible integer for existing jobID'
                )
        elif isinstance(jobID, list):
            try:
                jobID = [int(job) for job in jobID]
            except Exception as x:
                return self._errorReport(
                    str(x),
                    'Expected integer or convertible integer for existing jobIDs'
                )

        jobManager = RPCClient('WorkloadManagement/JobManager',
                               useCertificates=False)
        result = jobManager.resetJob(jobID)
        return result

    #############################################################################
    def getJobPilotOutput(self, jobID, directory=''):
        """Retrieve the pilot output for an existing job in the WMS.
       The output will be retrieved in a local directory unless
       otherwise specified.

         >>> print dirac.getJobPilotOutput(12345)
         {'OK': True, StdOut:'',StdError:''}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
        if not directory:
            directory = self.currentDir

        if not os.path.exists(directory):
            return self._errorReport('Directory %s does not exist' % directory)

        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdmin.getJobPilotOutput(jobID)
        if not result['OK']:
            return result

        outputPath = '%s/pilot_%s' % (directory, jobID)
        if os.path.exists(outputPath):
            self.log.info('Remove %s and retry to continue' % outputPath)
            return S_ERROR('Remove %s and retry to continue' % outputPath)

        if not os.path.exists(outputPath):
            self.log.verbose('Creating directory %s' % outputPath)
            os.mkdir(outputPath)

        outputs = result['Value']
        if outputs.has_key('StdOut'):
            stdout = '%s/std.out' % (outputPath)
            with open(stdout, 'w') as fopen:
                fopen.write(outputs['StdOut'])
            self.log.verbose('Standard output written to %s' % (stdout))
        else:
            self.log.warn('No standard output returned')

        if outputs.has_key('StdError'):
            stderr = '%s/std.err' % (outputPath)
            with open(stderr, 'w') as fopen:
                fopen.write(outputs['StdError'])
            self.log.verbose('Standard error written to %s' % (stderr))
        else:
            self.log.warn('No standard error returned')

        self.log.always('Outputs retrieved in %s' % outputPath)
        return result

    #############################################################################
    def getPilotOutput(self, gridReference, directory=''):
        """Retrieve the pilot output  (std.out and std.err) for an existing job in the WMS.

         >>> print dirac.getJobPilotOutput(12345)
         {'OK': True, 'Value': {}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, basestring):
            return self._errorReport('Expected string for pilot reference')

        if not directory:
            directory = self.currentDir

        if not os.path.exists(directory):
            return self._errorReport('Directory %s does not exist' % directory)

        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdmin.getPilotOutput(gridReference)
        if not result['OK']:
            return result

        gridReferenceSmall = gridReference.split('/')[-1]
        if not gridReferenceSmall:
            gridReferenceSmall = 'reference'
        outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall)

        if os.path.exists(outputPath):
            self.log.info('Remove %s and retry to continue' % outputPath)
            return S_ERROR('Remove %s and retry to continue' % outputPath)

        if not os.path.exists(outputPath):
            self.log.verbose('Creating directory %s' % outputPath)
            os.mkdir(outputPath)

        outputs = result['Value']
        if outputs.has_key('StdOut'):
            stdout = '%s/std.out' % (outputPath)
            with open(stdout, 'w') as fopen:
                fopen.write(outputs['StdOut'])
            self.log.info('Standard output written to %s' % (stdout))
        else:
            self.log.warn('No standard output returned')

        if outputs.has_key('StdErr'):
            stderr = '%s/std.err' % (outputPath)
            with open(stderr, 'w') as fopen:
                fopen.write(outputs['StdErr'])
            self.log.info('Standard error written to %s' % (stderr))
        else:
            self.log.warn('No standard error returned')

        self.log.always('Outputs retrieved in %s' % outputPath)
        return result

    #############################################################################
    def getPilotInfo(self, gridReference):
        """Retrieve info relative to a pilot reference

         >>> print dirac.getPilotInfo(12345)
         {'OK': True, 'Value': {}}

       :param gridReference: Pilot Job Reference
       :type gridReference: string
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, basestring):
            return self._errorReport('Expected string for pilot reference')

        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdmin.getPilotInfo(gridReference)
        return result

    #############################################################################
    def killPilot(self, gridReference):
        """Kill the pilot specified

         >>> print dirac.getPilotInfo(12345)
         {'OK': True, 'Value': {}}

       :param gridReference: Pilot Job Reference
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, basestring):
            return self._errorReport('Expected string for pilot reference')

        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdmin.killPilot(gridReference)
        return result

    #############################################################################
    def getPilotLoggingInfo(self, gridReference):
        """Retrieve the pilot logging info for an existing job in the WMS.

         >>> print dirac.getPilotLoggingInfo(12345)
         {'OK': True, 'Value': {"The output of the command"}}

       :param gridReference: Gridp pilot job reference Id
       :type gridReference: string
       :return: S_OK,S_ERROR
    """
        if type(gridReference) not in types.StringTypes:
            return self._errorReport('Expected string for pilot reference')

        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        return wmsAdmin.getPilotLoggingInfo(gridReference)

    #############################################################################
    def getJobPilots(self, jobID):
        """Extract the list of submitted pilots and their status for a given
       jobID from the WMS.  Useful information is printed to the screen.

         >>> print dirac.getJobPilots()
         {'OK': True, 'Value': {PilotID:{StatusDict}}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR

    """
        if isinstance(jobID, basestring):
            try:
                jobID = int(jobID)
            except Exception as x:
                return self._errorReport(
                    str(x), 'Expected integer or string for existing jobID')

        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdmin.getPilots(jobID)
        if result['OK']:
            print self.pPrint.pformat(result['Value'])
        return result

    #############################################################################
    def getPilotSummary(self, startDate='', endDate=''):
        """Retrieve the pilot output for an existing job in the WMS.  Summary is
       printed at INFO level, full dictionary of results also returned.

         >>> print dirac.getPilotSummary()
         {'OK': True, 'Value': {CE:{Status:Count}}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
        wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdmin.getPilotSummary(startDate, endDate)
        if not result['OK']:
            return result

        ceDict = result['Value']
        headers = 'CE'.ljust(28)
        i = 0
        for ce, summary in ceDict.iteritems():
            states = summary.keys()
            if len(states) > i:
                i = len(states)

        for i in xrange(i):
            headers += 'Status'.ljust(12) + 'Count'.ljust(12)
        print headers

        for ce, summary in ceDict.iteritems():
            line = ce.ljust(28)
            states = summary.keys()
            states.sort()
            for state in states:
                count = str(summary[state])
                line += state.ljust(12) + count.ljust(12)
            print line

        return result

    #############################################################################
    def selectRequests(self,
                       jobID=None,
                       requestID=None,
                       requestName=None,
                       requestType=None,
                       status=None,
                       operation=None,
                       ownerDN=None,
                       ownerGroup=None,
                       requestStart=0,
                       limit=100,
                       printOutput=False):
        """Select requests from the request management system. A few notes on the selection criteria:

         - jobID is the WMS JobID for the request (if applicable)
         - requestID is assigned during submission of the request
         - requestName is the corresponding XML file name
         - requestType e.g. 'transfer'
         - status e.g. Done
         - operation e.g. replicateAndRegister
         - requestStart e.g. the first request to consider (start from 0 by default)
         - limit e.g. selection limit (default 100)

       >>> dirac.selectRequests(jobID='4894')
       {'OK': True, 'Value': [[<Requests>]]}

    """
        options = {
            'RequestID': requestID,
            'RequestName': requestName,
            'JobID': jobID,
            'OwnerDN': ownerDN,
            'OwnerGroup': ownerGroup,
            'RequestType': requestType,
            'Status': status,
            'Operation': operation
        }

        conditions = {}
        for key, value in options.iteritems():
            if value:
                try:
                    conditions[key] = str(value)
                except Exception as x:
                    return self._errorReport(
                        str(x), 'Expected string for %s field' % key)

        try:
            requestStart = int(requestStart)
            limit = int(limit)
        except Exception as x:
            return self._errorReport(str(x),
                                     'Expected integer for %s field' % limit)

        self.log.verbose('Will select requests with the following conditions')
        self.log.verbose(self.pPrint.pformat(conditions))
        requestClient = RPCClient("RequestManagement/centralURL")
        result = requestClient.getRequestSummaryWeb(conditions, [],
                                                    requestStart, limit)
        if not result['OK']:
            self.log.warn(result['Message'])
            return result

        requestIDs = result['Value']
        conds = []
        for key, value in conditions.iteritems():
            if value:
                conds.append('%s = %s' % (key, value))
        self.log.verbose(
            '%s request(s) selected with conditions %s and limit %s' %
            (len(requestIDs['Records']), ', '.join(conds), limit))
        if printOutput:
            requests = []
            if len(requestIDs['Records']) > limit:
                requestList = requestIDs['Records']
                requests = requestList[:limit]
            else:
                requests = requestIDs['Records']
            print '%s request(s) selected with conditions %s and limit %s' % (
                len(requestIDs['Records']), ', '.join(conds), limit)
            print requestIDs['ParameterNames']
            for request in requests:
                print request
        if not requestIDs:
            return S_ERROR('No requests selected for conditions: %s' %
                           conditions)
        else:
            return result

    #############################################################################
    def getRequestSummary(self, printOutput=False):
        """
    Get a summary of the requests in the request DB.
    """
        requestClient = RPCClient("RequestManagement/centralURL", timeout=120)
        result = requestClient.getDBSummary()
        if not result['OK']:
            self.log.warn(result['Message'])
            return result

        if printOutput:
            print self.pPrint.pformat(result['Value'])

        return result

    #############################################################################
    def getExternalPackageVersions(self):
        """
    Simple function that attempts to obtain the external versions for
    the local DIRAC installation (frequently needed for debugging purposes).
    """
        gLogger.info(
            'DIRAC version v%dr%d build %d' %
            (DIRAC.majorVersion, DIRAC.minorVersion, DIRAC.patchLevel))
        try:
            import lcg_util
            infoStr = 'Using lcg_util from: \n%s' % lcg_util.__file__
            gLogger.info(infoStr)
            infoStr = "The version of lcg_utils is %s" % lcg_util.lcg_util_version(
            )
            gLogger.info(infoStr)
        except Exception as x:
            errStr = "SRM2Storage.__init__: Failed to import lcg_util: %s" % (
                x)
            gLogger.exception(errStr)

        try:
            import gfalthr as gfal
            infoStr = "Using gfalthr from: \n%s" % gfal.__file__
            gLogger.info(infoStr)
            infoStr = "The version of gfalthr is %s" % gfal.gfal_version()
            gLogger.info(infoStr)
        except Exception as x:
            errStr = "SRM2Storage.__init__: Failed to import gfalthr: %s." % (
                x)
            gLogger.warn(errStr)
            try:
                import gfal
                infoStr = "Using gfal from: %s" % gfal.__file__
                gLogger.info(infoStr)
                infoStr = "The version of gfal is %s" % gfal.gfal_version()
                gLogger.info(infoStr)
            except Exception as x:
                errStr = "SRM2Storage.__init__: Failed to import gfal: %s" % (
                    x)
                gLogger.exception(errStr)

        defaultProtocols = gConfig.getValue(
            '/Resources/StorageElements/DefaultProtocols', [])
        gLogger.info('Default list of protocols are: %s' %
                     (', '.join(defaultProtocols)))
        return S_OK()

    #############################################################################
    def getSiteProtocols(self, site, printOutput=False):
        """
    Allows to check the defined protocols for each site SE.
    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site)
        siteSEs = gConfig.getValue(siteSection, [])
        if not siteSEs:
            return S_ERROR('No SEs found for site %s in section %s' %
                           (site, siteSection))

        defaultProtocols = gConfig.getValue(
            '/Resources/StorageElements/DefaultProtocols', [])
        self.log.verbose('Default list of protocols are'
                         ', '.join(defaultProtocols))
        seInfo = {}
        siteSEs.sort()
        for se in siteSEs:
            sections = gConfig.getSections('/Resources/StorageElements/%s/' %
                                           (se))
            if not sections['OK']:
                return sections
            for section in sections['Value']:
                if gConfig.getValue(
                        '/Resources/StorageElements/%s/%s/ProtocolName' %
                    (se, section), '') == 'SRM2':
                    path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (
                        se, section)
                    seProtocols = gConfig.getValue(path, [])
                    if not seProtocols:
                        seProtocols = defaultProtocols
                    seInfo[se] = seProtocols

        if printOutput:
            print '\nSummary of protocols for StorageElements at site %s' % site
            print '\nStorageElement'.ljust(30) + 'ProtocolsList'.ljust(
                30) + '\n'
            for se, protocols in seInfo.iteritems():
                print se.ljust(30) + ', '.join(protocols).ljust(30)

        return S_OK(seInfo)

    #############################################################################
    def setSiteProtocols(self, site, protocolsList, printOutput=False):
        """
    Allows to set the defined protocols for each SE for a given site.
    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site)
        siteSEs = gConfig.getValue(siteSection, [])
        if not siteSEs:
            return S_ERROR('No SEs found for site %s in section %s' %
                           (site, siteSection))

        defaultProtocols = gConfig.getValue(
            '/Resources/StorageElements/DefaultProtocols', [])
        self.log.verbose('Default list of protocols are',
                         ', '.join(defaultProtocols))

        for protocol in protocolsList:
            if not protocol in defaultProtocols:
                return S_ERROR(
                    'Requested to set protocol %s in list but %s is not '
                    'in default list of protocols:\n%s' %
                    (protocol, protocol, ', '.join(defaultProtocols)))

        modifiedCS = False
        result = promptUser(
            'Do you want to add the following default protocols:'
            ' %s for SE(s):\n%s' %
            (', '.join(protocolsList), ', '.join(siteSEs)))
        if not result['OK']:
            return result
        if result['Value'].lower() != 'y':
            self.log.always('No protocols will be added')
            return S_OK()

        for se in siteSEs:
            sections = gConfig.getSections('/Resources/StorageElements/%s/' %
                                           (se))
            if not sections['OK']:
                return sections
            for section in sections['Value']:
                if gConfig.getValue(
                        '/Resources/StorageElements/%s/%s/ProtocolName' %
                    (se, section), '') == 'SRM2':
                    path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (
                        se, section)
                    self.log.verbose('Setting %s to %s' %
                                     (path, ', '.join(protocolsList)))
                    result = self.csSetOption(path, ', '.join(protocolsList))
                    if not result['OK']:
                        return result
                    modifiedCS = True

        if modifiedCS:
            result = self.csCommitChanges(False)
            if not result['OK']:
                return S_ERROR('CS Commit failed with message = %s' %
                               (result['Message']))
            else:
                if printOutput:
                    print 'Successfully committed changes to CS'
        else:
            if printOutput:
                print 'No modifications to CS required'

        return S_OK()

    #############################################################################
    def csSetOption(self, optionPath, optionValue):
        """
    Function to modify an existing value in the CS.
    """
        return self.csAPI.setOption(optionPath, optionValue)

    #############################################################################
    def csSetOptionComment(self, optionPath, comment):
        """
    Function to modify an existing value in the CS.
    """
        return self.csAPI.setOptionComment(optionPath, comment)

    #############################################################################
    def csModifyValue(self, optionPath, newValue):
        """
    Function to modify an existing value in the CS.
    """
        return self.csAPI.modifyValue(optionPath, newValue)

    #############################################################################
    def csRegisterUser(self, username, properties):
        """
    Registers a user in the CS.

        - username: Username of the user (easy;)
        - properties: Dict containing:
            - DN
            - groups : list/tuple of groups the user belongs to
            - <others> : More properties of the user, like mail

    """
        return self.csAPI.addUser(username, properties)

    #############################################################################
    def csDeleteUser(self, user):
        """
    Deletes a user from the CS. Can take a list of users
    """
        return self.csAPI.deleteUsers(user)

    #############################################################################
    def csModifyUser(self, username, properties, createIfNonExistant=False):
        """
    Modify a user in the CS. Takes the same params as in addUser and
    applies the changes
    """
        return self.csAPI.modifyUser(username, properties, createIfNonExistant)

    #############################################################################
    def csListUsers(self, group=False):
        """
    Lists the users in the CS. If no group is specified return all users.
    """
        return self.csAPI.listUsers(group)

    #############################################################################
    def csDescribeUsers(self, mask=False):
        """
    List users and their properties in the CS.
    If a mask is given, only users in the mask will be returned
    """
        return self.csAPI.describeUsers(mask)

    #############################################################################
    def csModifyGroup(self, groupname, properties, createIfNonExistant=False):
        """
    Modify a user in the CS. Takes the same params as in addGroup and applies
    the changes
    """
        return self.csAPI.modifyGroup(groupname, properties,
                                      createIfNonExistant)

    #############################################################################
    def csListHosts(self):
        """
    Lists the hosts in the CS
    """
        return self.csAPI.listHosts()

    #############################################################################
    def csDescribeHosts(self, mask=False):
        """
    Gets extended info for the hosts in the CS
    """
        return self.csAPI.describeHosts(mask)

    #############################################################################
    def csModifyHost(self, hostname, properties, createIfNonExistant=False):
        """
    Modify a host in the CS. Takes the same params as in addHost and applies
    the changes
    """
        return self.csAPI.modifyHost(hostname, properties, createIfNonExistant)

    #############################################################################
    def csListGroups(self):
        """
    Lists groups in the CS
    """
        return self.csAPI.listGroups()

    #############################################################################
    def csDescribeGroups(self, mask=False):
        """
    List groups and their properties in the CS.
    If a mask is given, only groups in the mask will be returned
    """
        return self.csAPI.describeGroups(mask)

    #############################################################################
    def csSyncUsersWithCFG(self, usersCFG):
        """
    Synchronize users in cfg with its contents
    """
        return self.csAPI.syncUsersWithCFG(usersCFG)

    #############################################################################
    def csCommitChanges(self, sortUsers=True):
        """
    Commit the changes in the CS
    """
        return self.csAPI.commitChanges(sortUsers=False)

    #############################################################################
    def sendMail(self,
                 address,
                 subject,
                 body,
                 fromAddress=None,
                 localAttempt=True,
                 html=False):
        """
    Send mail to specified address with body.
    """
        notification = NotificationClient()
        return notification.sendMail(address, subject, body, fromAddress,
                                     localAttempt, html)

    #############################################################################
    def sendSMS(self, userName, body, fromAddress=None):
        """
    Send mail to specified address with body.
    """
        if len(body) > 160:
            return S_ERROR('Exceeded maximum SMS length of 160 characters')
        notification = NotificationClient()
        return notification.sendSMS(userName, body, fromAddress)

    #############################################################################
    def getBDIISite(self, site, host=None):
        """
    Get information about site from BDII at host
    """
        return ldapSite(site, host=host)

    #############################################################################
    def getBDIICluster(self, ce, host=None):
        """
    Get information about ce from BDII at host
    """
        return ldapCluster(ce, host=host)

    #############################################################################
    def getBDIICE(self, ce, host=None):
        """
    Get information about ce from BDII at host
    """
        return ldapCE(ce, host=host)

    #############################################################################
    def getBDIIService(self, ce, host=None):
        """
    Get information about ce from BDII at host
    """
        return ldapService(ce, host=host)

    #############################################################################
    def getBDIICEState(self, ce, useVO=voName, host=None):
        """
    Get information about ce state from BDII at host
    """
        return ldapCEState(ce, useVO, host=host)

    #############################################################################
    def getBDIICEVOView(self, ce, useVO=voName, host=None):
        """
    Get information about ce voview from BDII at host
    """
        return ldapCEVOView(ce, useVO, host=host)

    #############################################################################
    def getBDIISE(self, site, useVO=voName, host=None):
        """
    Get information about SA  from BDII at host
    """
        return ldapSE(site, useVO, host=host)
Example #28
0
        DIRACExit(-1)
    voName = result['Value']

    resultQueues = Resources.getQueues(siteList=sites, community=voName)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)
    siteDict = resultQueues['Value']
    result = getQueuesResolved(siteDict)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)
    queueDict = result['Value']

    # get list of usable sites within this cycle
    resultMask = SiteStatus().getUsableSites()
    if not resultMask['OK']:
        gLogger.error('Failed to get Site mask information')
        DIRACExit(-1)
    siteMaskList = resultMask.get('Value', [])

    rssClient = ResourceStatus()

    fields = ('Site', 'CE', 'Queue', 'Status', 'Match', 'Reason')
    records = []

    for queue, queueInfo in queueDict.iteritems():
        site = queueInfo['Site']
        ce = queueInfo['CEName']
        siteStatus = "Active" if site in siteMaskList else "InActive"
        ceStatus = siteStatus
Example #29
0
class DiracAdmin(API):
  """ Administrative functionalities
  """

  #############################################################################
  def __init__(self):
    """Internal initialization of the DIRAC Admin API.
    """
    super(DiracAdmin, self).__init__()

    self.csAPI = CSAPI()

    self.dbg = False
    if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG':
      self.dbg = True

    self.scratchDir = gConfig.getValue(self.section + '/ScratchDir', '/tmp')
    self.currentDir = os.getcwd()
    self.rssFlag = ResourceStatus().rssFlag
    self.sitestatus = SiteStatus()

  #############################################################################
  def uploadProxy(self, group):
    """Upload a proxy to the DIRAC WMS.  This method

       Example usage:

         >>> print diracAdmin.uploadProxy('lhcb_pilot')
         {'OK': True, 'Value': 0L}

       :param group: DIRAC Group
       :type job: string
       :return: S_OK,S_ERROR

       :param permanent: Indefinitely update proxy
       :type permanent: boolean

    """
    return gProxyManager.uploadProxy(diracGroup=group)

  #############################################################################
  def setProxyPersistency(self, userDN, userGroup, persistent=True):
    """Set the persistence of a proxy in the Proxy Manager

       Example usage:

         >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )
         {'OK': True }

       :param userDN: User DN
       :type userDN: string
       :param userGroup: DIRAC Group
       :type userGroup: string
       :param persistent: Persistent flag
       :type persistent: boolean
       :return: S_OK,S_ERROR
    """
    return gProxyManager.setPersistency(userDN, userGroup, persistent)

  #############################################################################
  def checkProxyUploaded(self, userDN, userGroup, requiredTime):
    """Set the persistence of a proxy in the Proxy Manager

       Example usage:

         >>> print diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True )
         {'OK': True, 'Value' : True/False }

       :param userDN: User DN
       :type userDN: string
       :param userGroup: DIRAC Group
       :type userGroup: string
       :param requiredTime: Required life time of the uploaded proxy
       :type requiredTime: boolean
       :return: S_OK,S_ERROR
    """
    return gProxyManager.userHasProxy(userDN, userGroup, requiredTime)

  #############################################################################
  def getSiteMask(self, printOutput=False, status='Active'):
    """Retrieve current site mask from WMS Administrator service.

       Example usage:

         >>> print diracAdmin.getSiteMask()
         {'OK': True, 'Value': 0L}

       :return: S_OK,S_ERROR

    """

    result = self.sitestatus.getSites(siteState=status)
    if result['OK']:
      sites = result['Value']
      if printOutput:
        sites.sort()
        for site in sites:
          print site

    return result

  #############################################################################
  def getBannedSites(self, printOutput=False):
    """Retrieve current list of banned  and probing sites.

       Example usage:

         >>> print diracAdmin.getBannedSites()
         {'OK': True, 'Value': []}

       :return: S_OK,S_ERROR

    """

    bannedSites = self.sitestatus.getSites(siteState='Banned')
    if not bannedSites['OK']:
      return bannedSites

    probingSites = self.sitestatus.getSites(siteState='Probing')
    if not probingSites['OK']:
      return probingSites

    mergedList = sorted(bannedSites['Value'] + probingSites['Value'])

    if printOutput:
      print '\n'.join(mergedList)

    return S_OK(mergedList)

  #############################################################################
  def getSiteSection(self, site, printOutput=False):
    """Simple utility to get the list of CEs for DIRAC site name.

       Example usage:

         >>> print diracAdmin.getSiteSection('LCG.CERN.ch')
         {'OK': True, 'Value':}

       :return: S_OK,S_ERROR
    """
    gridType = site.split('.')[0]
    if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']:
      return S_ERROR('/Resources/Sites/%s is not a valid site section' % (gridType))

    result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' % (gridType, site))
    if printOutput and result['OK']:
      print self.pPrint.pformat(result['Value'])
    return result

  #############################################################################
  def allowSite(self, site, comment, printOutput=False):
    """Adds the site to the site mask.

       Example usage:

         >>> print diracAdmin.allowSite()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
    result = self.__checkSiteIsValid(site)
    if not result['OK']:
      return result

    result = self.getSiteMask(status='Active')
    if not result['OK']:
      return result
    siteMask = result['Value']
    if site in siteMask:
      if printOutput:
        print 'Site %s is already Active' % site
      return S_OK('Site %s is already Active' % site)

    if self.rssFlag:
      result = self.sitestatus.setSiteStatus(site, 'Active', comment)
    else:
      wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
      result = wmsAdmin.allowSite(site, comment)
    if not result['OK']:
      return result

    if printOutput:
      print 'Site %s status is set to Active' % site

    return result

  #############################################################################
  def getSiteMaskLogging(self, site=None, printOutput=False):
    """Retrieves site mask logging information.

       Example usage:

         >>> print diracAdmin.getSiteMaskLogging('LCG.AUVER.fr')
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR
    """
    result = self.__checkSiteIsValid(site)
    if not result['OK']:
      return result

    if self.rssFlag:
      result = ResourceStatusClient().selectStatusElement('Site', 'History', name=site)
    else:
      result = RPCClient('WorkloadManagement/WMSAdministrator').getSiteMaskLogging(site)

    if not result['OK']:
      return result

    if printOutput:
      if site:
        print '\nSite Mask Logging Info for %s\n' % site
      else:
        print '\nAll Site Mask Logging Info\n'

      sitesLogging = result['Value']
      if isinstance(sitesLogging, dict):
        for siteName, tupleList in sitesLogging.iteritems():
          if not siteName:
            print '\n===> %s\n' % siteName
          for tup in tupleList:
            print str(tup[0]).ljust(8) + str(tup[1]).ljust(20) + \
                '( ' + str(tup[2]).ljust(len(str(tup[2]))) + ' )  "' + str(tup[3]) + '"'
          print ' '
      elif isinstance(sitesLogging, list):
        result = [(sl[1], sl[3], sl[4]) for sl in sitesLogging]

    return result

  #############################################################################
  def banSite(self, site, comment, printOutput=False):
    """Removes the site from the site mask.

       Example usage:

         >>> print diracAdmin.banSite()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
    result = self.__checkSiteIsValid(site)
    if not result['OK']:
      return result

    mask = self.getSiteMask(status='Banned')
    if not mask['OK']:
      return mask
    siteMask = mask['Value']
    if site in siteMask:
      if printOutput:
        print 'Site %s is already Banned' % site
      return S_OK('Site %s is already Banned' % site)

    if self.rssFlag:
      result = self.sitestatus.setSiteStatus(site, 'Banned', comment)
    else:
      wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
      result = wmsAdmin.banSite(site, comment)
    if not result['OK']:
      return result

    if printOutput:
      print 'Site %s status is set to Banned' % site

    return result

  #############################################################################
  def __checkSiteIsValid(self, site):
    """Internal function to check that a site name is valid.
    """
    sites = getSiteCEMapping()
    if not sites['OK']:
      return S_ERROR('Could not get site CE mapping')
    siteList = sites['Value'].keys()
    if site not in siteList:
      return S_ERROR('Specified site %s is not in list of defined sites' % site)

    return S_OK('%s is valid' % site)

  #############################################################################
  def clearMask(self):
    """Removes all sites from the site mask.  Should be used with care.

       Example usage:

         >>> print diracAdmin.clearMask()
         {'OK': True, 'Value':''}

       :return: S_OK,S_ERROR

    """
    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.clearMask()
    return result

  #############################################################################
  def getServicePorts(self, setup='', printOutput=False):
    """Checks the service ports for the specified setup.  If not given this is
       taken from the current installation (/DIRAC/Setup)

       Example usage:

         >>> print diracAdmin.getServicePorts()
         {'OK': True, 'Value':''}

       :return: S_OK,S_ERROR

    """
    if not setup:
      setup = gConfig.getValue('/DIRAC/Setup', '')

    setupList = gConfig.getSections('/DIRAC/Setups', [])
    if not setupList['OK']:
      return S_ERROR('Could not get /DIRAC/Setups sections')
    setupList = setupList['Value']
    if setup not in setupList:
      return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList)))

    serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup)
    if not serviceSetups['OK']:
      return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup)
    serviceSetups = serviceSetups['Value']  # dict
    systemList = gConfig.getSections('/Systems')
    if not systemList['OK']:
      return S_ERROR('Could not get Systems sections')
    systemList = systemList['Value']
    result = {}
    for system in systemList:
      if system in serviceSetups:
        path = '/Systems/%s/%s/Services' % (system, serviceSetups[system])
        servicesList = gConfig.getSections(path)
        if not servicesList['OK']:
          self.log.warn('Could not get sections in %s' % path)
        else:
          servicesList = servicesList['Value']
          if not servicesList:
            servicesList = []
          self.log.verbose('System: %s ServicesList: %s' % (system, ', '.join(servicesList)))
          for service in servicesList:
            spath = '%s/%s/Port' % (path, service)
            servicePort = gConfig.getValue(spath, 0)
            if servicePort:
              self.log.verbose('Found port for %s/%s = %s' % (system, service, servicePort))
              result['%s/%s' % (system, service)] = servicePort
            else:
              self.log.warn('No port found for %s' % spath)
      else:
        self.log.warn('%s is not defined in /DIRAC/Setups/%s' % (system, setup))

    if printOutput:
      print self.pPrint.pformat(result)

    return S_OK(result)

  #############################################################################
  def getProxy(self, userDN, userGroup, validity=43200, limited=False):
    """Retrieves a proxy with default 12hr validity and stores
       this in a file in the local directory by default.

       Example usage:

         >>> print diracAdmin.getProxy()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
    return gProxyManager.downloadProxy(userDN, userGroup, limited=limited,
                                       requiredTimeLeft=validity)

  #############################################################################
  def getVOMSProxy(self, userDN, userGroup, vomsAttr=False, validity=43200, limited=False):
    """Retrieves a proxy with default 12hr validity and VOMS extensions and stores
       this in a file in the local directory by default.

       Example usage:

         >>> print diracAdmin.getVOMSProxy()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
    return gProxyManager.downloadVOMSProxy(userDN, userGroup, limited=limited,
                                           requiredVOMSAttribute=vomsAttr,
                                           requiredTimeLeft=validity)

  #############################################################################
  def getPilotProxy(self, userDN, userGroup, validity=43200):
    """Retrieves a pilot proxy with default 12hr validity and stores
       this in a file in the local directory by default.

       Example usage:

         >>> print diracAdmin.getVOMSProxy()
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """

    return gProxyManager.getPilotProxyFromDIRACGroup(userDN, userGroup, requiredTimeLeft=validity)

  #############################################################################
  def resetJob(self, jobID):
    """Reset a job or list of jobs in the WMS.  This operation resets the reschedule
       counter for a job or list of jobs and allows them to run as new.

       Example::

         >>> print dirac.reset(12345)
         {'OK': True, 'Value': [12345]}

       :param job: JobID
       :type job: integer or list of integers
       :return: S_OK,S_ERROR

    """
    if isinstance(jobID, basestring):
      try:
        jobID = int(jobID)
      except Exception as x:
        return self._errorReport(str(x), 'Expected integer or convertible integer for existing jobID')
    elif isinstance(jobID, list):
      try:
        jobID = [int(job) for job in jobID]
      except Exception as x:
        return self._errorReport(str(x), 'Expected integer or convertible integer for existing jobIDs')

    jobManager = RPCClient('WorkloadManagement/JobManager', useCertificates=False)
    result = jobManager.resetJob(jobID)
    return result

  #############################################################################
  def getJobPilotOutput(self, jobID, directory=''):
    """Retrieve the pilot output for an existing job in the WMS.
       The output will be retrieved in a local directory unless
       otherwise specified.

         >>> print dirac.getJobPilotOutput(12345)
         {'OK': True, StdOut:'',StdError:''}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
    if not directory:
      directory = self.currentDir

    if not os.path.exists(directory):
      return self._errorReport('Directory %s does not exist' % directory)

    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.getJobPilotOutput(jobID)
    if not result['OK']:
      return result

    outputPath = '%s/pilot_%s' % (directory, jobID)
    if os.path.exists(outputPath):
      self.log.info('Remove %s and retry to continue' % outputPath)
      return S_ERROR('Remove %s and retry to continue' % outputPath)

    if not os.path.exists(outputPath):
      self.log.verbose('Creating directory %s' % outputPath)
      os.mkdir(outputPath)

    outputs = result['Value']
    if 'StdOut' in outputs:
      stdout = '%s/std.out' % (outputPath)
      with open(stdout, 'w') as fopen:
        fopen.write(outputs['StdOut'])
      self.log.verbose('Standard output written to %s' % (stdout))
    else:
      self.log.warn('No standard output returned')

    if 'StdError' in outputs:
      stderr = '%s/std.err' % (outputPath)
      with open(stderr, 'w') as fopen:
        fopen.write(outputs['StdError'])
      self.log.verbose('Standard error written to %s' % (stderr))
    else:
      self.log.warn('No standard error returned')

    self.log.always('Outputs retrieved in %s' % outputPath)
    return result

  #############################################################################
  def getPilotOutput(self, gridReference, directory=''):
    """Retrieve the pilot output  (std.out and std.err) for an existing job in the WMS.

         >>> print dirac.getJobPilotOutput(12345)
         {'OK': True, 'Value': {}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
    if not isinstance(gridReference, basestring):
      return self._errorReport('Expected string for pilot reference')

    if not directory:
      directory = self.currentDir

    if not os.path.exists(directory):
      return self._errorReport('Directory %s does not exist' % directory)

    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.getPilotOutput(gridReference)
    if not result['OK']:
      return result

    gridReferenceSmall = gridReference.split('/')[-1]
    if not gridReferenceSmall:
      gridReferenceSmall = 'reference'
    outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall)

    if os.path.exists(outputPath):
      self.log.info('Remove %s and retry to continue' % outputPath)
      return S_ERROR('Remove %s and retry to continue' % outputPath)

    if not os.path.exists(outputPath):
      self.log.verbose('Creating directory %s' % outputPath)
      os.mkdir(outputPath)

    outputs = result['Value']
    if 'StdOut' in outputs:
      stdout = '%s/std.out' % (outputPath)
      with open(stdout, 'w') as fopen:
        fopen.write(outputs['StdOut'])
      self.log.info('Standard output written to %s' % (stdout))
    else:
      self.log.warn('No standard output returned')

    if 'StdErr' in outputs:
      stderr = '%s/std.err' % (outputPath)
      with open(stderr, 'w') as fopen:
        fopen.write(outputs['StdErr'])
      self.log.info('Standard error written to %s' % (stderr))
    else:
      self.log.warn('No standard error returned')

    self.log.always('Outputs retrieved in %s' % outputPath)
    return result

  #############################################################################
  def getPilotInfo(self, gridReference):
    """Retrieve info relative to a pilot reference

         >>> print dirac.getPilotInfo(12345)
         {'OK': True, 'Value': {}}

       :param gridReference: Pilot Job Reference
       :type gridReference: string
       :return: S_OK,S_ERROR
    """
    if not isinstance(gridReference, basestring):
      return self._errorReport('Expected string for pilot reference')

    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.getPilotInfo(gridReference)
    return result

  #############################################################################
  def killPilot(self, gridReference):
    """Kill the pilot specified

         >>> print dirac.getPilotInfo(12345)
         {'OK': True, 'Value': {}}

       :param gridReference: Pilot Job Reference
       :return: S_OK,S_ERROR
    """
    if not isinstance(gridReference, basestring):
      return self._errorReport('Expected string for pilot reference')

    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.killPilot(gridReference)
    return result

  #############################################################################
  def getPilotLoggingInfo(self, gridReference):
    """Retrieve the pilot logging info for an existing job in the WMS.

         >>> print dirac.getPilotLoggingInfo(12345)
         {'OK': True, 'Value': {"The output of the command"}}

       :param gridReference: Gridp pilot job reference Id
       :type gridReference: string
       :return: S_OK,S_ERROR
    """
    if not isinstance(gridReference, basestring):
      return self._errorReport('Expected string for pilot reference')

    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    return wmsAdmin.getPilotLoggingInfo(gridReference)

  #############################################################################
  def getJobPilots(self, jobID):
    """Extract the list of submitted pilots and their status for a given
       jobID from the WMS.  Useful information is printed to the screen.

         >>> print dirac.getJobPilots()
         {'OK': True, 'Value': {PilotID:{StatusDict}}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR

    """
    if isinstance(jobID, basestring):
      try:
        jobID = int(jobID)
      except Exception as x:
        return self._errorReport(str(x), 'Expected integer or string for existing jobID')

    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.getPilots(jobID)
    if result['OK']:
      print self.pPrint.pformat(result['Value'])
    return result

  #############################################################################
  def getPilotSummary(self, startDate='', endDate=''):
    """Retrieve the pilot output for an existing job in the WMS.  Summary is
       printed at INFO level, full dictionary of results also returned.

         >>> print dirac.getPilotSummary()
         {'OK': True, 'Value': {CE:{Status:Count}}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.getPilotSummary(startDate, endDate)
    if not result['OK']:
      return result

    ceDict = result['Value']
    headers = 'CE'.ljust(28)
    i = 0
    for ce, summary in ceDict.iteritems():
      states = summary.keys()
      if len(states) > i:
        i = len(states)

    for i in xrange(i):
      headers += 'Status'.ljust(12) + 'Count'.ljust(12)
    print headers

    for ce, summary in ceDict.iteritems():
      line = ce.ljust(28)
      states = sorted(summary)
      for state in states:
        count = str(summary[state])
        line += state.ljust(12) + count.ljust(12)
      print line

    return result

  #############################################################################
  def selectRequests(self, jobID=None, requestID=None, requestName=None,
                     requestType=None, status=None, operation=None, ownerDN=None,
                     ownerGroup=None, requestStart=0, limit=100, printOutput=False):
    """Select requests from the request management system. A few notes on the selection criteria:

         - jobID is the WMS JobID for the request (if applicable)
         - requestID is assigned during submission of the request
         - requestName is the corresponding XML file name
         - requestType e.g. 'transfer'
         - status e.g. Done
         - operation e.g. replicateAndRegister
         - requestStart e.g. the first request to consider (start from 0 by default)
         - limit e.g. selection limit (default 100)

       >>> dirac.selectRequests(jobID='4894')
       {'OK': True, 'Value': [[<Requests>]]}

    """
    options = {'RequestID': requestID, 'RequestName': requestName, 'JobID': jobID, 'OwnerDN': ownerDN,
               'OwnerGroup': ownerGroup, 'RequestType': requestType, 'Status': status, 'Operation': operation}

    conditions = {}
    for key, value in options.iteritems():
      if value:
        try:
          conditions[key] = str(value)
        except Exception as x:
          return self._errorReport(str(x), 'Expected string for %s field' % key)

    try:
      requestStart = int(requestStart)
      limit = int(limit)
    except Exception as x:
      return self._errorReport(str(x), 'Expected integer for %s field' % limit)

    self.log.verbose('Will select requests with the following conditions')
    self.log.verbose(self.pPrint.pformat(conditions))
    requestClient = RPCClient("RequestManagement/centralURL")
    result = requestClient.getRequestSummaryWeb(conditions, [], requestStart, limit)
    if not result['OK']:
      self.log.warn(result['Message'])
      return result

    requestIDs = result['Value']
    conds = []
    for key, value in conditions.iteritems():
      if value:
        conds.append('%s = %s' % (key, value))
    self.log.verbose('%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']),
                                                                                 ', '.join(conds), limit))
    if printOutput:
      requests = []
      if len(requestIDs['Records']) > limit:
        requestList = requestIDs['Records']
        requests = requestList[:limit]
      else:
        requests = requestIDs['Records']
      print '%s request(s) selected with conditions %s and limit %s' % (len(requestIDs['Records']),
                                                                        ', '.join(conds), limit)
      print requestIDs['ParameterNames']
      for request in requests:
        print request
    if not requestIDs:
      return S_ERROR('No requests selected for conditions: %s' % conditions)
    else:
      return result

  #############################################################################
  def getRequestSummary(self, printOutput=False):
    """
    Get a summary of the requests in the request DB.
    """
    requestClient = RPCClient("RequestManagement/centralURL", timeout=120)
    result = requestClient.getDBSummary()
    if not result['OK']:
      self.log.warn(result['Message'])
      return result

    if printOutput:
      print self.pPrint.pformat(result['Value'])

    return result

  #############################################################################
  def getExternalPackageVersions(self):
    """
    Simple function that attempts to obtain the external versions for
    the local DIRAC installation (frequently needed for debugging purposes).
    """
    gLogger.info('DIRAC version v%dr%d build %d' % (DIRAC.majorVersion, DIRAC.minorVersion, DIRAC.patchLevel))
    try:
      import lcg_util  # pylint: disable=import-error
      infoStr = 'Using lcg_util from: \n%s' % lcg_util.__file__
      gLogger.info(infoStr)
      infoStr = "The version of lcg_utils is %s" % lcg_util.lcg_util_version()
      gLogger.info(infoStr)
    except Exception as x:
      errStr = "SRM2Storage.__init__: Failed to import lcg_util: %s" % (x)
      gLogger.exception(errStr)

    try:
      import gfalthr as gfal  # pylint: disable=import-error
      infoStr = "Using gfalthr from: \n%s" % gfal.__file__
      gLogger.info(infoStr)
      infoStr = "The version of gfalthr is %s" % gfal.gfal_version()
      gLogger.info(infoStr)
    except Exception as x:
      errStr = "SRM2Storage.__init__: Failed to import gfalthr: %s." % (x)
      gLogger.warn(errStr)
      try:
        import gfal  # pylint: disable=import-error
        infoStr = "Using gfal from: %s" % gfal.__file__
        gLogger.info(infoStr)
        infoStr = "The version of gfal is %s" % gfal.gfal_version()
        gLogger.info(infoStr)
      except Exception as x:
        errStr = "SRM2Storage.__init__: Failed to import gfal: %s" % (x)
        gLogger.exception(errStr)

    defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', [])
    gLogger.info('Default list of protocols are: %s' % (', '.join(defaultProtocols)))
    return S_OK()

  #############################################################################
  def getSiteProtocols(self, site, printOutput=False):
    """
    Allows to check the defined protocols for each site SE.
    """
    result = self.__checkSiteIsValid(site)
    if not result['OK']:
      return result

    siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site)
    siteSEs = gConfig.getValue(siteSection, [])
    if not siteSEs:
      return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection))

    defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', [])
    self.log.verbose('Default list of protocols are' ', '.join(defaultProtocols))
    seInfo = {}
    siteSEs.sort()
    for se in siteSEs:
      sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se))
      if not sections['OK']:
        return sections
      for section in sections['Value']:
        if gConfig.getValue('/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2':
          path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (se, section)
          seProtocols = gConfig.getValue(path, [])
          if not seProtocols:
            seProtocols = defaultProtocols
          seInfo[se] = seProtocols

    if printOutput:
      print '\nSummary of protocols for StorageElements at site %s' % site
      print '\nStorageElement'.ljust(30) + 'ProtocolsList'.ljust(30) + '\n'
      for se, protocols in seInfo.iteritems():
        print se.ljust(30) + ', '.join(protocols).ljust(30)

    return S_OK(seInfo)

  #############################################################################
  def setSiteProtocols(self, site, protocolsList, printOutput=False):
    """
    Allows to set the defined protocols for each SE for a given site.
    """
    result = self.__checkSiteIsValid(site)
    if not result['OK']:
      return result

    siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site)
    siteSEs = gConfig.getValue(siteSection, [])
    if not siteSEs:
      return S_ERROR('No SEs found for site %s in section %s' % (site, siteSection))

    defaultProtocols = gConfig.getValue('/Resources/StorageElements/DefaultProtocols', [])
    self.log.verbose('Default list of protocols are', ', '.join(defaultProtocols))

    for protocol in protocolsList:
      if protocol not in defaultProtocols:
        return S_ERROR('Requested to set protocol %s in list but %s is not '
                       'in default list of protocols:\n%s' % (protocol, protocol, ', '.join(defaultProtocols)))

    modifiedCS = False
    result = promptUser('Do you want to add the following default protocols:'
                        ' %s for SE(s):\n%s' % (', '.join(protocolsList), ', '.join(siteSEs)))
    if not result['OK']:
      return result
    if result['Value'].lower() != 'y':
      self.log.always('No protocols will be added')
      return S_OK()

    for se in siteSEs:
      sections = gConfig.getSections('/Resources/StorageElements/%s/' % (se))
      if not sections['OK']:
        return sections
      for section in sections['Value']:
        if gConfig.getValue('/Resources/StorageElements/%s/%s/ProtocolName' % (se, section), '') == 'SRM2':
          path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (se, section)
          self.log.verbose('Setting %s to %s' % (path, ', '.join(protocolsList)))
          result = self.csSetOption(path, ', '.join(protocolsList))
          if not result['OK']:
            return result
          modifiedCS = True

    if modifiedCS:
      result = self.csCommitChanges(False)
      if not result['OK']:
        return S_ERROR('CS Commit failed with message = %s' % (result['Message']))
      else:
        if printOutput:
          print 'Successfully committed changes to CS'
    else:
      if printOutput:
        print 'No modifications to CS required'

    return S_OK()

  #############################################################################
  def csSetOption(self, optionPath, optionValue):
    """
    Function to modify an existing value in the CS.
    """
    return self.csAPI.setOption(optionPath, optionValue)

  #############################################################################
  def csSetOptionComment(self, optionPath, comment):
    """
    Function to modify an existing value in the CS.
    """
    return self.csAPI.setOptionComment(optionPath, comment)

  #############################################################################
  def csModifyValue(self, optionPath, newValue):
    """
    Function to modify an existing value in the CS.
    """
    return self.csAPI.modifyValue(optionPath, newValue)

  #############################################################################
  def csRegisterUser(self, username, properties):
    """
    Registers a user in the CS.

        - username: Username of the user (easy;)
        - properties: Dict containing:
            - DN
            - groups : list/tuple of groups the user belongs to
            - <others> : More properties of the user, like mail

    """
    return self.csAPI.addUser(username, properties)

  #############################################################################
  def csDeleteUser(self, user):
    """
    Deletes a user from the CS. Can take a list of users
    """
    return self.csAPI.deleteUsers(user)

  #############################################################################
  def csModifyUser(self, username, properties, createIfNonExistant=False):
    """
    Modify a user in the CS. Takes the same params as in addUser and
    applies the changes
    """
    return self.csAPI.modifyUser(username, properties, createIfNonExistant)

  #############################################################################
  def csListUsers(self, group=False):
    """
    Lists the users in the CS. If no group is specified return all users.
    """
    return self.csAPI.listUsers(group)

  #############################################################################
  def csDescribeUsers(self, mask=False):
    """
    List users and their properties in the CS.
    If a mask is given, only users in the mask will be returned
    """
    return self.csAPI.describeUsers(mask)

  #############################################################################
  def csModifyGroup(self, groupname, properties, createIfNonExistant=False):
    """
    Modify a user in the CS. Takes the same params as in addGroup and applies
    the changes
    """
    return self.csAPI.modifyGroup(groupname, properties, createIfNonExistant)

  #############################################################################
  def csListHosts(self):
    """
    Lists the hosts in the CS
    """
    return self.csAPI.listHosts()

  #############################################################################
  def csDescribeHosts(self, mask=False):
    """
    Gets extended info for the hosts in the CS
    """
    return self.csAPI.describeHosts(mask)

  #############################################################################
  def csModifyHost(self, hostname, properties, createIfNonExistant=False):
    """
    Modify a host in the CS. Takes the same params as in addHost and applies
    the changes
    """
    return self.csAPI.modifyHost(hostname, properties, createIfNonExistant)

  #############################################################################
  def csListGroups(self):
    """
    Lists groups in the CS
    """
    return self.csAPI.listGroups()

  #############################################################################
  def csDescribeGroups(self, mask=False):
    """
    List groups and their properties in the CS.
    If a mask is given, only groups in the mask will be returned
    """
    return self.csAPI.describeGroups(mask)

  #############################################################################
  def csSyncUsersWithCFG(self, usersCFG):
    """
    Synchronize users in cfg with its contents
    """
    return self.csAPI.syncUsersWithCFG(usersCFG)

  #############################################################################
  def csCommitChanges(self, sortUsers=True):
    """
    Commit the changes in the CS
    """
    return self.csAPI.commitChanges(sortUsers=False)

  #############################################################################
  def sendMail(self, address, subject, body, fromAddress=None, localAttempt=True, html=False):
    """
    Send mail to specified address with body.
    """
    notification = NotificationClient()
    return notification.sendMail(address, subject, body, fromAddress, localAttempt, html)

  #############################################################################
  def sendSMS(self, userName, body, fromAddress=None):
    """
    Send mail to specified address with body.
    """
    if len(body) > 160:
      return S_ERROR('Exceeded maximum SMS length of 160 characters')
    notification = NotificationClient()
    return notification.sendSMS(userName, body, fromAddress)

  #############################################################################
  def getBDIISite(self, site, host=None):
    """
    Get information about site from BDII at host
    """
    return ldapSite(site, host=host)

  #############################################################################
  def getBDIICluster(self, ce, host=None):
    """
    Get information about ce from BDII at host
    """
    return ldapCluster(ce, host=host)

  #############################################################################
  def getBDIICE(self, ce, host=None):
    """
    Get information about ce from BDII at host
    """
    return ldapCE(ce, host=host)

  #############################################################################
  def getBDIIService(self, ce, host=None):
    """
    Get information about ce from BDII at host
    """
    return ldapService(ce, host=host)

  #############################################################################
  def getBDIICEState(self, ce, useVO=voName, host=None):
    """
    Get information about ce state from BDII at host
    """
    return ldapCEState(ce, useVO, host=host)

  #############################################################################
  def getBDIICEVOView(self, ce, useVO=voName, host=None):
    """
    Get information about ce voview from BDII at host
    """
    return ldapCEVOView(ce, useVO, host=host)

  #############################################################################
  def getBDIISE(self, site, useVO=voName, host=None):
    """
    Get information about SA  from BDII at host
    """
    return ldapSE(site, useVO, host=host)
Example #30
0
class Matcher(object):
    """ Logic for matching
  """
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()

    def selectJob(self, resourceDescription, credDict):
        """ Main job selection function to find the highest priority job matching the resource capacity
    """

        startTime = time.time()

        resourceDict = self._getResourceDict(resourceDescription, credDict)

        # Make a nice print of the resource matching parameters
        toPrintDict = dict(resourceDict)
        if "MaxRAM" in resourceDescription:
            toPrintDict['MaxRAM'] = resourceDescription['MaxRAM']
        if "NumberOfProcessors" in resourceDescription:
            toPrintDict['NumberOfProcessors'] = resourceDescription[
                'NumberOfProcessors']
        toPrintDict['Tag'] = []
        if "Tag" in resourceDict:
            for tag in resourceDict['Tag']:
                if not tag.endswith('GB') and not tag.endswith('Processors'):
                    toPrintDict['Tag'].append(tag)
        if not toPrintDict['Tag']:
            toPrintDict.pop('Tag')
        gLogger.info('Resource description for matching',
                     printDict(toPrintDict))

        negativeCond = self.limiter.getNegativeCondForSite(
            resourceDict['Site'])
        result = self.tqDB.matchAndGetJob(resourceDict,
                                          negativeCond=negativeCond)

        if not result['OK']:
            raise RuntimeError(result['Message'])
        result = result['Value']
        if not result['matchFound']:
            self.log.info("No match found")
            return {}

        jobID = result['jobId']
        resAtt = self.jobDB.getJobAttributes(
            jobID, ['OwnerDN', 'OwnerGroup', 'Status'])
        if not resAtt['OK']:
            raise RuntimeError('Could not retrieve job attributes')
        if not resAtt['Value']:
            raise RuntimeError("No attributes returned for job")
        if not resAtt['Value']['Status'] == 'Waiting':
            self.log.error('Job matched by the TQ is not in Waiting state',
                           str(jobID))
            result = self.tqDB.deleteJob(jobID)
            if not result['OK']:
                raise RuntimeError(result['Message'])
            raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

        self._reportStatus(resourceDict, jobID)

        result = self.jobDB.getJobJDL(jobID)
        if not result['OK']:
            raise RuntimeError("Failed to get the job JDL")

        resultDict = {}
        resultDict['JDL'] = result['Value']
        resultDict['JobID'] = jobID

        matchTime = time.time() - startTime
        self.log.info("Match time: [%s]" % str(matchTime))
        gMonitor.addMark("matchTime", matchTime)

        # Get some extra stuff into the response returned
        resOpt = self.jobDB.getJobOptParameters(jobID)
        if resOpt['OK']:
            for key, value in resOpt['Value'].items():
                resultDict[key] = value
        resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
        if not resAtt['OK']:
            raise RuntimeError('Could not retrieve job attributes')
        if not resAtt['Value']:
            raise RuntimeError('No attributes returned for job')

        if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.limiter.updateDelayCounters(resourceDict['Site'], jobID)

        pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag',
                                                 False)
        if not pilotInfoReportedFlag:
            self._updatePilotInfo(resourceDict)
        self._updatePilotJobMapping(resourceDict, jobID)

        resultDict['DN'] = resAtt['Value']['OwnerDN']
        resultDict['Group'] = resAtt['Value']['OwnerGroup']
        resultDict['PilotInfoReportedFlag'] = True

        return resultDict

    def _getResourceDict(self, resourceDescription, credDict):
        """ from resourceDescription to resourceDict (just various mods)
    """
        resourceDict = self._processResourceDescription(resourceDescription)
        resourceDict = self._checkCredentials(resourceDict, credDict)
        self._checkPilotVersion(resourceDict)
        if not self._checkMask(resourceDict):
            # Banned destinations can only take Test jobs
            resourceDict['JobType'] = 'Test'

        self.log.verbose("Resource description:")
        for key in resourceDict:
            self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key]))

        return resourceDict

    def _processResourceDescription(self, resourceDescription):
        """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

        resourceDict = {}
        for name in singleValueDefFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in multiValueMatchFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in tagMatchFields:
            if name in resourceDescription and resourceDescription[name]:
                resourceDict[name] = resourceDescription[name]
            rname = 'Required%s' % name
            if rname in resourceDescription:
                resourceDict[rname] = resourceDescription[rname]

        if 'JobID' in resourceDescription:
            resourceDict['JobID'] = resourceDescription['JobID']

        # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
        maxRAM = resourceDescription.get('MaxRAM')
        if maxRAM:
            try:
                maxRAM = int(maxRAM) / 1000
            except ValueError:
                maxRAM = None
        nProcessors = resourceDescription.get('NumberOfProcessors')
        if nProcessors:
            try:
                nProcessors = int(nProcessors)
            except ValueError:
                nProcessors = None
        for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]:
            if param and param <= 128:
                paramList = range(2, param + 1)
                paramTags = ['%d%s' % (par, key) for par in paramList]
                if paramTags:
                    resourceDict.setdefault("Tag", []).extend(paramTags)

        if "WholeNode" in resourceDescription:
            resourceDict.setdefault("Tag", []).append("WholeNode")

        if 'Tag' in resourceDict:
            resourceDict['Tag'] = list(set(resourceDict['Tag']))

        for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject',
                  'VirtualOrganization', 'PilotReference', 'PilotBenchmark',
                  'PilotInfoReportedFlag'):
            if k in resourceDescription:
                resourceDict[k] = resourceDescription[k]

        return resourceDict

    def _reportStatus(self, resourceDict, jobID):
        """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
        attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
        attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
        result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
        if not result['OK']:
            self.log.error(
                "Problem reporting job status",
                "setJobAttributes, jobID = %s: %s" %
                (jobID, result['Message']))
        else:
            self.log.verbose("Set job attributes for jobID %s" % jobID)

        result = self.jlDB.addLoggingRecord(jobID,
                                            status='Matched',
                                            minor='Assigned',
                                            source='Matcher')
        if not result['OK']:
            self.log.error(
                "Problem reporting job status",
                "addLoggingRecord, jobID = %s: %s" %
                (jobID, result['Message']))
        else:
            self.log.verbose("Added logging record for jobID %s" % jobID)

    def _checkMask(self, resourceDict):
        """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
        if 'Site' not in resourceDict:
            self.log.error("Missing Site Name in Resource JDL")
            raise RuntimeError("Missing Site Name in Resource JDL")

        # Check if site is allowed
        result = self.siteClient.getUsableSites(resourceDict['Site'])
        if not result['OK']:
            self.log.error("Internal error",
                           "siteClient.getUsableSites: %s" % result['Message'])
            raise RuntimeError("Internal error")

        if resourceDict['Site'] not in result['Value']:
            return False

        return True

    def _updatePilotInfo(self, resourceDict):
        """ Update pilot information - do not fail if we don't manage to do it
    """
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            gridCE = resourceDict.get('GridCE', 'Unknown')
            site = resourceDict.get('Site', 'Unknown')
            benchmark = resourceDict.get('PilotBenchmark', 0.0)
            self.log.verbose(
                'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f'
                % (pilotReference, gridCE, site, benchmark))

            result = self.pilotAgentsDB.setPilotStatus(pilotReference,
                                                       status='Running',
                                                       gridSite=site,
                                                       destination=gridCE,
                                                       benchmark=benchmark)
            if not result['OK']:
                self.log.warn(
                    "Problem updating pilot information",
                    "; setPilotStatus. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))

    def _updatePilotJobMapping(self, resourceDict, jobID):
        """ Update pilot to job mapping information
    """
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            if not result['OK']:
                self.log.error(
                    "Problem updating pilot information",
                    ";setCurrentJobID. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))
            result = self.pilotAgentsDB.setJobForPilot(jobID,
                                                       pilotReference,
                                                       updateStatus=False)
            if not result['OK']:
                self.log.error(
                    "Problem updating pilot information",
                    "; setJobForPilot. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))

    def _checkCredentials(self, resourceDict, credDict):
        """ Check if we can get a job given the passed credentials
    """
        if Properties.GENERIC_PILOT in credDict['properties']:
            # You can only match groups in the same VO
            if credDict['group'] == "hosts":
                # for the host case the VirtualOrganization parameter
                # is mandatory in resourceDict
                vo = resourceDict.get('VirtualOrganization', '')
            else:
                vo = Registry.getVOForGroup(credDict['group'])
            result = Registry.getGroupsForVO(vo)
            if result['OK']:
                resourceDict['OwnerGroup'] = result['Value']
            else:
                raise RuntimeError(result['Message'])
        else:
            # If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict['properties']:
                self.log.notice(
                    "Setting the resource DN to the credentials DN")
                resourceDict['OwnerDN'] = credDict['DN']
            # If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict['properties']:
                resourceDict['OwnerGroup'] = credDict['group']
                self.log.notice(
                    "Setting the resource group to the credentials group")
                if 'OwnerDN' in resourceDict and resourceDict[
                        'OwnerDN'] != credDict['DN']:
                    ownerDN = resourceDict['OwnerDN']
                    result = Registry.getGroupsForDN(resourceDict['OwnerDN'])
                    if not result['OK']:
                        raise RuntimeError(result['Message'])
                    if credDict['group'] not in result['Value']:
                        # DN is not in the same group! bad boy.
                        self.log.notice(
                            "You cannot request jobs from DN %s. It does not belong to your group!"
                            % ownerDN)
                        resourceDict['OwnerDN'] = credDict['DN']
            # Nothing special, group and DN have to be the same
            else:
                resourceDict['OwnerDN'] = credDict['DN']
                resourceDict['OwnerGroup'] = credDict['group']

        return resourceDict

    def _checkPilotVersion(self, resourceDict):
        """ Check the pilot DIRAC version
    """
        if self.opsHelper.getValue("Pilot/CheckVersion", True):
            if 'ReleaseVersion' not in resourceDict:
                if 'DIRACVersion' not in resourceDict:
                    raise RuntimeError(
                        'Version check requested and not provided by Pilot')
                else:
                    pilotVersion = resourceDict['DIRACVersion']
            else:
                pilotVersion = resourceDict['ReleaseVersion']

            validVersions = self.opsHelper.getValue("Pilot/Version", [])
            if validVersions and pilotVersion not in validVersions:
                raise RuntimeError(
                    'Pilot version does not match the production version %s not in ( %s )'
                    % (pilotVersion, ",".join(validVersions)))
            # Check project if requested
            validProject = self.opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if 'ReleaseProject' not in resourceDict:
                    raise RuntimeError(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict['ReleaseProject'] != validProject:
                    raise RuntimeError(
                        "Version check requested \
          but expected project %s != received %s" %
                        (validProject, resourceDict['ReleaseProject']))
Example #31
0
class SiteInspectorAgent( AgentModule ):
  """ SiteInspectorAgent

  The SiteInspectorAgent agent is an agent that is used to get the all the site names
  and trigger PEP to evaluate their status.

  """

  # Max number of worker threads by default
  __maxNumberOfThreads = 15

  # Inspection freqs, defaults, the lower, the higher priority to be checked.
  # Error state usually means there is a glitch somewhere, so it has the highest
  # priority.
  __checkingFreqs = {'Active'   : 20,
                     'Degraded' : 20,
                     'Probing'  : 20,
                     'Banned'   : 15,
                     'Unknown'  : 10,
                     'Error'    : 5}


  def __init__( self, *args, **kwargs ):

    AgentModule.__init__( self, *args, **kwargs )

    # ElementType, to be defined among Site, Resource or Node
    self.sitesToBeChecked    = None
    self.threadPool          = None
    self.siteClient          = None
    self.clients             = {}


  def initialize( self ):
    """ Standard initialize.
    """

    maxNumberOfThreads = self.am_getOption( 'maxNumberOfThreads', self.__maxNumberOfThreads )
    self.threadPool    = ThreadPool( maxNumberOfThreads, maxNumberOfThreads )

    self.siteClient  = SiteStatus()

    self.clients['SiteStatus']               = self.siteClient
    self.clients['ResourceManagementClient'] = ResourceManagementClient()

    return S_OK()

  def execute( self ):
    """ execute

    This is the main method of the agent. It gets the sites from the Database, calculates how many threads should be
    started and spawns them. Each thread will get a site from the queue until
    it is empty. At the end, the method will join the queue such that the agent
    will not terminate a cycle until all sites have been processed.

    """

    # Gets sites to be checked ( returns a Queue )
    sitesToBeChecked = self.getSitesToBeChecked()
    if not sitesToBeChecked['OK']:
      self.log.error( sitesToBeChecked['Message'] )
      return sitesToBeChecked
    self.sitesToBeChecked = sitesToBeChecked['Value']

    queueSize = self.sitesToBeChecked.qsize()
    pollingTime = self.am_getPollingTime()

    # Assigns number of threads on the fly such that we exhaust the PollingTime
    # without having to spawn too many threads. We assume 10 seconds per element
    # to be processed ( actually, it takes something like 1 sec per element ):
    # numberOfThreads = elements * 10(s/element) / pollingTime
    numberOfThreads = int( math.ceil( queueSize * 10. / pollingTime ) )

    self.log.info( 'Needed %d threads to process %d elements' % ( numberOfThreads, queueSize ) )

    for _x in xrange( numberOfThreads ):
      jobUp = self.threadPool.generateJobAndQueueIt( self._execute )
      if not jobUp['OK']:
        self.log.error( jobUp['Message'] )

    self.log.info( 'blocking until all sites have been processed' )
    # block until all tasks are done
    self.sitesToBeChecked.join()
    self.log.info( 'done')

    return S_OK()


  def getSitesToBeChecked( self ):
    """ getElementsToBeChecked

    This method gets all the site names from the SiteStatus table, after that it get the details of each
    site (status, name, etc..) and adds them to a queue.

    """

    toBeChecked = Queue.Queue()

    res = self.siteClient.getSites('All')
    if not res['OK']:
      return res

    # get the current status
    res = self.siteClient.getSiteStatuses( res['Value'] )
    if not res['OK']:
      return res

    # filter elements
    for site in res['Value']:
      status = res['Value'].get(site, 'Unknown')

      toBeChecked.put( { 'status': status,
                         'name': site,
                         'site' : site,
                         'element' : 'Site',
                         'statusType': 'all',
                         'elementType': 'Site' } )

    return S_OK( toBeChecked )


  # Private methods ............................................................

  def _execute( self ):
    """
      Method run by each of the thread that is in the ThreadPool.
      It enters a loop until there are no sites on the queue.

      On each iteration, it evaluates the policies for such site
      and enforces the necessary actions. If there are no more sites in the
      queue, the loop is finished.
    """

    pep = PEP( clients = self.clients )

    while True:

      try:
        site = self.sitesToBeChecked.get_nowait()
      except Queue.Empty:
        return S_OK()

      resEnforce = pep.enforce( site )
      if not resEnforce['OK']:
        self.log.error( 'Failed policy enforcement', resEnforce['Message'] )
        self.sitesToBeChecked.task_done()
        continue

      # Used together with join !
      self.sitesToBeChecked.task_done()
Example #32
0
class ResourceStatus( ElementStatus ):
  """
  ResourceStatus helper that connects to CS if RSS flag is not Active. It keeps
  the connection to the db / server as an object member, to avoid creating a new
  one massively.
  """

  __metaclass__ = DIRACSingleton
  
  def __init__( self ):
    """
    Constructor, initializes the logger, rssClient and caches.

    examples
      >>> resourceStatus = ResourceStatus()
    """

    super( ResourceStatus, self ).__init__()
    
    self.siteStatus = SiteStatus()
    
    # We can set CacheLifetime and CacheHistory from CS, so that we can tune them.
    cacheLifeTime = int( RssConfiguration().getConfigCache() )
    
    # RSSCaches, one per elementType ( StorageElement, ComputingElement )
    # Should be generated on the fly, instead of being hardcoded ?
    self.seCache = RSSCache( 'Storage', cacheLifeTime, self._updateSECache )
    self.ceCache = RSSCache( 'Computing', cacheLifeTime, self._updateCECache )

  #.............................................................................
  # ComputingElement methods

  def getComputingStatuses( self, ceNames, statusTypes = None ):
    """
    Method that queries the RSSCache for ComputingElement-Status-related information.
    If any of the inputs is None, it is interpreted as * ( all ).
    If match is positive, the output looks like:
      {
        computingElementA : { statusType1 : status1, statusType2 : status2 },
        computingElementB : { statusType1 : status1, statusType2 : status2 },
      }
    There are ALWAYS the same keys inside the site dictionaries.
    
    examples:
      >>> resourceStatus.getComputingStatuses( 'ce207.cern.ch', None )
          S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } )
      >>> resourceStatus.getComputingStatuses( 'RubbishCE', None )
          S_ERROR( ... )
      >>> resourceStaus.getComputingStatuses( 'ce207.cern.ch', 'all' )
          S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } )
      >>> resourceStatus.getComputingStatuses( [ 'ce206.cern.ch', 'ce207.cern.ch' ], 'all' )
          S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' },
                  'ce207.cern.ch' : { 'all' : 'Active' } } )
      >>> resourceStatus.getComputingStatuses( None, 'all' )
          S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' },
                  'ce207.cern.ch' : { 'all' : 'Active' },
                  ... } )

    :Parameters:
      **ceNames** - [ None, `string`, `list` ]
        name(s) of the computing elements to be matched
      **statusTypes** - [ None, `string`, `list` ]
        name(s) of the statusTypes to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    cacheMatch = self.ceCache.match( ceNames, statusTypes )
    if not cacheMatch[ 'OK' ]:
      return cacheMatch
    
    cacheMatch = cacheMatch[ 'Value' ]
    
    for ceName, ceDict in cacheMatch.items():
      
      if not self.__getSiteAccess( ceName, 'ComputingAccess' )[ 'OK' ]:
        
        cacheMatch[ ceName ] = dict( zip( ceDict.keys(), [ 'Banned' ] * len( ceDict ) ) )
          
    return S_OK( cacheMatch )

  def getComputingStatus( self, ceName, statusType ):
    """
    Given a ce and a statusType, it returns its status from the cache.
    
    examples:
      >>> resourceStatus.getComputingStatus( 'ce207.cern.ch', 'all' )
          S_OK( 'Active' )
      >>> resourceStatus.getComputingStatus( 'ce207.cern.ch', None )
          S_ERROR( ... )

    :Parameters:
      **ceName** - `string`
        name of the computing element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
  
    return self.getElementStatus( 'Computing', ceName, statusType )
  
  def isUsableComputing( self, ceName, statusType ):
    """
    Similar method to getComputingStatus. The difference is the output.
    Given a ce name, returns a bool if the ce is usable:
    status is Active or Degraded outputs True
    anything else outputs False
    
    examples:
      >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'all' )
          True
      >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'all' )
          False # May be banned
      >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', None )
          False
      >>> resourceStatus.isUsableComputing( 'RubbishCE', 'all' )
          False
      >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'RubbishAccess' )
          False
    
    :Parameters:
      **ceName** - `string`
        name of the computing element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    return self.isUsableElement( 'Computing', ceName, statusType )

  def getUsableComputings( self, statusType ):
    """
    For a given statusType, returns all computing elements that are usable: their
    status for that particular statusType is either Active or Degraded; in a list.
    
    examples:
      >>> resourceStatus.getUsableComputings( 'all' )
          S_OK( [ 'ce206.cern.ch', 'ce207.cern.ch',... ] )
      >>> resourceStatus.getUsableComputings( None )
          S_ERROR( ... )
      >>> resourceStatus.getUsableComputings( 'RubbishAccess' )
          S_ERROR( ... )
    
    :Parameters:
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    return self.getUsableElements( 'Computing', statusType )

  #.............................................................................
  # StorageElement methods

  def getStorageStatuses( self, seNames, statusTypes = None ):
    """
    Method that queries the RSSCache for StorageElement-Status-related information.
    If any of the inputs is None, it is interpreted as * ( all ).
    If match is positive, the output looks like:
    {
      storageElementA : { statusType1 : status1, statusType2 : status2 },
      storageElementB : { statusType1 : status1, statusType2 : status2 },
    }
    There are ALWAYS the same keys inside the site dictionaries.
    
    examples:
      >>> resourceStatus.getStorageStatuses( 'CERN-USER', None )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active', 'WriteAccess' : 'Degraded',... } } )
      >>> resourceStatus.getStorageStatuses( 'RubbishCE', None )
          S_ERROR( ... )
      >>> resourceStaus.getStorageStatuses( 'CERN-USER', 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' } } )
      >>> resourceStatus.getStorageStatuses( [ 'CERN-USER', 'PIC-USER' ], 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' },
                  'PIC-USER' : { 'ReadAccess' : 'Active' } } )
      >>> resourceStatus.getStorageStatuses( None, 'ReadAccess' )
          S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' },
                  'PIC-USER' : { 'ReadAccess' : 'Active' },
                  ... } )

    :Parameters:
      **seNames** - [ None, `string`, `list` ]
        name(s) of the storage elements to be matched
      **statusTypes** - [ None, `string`, `list` ]
        name(s) of the statusTypes to be matched
        
    :return: S_OK() || S_ERROR()
    """
    
    cacheMatch = self.seCache.match( seNames, statusTypes )
    if not cacheMatch[ 'OK' ]:
      return cacheMatch
    
    cacheMatch = cacheMatch[ 'Value' ]
    
    for seName, seDict in cacheMatch.items():
      
      if not self.__getSiteAccess( seName, 'StorageAccess' )[ 'OK' ]:
        
        cacheMatch[ seName ] = dict( zip( seDict.keys(), [ 'Banned' ] * len( seDict ) ) )
          
    return S_OK( cacheMatch )


  def getStorageStatus( self, seName, statusType ):
    """
    Given a se and a statusType, it returns its status from the cache.
    
    examples:
      >>> resourceStatus.getComputingElementStatus( 'CERN-USER', 'ReadAccess' )
          S_OK( 'Active' )
      >>> resourceStatus.getComputingElementStatus( 'CERN-USER', None )
          S_ERROR( ... )
    
    :Parameters:
      **seName** - `string`
        name of the storage element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
  
    return self.getElementStatus( 'Storage', seName, statusType )
  
  def isUsableStorage( self, seName, statusType ):
    """
    Similar method to getStorageStatus. The difference is the output.
    Given a se name, returns a bool if the se is usable:
    status is Active or Degraded outputs True
    anything else outputs False
    
    examples:
      >>> resourceStatus.isUsableStorage( 'CERN-USER', 'ReadAccess' )
          True
      >>> resourceStatus.isUsableStorage( 'CERN-ARCHIVE', 'ReadAccess' )
          False # May be banned
      >>> resourceStatus.isUsableStorage( 'CERN-USER', None )
          False
      >>> resourceStatus.isUsableStorage( 'RubbishCE', 'ReadAccess' )
          False
      >>> resourceStatus.isUsableStorage( 'CERN-USER', 'RubbishAccess' )
          False
    
    :Parameters:
      **seName** - `string`
        name of the storage element to be matched
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    return self.isUsableElement( 'Storage', seName, statusType )

  def getUsableStorages( self, statusType ):
    """
    For a given statusType, returns all storage elements that are usable: their
    status for that particular statusType is either Active or Degraded; in a list.
    
    examples:
      >>> resourceStatus.getUsableStorages( 'ReadAccess' )
          S_OK( [ 'CERN-USER', 'PIC-USER',... ] )
      >>> resourceStatus.getUsableStorages( None )
          S_ERROR( ... )
      >>> resourceStatus.getUsableStorages( 'RubbishAccess' )
          S_ERROR( ... )
    
    :Parameters:
      **statusType** - `string`
        name of the statusType to be matched
    
    :return: S_OK() || S_ERROR()
    """
    
    return self.getUsableElements( 'Storage', statusType )
  
  
  #.............................................................................
  # update Cache methods

  def _updateCECache( self ):
    """
    Method used to update the ComputingElementCache.
    """
    return self.__updateCache( 'Computing' )

  
  def _updateSECache( self ):
    """
    Method used to update the StorageElementCache.
    """
    return self.__updateCache( 'Storage' )
    
          
  #.............................................................................
  # Private methods
  

  def __updateCache( self, elementType ):

    meta = { 'columns' : [ 'Name', 'StatusType', 'Status' ] }
    rawCache = self.rssClient.selectStatusElement( 'Resource', 'Status',
                                                    elementType = elementType,
                                                    meta = meta )
    
    if not rawCache[ 'OK' ]:
      return rawCache
    return S_OK( self.getCacheDictFromRawData( rawCache[ 'Value' ] ) )  
  
  
  def __getSiteAccess( self, elementName, siteAccess ):
    """
    Method that given a resourceType and an elementName, finds the site name
    that owes it. Once that is done, the site access <siteAccess> is checked
    and returned.
    
    :Parameters:
      **resourceType** - `string`
        name of the resource type ( StorageElement, ComputingElement.. )
      **elementName** - `string`
        name of the resource of type <resourceType>
      **siteAccess** - `string`
        site access ( StorageAccess, ComputingAccess .. )
        
    :return: S_OK() || S_ERROR()
    """
    
    siteName = Resources.getSiteForResource( elementName )
    if not siteName[ 'OK' ]:
      return siteName
    siteName = siteName[ 'Value' ]
    
    if not self.siteStatus.isUsableSite( siteName, siteAccess ):
      return S_ERROR( 'Site %s is not usable for Computing' % siteName )
    
    return S_OK()
  

################################################################################
#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
Example #33
0
class SiteDirector( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  def initialize( self ):
    """ Standard constructor
    """
    self.am_setOption( "PollingTime", 60.0 )
    self.am_setOption( "maxPilotWaitingHours", 6 )
    self.queueDict = {}
    self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
    self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT
    self.siteStatus = SiteStatus()
    return S_OK()

  def beginExecution( self ):

    self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() )
    # The SiteDirector is for a particular user community
    self.vo = self.am_getOption( "Community", '' )
    if not self.vo:
      self.vo = CSGlobals.getVO()
    # The SiteDirector is for a particular user group
    self.group = self.am_getOption( "Group", '' )
    # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector
    self.voGroups = []

    # Choose the group for which pilots will be submitted. This is a hack until
    # we will be able to match pilots to VOs.
    if not self.group:
      if self.vo:
        result = Registry.getGroupsForVO( self.vo )
        if not result['OK']:
          return result
        for group in result['Value']:
          if 'NormalUser' in Registry.getPropertiesForGroup( group ):
            self.voGroups.append( group )
    else:
      self.voGroups = [ self.group ]

    result = findGenericPilotCredentials( vo = self.vo )
    if not result[ 'OK' ]:
      return result
    self.pilotDN, self.pilotGroup = result[ 'Value' ]
    self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN )
    self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup )

    self.platforms = []
    self.sites = []
    self.defaultSubmitPools = ''
    if self.group:
      self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '' )
    elif self.vo:
      self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '' )

    self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT )
    self.install = DIRAC_INSTALL
    self.workingDirectory = self.am_getOption( 'WorkDirectory' )
    self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 )
    self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' )
    self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode )
    self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit )
    self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True )
    self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 7200 )

    # Flags
    self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True )
    self.getOutput = self.am_getOption( 'GetPilotOutput', True )
    self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True )

    # Get the site description dictionary
    siteNames = None
    if not self.am_getOption( 'Site', 'Any' ).lower() == "any":
      siteNames = self.am_getOption( 'Site', [] )
    ceTypes = None
    if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any":
      ceTypes = self.am_getOption( 'CETypes', [] )
    ces = None
    if not self.am_getOption( 'CEs', 'Any' ).lower() == "any":
      ces = self.am_getOption( 'CEs', [] )
      
    self._resources = Resources.Resources( vo = self.vo )  
    result = self._resources.getEligibleQueuesInfo( siteList = siteNames,
                                                    ceList = ces,
                                                    ceTypeList = ceTypes,
                                                    mode = 'Direct' )
    if not result['OK']:
      return result
    resourceDict = result['Value']
    result = self.getQueues( resourceDict )
    if not result['OK']:
      return result

    #if not siteNames:
    #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
    #  if siteName == 'Unknown':
    #    return S_OK( 'No site specified for the SiteDirector' )
    #  else:
    #    siteNames = [siteName]
    #self.siteNames = siteNames

    if self.updateStatus:
      self.log.always( 'Pilot status update requested' )
    if self.getOutput:
      self.log.always( 'Pilot output retrieval requested' )
    if self.sendAccounting:
      self.log.always( 'Pilot accounting sending requested' )

    self.log.always( 'Sites:', siteNames )
    self.log.always( 'CETypes:', ceTypes )
    self.log.always( 'CEs:', ces )
    self.log.always( 'PilotDN:', self.pilotDN )
    self.log.always( 'PilotGroup:', self.pilotGroup )
    self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit )
    self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode )

    self.localhost = socket.getfqdn()
    self.proxy = ''

    if self.queueDict:
      self.log.always( "Agent will serve queues:" )
      for queue in self.queueDict:
        self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'],
                                                         self.queueDict[queue]['CEName'],
                                                         queue ) )

    return S_OK()

  def getQueues( self, resourceDict ):
    """ Get the list of relevant CEs and their descriptions
    """

    self.queueDict = {}
    ceFactory = ComputingElementFactory()

    for site in resourceDict:
      result = self._resources.getSiteFullName( site )
      if not result['OK']:
        continue
      siteFullName = result['Value']
      for ce in resourceDict[site]:
        ceDict = resourceDict[site][ce]
        qDict = ceDict.pop( 'Queues' )
        for queue in qDict:
          queueName = '%s_%s' % ( ce, queue )
          self.queueDict[queueName] = {}
          self.queueDict[queueName]['ParametersDict'] = qDict[queue]
          self.queueDict[queueName]['ParametersDict']['Queue'] = queue
          self.queueDict[queueName]['ParametersDict']['Site'] = siteFullName
          self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv
          self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' )
          # Evaluate the CPU limit of the queue according to the Glue convention
          # To Do: should be a utility
          if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \
             "SI00" in self.queueDict[queueName]['ParametersDict']:
            maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] )
            # For some sites there are crazy values in the CS
            maxCPUTime = max( maxCPUTime, 0 )
            maxCPUTime = min( maxCPUTime, 86400 * 12.5 )
            si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] )
            queueCPUTime = 60. / 250. * maxCPUTime * si00
            self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime )
          qwDir = os.path.join( self.workingDirectory, queue )
          if not os.path.exists( qwDir ):
            os.makedirs( qwDir )
          self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir

          platform = ''
          if "Platform" in self.queueDict[queueName]['ParametersDict']:
            platform = self.queueDict[queueName]['ParametersDict']['Platform']
          elif "Platform" in ceDict:
            platform = ceDict['Platform']
          elif "OS" in ceDict:
            architecture = ceDict.get( 'architecture', 'x86_64' )
            OS = ceDict['OS']
            platform = '_'.join( [architecture, OS] )
          if platform and not platform in self.platforms:
            self.platforms.append( platform )

          if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform:
            result = Resources.getDIRACPlatform( platform )
            if result['OK']:
              self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value']

          ceQueueDict = dict( ceDict )
          ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] )
          result = ceFactory.getCE( ceName = ce,
                                    ceType = ceDict['CEType'],
                                    ceParametersDict = ceQueueDict )
          if not result['OK']:
            return result
          self.queueDict[queueName]['CE'] = result['Value']
          self.queueDict[queueName]['CEName'] = ce
          self.queueDict[queueName]['CEType'] = ceDict['CEType']
          self.queueDict[queueName]['Site'] = siteFullName
          self.queueDict[queueName]['QueueName'] = queue
          self.queueDict[queueName]['Platform'] = platform
          result = self.queueDict[queueName]['CE'].isValid()
          if not result['OK']:
            self.log.fatal( result['Message'] )
            return result
          if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']:
            self.queueDict[queueName]['BundleProxy'] = True
          elif 'BundleProxy' in ceDict:
            self.queueDict[queueName]['BundleProxy'] = True

          if siteFullName not in self.sites:
            self.sites.append( siteFullName )

    return S_OK()

  def execute( self ):
    """ Main execution method
    """

    if not self.queueDict:
      self.log.warn( 'No site defined, exiting the cycle' )
      return S_OK()

    result = self.submitJobs()
    if not result['OK']:
      self.log.error( 'Errors in the job submission: ', result['Message'] )


    if self.updateStatus:
      result = self.updatePilotStatus()
      if not result['OK']:
        self.log.error( 'Errors in updating pilot status: ', result['Message'] )

    return S_OK()

  def submitJobs( self ):
    """ Go through defined computing elements and submit jobs if necessary
    """

    # Check that there is some work at all
    setup = CSGlobals.getSetup()
    tqDict = { 'Setup':setup,
               'CPUTime': 9999999,
               'SubmitPool' : self.defaultSubmitPools }
    if self.vo:
      tqDict['Community'] = self.vo
    if self.voGroups:
      tqDict['OwnerGroup'] = self.voGroups

    result = Resources.getCompatiblePlatforms( self.platforms )
    if not result['OK']:
      return result
    tqDict['Platform'] = result['Value']
    tqDict['Site'] = self.sites

    self.log.verbose( 'Checking overall TQ availability with requirements' )
    self.log.verbose( tqDict )

    rpcMatcher = RPCClient( "WorkloadManagement/Matcher" )
    result = rpcMatcher.getMatchingTaskQueues( tqDict )
    if not result[ 'OK' ]:
      return result
    if not result['Value']:
      self.log.verbose( 'No Waiting jobs suitable for the director' )
      return S_OK()

    queues = self.queueDict.keys()
    random.shuffle( queues )
    for queue in queues:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      ceType = self.queueDict[queue]['CEType']
      queueName = self.queueDict[queue]['QueueName']
      siteName = self.queueDict[queue]['Site']
      siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' )
      platform = self.queueDict[queue]['Platform']

      if 'CPUTime' in self.queueDict[queue]['ParametersDict'] :
        queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] )
      else:
        self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue )
        continue
      if queueCPUTime > self.maxQueueLength:
        queueCPUTime = self.maxQueueLength

      # Get the working proxy
      cpuTime = queueCPUTime + 86400

      self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) )
      result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime )
      if not result['OK']:
        return result
      self.proxy = result['Value']
      ce.setProxy( self.proxy, cpuTime - 60 )

      # Get the number of available slots on the target site/queue
      result = ce.available()
      if not result['OK']:
        self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) )
        continue
      ceInfoDict = result['CEInfoDict']
      self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \
                     ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'],
                       ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) )

      totalSlots = result['Value']

      ceDict = ce.getParameterDict()
      ceDict[ 'GridCE' ] = ceName
      if not siteMask and 'Site' in ceDict:
        self.log.info( 'Site not in the mask %s' % siteName )
        self.log.info( 'Removing "Site" from matching Dict' )
        del ceDict[ 'Site' ]
      if self.vo:
        ceDict['Community'] = self.vo
      if self.voGroups:
        ceDict['OwnerGroup'] = self.voGroups

      # This is a hack to get rid of !
      ceDict['SubmitPool'] = self.defaultSubmitPools

      result = Resources.getCompatiblePlatforms( platform )
      if not result['OK']:
        continue
      ceDict['Platform'] = result['Value']

      # Get the number of eligible jobs for the target site/queue
      result = rpcMatcher.getMatchingTaskQueues( ceDict )
      if not result['OK']:
        self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] )
        return result
      taskQueueDict = result['Value']
      if not taskQueueDict:
        self.log.info( 'No matching TQs found' )
        continue

      totalTQJobs = 0
      tqIDList = taskQueueDict.keys()
      for tq in taskQueueDict:
        totalTQJobs += taskQueueDict[tq]['Jobs']

      pilotsToSubmit = min( totalSlots, totalTQJobs )

      # Get the number of already waiting pilots for this queue
      totalWaitingPilots = 0
      if self.pilotWaitingFlag:
        lastUpdateTime = dateTime() - self.pilotWaitingTime * second
        result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList,
                                              'Status': WAITING_PILOT_STATUS },
                                            None, lastUpdateTime )
        if not result['OK']:
          self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] )
          totalWaitingPilots = 0
        else:
          totalWaitingPilots = result['Value']
          self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots )

      pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) )
      self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \
                              ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) )

      # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT
      pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit )

      while pilotsToSubmit > 0:
        self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) )

        bundleProxy = self.queueDict[queue].get( 'BundleProxy', False )
        jobExecDir = ''
        if ceType == 'CREAM':
          jobExecDir = '.'
        jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir )
        httpProxy = self.queueDict[queue].get( 'HttpProxy', '' )

        result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir )
        if not result['OK']:
          return result

        executable, pilotSubmissionChunk = result['Value']
        result = ce.submitJob( executable, '', pilotSubmissionChunk )
        os.unlink( executable )
        if not result['OK']:
          self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] )
          pilotsToSubmit = 0
          continue

        pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk
        # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the
        # task queue priorities
        pilotList = result['Value']
        self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) )
        stampDict = {}
        if result.has_key( 'PilotStampDict' ):
          stampDict = result['PilotStampDict']
        tqPriorityList = []
        sumPriority = 0.
        for tq in taskQueueDict:
          sumPriority += taskQueueDict[tq]['Priority']
          tqPriorityList.append( ( tq, sumPriority ) )
        rndm = random.random()*sumPriority
        tqDict = {}
        for pilotID in pilotList:
          rndm = random.random()*sumPriority
          for tq, prio in tqPriorityList:
            if rndm < prio:
              tqID = tq
              break
          if not tqDict.has_key( tqID ):
            tqDict[tqID] = []
          tqDict[tqID].append( pilotID )

        for tqID, pilotList in tqDict.items():
          result = pilotAgentsDB.addPilotTQReference( pilotList,
                                                     tqID,
                                                     self.pilotDN,
                                                     self.pilotGroup,
                                                     self.localhost,
                                                     ceType,
                                                     '',
                                                     stampDict )
          if not result['OK']:
            self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] )
            continue
          for pilot in pilotList:
            result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName,
                                                  'Successfully submitted by the SiteDirector',
                                                  siteName, queueName )
            if not result['OK']:
              self.log.error( 'Failed to set pilot status: ', result['Message'] )
              continue

    return S_OK()

#####################################################################################
  def __getExecutable( self, queue, pilotsToSubmit, bundleProxy = True, httpProxy = '', jobExecDir = '' ):
    """ Prepare the full executable for queue
    """

    proxy = None
    if bundleProxy:
      proxy = self.proxy
    pilotOptions, pilotsToSubmit = self.__getPilotOptions( queue, pilotsToSubmit )
    if pilotOptions is None:
      return S_ERROR( 'Errors in compiling pilot options' )
    executable = self.__writePilotScript( self.workingDirectory, pilotOptions, proxy, httpProxy, jobExecDir )
    return S_OK( [ executable, pilotsToSubmit ] )

#####################################################################################
  def __getPilotOptions( self, queue, pilotsToSubmit ):
    """ Prepare pilot options
    """

    queueDict = self.queueDict[queue]['ParametersDict']
    pilotOptions = []

    setup = gConfig.getValue( "/DIRAC/Setup", "unknown" )
    if setup == 'unknown':
      self.log.error( 'Setup is not defined in the configuration' )
      return [ None, None ]
    pilotOptions.append( '-S %s' % setup )
    opsHelper = Operations.Operations( group = self.pilotGroup, setup = setup )

    #Installation defined?
    installationName = opsHelper.getValue( "Pilot/Installation", "" )
    if installationName:
      pilotOptions.append( '-V %s' % installationName )

    #Project defined?
    projectName = opsHelper.getValue( "Pilot/Project", "" )
    if projectName:
      pilotOptions.append( '-l %s' % projectName )
    else:
      self.log.info( 'DIRAC project will be installed by pilots' )

    #Request a release
    diracVersion = opsHelper.getValue( "Pilot/Version", [] )
    if not diracVersion:
      self.log.error( 'Pilot/Version is not defined in the configuration' )
      return [ None, None ]
    #diracVersion is a list of accepted releases. Just take the first one
    pilotOptions.append( '-r %s' % diracVersion[0] )

    ownerDN = self.pilotDN
    ownerGroup = self.pilotGroup
    # Request token for maximum pilot efficiency
    result = gProxyManager.requestToken( ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode )
    if not result[ 'OK' ]:
      self.log.error( 'Invalid proxy token request', result['Message'] )
      return [ None, None ]
    ( token, numberOfUses ) = result[ 'Value' ]
    pilotOptions.append( '-o /Security/ProxyToken=%s' % token )
    # Use Filling mode
    pilotOptions.append( '-M %s' % min( numberOfUses, self.maxJobsInFillMode ) )

    # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode )
    # with numberOfUses tokens we can submit at most: 
    #    numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
    # pilots
    newPilotsToSubmit = numberOfUses / min( numberOfUses, self.maxJobsInFillMode )
    if newPilotsToSubmit != pilotsToSubmit:
      self.log.info( 'Number of pilots to submit is changed to %d after getting the proxy token' % newPilotsToSubmit )
      pilotsToSubmit = newPilotsToSubmit
    # Debug
    if self.pilotLogLevel.lower() == 'debug':
      pilotOptions.append( '-d' )
    # CS Servers
    csServers = gConfig.getValue( "/DIRAC/Configuration/Servers", [] )
    pilotOptions.append( '-C %s' % ",".join( csServers ) )
    
    # DIRAC Extensions to be used in pilots
    pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", [] )
    extensionsList = []
    if pilotExtensionsList: 
      if pilotExtensionsList[0] != 'None':
        extensionsList = pilotExtensionsList
    else:
      extensionsList = CSGlobals.getCSExtensions()
    if extensionsList:
      pilotOptions.append( '-e %s' % ",".join( extensionsList ) )
      
    # Requested CPU time
    pilotOptions.append( '-T %s' % queueDict['CPUTime'] )
    # CEName
    pilotOptions.append( '-N %s' % self.queueDict[queue]['CEName'] )
    # SiteName
    pilotOptions.append( '-n %s' % queueDict['Site'] )
    if 'ClientPlatform' in queueDict:
      pilotOptions.append( "-p '%s'" % queueDict['ClientPlatform'] )

    if 'SharedArea' in queueDict:
      pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % queueDict['SharedArea'] )

    if 'SI00' in queueDict:
      factor = float( queueDict['SI00'] ) / 250.
      pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % factor )
      pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % factor )
    else:
      if 'CPUScalingFactor' in queueDict:
        pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % queueDict['CPUScalingFactor'] )
      if 'CPUNormalizationFactor' in queueDict:
        pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict['CPUNormalizationFactor'] )

    # Hack
    if self.defaultSubmitPools:
      pilotOptions.append( '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % self.defaultSubmitPools )

    if self.group:
      pilotOptions.append( '-G %s' % self.group )

    self.log.verbose( "pilotOptions: ", ' '.join( pilotOptions ) )

    return [ pilotOptions, pilotsToSubmit ]

#####################################################################################
  def __writePilotScript( self, workingDirectory, pilotOptions, proxy = None, httpProxy = '', pilotExecDir = '' ):
    """ Bundle together and write out the pilot executable script, admixt the proxy if given
    """

    try:
      compressedAndEncodedProxy = ''
      proxyFlag = 'False'
      if proxy is not None:
        compressedAndEncodedProxy = base64.encodestring( bz2.compress( proxy.dumpAllToString()['Value'] ) )
        proxyFlag = 'True'
      compressedAndEncodedPilot = base64.encodestring( bz2.compress( open( self.pilot, "rb" ).read(), 9 ) )
      compressedAndEncodedInstall = base64.encodestring( bz2.compress( open( self.install, "rb" ).read(), 9 ) )
    except:
      self.log.exception( 'Exception during file compression of proxy, dirac-pilot or dirac-install' )
      return S_ERROR( 'Exception during file compression of proxy, dirac-pilot or dirac-install' )

    localPilot = """#!/bin/bash
/usr/bin/env python << EOF
#
import os, tempfile, sys, shutil, base64, bz2
try:
  pilotExecDir = '%(pilotExecDir)s'
  if not pilotExecDir:
    pilotExecDir = None
  pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir )
  pilotWorkingDirectory = os.path.realpath( pilotWorkingDirectory )
  os.chdir( pilotWorkingDirectory )
  if %(proxyFlag)s:
    open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedProxy)s\"\"\" ) ) )
    os.chmod("proxy",0600)
    os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy')
  open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedPilot)s\"\"\" ) ) )
  open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedInstall)s\"\"\" ) ) )
  os.chmod("%(pilotScript)s",0700)
  os.chmod("%(installScript)s",0700)
  if "LD_LIBRARY_PATH" not in os.environ:
    os.environ["LD_LIBRARY_PATH"]=""
  if "%(httpProxy)s":
    os.environ["HTTP_PROXY"]="%(httpProxy)s"
  os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates')
  # TODO: structure the output
  print '==========================================================='
  print 'Environment of execution host'
  for key in os.environ.keys():
    print key + '=' + os.environ[key]
  print '==========================================================='
except Exception, x:
  print >> sys.stderr, x
  sys.exit(-1)
cmd = "python %(pilotScript)s %(pilotOptions)s"
print 'Executing: ', cmd
sys.stdout.flush()
os.system( cmd )

shutil.rmtree( pilotWorkingDirectory )

EOF
""" % { 'compressedAndEncodedProxy': compressedAndEncodedProxy,
        'compressedAndEncodedPilot': compressedAndEncodedPilot,
        'compressedAndEncodedInstall': compressedAndEncodedInstall,
        'httpProxy': httpProxy,
        'pilotExecDir': pilotExecDir,
        'pilotScript': os.path.basename( self.pilot ),
        'installScript': os.path.basename( self.install ),
        'pilotOptions': ' '.join( pilotOptions ),
        'proxyFlag': proxyFlag }

    fd, name = tempfile.mkstemp( suffix = '_pilotwrapper.py', prefix = 'DIRAC_', dir = workingDirectory )
    pilotWrapper = os.fdopen( fd, 'w' )
    pilotWrapper.write( localPilot )
    pilotWrapper.close()
    return name

  def updatePilotStatus( self ):
    """ Update status of pilots in transient states
    """
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']
      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']

      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'Status':TRANSIENT_PILOT_STATUS,
                                           'OwnerDN': self.pilotDN,
                                           'OwnerGroup': self.pilotGroup } )
      if not result['OK']:
        self.log.error( 'Failed to select pilots: %s' % result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue

      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      stampedPilotRefs = []
      for pRef in pilotDict:
        if pilotDict[pRef]['PilotStamp']:
          stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] )
        else:
          stampedPilotRefs = list( pilotRefs )
          break

      result = ce.isProxyValid()
      if not result['OK']:
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600 )
        if not result['OK']:
          return result
        self.proxy = result['Value']
        ce.setProxy( self.proxy, 500 )

      result = ce.getJobStatus( stampedPilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) )
        continue
      pilotCEDict = result['Value']

      for pRef in pilotRefs:
        newStatus = ''
        oldStatus = pilotDict[pRef]['Status']
        ceStatus = pilotCEDict[pRef]
        if oldStatus == ceStatus:
          # Status did not change, continue
          continue
        elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS:
          # Pilot finished without reporting, consider it Aborted
          newStatus = 'Aborted'
        elif ceStatus != 'Unknown' :
          # Update the pilot status to the new value
          newStatus = ceStatus

        if newStatus:
          self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) )
          result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' )
        # Retrieve the pilot output now
        if newStatus in FINAL_PILOT_STATUS:
          if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput:
            self.log.info( 'Retrieving output for pilot %s' % pRef )
            pilotStamp = pilotDict[pRef]['PilotStamp']
            pRefStamp = pRef
            if pilotStamp:
              pRefStamp = pRef + ':::' + pilotStamp
            result = ce.getJobOutput( pRefStamp )
            if not result['OK']:
              self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
            else:
              output, error = result['Value']
              if output:
                result = pilotAgentsDB.storePilotOutput( pRef, output, error )
                if not result['OK']:
                  self.log.error( 'Failed to store pilot output', result['Message'] )
              else:
                self.log.warn( 'Empty pilot output not stored to PilotDB' )

    # The pilot can be in Done state set by the job agent check if the output is retrieved
    for queue in self.queueDict:
      ce = self.queueDict[queue]['CE']

      if not ce.isProxyValid( 120 ):
        result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 )
        if not result['OK']:
          return result
        ce.setProxy( self.proxy, 940 )

      ceName = self.queueDict[queue]['CEName']
      queueName = self.queueDict[queue]['QueueName']
      ceType = self.queueDict[queue]['CEType']
      siteName = self.queueDict[queue]['Site']
      result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                           'Queue':queueName,
                                           'GridType':ceType,
                                           'GridSite':siteName,
                                           'OutputReady':'False',
                                           'Status':FINAL_PILOT_STATUS} )

      if not result['OK']:
        self.log.error( 'Failed to select pilots', result['Message'] )
        continue
      pilotRefs = result['Value']
      if not pilotRefs:
        continue
      result = pilotAgentsDB.getPilotInfo( pilotRefs )
      if not result['OK']:
        self.log.error( 'Failed to get pilots info from DB', result['Message'] )
        continue
      pilotDict = result['Value']
      if self.getOutput:
        for pRef in pilotRefs:
          self.log.info( 'Retrieving output for pilot %s' % pRef )
          pilotStamp = pilotDict[pRef]['PilotStamp']
          pRefStamp = pRef
          if pilotStamp:
            pRefStamp = pRef + ':::' + pilotStamp
          result = ce.getJobOutput( pRefStamp )
          if not result['OK']:
            self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) )
          else:
            output, error = result['Value']
            result = pilotAgentsDB.storePilotOutput( pRef, output, error )
            if not result['OK']:
              self.log.error( 'Failed to store pilot output', result['Message'] )

      # Check if the accounting is to be sent
      if self.sendAccounting:
        result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName,
                                             'Queue':queueName,
                                             'GridType':ceType,
                                             'GridSite':siteName,
                                             'AccountingSent':'False',
                                             'Status':FINAL_PILOT_STATUS} )

        if not result['OK']:
          self.log.error( 'Failed to select pilots', result['Message'] )
          continue
        pilotRefs = result['Value']
        if not pilotRefs:
          continue
        result = pilotAgentsDB.getPilotInfo( pilotRefs )
        if not result['OK']:
          self.log.error( 'Failed to get pilots info from DB', result['Message'] )
          continue
        pilotDict = result['Value']
        result = self.sendPilotAccounting( pilotDict )
        if not result['OK']:
          self.log.error( 'Failed to send pilot agent accounting' )

    return S_OK()

  def sendPilotAccounting( self, pilotDict ):
    """ Send pilot accounting record
    """
    for pRef in pilotDict:
      self.log.verbose( 'Preparing accounting record for pilot %s' % pRef )
      pA = PilotAccounting()
      pA.setEndTime( pilotDict[pRef][ 'LastUpdateTime' ] )
      pA.setStartTime( pilotDict[pRef][ 'SubmissionTime' ] )
      retVal = CS.getUsernameForDN( pilotDict[pRef][ 'OwnerDN' ] )
      if not retVal[ 'OK' ]:
        userName = '******'
        self.log.error( "Can't determine username for dn:", pilotDict[pRef][ 'OwnerDN' ] )
      else:
        userName = retVal[ 'Value' ]
      pA.setValueByKey( 'User', userName )
      pA.setValueByKey( 'UserGroup', pilotDict[pRef][ 'OwnerGroup' ] )
      result = getSiteForCE( pilotDict[pRef][ 'DestinationSite' ] )
      if result['OK'] and result[ 'Value' ].strip():
        pA.setValueByKey( 'Site', result['Value'].strip() )
      else:
        pA.setValueByKey( 'Site', 'Unknown' )
      pA.setValueByKey( 'GridCE', pilotDict[pRef][ 'DestinationSite' ] )
      pA.setValueByKey( 'GridMiddleware', pilotDict[pRef][ 'GridType' ] )
      pA.setValueByKey( 'GridResourceBroker', pilotDict[pRef][ 'Broker' ] )
      pA.setValueByKey( 'GridStatus', pilotDict[pRef][ 'Status' ] )
      if not 'Jobs' in pilotDict[pRef]:
        pA.setValueByKey( 'Jobs', 0 )
      else:
        pA.setValueByKey( 'Jobs', len( pilotDict[pRef]['Jobs'] ) )
      self.log.info( "Adding accounting record for pilot %s" % pilotDict[pRef][ 'PilotID' ] )
      retVal = gDataStoreClient.addRegister( pA )
      if not retVal[ 'OK' ]:
        self.log.error( 'Failed to send accounting info for pilot ', pRef )
      else:
        # Set up AccountingSent flag
        result = pilotAgentsDB.setAccountingFlag( pRef )
        if not result['OK']:
          self.log.error( 'Failed to set accounting flag for pilot ', pRef )

    self.log.info( 'Committing accounting records for %d pilots' % len( pilotDict ) )
    result = gDataStoreClient.commit()
    if result['OK']:
      for pRef in pilotDict:
        self.log.verbose( 'Setting AccountingSent flag for pilot %s' % pRef )
        result = pilotAgentsDB.setAccountingFlag( pRef )
        if not result['OK']:
          self.log.error( 'Failed to set accounting flag for pilot ', pRef )
    else:
      return result

    return S_OK()
Example #34
0
 def setUp(self):
   self.rsClient = ResourceStatusClient()
   self.stClient = SiteStatus()
   self.stClient.rssFlag = True
Example #35
0
    def getPilotSummaryWeb(self, selectDict, sortList, startItem, maxItems):
        """Get summary of the pilot jobs status by CE/site in a standard structure"""
        allStateNames = PilotStatus.PILOT_STATES + [
            "Done_Empty", "Aborted_Hour"
        ]
        paramNames = ["Site", "CE"] + allStateNames

        last_update = None
        if "LastUpdateTime" in selectDict:
            last_update = selectDict["LastUpdateTime"]
            del selectDict["LastUpdateTime"]
        site_select = []
        if "GridSite" in selectDict:
            site_select = selectDict["GridSite"]
            if not isinstance(site_select, list):
                site_select = [site_select]
            del selectDict["GridSite"]

        status_select = []
        if "Status" in selectDict:
            status_select = selectDict["Status"]
            if not isinstance(status_select, list):
                status_select = [status_select]
            del selectDict["Status"]

        expand_site = ""
        if "ExpandSite" in selectDict:
            expand_site = selectDict["ExpandSite"]
            site_select = [expand_site]
            del selectDict["ExpandSite"]

        # Get all the data from the database with various selections
        result = self.getCounters(
            "PilotAgents",
            ["GridSite", "DestinationSite", "Status"],
            selectDict,
            newer=last_update,
            timeStamp="LastUpdateTime",
        )
        if not result["OK"]:
            return result

        last_update = Time.dateTime() - Time.hour
        selectDict["Status"] = PilotStatus.ABORTED
        resultHour = self.getCounters(
            "PilotAgents",
            ["GridSite", "DestinationSite", "Status"],
            selectDict,
            newer=last_update,
            timeStamp="LastUpdateTime",
        )
        if not resultHour["OK"]:
            return resultHour

        last_update = Time.dateTime() - Time.day
        selectDict["Status"] = [PilotStatus.ABORTED, PilotStatus.DONE]
        resultDay = self.getCounters(
            "PilotAgents",
            ["GridSite", "DestinationSite", "Status"],
            selectDict,
            newer=last_update,
            timeStamp="LastUpdateTime",
        )
        if not resultDay["OK"]:
            return resultDay
        selectDict["CurrentJobID"] = 0
        selectDict["Status"] = PilotStatus.DONE
        resultDayEmpty = self.getCounters(
            "PilotAgents",
            ["GridSite", "DestinationSite", "Status"],
            selectDict,
            newer=last_update,
            timeStamp="LastUpdateTime",
        )
        if not resultDayEmpty["OK"]:
            return resultDayEmpty

        ceMap = {}
        resMap = getCESiteMapping()
        if resMap["OK"]:
            ceMap = resMap["Value"]

        # Sort out different counters
        resultDict = {}
        resultDict["Unknown"] = {}
        for attDict, count in result["Value"]:
            site = attDict["GridSite"]
            ce = attDict["DestinationSite"]
            state = attDict["Status"]
            if site == "Unknown" and ce != "Unknown" and ce != "Multiple" and ce in ceMap:
                site = ceMap[ce]
            if site not in resultDict:
                resultDict[site] = {}
            if ce not in resultDict[site]:
                resultDict[site][ce] = {}
                for p in allStateNames:
                    resultDict[site][ce][p] = 0

            resultDict[site][ce][state] = count

        for attDict, count in resultDay["Value"]:
            site = attDict["GridSite"]
            ce = attDict["DestinationSite"]
            state = attDict["Status"]
            if site == "Unknown" and ce != "Unknown" and ce in ceMap:
                site = ceMap[ce]
            if state == PilotStatus.DONE:
                resultDict[site][ce][PilotStatus.DONE] = count
            if state == PilotStatus.ABORTED:
                resultDict[site][ce][PilotStatus.ABORTED] = count

        for attDict, count in resultDayEmpty["Value"]:
            site = attDict["GridSite"]
            ce = attDict["DestinationSite"]
            state = attDict["Status"]
            if site == "Unknown" and ce != "Unknown" and ce in ceMap:
                site = ceMap[ce]
            if state == PilotStatus.DONE:
                resultDict[site][ce]["Done_Empty"] = count

        for attDict, count in resultHour["Value"]:
            site = attDict["GridSite"]
            ce = attDict["DestinationSite"]
            state = attDict["Status"]
            if site == "Unknown" and ce != "Unknown" and ce in ceMap:
                site = ceMap[ce]
            if state == PilotStatus.ABORTED:
                resultDict[site][ce]["Aborted_Hour"] = count

        records = []
        siteSumDict = {}
        for site in resultDict:
            sumDict = {}
            for state in allStateNames:
                if state not in sumDict:
                    sumDict[state] = 0
            sumDict["Total"] = 0
            for ce in resultDict[site]:
                itemList = [site, ce]
                total = 0
                for state in allStateNames:
                    itemList.append(resultDict[site][ce][state])
                    sumDict[state] += resultDict[site][ce][state]
                    if state == PilotStatus.DONE:
                        done = resultDict[site][ce][state]
                    if state == "Done_Empty":
                        empty = resultDict[site][ce][state]
                    if state == PilotStatus.ABORTED:
                        aborted = resultDict[site][ce][state]
                    if state != "Aborted_Hour" and state != "Done_Empty":
                        total += resultDict[site][ce][state]

                sumDict["Total"] += total
                # Add the total number of pilots seen in the last day
                itemList.append(total)
                # Add pilot submission efficiency evaluation
                if (done - empty) > 0:
                    eff = done / (done - empty)
                elif done == 0:
                    eff = 0.0
                elif empty == done:
                    eff = 99.0
                else:
                    eff = 0.0
                itemList.append("%.2f" % eff)
                # Add pilot job efficiency evaluation
                if total > 0:
                    eff = (total - aborted) / total * 100
                else:
                    eff = 100.0
                itemList.append("%.2f" % eff)

                # Evaluate the quality status of the CE
                if total > 10:
                    if eff < 25.0:
                        itemList.append("Bad")
                    elif eff < 60.0:
                        itemList.append("Poor")
                    elif eff < 85.0:
                        itemList.append("Fair")
                    else:
                        itemList.append("Good")
                else:
                    itemList.append("Idle")

                if len(resultDict[site]) == 1 or expand_site:
                    records.append(itemList)

            if len(resultDict[site]) > 1 and not expand_site:
                itemList = [site, "Multiple"]
                for state in allStateNames + ["Total"]:
                    if state in sumDict:
                        itemList.append(sumDict[state])
                    else:
                        itemList.append(0)
                done = sumDict[PilotStatus.DONE]
                empty = sumDict["Done_Empty"]
                aborted = sumDict[PilotStatus.ABORTED]
                total = sumDict["Total"]

                # Add pilot submission efficiency evaluation
                if (done - empty) > 0:
                    eff = done / (done - empty)
                elif done == 0:
                    eff = 0.0
                elif empty == done:
                    eff = 99.0
                else:
                    eff = 0.0
                itemList.append("%.2f" % eff)
                # Add pilot job efficiency evaluation
                if total > 0:
                    eff = (total - aborted) / total * 100
                else:
                    eff = 100.0
                itemList.append("%.2f" % eff)

                # Evaluate the quality status of the Site
                if total > 10:
                    if eff < 25.0:
                        itemList.append("Bad")
                    elif eff < 60.0:
                        itemList.append("Poor")
                    elif eff < 85.0:
                        itemList.append("Fair")
                    else:
                        itemList.append("Good")
                else:
                    itemList.append("Idle")
                records.append(itemList)

            for state in allStateNames + ["Total"]:
                if state not in siteSumDict:
                    siteSumDict[state] = sumDict[state]
                else:
                    siteSumDict[state] += sumDict[state]

        # Perform site selection
        if site_select:
            new_records = []
            for r in records:
                if r[0] in site_select:
                    new_records.append(r)
            records = new_records

        # Perform status selection
        if status_select:
            new_records = []
            for r in records:
                if r[14] in status_select:
                    new_records.append(r)
            records = new_records

        # Get the Site Mask data
        result = SiteStatus().getUsableSites()
        if result["OK"]:
            siteMask = result["Value"]
            for r in records:
                if r[0] in siteMask:
                    r.append("Yes")
                else:
                    r.append("No")
        else:
            for r in records:
                r.append("Unknown")

        finalDict = {}
        finalDict["TotalRecords"] = len(records)
        finalDict["ParameterNames"] = paramNames + [
            "Total", "PilotsPerJob", "PilotJobEff", "Status", "InMask"
        ]

        # Return all the records if maxItems == 0 or the specified number otherwise
        if maxItems:
            finalDict["Records"] = records[startItem:startItem + maxItems]
        else:
            finalDict["Records"] = records

        done = siteSumDict[PilotStatus.DONE]
        empty = siteSumDict["Done_Empty"]
        aborted = siteSumDict[PilotStatus.ABORTED]
        total = siteSumDict["Total"]

        # Add pilot submission efficiency evaluation
        if (done - empty) > 0:
            eff = done / (done - empty)
        elif done == 0:
            eff = 0.0
        elif empty == done:
            eff = 99.0
        else:
            eff = 0.0
        siteSumDict["PilotsPerJob"] = "%.2f" % eff
        # Add pilot job efficiency evaluation
        if total > 0:
            eff = (total - aborted) / total * 100
        else:
            eff = 100.0
        siteSumDict["PilotJobEff"] = "%.2f" % eff

        # Evaluate the overall quality status
        if total > 100:
            if eff < 25.0:
                siteSumDict["Status"] = "Bad"
            elif eff < 60.0:
                siteSumDict["Status"] = "Poor"
            elif eff < 85.0:
                siteSumDict["Status"] = "Fair"
            else:
                siteSumDict["Status"] = "Good"
        else:
            siteSumDict["Status"] = "Idle"
        finalDict["Extras"] = siteSumDict

        return S_OK(finalDict)
Example #36
0
class PilotDirector( object ):
  """
    Base Pilot Director class.
    Derived classes must implement:

      * __init__( self, submitPool ):
          that must call the parent class __init__ method and then do its own initialization
      * configure( self, csSection, submitPool ):
          that must call the parent class configure method and the do its own configuration
      * _submitPilot( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask,
                      submitPrivatePilot, privateTQ, proxy, pilotsPerJob )
      * _listMatch( self, proxy, jdl, taskQueueID, rb )
      * _getChildrenReferences( self, proxy, parentReference, taskQueueID )


    Derived classes might implement:

      * configureFromSection( self, mySection ):
          to reload from a CS section the additional datamembers they might have defined.

    If additional datamembers are defined, they must:

      - be declared in the __init__
      - be reconfigured in the configureFromSection method by executing
        self.reloadConfiguration( csSection, submitPool ) in their configure method
  """
  gridMiddleware = ''

  def __init__( self, submitPool ):
    """
     Define the logger and some defaults
    """

    if submitPool == self.gridMiddleware:
      self.log = gLogger.getSubLogger( '%sPilotDirector' % self.gridMiddleware )
    else:
      self.log = gLogger.getSubLogger( '%sPilotDirector/%s' % ( self.gridMiddleware, submitPool ) )

    self.pilot = DIRAC_PILOT
    self.submitPoolOption = '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % submitPool
    self.extraPilotOptions = []
    self.installVersion = DIRAC_VERSION
    self.installProject = DIRAC_PROJECT
    self.installation = DIRAC_INSTALLATION
    self.pilotExtensionsList = []

    self.virtualOrganization = VIRTUAL_ORGANIZATION
    self.install = DIRAC_INSTALL
    self.extraModules = DIRAC_MODULES
    self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE
    self.targetGrids = [ self.gridMiddleware ]


    self.enableListMatch = ENABLE_LISTMATCH
    self.listMatchDelay = LISTMATCH_DELAY
    self.listMatchCache = DictCache()

    self.privatePilotFraction = PRIVATE_PILOT_FRACTION

    self.errorClearTime = ERROR_CLEAR_TIME
    self.errorTicketTime = ERROR_TICKET_TIME
    self.errorMailAddress = DIRAC.errorMail
    self.alarmMailAddress = DIRAC.alarmMail
    self.mailFromAddress = FROM_MAIL

    self.siteClient = SiteStatus()

    if not  'log' in self.__dict__:
      self.log = gLogger.getSubLogger( 'PilotDirector' )
    self.log.info( 'Initialized' )

  def configure( self, csSection, submitPool ):
    """
     Here goes common configuration for all PilotDirectors
    """
    self.configureFromSection( csSection )
    self.reloadConfiguration( csSection, submitPool )

    # Get the defaults for the Setup where the Director is running
    opsHelper = Operations()
    self.installVersion = opsHelper.getValue( cfgPath( 'Pilot', 'Version' ), [ self.installVersion ] )[0]
    self.installProject = opsHelper.getValue( cfgPath( 'Pilot', 'Project' ), self.installProject )
    self.installation = opsHelper.getValue( cfgPath( 'Pilot', 'Installation' ), self.installation )
    self.pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", self.pilotExtensionsList )

    self.log.info( '===============================================' )
    self.log.info( 'Configuration:' )
    self.log.info( '' )
    self.log.info( ' Target Grids:   ', ', '.join( self.targetGrids ) )
    self.log.info( ' Install script: ', self.install )
    self.log.info( ' Pilot script:   ', self.pilot )
    self.log.info( ' Pilot modules', self.extraModules )
    self.log.info( ' Install Ver:    ', self.installVersion )
    if self.installProject:
      self.log.info( ' Project:        ', self.installProject )
    if self.installation:
      self.log.info( ' Installation:   ', self.installation )
    if self.extraPilotOptions:
      self.log.info( ' Extra Options:   ', ' '.join( self.extraPilotOptions ) )
    self.log.info( ' ListMatch:      ', self.enableListMatch )
    self.log.info( ' Private %:      ', self.privatePilotFraction * 100 )
    if self.enableListMatch:
      self.log.info( ' ListMatch Delay:', self.listMatchDelay )
    self.listMatchCache.purgeExpired()

  def reloadConfiguration( self, csSection, submitPool ):
    """
     Common Configuration can be overwriten for each GridMiddleware
    """
    mySection = csSection + '/' + self.gridMiddleware
    self.configureFromSection( mySection )

    # And Again for each SubmitPool
    mySection = csSection + '/' + submitPool
    self.configureFromSection( mySection )

  def configureFromSection( self, mySection ):
    """
      reload from CS
    """
    self.pilot = gConfig.getValue( mySection + '/PilotScript'          , self.pilot )
    self.installVersion = gConfig.getValue( mySection + '/Version'         , self.installVersion )
    self.extraPilotOptions = gConfig.getValue( mySection + '/ExtraPilotOptions'    , self.extraPilotOptions )
    self.install = gConfig.getValue( mySection + '/InstallScript'        , self.install )
    self.extraModules = gConfig.getValue( mySection + '/ExtraPilotModules'        , [] ) + self.extraModules
    self.installProject = gConfig.getValue( mySection + '/Project'        , self.installProject )
    self.installation = gConfig.getValue( mySection + '/Installation'        , self.installation )
    self.maxJobsInFillMode = gConfig.getValue( mySection + '/MaxJobsInFillMode'    , self.maxJobsInFillMode )
    self.targetGrids = gConfig.getValue( mySection + '/TargetGrids'    , self.targetGrids )

    self.enableListMatch = gConfig.getValue( mySection + '/EnableListMatch'      , self.enableListMatch )
    self.listMatchDelay = gConfig.getValue( mySection + '/ListMatchDelay'       , self.listMatchDelay )
    self.errorClearTime = gConfig.getValue( mySection + '/ErrorClearTime'       , self.errorClearTime )
    self.errorTicketTime = gConfig.getValue( mySection + '/ErrorTicketTime'      , self.errorTicketTime )
    self.errorMailAddress = gConfig.getValue( mySection + '/ErrorMailAddress'     , self.errorMailAddress )
    self.alarmMailAddress = gConfig.getValue( mySection + '/AlarmMailAddress'     , self.alarmMailAddress )
    self.mailFromAddress = gConfig.getValue( mySection + '/MailFromAddress'      , self.mailFromAddress )
    self.privatePilotFraction = gConfig.getValue( mySection + '/PrivatePilotFraction' , self.privatePilotFraction )

    virtualOrganization = gConfig.getValue( mySection + '/VirtualOrganization' , '' )
    if not virtualOrganization:
      virtualOrganization = getVOForGroup( 'NonExistingGroup' )
      if not virtualOrganization:
        virtualOrganization = self.virtualOrganization
    self.virtualOrganization = virtualOrganization

  def _resolveCECandidates( self, taskQueueDict ):
    """
      Return a list of CEs for this TaskQueue
    """
    # assume user knows what they're doing and avoid site mask e.g. sam jobs
    if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
      self.log.info( 'CEs requested by TaskQueue %s:' % taskQueueDict['TaskQueueID'],
                     ', '.join( taskQueueDict['GridCEs'] ) )
      return taskQueueDict['GridCEs']

    # Get the mask
    ret = self.siteClient.getSites()
    if not ret['OK']:
      self.log.error( 'Can not retrieve site Mask from DB:', ret['Message'] )
      return []

    siteMask = ret['Value']
    if not siteMask:
      self.log.error( 'Site mask is empty' )
      return []

    self.log.verbose( 'Site Mask: %s' % ', '.join( siteMask ) )

    # remove banned sites from siteMask
    if 'BannedSites' in taskQueueDict:
      for site in taskQueueDict['BannedSites']:
        if site in siteMask:
          siteMask.remove( site )
          self.log.verbose( 'Removing banned site %s from site Mask' % site )

    # remove from the mask if a Site is given
    siteMask = [ site for site in siteMask if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites'] ]

    if not siteMask:
      # pilot can not be submitted
      self.log.info( 'No Valid Site Candidate in Mask for TaskQueue %s' % taskQueueDict['TaskQueueID'] )
      return []

    self.log.info( 'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) )

    # Get CE's associates to the given site Names
    ceMask = []

    for grid in self.targetGrids:

      section = '/Resources/Sites/%s' % grid
      ret = gConfig.getSections( section )
      if not ret['OK']:
        # this is hack, maintained until LCG is added as TargetGrid for the gLite SubmitPool
        section = '/Resources/Sites/LCG'
        ret = gConfig.getSections( section )

      if not ret['OK']:
        self.log.error( 'Could not obtain CEs from CS', ret['Message'] )
        continue

      gridSites = ret['Value']
      for siteName in gridSites:
        if siteName in siteMask:
          ret = gConfig.getValue( '%s/%s/CE' % ( section, siteName ), [] )
          for ce in ret:
            submissionMode = gConfig.getValue( '%s/%s/CEs/%s/SubmissionMode' % ( section, siteName, ce ), 'gLite' )
            if submissionMode == self.gridMiddleware and ce not in ceMask:
              ceMask.append( ce )

    if not ceMask:
      self.log.info( 'No CE Candidate found for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( siteMask ) )

    self.log.verbose( 'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'], ', '.join( ceMask ) )

    return ceMask

  def _getPilotOptions( self, taskQueueDict, pilotsToSubmit ):

    # Need to limit the maximum number of pilots to submit at once
    # For generic pilots this is limited by the number of use of the tokens and the
    # maximum number of jobs in Filling mode, but for private Jobs we need an extra limitation:
    pilotsToSubmit = max( min( pilotsToSubmit, int( 50 / self.maxJobsInFillMode ) ), 1 )
    pilotOptions = []
    privateIfGenericTQ = self.privatePilotFraction > random.random()
    privateTQ = ( 'PilotTypes' in taskQueueDict and 'private' in [ t.lower() for t in taskQueueDict['PilotTypes'] ] )
    forceGeneric = 'ForceGeneric' in taskQueueDict
    submitPrivatePilot = ( privateIfGenericTQ or privateTQ ) and not forceGeneric
    if submitPrivatePilot:
      self.log.verbose( 'Submitting private pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] )
      ownerDN = taskQueueDict['OwnerDN']
      ownerGroup = taskQueueDict['OwnerGroup']
      # User Group requirement
      pilotOptions.append( '-G %s' % taskQueueDict['OwnerGroup'] )
      # check if group allows jobsharing
      ownerGroupProperties = getPropertiesForGroup( ownerGroup )
      if not 'JobSharing' in ownerGroupProperties:
        # Add Owner requirement to pilot
        pilotOptions.append( "-O '%s'" % ownerDN )
      if privateTQ:
        pilotOptions.append( '-o /Resources/Computing/CEDefaults/PilotType=private' )
      maxJobsInFillMode = self.maxJobsInFillMode
    else:
      #For generic jobs we'll submit mixture of generic and private pilots
      self.log.verbose( 'Submitting generic pilots for TaskQueue %s' % taskQueueDict['TaskQueueID'] )
      #ADRI: Find the generic group
      result = findGenericPilotCredentials( group = taskQueueDict[ 'OwnerGroup' ] )
      if not result[ 'OK' ]:
        self.log.error( ERROR_GENERIC_CREDENTIALS, result[ 'Message' ] )
        return S_ERROR( ERROR_GENERIC_CREDENTIALS )
      ownerDN, ownerGroup = result[ 'Value' ]

      result = gProxyManager.requestToken( ownerDN, ownerGroup, max( pilotsToSubmit, self.maxJobsInFillMode ) )
      if not result[ 'OK' ]:
        self.log.error( ERROR_TOKEN, result['Message'] )
        return S_ERROR( ERROR_TOKEN )
      ( token, numberOfUses ) = result[ 'Value' ]
      pilotsToSubmit = min( numberOfUses, pilotsToSubmit )

      pilotOptions.append( '-o /Security/ProxyToken=%s' % token )

      pilotsToSubmit = max( 1, ( pilotsToSubmit - 1 ) / self.maxJobsInFillMode + 1 )

      maxJobsInFillMode = int( numberOfUses / pilotsToSubmit )
    # Use Filling mode
    pilotOptions.append( '-M %s' % maxJobsInFillMode )

    # Debug
    pilotOptions.append( '-d' )
    # Setup.
    pilotOptions.append( '-S %s' % taskQueueDict['Setup'] )
    # CS Servers
    csServers = gConfig.getServersList()
    if len( csServers ) > 3:
      # Remove the master
      master = gConfigurationData.getMasterServer()
      if master in csServers:
        csServers.remove( master )
    pilotOptions.append( '-C %s' % ",".join( csServers ) )
    # DIRAC Extensions to be used in pilots
    # ubeda: I'm not entirely sure if we can use here the same opsHelper as in line
    # line +352
    pilotExtensionsList = Operations().getValue( "Pilot/Extensions", [] )
    extensionsList = []
    if pilotExtensionsList:
      if pilotExtensionsList[0] != 'None':
        extensionsList = pilotExtensionsList
    else:
      extensionsList = getCSExtensions()
    if extensionsList:
      pilotOptions.append( '-e %s' % ",".join( extensionsList ) )

    #Get DIRAC version and project, There might be global Setup defaults and per VO/Setup defaults (from configure)
    opsHelper = Operations( group = taskQueueDict['OwnerGroup'], setup = taskQueueDict['Setup'] )
    # Requested version of DIRAC (it can be a list, so we take the fist one)
    version = opsHelper.getValue( cfgPath( 'Pilot', 'Version' ) , [ self.installVersion ] )[0]
    pilotOptions.append( '-r %s' % version )
    # Requested Project to install
    installProject = opsHelper.getValue( cfgPath( 'Pilot', 'Project' ) , self.installProject )
    if installProject:
      pilotOptions.append( '-l %s' % installProject )
    installation = opsHelper.getValue( cfgPath( 'Pilot', 'Installation' ), self.installation )
    if installation:
      pilotOptions.append( "-V %s" % installation )
    # Requested CPU time
    pilotOptions.append( '-T %s' % taskQueueDict['CPUTime'] )

    if self.submitPoolOption not in self.extraPilotOptions:
      pilotOptions.append( self.submitPoolOption )

    if self.extraPilotOptions:
      pilotOptions.extend( self.extraPilotOptions )

    return S_OK( ( pilotOptions, pilotsToSubmit, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) )

  def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit,
                     ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ):
    """
      This method must be implemented on the Backend specific derived class.
      This is problem with the Director, not with the Job so we must return S_OK
      Return S_ERROR if not defined.
    """
    self.log.error( '_submitPilots method not implemented' )
    return S_OK()

  def _submitPilot( self, proxy, pilotsToSubmit, jdl, taskQueueID, rb ):
    """ Submit pilot and get back the reference
    """
    self.log.error( '_submitPilot method not implemented' )
    return S_OK()

  def _listMatch( self, proxy, jdl, taskQueueID, rb ):
    """ This method must be implemented on the Backend specific derived class.
    """
    self.log.error( '_listMatch method not implemented' )
    return S_OK()

  def _getChildrenReferences( self, proxy, parentReference, taskQueueID ):
    """ This method must be implemented on the Backend specific derived class.
    """
    self.log.error( '_getChildrenReferences method not implemented' )
    return S_OK()

  def submitPilots( self, taskQueueDict, pilotsToSubmit, workDir = None ):
    """
      Submit pilot for the given TaskQueue,
      this method just insert the request in the corresponding ThreadPool,
      the submission is done from the Thread Pool job
    """
    try:

      taskQueueID = taskQueueDict['TaskQueueID']

      self.log.verbose( 'Submitting Pilot' )
      ceMask = self._resolveCECandidates( taskQueueDict )
      if not ceMask:
        return S_ERROR( 'No CE available for TaskQueue %d' % int( taskQueueID ) )
      result = self._getPilotOptions( taskQueueDict, pilotsToSubmit )
      if not result['OK']:
        return result
      ( pilotOptions, pilotsPerJob, ownerDN, ownerGroup, submitPrivatePilot, privateTQ ) = result['Value']
      # get a valid proxy, submit with a long proxy to avoid renewal
      ret = self._getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft = 86400 * 5 )
      if not ret['OK']:
        self.log.error( ret['Message'] )
        self.log.error( 'No proxy Available', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) )
        return S_ERROR( ERROR_PROXY )
      proxy = ret['Value']
      # Now call a Grid Specific method to handle the final submission of the pilots
      return self._submitPilots( workDir, taskQueueDict, pilotOptions,
                                 pilotsToSubmit, ceMask,
                                 submitPrivatePilot, privateTQ,
                                 proxy, pilotsPerJob )

    except Exception:
      self.log.exception( 'Error in Pilot Submission' )

    return S_OK( 0 )

  def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ):
    """
     To be overwritten if a given Pilot does not require a full proxy
    """
    self.log.info( "Downloading %s@%s proxy" % ( ownerDN, ownerGroup ) )
    return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft )

  def exceptionCallBack( self, threadedJob, exceptionInfo ):
    self.log.exception( 'Error in Pilot Submission' )
Example #37
0
    def getPilotSummaryWeb(self, selectDict, sortList, startItem, maxItems):
        """ Get summary of the pilot jobs status by CE/site in a standard structure
    """

        stateNames = [
            'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done',
            'Aborted', 'Failed'
        ]
        allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour']
        paramNames = ['Site', 'CE'] + allStateNames

        last_update = None
        if 'LastUpdateTime' in selectDict:
            last_update = selectDict['LastUpdateTime']
            del selectDict['LastUpdateTime']
        site_select = []
        if 'GridSite' in selectDict:
            site_select = selectDict['GridSite']
            if not isinstance(site_select, list):
                site_select = [site_select]
            del selectDict['GridSite']

        status_select = []
        if 'Status' in selectDict:
            status_select = selectDict['Status']
            if not isinstance(status_select, list):
                status_select = [status_select]
            del selectDict['Status']

        expand_site = ''
        if 'ExpandSite' in selectDict:
            expand_site = selectDict['ExpandSite']
            site_select = [expand_site]
            del selectDict['ExpandSite']

        # Get all the data from the database with various selections
        result = self.getCounters('PilotAgents',
                                  ['GridSite', 'DestinationSite', 'Status'],
                                  selectDict,
                                  newer=last_update,
                                  timeStamp='LastUpdateTime')
        if not result['OK']:
            return result

        last_update = Time.dateTime() - Time.hour
        selectDict['Status'] = 'Aborted'
        resultHour = self.getCounters(
            'PilotAgents', ['GridSite', 'DestinationSite', 'Status'],
            selectDict,
            newer=last_update,
            timeStamp='LastUpdateTime')
        if not resultHour['OK']:
            return resultHour

        last_update = Time.dateTime() - Time.day
        selectDict['Status'] = ['Aborted', 'Done']
        resultDay = self.getCounters('PilotAgents',
                                     ['GridSite', 'DestinationSite', 'Status'],
                                     selectDict,
                                     newer=last_update,
                                     timeStamp='LastUpdateTime')
        if not resultDay['OK']:
            return resultDay
        selectDict['CurrentJobID'] = 0
        selectDict['Status'] = 'Done'
        resultDayEmpty = self.getCounters(
            'PilotAgents', ['GridSite', 'DestinationSite', 'Status'],
            selectDict,
            newer=last_update,
            timeStamp='LastUpdateTime')
        if not resultDayEmpty['OK']:
            return resultDayEmpty

        ceMap = {}
        resMap = getCESiteMapping()
        if resMap['OK']:
            ceMap = resMap['Value']

        # Sort out different counters
        resultDict = {}
        resultDict['Unknown'] = {}
        for attDict, count in result['Value']:
            site = attDict['GridSite']
            ce = attDict['DestinationSite']
            state = attDict['Status']
            if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ce in ceMap:
                site = ceMap[ce]
            if site not in resultDict:
                resultDict[site] = {}
            if ce not in resultDict[site]:
                resultDict[site][ce] = {}
                for p in allStateNames:
                    resultDict[site][ce][p] = 0

            resultDict[site][ce][state] = count

        for attDict, count in resultDay['Value']:
            site = attDict['GridSite']
            ce = attDict['DestinationSite']
            state = attDict['Status']
            if site == 'Unknown' and ce != "Unknown" and ce in ceMap:
                site = ceMap[ce]
            if state == "Done":
                resultDict[site][ce]["Done"] = count
            if state == "Aborted":
                resultDict[site][ce]["Aborted"] = count

        for attDict, count in resultDayEmpty['Value']:
            site = attDict['GridSite']
            ce = attDict['DestinationSite']
            state = attDict['Status']
            if site == 'Unknown' and ce != "Unknown" and ce in ceMap:
                site = ceMap[ce]
            if state == "Done":
                resultDict[site][ce]["Done_Empty"] = count

        for attDict, count in resultHour['Value']:
            site = attDict['GridSite']
            ce = attDict['DestinationSite']
            state = attDict['Status']
            if site == 'Unknown' and ce != "Unknown" and ce in ceMap:
                site = ceMap[ce]
            if state == "Aborted":
                resultDict[site][ce]["Aborted_Hour"] = count

        records = []
        siteSumDict = {}
        for site in resultDict:
            sumDict = {}
            for state in allStateNames:
                if state not in sumDict:
                    sumDict[state] = 0
            sumDict['Total'] = 0
            for ce in resultDict[site]:
                itemList = [site, ce]
                total = 0
                for state in allStateNames:
                    itemList.append(resultDict[site][ce][state])
                    sumDict[state] += resultDict[site][ce][state]
                    if state == "Done":
                        done = resultDict[site][ce][state]
                    if state == "Done_Empty":
                        empty = resultDict[site][ce][state]
                    if state == "Aborted":
                        aborted = resultDict[site][ce][state]
                    if state != "Aborted_Hour" and state != "Done_Empty":
                        total += resultDict[site][ce][state]

                sumDict['Total'] += total
                # Add the total number of pilots seen in the last day
                itemList.append(total)
                # Add pilot submission efficiency evaluation
                if (done - empty) > 0:
                    eff = done / (done - empty)
                elif done == 0:
                    eff = 0.
                elif empty == done:
                    eff = 99.
                else:
                    eff = 0.
                itemList.append('%.2f' % eff)
                # Add pilot job efficiency evaluation
                if total > 0:
                    eff = (total - aborted) / total * 100
                else:
                    eff = 100.
                itemList.append('%.2f' % eff)

                # Evaluate the quality status of the CE
                if total > 10:
                    if eff < 25.:
                        itemList.append('Bad')
                    elif eff < 60.:
                        itemList.append('Poor')
                    elif eff < 85.:
                        itemList.append('Fair')
                    else:
                        itemList.append('Good')
                else:
                    itemList.append('Idle')

                if len(resultDict[site]) == 1 or expand_site:
                    records.append(itemList)

            if len(resultDict[site]) > 1 and not expand_site:
                itemList = [site, 'Multiple']
                for state in allStateNames + ['Total']:
                    if state in sumDict:
                        itemList.append(sumDict[state])
                    else:
                        itemList.append(0)
                done = sumDict["Done"]
                empty = sumDict["Done_Empty"]
                aborted = sumDict["Aborted"]
                total = sumDict["Total"]

                # Add pilot submission efficiency evaluation
                if (done - empty) > 0:
                    eff = done / (done - empty)
                elif done == 0:
                    eff = 0.
                elif empty == done:
                    eff = 99.
                else:
                    eff = 0.
                itemList.append('%.2f' % eff)
                # Add pilot job efficiency evaluation
                if total > 0:
                    eff = (total - aborted) / total * 100
                else:
                    eff = 100.
                itemList.append('%.2f' % eff)

                # Evaluate the quality status of the Site
                if total > 10:
                    if eff < 25.:
                        itemList.append('Bad')
                    elif eff < 60.:
                        itemList.append('Poor')
                    elif eff < 85.:
                        itemList.append('Fair')
                    else:
                        itemList.append('Good')
                else:
                    itemList.append('Idle')
                records.append(itemList)

            for state in allStateNames + ['Total']:
                if state not in siteSumDict:
                    siteSumDict[state] = sumDict[state]
                else:
                    siteSumDict[state] += sumDict[state]

        # Perform site selection
        if site_select:
            new_records = []
            for r in records:
                if r[0] in site_select:
                    new_records.append(r)
            records = new_records

        # Perform status selection
        if status_select:
            new_records = []
            for r in records:
                if r[14] in status_select:
                    new_records.append(r)
            records = new_records

        # Get the Site Mask data
        result = SiteStatus().getUsableSites()
        if result['OK']:
            siteMask = result['Value']
            for r in records:
                if r[0] in siteMask:
                    r.append('Yes')
                else:
                    r.append('No')
        else:
            for r in records:
                r.append('Unknown')

        finalDict = {}
        finalDict['TotalRecords'] = len(records)
        finalDict['ParameterNames'] = paramNames + \
            ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask']

        # Return all the records if maxItems == 0 or the specified number otherwise
        if maxItems:
            finalDict['Records'] = records[startItem:startItem + maxItems]
        else:
            finalDict['Records'] = records

        done = siteSumDict["Done"]
        empty = siteSumDict["Done_Empty"]
        aborted = siteSumDict["Aborted"]
        total = siteSumDict["Total"]

        # Add pilot submission efficiency evaluation
        if (done - empty) > 0:
            eff = done / (done - empty)
        elif done == 0:
            eff = 0.
        elif empty == done:
            eff = 99.
        else:
            eff = 0.
        siteSumDict['PilotsPerJob'] = '%.2f' % eff
        # Add pilot job efficiency evaluation
        if total > 0:
            eff = (total - aborted) / total * 100
        else:
            eff = 100.
        siteSumDict['PilotJobEff'] = '%.2f' % eff

        # Evaluate the overall quality status
        if total > 100:
            if eff < 25.:
                siteSumDict['Status'] = 'Bad'
            elif eff < 60.:
                siteSumDict['Status'] = 'Poor'
            elif eff < 85.:
                siteSumDict['Status'] = 'Fair'
            else:
                siteSumDict['Status'] = 'Good'
        else:
            siteSumDict['Status'] = 'Idle'
        finalDict['Extras'] = siteSumDict

        return S_OK(finalDict)
Example #38
0
class SiteInspectorAgent(AgentModule):
    """ SiteInspectorAgent

  The SiteInspectorAgent agent is an agent that is used to get the all the site names
  and trigger PEP to evaluate their status.

  """

    # Max number of worker threads by default
    __maxNumberOfThreads = 15

    # Inspection freqs, defaults, the lower, the higher priority to be checked.
    # Error state usually means there is a glitch somewhere, so it has the highest
    # priority.
    __checkingFreqs = {
        'Active': 20,
        'Degraded': 20,
        'Probing': 20,
        'Banned': 15,
        'Unknown': 10,
        'Error': 5
    }

    def __init__(self, *args, **kwargs):

        AgentModule.__init__(self, *args, **kwargs)

        # ElementType, to be defined among Site, Resource or Node
        self.sitesToBeChecked = None
        self.threadPool = None
        self.siteClient = None
        self.clients = {}

    def initialize(self):
        """ Standard initialize.
    """

        maxNumberOfThreads = self.am_getOption('maxNumberOfThreads',
                                               self.__maxNumberOfThreads)
        self.threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads)

        self.siteClient = SiteStatus()

        self.clients['SiteStatus'] = self.siteClient
        self.clients['ResourceManagementClient'] = ResourceManagementClient()

        return S_OK()

    def execute(self):
        """ execute

    This is the main method of the agent. It gets the sites from the Database, calculates how many threads should be
    started and spawns them. Each thread will get a site from the queue until
    it is empty. At the end, the method will join the queue such that the agent
    will not terminate a cycle until all sites have been processed.

    """

        # Gets sites to be checked ( returns a Queue )
        sitesToBeChecked = self.getSitesToBeChecked()
        if not sitesToBeChecked['OK']:
            self.log.error(sitesToBeChecked['Message'])
            return sitesToBeChecked
        self.sitesToBeChecked = sitesToBeChecked['Value']

        queueSize = self.sitesToBeChecked.qsize()
        pollingTime = self.am_getPollingTime()

        # Assigns number of threads on the fly such that we exhaust the PollingTime
        # without having to spawn too many threads. We assume 10 seconds per element
        # to be processed ( actually, it takes something like 1 sec per element ):
        # numberOfThreads = elements * 10(s/element) / pollingTime
        numberOfThreads = int(math.ceil(queueSize * 10. / pollingTime))

        self.log.info('Needed %d threads to process %d elements' %
                      (numberOfThreads, queueSize))

        for _x in xrange(numberOfThreads):
            jobUp = self.threadPool.generateJobAndQueueIt(self._execute)
            if not jobUp['OK']:
                self.log.error(jobUp['Message'])

        self.log.info('blocking until all sites have been processed')
        # block until all tasks are done
        self.sitesToBeChecked.join()
        self.log.info('done')

        return S_OK()

    def getSitesToBeChecked(self):
        """ getElementsToBeChecked

    This method gets all the site names from the SiteStatus table, after that it get the details of each
    site (status, name, etc..) and adds them to a queue.

    """

        toBeChecked = Queue.Queue()

        res = self.siteClient.getSites('All')
        if not res['OK']:
            return res

        # get the current status
        res = self.siteClient.getSiteStatuses(res['Value'])
        if not res['OK']:
            return res

        # filter elements
        for site in res['Value']:
            status = res['Value'].get(site, 'Unknown')

            toBeChecked.put({
                'status': status,
                'name': site,
                'site': site,
                'element': 'Site',
                'statusType': 'all',
                'elementType': 'Site'
            })

        return S_OK(toBeChecked)

    # Private methods ............................................................

    def _execute(self):
        """
      Method run by each of the thread that is in the ThreadPool.
      It enters a loop until there are no sites on the queue.

      On each iteration, it evaluates the policies for such site
      and enforces the necessary actions. If there are no more sites in the
      queue, the loop is finished.
    """

        pep = PEP(clients=self.clients)

        while True:

            try:
                site = self.sitesToBeChecked.get_nowait()
            except Queue.Empty:
                return S_OK()

            resEnforce = pep.enforce(site)
            if not resEnforce['OK']:
                self.log.error('Failed policy enforcement',
                               resEnforce['Message'])
                self.sitesToBeChecked.task_done()
                continue

            # Used together with join !
            self.sitesToBeChecked.task_done()
Example #39
0
 def initialize(self):
     self.siteClient = SiteStatus()
     return S_OK()
Example #40
0
class DiracAdmin(API):
    """ Administrative functionalities
  """

    #############################################################################
    def __init__(self):
        """Internal initialization of the DIRAC Admin API.
    """
        super(DiracAdmin, self).__init__()

        self.csAPI = CSAPI()

        self.dbg = False
        if gConfig.getValue(self.section + '/LogLevel', 'DEBUG') == 'DEBUG':
            self.dbg = True

        self.scratchDir = gConfig.getValue(self.section + '/ScratchDir',
                                           '/tmp')
        self.currentDir = os.getcwd()
        self.rssFlag = ResourceStatus().rssFlag
        self.sitestatus = SiteStatus()
        self._siteSet = set(getSites().get('Value', []))

    #############################################################################
    def uploadProxy(self):
        """Upload a proxy to the DIRAC WMS.  This method

       Example usage:

         >>> print diracAdmin.uploadProxy('dteam_pilot')
         {'OK': True, 'Value': 0L}

       :return: S_OK,S_ERROR

       :param permanent: Indefinitely update proxy
       :type permanent: boolean

    """
        return gProxyManager.uploadProxy()

    #############################################################################
    def setProxyPersistency(self, userDN, userGroup, persistent=True):
        """Set the persistence of a proxy in the Proxy Manager

       Example usage:

         >>> gLogger.notice(diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ))
         {'OK': True }

       :param userDN: User DN
       :type userDN: string
       :param userGroup: DIRAC Group
       :type userGroup: string
       :param persistent: Persistent flag
       :type persistent: boolean
       :return: S_OK,S_ERROR
    """
        return gProxyManager.setPersistency(userDN, userGroup, persistent)

    #############################################################################
    def checkProxyUploaded(self, userDN, userGroup, requiredTime):
        """Set the persistence of a proxy in the Proxy Manager

       Example usage:

         >>> gLogger.notice(diracAdmin.setProxyPersistency( 'some DN', 'dirac group', True ))
         {'OK': True, 'Value' : True/False }

       :param userDN: User DN
       :type userDN: string
       :param userGroup: DIRAC Group
       :type userGroup: string
       :param requiredTime: Required life time of the uploaded proxy
       :type requiredTime: boolean
       :return: S_OK,S_ERROR
    """
        return gProxyManager.userHasProxy(userDN, userGroup, requiredTime)

    #############################################################################
    def getSiteMask(self, printOutput=False, status='Active'):
        """Retrieve current site mask from WMS Administrator service.

       Example usage:

         >>> gLogger.notice(diracAdmin.getSiteMask())
         {'OK': True, 'Value': 0L}

       :return: S_OK,S_ERROR

    """

        result = self.sitestatus.getSites(siteState=status)
        if result['OK']:
            sites = result['Value']
            if printOutput:
                sites.sort()
                for site in sites:
                    gLogger.notice(site)

        return result

    #############################################################################
    def getBannedSites(self, printOutput=False):
        """Retrieve current list of banned  and probing sites.

       Example usage:

         >>> gLogger.notice(diracAdmin.getBannedSites())
         {'OK': True, 'Value': []}

       :return: S_OK,S_ERROR

    """

        bannedSites = self.sitestatus.getSites(siteState='Banned')
        if not bannedSites['OK']:
            return bannedSites

        probingSites = self.sitestatus.getSites(siteState='Probing')
        if not probingSites['OK']:
            return probingSites

        mergedList = sorted(bannedSites['Value'] + probingSites['Value'])

        if printOutput:
            gLogger.notice('\n'.join(mergedList))

        return S_OK(mergedList)

    #############################################################################
    def getSiteSection(self, site, printOutput=False):
        """Simple utility to get the list of CEs for DIRAC site name.

       Example usage:

         >>> gLogger.notice(diracAdmin.getSiteSection('LCG.CERN.ch'))
         {'OK': True, 'Value':}

       :return: S_OK,S_ERROR
    """
        gridType = site.split('.')[0]
        if not gConfig.getSections('/Resources/Sites/%s' % (gridType))['OK']:
            return S_ERROR('/Resources/Sites/%s is not a valid site section' %
                           (gridType))

        result = gConfig.getOptionsDict('/Resources/Sites/%s/%s' %
                                        (gridType, site))
        if printOutput and result['OK']:
            gLogger.notice(self.pPrint.pformat(result['Value']))
        return result

    #############################################################################
    def allowSite(self, site, comment, printOutput=False):
        """Adds the site to the site mask.

       Example usage:

         >>> gLogger.notice(diracAdmin.allowSite())
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        result = self.getSiteMask(status='Active')
        if not result['OK']:
            return result
        siteMask = result['Value']
        if site in siteMask:
            if printOutput:
                gLogger.notice('Site %s is already Active' % site)
            return S_OK('Site %s is already Active' % site)

        if self.rssFlag:
            result = self.sitestatus.setSiteStatus(site, 'Active', comment)
        else:
            result = WMSAdministratorClient().allowSite(site, comment)
        if not result['OK']:
            return result

        if printOutput:
            gLogger.notice('Site %s status is set to Active' % site)

        return result

    #############################################################################
    def getSiteMaskLogging(self, site=None, printOutput=False):
        """Retrieves site mask logging information.

       Example usage:

         >>> gLogger.notice(diracAdmin.getSiteMaskLogging('LCG.AUVER.fr'))
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR
    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        if self.rssFlag:
            result = ResourceStatusClient().selectStatusElement('Site',
                                                                'History',
                                                                name=site)
        else:
            result = WMSAdministratorClient().getSiteMaskLogging(site)

        if not result['OK']:
            return result

        if printOutput:
            if site:
                gLogger.notice('\nSite Mask Logging Info for %s\n' % site)
            else:
                gLogger.notice('\nAll Site Mask Logging Info\n')

            sitesLogging = result['Value']
            if isinstance(sitesLogging, dict):
                for siteName, tupleList in sitesLogging.items(
                ):  # can be an iterator
                    if not siteName:
                        gLogger.notice('\n===> %s\n' % siteName)
                    for tup in tupleList:
                        stup = str(tup[0]).ljust(8) + str(tup[1]).ljust(20)
                        stup += '( ' + str(tup[2]).ljust(len(str(
                            tup[2]))) + ' )  "' + str(tup[3]) + '"'
                        gLogger.notice(stup)
                    gLogger.notice(' ')
            elif isinstance(sitesLogging, list):
                sitesLoggingList = [(sl[1], sl[3], sl[4])
                                    for sl in sitesLogging]
                for siteLog in sitesLoggingList:
                    gLogger.notice(siteLog)

        return S_OK()

    #############################################################################
    def banSite(self, site, comment, printOutput=False):
        """Removes the site from the site mask.

       Example usage:

         >>> gLogger.notice(diracAdmin.banSite())
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        mask = self.getSiteMask(status='Banned')
        if not mask['OK']:
            return mask
        siteMask = mask['Value']
        if site in siteMask:
            if printOutput:
                gLogger.notice('Site %s is already Banned' % site)
            return S_OK('Site %s is already Banned' % site)

        if self.rssFlag:
            result = self.sitestatus.setSiteStatus(site, 'Banned', comment)
        else:
            result = WMSAdministratorClient().banSite(site, comment)
        if not result['OK']:
            return result

        if printOutput:
            gLogger.notice('Site %s status is set to Banned' % site)

        return result

    #############################################################################
    def __checkSiteIsValid(self, site):
        """Internal function to check that a site name is valid.
    """
        if isinstance(site, (list, set, dict)):
            site = set(site) - self._siteSet
            if not site:
                return S_OK()
        elif site in self._siteSet:
            return S_OK()
        return S_ERROR('Specified site %s is not in list of defined sites' %
                       str(site))

    #############################################################################
    def getServicePorts(self, setup='', printOutput=False):
        """Checks the service ports for the specified setup.  If not given this is
       taken from the current installation (/DIRAC/Setup)

       Example usage:

         >>> gLogger.notice(diracAdmin.getServicePorts())
         {'OK': True, 'Value':''}

       :return: S_OK,S_ERROR

    """
        if not setup:
            setup = gConfig.getValue('/DIRAC/Setup', '')

        setupList = gConfig.getSections('/DIRAC/Setups', [])
        if not setupList['OK']:
            return S_ERROR('Could not get /DIRAC/Setups sections')
        setupList = setupList['Value']
        if setup not in setupList:
            return S_ERROR('Setup %s is not in allowed list: %s' %
                           (setup, ', '.join(setupList)))

        serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup)
        if not serviceSetups['OK']:
            return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup)
        serviceSetups = serviceSetups['Value']  # dict
        systemList = gConfig.getSections('/Systems')
        if not systemList['OK']:
            return S_ERROR('Could not get Systems sections')
        systemList = systemList['Value']
        result = {}
        for system in systemList:
            if system in serviceSetups:
                path = '/Systems/%s/%s/Services' % (system,
                                                    serviceSetups[system])
                servicesList = gConfig.getSections(path)
                if not servicesList['OK']:
                    self.log.warn('Could not get sections in %s' % path)
                else:
                    servicesList = servicesList['Value']
                    if not servicesList:
                        servicesList = []
                    self.log.verbose('System: %s ServicesList: %s' %
                                     (system, ', '.join(servicesList)))
                    for service in servicesList:
                        spath = '%s/%s/Port' % (path, service)
                        servicePort = gConfig.getValue(spath, 0)
                        if servicePort:
                            self.log.verbose('Found port for %s/%s = %s' %
                                             (system, service, servicePort))
                            result['%s/%s' % (system, service)] = servicePort
                        else:
                            self.log.warn('No port found for %s' % spath)
            else:
                self.log.warn('%s is not defined in /DIRAC/Setups/%s' %
                              (system, setup))

        if printOutput:
            gLogger.notice(self.pPrint.pformat(result))

        return S_OK(result)

    #############################################################################
    def getProxy(self, userDN, userGroup, validity=43200, limited=False):
        """Retrieves a proxy with default 12hr validity and stores
       this in a file in the local directory by default.

       Example usage:

         >>> gLogger.notice(diracAdmin.getProxy())
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
        return gProxyManager.downloadProxy(userDN,
                                           userGroup,
                                           limited=limited,
                                           requiredTimeLeft=validity)

    #############################################################################
    def getVOMSProxy(self,
                     userDN,
                     userGroup,
                     vomsAttr=False,
                     validity=43200,
                     limited=False):
        """Retrieves a proxy with default 12hr validity and VOMS extensions and stores
       this in a file in the local directory by default.

       Example usage:

         >>> gLogger.notice(diracAdmin.getVOMSProxy())
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """
        return gProxyManager.downloadVOMSProxy(userDN,
                                               userGroup,
                                               limited=limited,
                                               requiredVOMSAttribute=vomsAttr,
                                               requiredTimeLeft=validity)

    #############################################################################
    def getPilotProxy(self, userDN, userGroup, validity=43200):
        """Retrieves a pilot proxy with default 12hr validity and stores
       this in a file in the local directory by default.

       Example usage:

         >>> gLogger.notice(diracAdmin.getVOMSProxy())
         {'OK': True, 'Value': }

       :return: S_OK,S_ERROR

    """

        return gProxyManager.getPilotProxyFromDIRACGroup(
            userDN, userGroup, requiredTimeLeft=validity)

    #############################################################################
    def resetJob(self, jobID):
        """Reset a job or list of jobs in the WMS.  This operation resets the reschedule
       counter for a job or list of jobs and allows them to run as new.

       Example::

         >>> gLogger.notice(dirac.reset(12345))
         {'OK': True, 'Value': [12345]}

       :param job: JobID
       :type job: integer or list of integers
       :return: S_OK,S_ERROR

    """
        if isinstance(jobID, six.string_types):
            try:
                jobID = int(jobID)
            except Exception as x:
                return self._errorReport(
                    str(x),
                    'Expected integer or convertible integer for existing jobID'
                )
        elif isinstance(jobID, list):
            try:
                jobID = [int(job) for job in jobID]
            except Exception as x:
                return self._errorReport(
                    str(x),
                    'Expected integer or convertible integer for existing jobIDs'
                )

        result = JobManagerClient(useCertificates=False).resetJob(jobID)
        return result

    #############################################################################
    def getJobPilotOutput(self, jobID, directory=''):
        """Retrieve the pilot output for an existing job in the WMS.
       The output will be retrieved in a local directory unless
       otherwise specified.

         >>> gLogger.notice(dirac.getJobPilotOutput(12345))
         {'OK': True, StdOut:'',StdError:''}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
        if not directory:
            directory = self.currentDir

        if not os.path.exists(directory):
            return self._errorReport('Directory %s does not exist' % directory)

        result = WMSAdministratorClient().getJobPilotOutput(jobID)
        if not result['OK']:
            return result

        outputPath = '%s/pilot_%s' % (directory, jobID)
        if os.path.exists(outputPath):
            self.log.info('Remove %s and retry to continue' % outputPath)
            return S_ERROR('Remove %s and retry to continue' % outputPath)

        if not os.path.exists(outputPath):
            self.log.verbose('Creating directory %s' % outputPath)
            os.mkdir(outputPath)

        outputs = result['Value']
        if 'StdOut' in outputs:
            stdout = '%s/std.out' % (outputPath)
            with open(stdout, 'w') as fopen:
                fopen.write(outputs['StdOut'])
            self.log.verbose('Standard output written to %s' % (stdout))
        else:
            self.log.warn('No standard output returned')

        if 'StdError' in outputs:
            stderr = '%s/std.err' % (outputPath)
            with open(stderr, 'w') as fopen:
                fopen.write(outputs['StdError'])
            self.log.verbose('Standard error written to %s' % (stderr))
        else:
            self.log.warn('No standard error returned')

        self.log.always('Outputs retrieved in %s' % outputPath)
        return result

    #############################################################################
    def getPilotOutput(self, gridReference, directory=''):
        """Retrieve the pilot output  (std.out and std.err) for an existing job in the WMS.

         >>> gLogger.notice(dirac.getJobPilotOutput(12345))
         {'OK': True, 'Value': {}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, six.string_types):
            return self._errorReport('Expected string for pilot reference')

        if not directory:
            directory = self.currentDir

        if not os.path.exists(directory):
            return self._errorReport('Directory %s does not exist' % directory)

        result = PilotManagerClient().getPilotOutput(gridReference)
        if not result['OK']:
            return result

        gridReferenceSmall = gridReference.split('/')[-1]
        if not gridReferenceSmall:
            gridReferenceSmall = 'reference'
        outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall)

        if os.path.exists(outputPath):
            self.log.info('Remove %s and retry to continue' % outputPath)
            return S_ERROR('Remove %s and retry to continue' % outputPath)

        if not os.path.exists(outputPath):
            self.log.verbose('Creating directory %s' % outputPath)
            os.mkdir(outputPath)

        outputs = result['Value']
        if 'StdOut' in outputs:
            stdout = '%s/std.out' % (outputPath)
            with open(stdout, 'w') as fopen:
                fopen.write(outputs['StdOut'])
            self.log.info('Standard output written to %s' % (stdout))
        else:
            self.log.warn('No standard output returned')

        if 'StdErr' in outputs:
            stderr = '%s/std.err' % (outputPath)
            with open(stderr, 'w') as fopen:
                fopen.write(outputs['StdErr'])
            self.log.info('Standard error written to %s' % (stderr))
        else:
            self.log.warn('No standard error returned')

        self.log.always('Outputs retrieved in %s' % outputPath)
        return result

    #############################################################################
    def getPilotInfo(self, gridReference):
        """Retrieve info relative to a pilot reference

         >>> gLogger.notice(dirac.getPilotInfo(12345))
         {'OK': True, 'Value': {}}

       :param gridReference: Pilot Job Reference
       :type gridReference: string
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, six.string_types):
            return self._errorReport('Expected string for pilot reference')

        result = PilotManagerClient().getPilotInfo(gridReference)
        return result

    #############################################################################
    def killPilot(self, gridReference):
        """Kill the pilot specified

         >>> gLogger.notice(dirac.getPilotInfo(12345))
         {'OK': True, 'Value': {}}

       :param gridReference: Pilot Job Reference
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, six.string_types):
            return self._errorReport('Expected string for pilot reference')

        result = PilotManagerClient().killPilot(gridReference)
        return result

    #############################################################################
    def getPilotLoggingInfo(self, gridReference):
        """Retrieve the pilot logging info for an existing job in the WMS.

         >>> gLogger.notice(dirac.getPilotLoggingInfo(12345))
         {'OK': True, 'Value': {"The output of the command"}}

       :param gridReference: Gridp pilot job reference Id
       :type gridReference: string
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, six.string_types):
            return self._errorReport('Expected string for pilot reference')

        return PilotManagerClient().getPilotLoggingInfo(gridReference)

    #############################################################################
    def getJobPilots(self, jobID):
        """Extract the list of submitted pilots and their status for a given
       jobID from the WMS.  Useful information is printed to the screen.

         >>> gLogger.notice(dirac.getJobPilots())
         {'OK': True, 'Value': {PilotID:{StatusDict}}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR

    """
        if isinstance(jobID, six.string_types):
            try:
                jobID = int(jobID)
            except Exception as x:
                return self._errorReport(
                    str(x), 'Expected integer or string for existing jobID')

        result = PilotManagerClient().getPilots(jobID)
        if result['OK']:
            gLogger.notice(self.pPrint.pformat(result['Value']))
        return result

    #############################################################################
    def getPilotSummary(self, startDate='', endDate=''):
        """Retrieve the pilot output for an existing job in the WMS.  Summary is
       printed at INFO level, full dictionary of results also returned.

         >>> gLogger.notice(dirac.getPilotSummary())
         {'OK': True, 'Value': {CE:{Status:Count}}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
        result = PilotManagerClient().getPilotSummary(startDate, endDate)
        if not result['OK']:
            return result

        ceDict = result['Value']
        headers = 'CE'.ljust(28)
        i = 0
        for ce, summary in ceDict.iteritems():
            states = summary.keys()
            if len(states) > i:
                i = len(states)

        for i in xrange(i):
            headers += 'Status'.ljust(12) + 'Count'.ljust(12)
        gLogger.notice(headers)

        for ce, summary in ceDict.iteritems():
            line = ce.ljust(28)
            states = sorted(summary)
            for state in states:
                count = str(summary[state])
                line += state.ljust(12) + count.ljust(12)
            gLogger.notice(line)

        return result

    #############################################################################
    def setSiteProtocols(self, site, protocolsList, printOutput=False):
        """
    Allows to set the defined protocols for each SE for a given site.
    """
        result = self.__checkSiteIsValid(site)
        if not result['OK']:
            return result

        siteSection = '/Resources/Sites/%s/%s/SE' % (site.split('.')[0], site)
        siteSEs = gConfig.getValue(siteSection, [])
        if not siteSEs:
            return S_ERROR('No SEs found for site %s in section %s' %
                           (site, siteSection))

        defaultProtocols = gConfig.getValue(
            '/Resources/StorageElements/DefaultProtocols', [])
        self.log.verbose('Default list of protocols are',
                         ', '.join(defaultProtocols))

        for protocol in protocolsList:
            if protocol not in defaultProtocols:
                return S_ERROR(
                    'Requested to set protocol %s in list but %s is not '
                    'in default list of protocols:\n%s' %
                    (protocol, protocol, ', '.join(defaultProtocols)))

        modifiedCS = False
        result = promptUser(
            'Do you want to add the following default protocols:'
            ' %s for SE(s):\n%s' %
            (', '.join(protocolsList), ', '.join(siteSEs)))
        if not result['OK']:
            return result
        if result['Value'].lower() != 'y':
            self.log.always('No protocols will be added')
            return S_OK()

        for se in siteSEs:
            sections = gConfig.getSections('/Resources/StorageElements/%s/' %
                                           (se))
            if not sections['OK']:
                return sections
            for section in sections['Value']:
                if gConfig.getValue(
                        '/Resources/StorageElements/%s/%s/ProtocolName' %
                    (se, section), '') == 'SRM2':
                    path = '/Resources/StorageElements/%s/%s/ProtocolsList' % (
                        se, section)
                    self.log.verbose('Setting %s to %s' %
                                     (path, ', '.join(protocolsList)))
                    result = self.csSetOption(path, ', '.join(protocolsList))
                    if not result['OK']:
                        return result
                    modifiedCS = True

        if modifiedCS:
            result = self.csCommitChanges(False)
            if not result['OK']:
                return S_ERROR('CS Commit failed with message = %s' %
                               (result['Message']))
            else:
                if printOutput:
                    gLogger.notice('Successfully committed changes to CS')
        else:
            if printOutput:
                gLogger.notice('No modifications to CS required')

        return S_OK()

    #############################################################################
    def csSetOption(self, optionPath, optionValue):
        """
    Function to modify an existing value in the CS.
    """
        return self.csAPI.setOption(optionPath, optionValue)

    #############################################################################
    def csSetOptionComment(self, optionPath, comment):
        """
    Function to modify an existing value in the CS.
    """
        return self.csAPI.setOptionComment(optionPath, comment)

    #############################################################################
    def csModifyValue(self, optionPath, newValue):
        """
    Function to modify an existing value in the CS.
    """
        return self.csAPI.modifyValue(optionPath, newValue)

    #############################################################################
    def csRegisterUser(self, username, properties):
        """
    Registers a user in the CS.

        - username: Username of the user (easy;)
        - properties: Dict containing:
            - DN
            - groups : list/tuple of groups the user belongs to
            - <others> : More properties of the user, like mail

    """
        return self.csAPI.addUser(username, properties)

    #############################################################################
    def csDeleteUser(self, user):
        """
    Deletes a user from the CS. Can take a list of users
    """
        return self.csAPI.deleteUsers(user)

    #############################################################################
    def csModifyUser(self, username, properties, createIfNonExistant=False):
        """
    Modify a user in the CS. Takes the same params as in addUser and
    applies the changes
    """
        return self.csAPI.modifyUser(username, properties, createIfNonExistant)

    #############################################################################
    def csListUsers(self, group=False):
        """
    Lists the users in the CS. If no group is specified return all users.
    """
        return self.csAPI.listUsers(group)

    #############################################################################
    def csDescribeUsers(self, mask=False):
        """
    List users and their properties in the CS.
    If a mask is given, only users in the mask will be returned
    """
        return self.csAPI.describeUsers(mask)

    #############################################################################
    def csModifyGroup(self, groupname, properties, createIfNonExistant=False):
        """
    Modify a user in the CS. Takes the same params as in addGroup and applies
    the changes
    """
        return self.csAPI.modifyGroup(groupname, properties,
                                      createIfNonExistant)

    #############################################################################
    def csListHosts(self):
        """
    Lists the hosts in the CS
    """
        return self.csAPI.listHosts()

    #############################################################################
    def csDescribeHosts(self, mask=False):
        """
    Gets extended info for the hosts in the CS
    """
        return self.csAPI.describeHosts(mask)

    #############################################################################
    def csModifyHost(self, hostname, properties, createIfNonExistant=False):
        """
    Modify a host in the CS. Takes the same params as in addHost and applies
    the changes
    """
        return self.csAPI.modifyHost(hostname, properties, createIfNonExistant)

    #############################################################################
    def csListGroups(self):
        """
    Lists groups in the CS
    """
        return self.csAPI.listGroups()

    #############################################################################
    def csDescribeGroups(self, mask=False):
        """
    List groups and their properties in the CS.
    If a mask is given, only groups in the mask will be returned
    """
        return self.csAPI.describeGroups(mask)

    #############################################################################
    def csSyncUsersWithCFG(self, usersCFG):
        """
    Synchronize users in cfg with its contents
    """
        return self.csAPI.syncUsersWithCFG(usersCFG)

    #############################################################################
    def csCommitChanges(self, sortUsers=True):
        """
    Commit the changes in the CS
    """
        return self.csAPI.commitChanges(sortUsers=False)

    #############################################################################
    def sendMail(self,
                 address,
                 subject,
                 body,
                 fromAddress=None,
                 localAttempt=True,
                 html=False):
        """
    Send mail to specified address with body.
    """
        notification = NotificationClient()
        return notification.sendMail(address, subject, body, fromAddress,
                                     localAttempt, html)

    #############################################################################
    def sendSMS(self, userName, body, fromAddress=None):
        """
    Send mail to specified address with body.
    """
        if len(body) > 160:
            return S_ERROR('Exceeded maximum SMS length of 160 characters')
        notification = NotificationClient()
        return notification.sendSMS(userName, body, fromAddress)

    #############################################################################
    def getBDIISite(self, site, host=None):
        """
    Get information about site from BDII at host
    """
        return ldapSite(site, host=host)

    #############################################################################
    def getBDIICluster(self, ce, host=None):
        """
    Get information about ce from BDII at host
    """
        return ldapCluster(ce, host=host)

    #############################################################################
    def getBDIICE(self, ce, host=None):
        """
    Get information about ce from BDII at host
    """
        return ldapCE(ce, host=host)

    #############################################################################
    def getBDIIService(self, ce, host=None):
        """
    Get information about ce from BDII at host
    """
        return ldapService(ce, host=host)

    #############################################################################
    def getBDIICEState(self, ce, useVO=voName, host=None):
        """
    Get information about ce state from BDII at host
    """
        return ldapCEState(ce, useVO, host=host)

    #############################################################################
    def getBDIICEVOView(self, ce, useVO=voName, host=None):
        """
    Get information about ce voview from BDII at host
    """
        return ldapCEVOView(ce, useVO, host=host)
Example #41
0
  def getPilotSummaryWeb( self, selectDict, sortList, startItem, maxItems ):
    """ Get summary of the pilot jobs status by CE/site in a standard structure
    """

    stateNames = ['Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done', 'Aborted']
    allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour']
    paramNames = ['Site', 'CE'] + allStateNames

    resultDict = {}
    last_update = None
    if selectDict.has_key( 'LastUpdateTime' ):
      last_update = selectDict['LastUpdateTime']
      del selectDict['LastUpdateTime']
    site_select = []
    if selectDict.has_key( 'GridSite' ):
      site_select = selectDict['GridSite']
      if type( site_select ) != type( [] ):
        site_select = [site_select]
      del selectDict['GridSite']

    status_select = []
    if selectDict.has_key( 'Status' ):
      status_select = selectDict['Status']
      if type( status_select ) != type( [] ):
        status_select = [status_select]
      del selectDict['Status']

    expand_site = ''
    if selectDict.has_key( 'ExpandSite' ):
      expand_site = selectDict['ExpandSite']
      site_select = [expand_site]
      del selectDict['ExpandSite']

    start = time.time()
    # Get all the data from the database with various selections
    result = self.getCounters( 'PilotAgents',
                              ['GridSite', 'DestinationSite', 'Status'],
                              selectDict, newer = last_update, timeStamp = 'LastUpdateTime' )
    if not result['OK']:
      return result

    last_update = Time.dateTime() - Time.hour
    selectDict['Status'] = 'Aborted'
    resultHour = self.getCounters( 'PilotAgents',
                                 ['GridSite', 'DestinationSite', 'Status'],
                                 selectDict, newer = last_update, timeStamp = 'LastUpdateTime' )
    if not resultHour['OK']:
      return resultHour

    last_update = Time.dateTime() - Time.day
    selectDict['Status'] = ['Aborted', 'Done']
    resultDay = self.getCounters( 'PilotAgents',
                                 ['GridSite', 'DestinationSite', 'Status'],
                                 selectDict, newer = last_update, timeStamp = 'LastUpdateTime' )
    if not resultDay['OK']:
      return resultDay
    selectDict['CurrentJobID'] = 0
    selectDict['Status'] = 'Done'
    resultDayEmpty = self.getCounters( 'PilotAgents',
                                 ['GridSite', 'DestinationSite', 'Status'],
                                 selectDict, newer = last_update, timeStamp = 'LastUpdateTime' )
    if not resultDayEmpty['OK']:
      return resultDayEmpty

    ceMap = {}
    resMap = getCESiteMapping()
    if resMap['OK']:
      ceMap = resMap['Value']

    # Sort out different counters
    resultDict = {}
    resultDict['Unknown'] = {}
    for attDict, count in result['Value']:
      site = attDict['GridSite']
      ce = attDict['DestinationSite']
      state = attDict['Status']
      if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ceMap.has_key( ce ):
        site = ceMap[ce]
      if not resultDict.has_key( site ):
        resultDict[site] = {}
      if not resultDict[site].has_key( ce ):
        resultDict[site][ce] = {}
        for p in allStateNames:
          resultDict[site][ce][p] = 0

      resultDict[site][ce][state] = count

    for attDict, count in resultDay['Value']:
      site = attDict['GridSite']
      ce = attDict['DestinationSite']
      state = attDict['Status']
      if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ):
        site = ceMap[ce]
      if state == "Done":
        resultDict[site][ce]["Done"] = count
      if state == "Aborted":
        resultDict[site][ce]["Aborted"] = count

    for attDict, count in resultDayEmpty['Value']:
      site = attDict['GridSite']
      ce = attDict['DestinationSite']
      state = attDict['Status']
      if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ):
        site = ceMap[ce]
      if state == "Done":
        resultDict[site][ce]["Done_Empty"] = count

    for attDict, count in resultHour['Value']:
      site = attDict['GridSite']
      ce = attDict['DestinationSite']
      state = attDict['Status']
      if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ):
        site = ceMap[ce]
      if state == "Aborted":
        resultDict[site][ce]["Aborted_Hour"] = count

    records = []
    siteSumDict = {}
    for site in resultDict:
      sumDict = {}
      for state in allStateNames:
        if not sumDict.has_key( state ):
          sumDict[state] = 0
      sumDict['Total'] = 0
      for ce in resultDict[site]:
        itemList = [site, ce]
        total = 0
        for state in allStateNames:
          itemList.append( resultDict[site][ce][state] )
          sumDict[state] += resultDict[site][ce][state]
          if state == "Done":
            done = resultDict[site][ce][state]
          if state == "Done_Empty":
            empty = resultDict[site][ce][state]
          if state == "Aborted":
            aborted = resultDict[site][ce][state]
          if state == "Aborted_Hour":
            aborted_hour = resultDict[site][ce][state]
          if state != "Aborted_Hour" and state != "Done_Empty":
            total += resultDict[site][ce][state]

        sumDict['Total'] += total
        # Add the total number of pilots seen in the last day
        itemList.append( total )
        # Add pilot submission efficiency evaluation
        if ( done - empty ) > 0:
          eff = float( done ) / float( done - empty )
        elif done == 0:
          eff = 0.
        elif empty == done:
          eff = 99.
        else:
          eff = 0.
        itemList.append( '%.2f' % eff )
        # Add pilot job efficiency evaluation
        if total > 0:
          eff = float( total - aborted ) / float( total ) * 100.
        else:
          eff = 100.
        itemList.append( '%.2f' % eff )

        # Evaluate the quality status of the CE
        if total > 10:
          if eff < 25.:
            itemList.append( 'Bad' )
          elif eff < 60.:
            itemList.append( 'Poor' )
          elif eff < 85.:
            itemList.append( 'Fair' )
          else:
            itemList.append( 'Good' )
        else:
          itemList.append( 'Idle' )

        if len( resultDict[site] ) == 1 or expand_site:
          records.append( itemList )

      if len( resultDict[site] ) > 1 and not expand_site:
        itemList = [site, 'Multiple']
        for state in allStateNames + ['Total']:
          if sumDict.has_key( state ):
            itemList.append( sumDict[state] )
          else:
            itemList.append( 0 )
        done = sumDict["Done"]
        empty = sumDict["Done_Empty"]
        aborted = sumDict["Aborted"]
        aborted_hour = sumDict["Aborted_Hour"]
        total = sumDict["Total"]

        # Add pilot submission efficiency evaluation
        if ( done - empty ) > 0:
          eff = float( done ) / float( done - empty )
        elif done == 0:
          eff = 0.
        elif empty == done:
          eff = 99.
        else:
          eff = 0.
        itemList.append( '%.2f' % eff )
        # Add pilot job efficiency evaluation
        if total > 0:
          eff = float( total - aborted ) / float( total ) * 100.
        else:
          eff = 100.
        itemList.append( '%.2f' % eff )

        # Evaluate the quality status of the Site
        if total > 10:
          if eff < 25.:
            itemList.append( 'Bad' )
          elif eff < 60.:
            itemList.append( 'Poor' )
          elif eff < 85.:
            itemList.append( 'Fair' )
          else:
            itemList.append( 'Good' )
        else:
          itemList.append( 'Idle' )
        records.append( itemList )

      for state in allStateNames + ['Total']:
        if not siteSumDict.has_key( state ):
          siteSumDict[state] = sumDict[state]
        else:
          siteSumDict[state] += sumDict[state]

    # Perform site selection
    if site_select:
      new_records = []
      for r in records:
        if r[0] in site_select:
          new_records.append( r )
      records = new_records

    # Perform status selection
    if status_select:
      new_records = []
      for r in records:
        if r[14] in status_select:
          new_records.append( r )
      records = new_records

    # Get the Site Mask data
    siteStatus = SiteStatus()
    for r in records:
      #
      #FIXME: using only ComputingAccess
      #
      if siteStatus.isUsableSite( r[0], 'ComputingAccess' ):
        r.append('Yes')
      else:
        r.append('No')

    finalDict = {}
    finalDict['TotalRecords'] = len( records )
    finalDict['ParameterNames'] = paramNames + \
                                 ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask']

    # Return all the records if maxItems == 0 or the specified number otherwise
    if maxItems:
      finalDict['Records'] = records[startItem:startItem + maxItems]
    else:
      finalDict['Records'] = records

    done = siteSumDict["Done"]
    empty = siteSumDict["Done_Empty"]
    aborted = siteSumDict["Aborted"]
    aborted_hour = siteSumDict["Aborted_Hour"]
    total = siteSumDict["Total"]

    # Add pilot submission efficiency evaluation
    if ( done - empty ) > 0:
      eff = float( done ) / float( done - empty )
    elif done == 0:
      eff = 0.
    elif empty == done:
      eff = 99.
    else:
      eff = 0.
    siteSumDict['PilotsPerJob'] = '%.2f' % eff
    # Add pilot job efficiency evaluation
    if total > 0:
      eff = float( total - aborted ) / float( total ) * 100.
    else:
      eff = 100.
    siteSumDict['PilotJobEff'] = '%.2f' % eff

    # Evaluate the overall quality status
    if total > 100:
      if eff < 25.:
        siteSumDict['Status'] = 'Bad'
      elif eff < 60.:
        siteSumDict['Status'] = 'Poor'
      elif eff < 85.:
        siteSumDict['Status'] = 'Fair'
      else:
        siteSumDict['Status'] = 'Good'
    else:
      siteSumDict['Status'] = 'Idle'
    finalDict['Extras'] = siteSumDict

    return S_OK( finalDict )
Example #42
0
    def _resolveCECandidates(self, taskQueueDict):
        """
      Return a list of CEs for this TaskQueue
    """
        # assume user knows what they're doing and avoid site mask e.g. sam jobs
        if 'GridCEs' in taskQueueDict and taskQueueDict['GridCEs']:
            self.log.info(
                'CEs requested by TaskQueue %s:' %
                taskQueueDict['TaskQueueID'],
                ', '.join(taskQueueDict['GridCEs']))
            return taskQueueDict['GridCEs']

        # Get the mask
        siteStatus = SiteStatus()
        ret = siteStatus.getUsableSites('ComputingAccess')
        if not ret['OK']:
            self.log.error('Can not retrieve site Mask from DB:',
                           ret['Message'])
            return []

        usableSites = ret['Value']
        if not usableSites:
            self.log.error('Site mask is empty')
            return []

        self.log.verbose('Site Mask: %s' % ', '.join(usableSites))

        # remove banned sites from siteMask
        if 'BannedSites' in taskQueueDict:
            for site in taskQueueDict['BannedSites']:
                if site in usableSites:
                    usableSites.remove(site)
                    self.log.verbose('Removing banned site %s from site Mask' %
                                     site)

        # remove from the mask if a Site is given
        siteMask = [
            site for site in usableSites
            if 'Sites' not in taskQueueDict or site in taskQueueDict['Sites']
        ]

        if not siteMask:
            # pilot can not be submitted
            self.log.info('No Valid Site Candidate in Mask for TaskQueue %s' %
                          taskQueueDict['TaskQueueID'])
            return []

        self.log.info(
            'Site Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'],
            ', '.join(siteMask))

        # Get CE's associates to the given site Names
        ceMask = []

        resources = Resources(vo=self.virtualOrganization)
        result = resources.getEligibleResources(
            'Computing', {
                'Site': siteMask,
                'SubmissionMode': 'gLite',
                'CEType': ['LCG', 'CREAM']
            })
        if not result['OK']:
            self.log.error("Failed to get eligible ce's:", result['Message'])
            return []
        ces = result['Value']

        for ce in ces:
            ceHost = resources.getComputingElementValue(ce, 'Host', 'unknown')
            if ceHost != 'unknown':
                ceMask.append(ceHost)

        if not ceMask:
            self.log.info(
                'No CE Candidate found for TaskQueue %s:' %
                taskQueueDict['TaskQueueID'], ', '.join(siteMask))

        self.log.verbose(
            'CE Candidates for TaskQueue %s:' % taskQueueDict['TaskQueueID'],
            ', '.join(ceMask))

        return ceMask
Example #43
0
 def initializeOptimizer(cls):
     """ Initialization of the optimizer.
 """
     cls.siteClient = SiteStatus()
     cls.__jobDB = JobDB()
     return S_OK()
Example #44
0
class Matcher(object):
  """ Logic for matching
  """

  def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger("Matcher")

    self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

    self.siteClient = SiteStatus()

  def selectJob(self, resourceDescription, credDict):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict(resourceDescription, credDict)

    # Make a nice print of the resource matching parameters
    toPrintDict = dict(resourceDict)
    if "MaxRAM" in resourceDescription:
      toPrintDict['MaxRAM'] = resourceDescription['MaxRAM']
    if "NumberOfProcessors" in resourceDescription:
      toPrintDict['NumberOfProcessors'] = resourceDescription['NumberOfProcessors']
    toPrintDict['Tag'] = []
    if "Tag" in resourceDict:
      for tag in resourceDict['Tag']:
        if not tag.endswith('GB') and not tag.endswith('Processors'):
          toPrintDict['Tag'].append(tag)
    if not toPrintDict['Tag']:
      toPrintDict.pop('Tag')
    gLogger.info('Resource description for matching', printDict(toPrintDict))

    negativeCond = self.limiter.getNegativeCondForSite(resourceDict['Site'])
    result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond)

    if not result['OK']:
      raise RuntimeError(result['Message'])
    result = result['Value']
    if not result['matchFound']:
      self.log.info("No match found")
      return {}

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup', 'Status'])
    if not resAtt['OK']:
      raise RuntimeError('Could not retrieve job attributes')
    if not resAtt['Value']:
      raise RuntimeError("No attributes returned for job")
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error('Job matched by the TQ is not in Waiting state', str(jobID))
      result = self.tqDB.deleteJob(jobID)
      if not result['OK']:
        raise RuntimeError(result['Message'])
      raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

    self._reportStatus(resourceDict, jobID)

    result = self.jobDB.getJobJDL(jobID)
    if not result['OK']:
      raise RuntimeError("Failed to get the job JDL")

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info("Match time: [%s]" % str(matchTime))
    gMonitor.addMark("matchTime", matchTime)

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters(jobID)
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
    if not resAtt['OK']:
      raise RuntimeError('Could not retrieve job attributes')
    if not resAtt['Value']:
      raise RuntimeError('No attributes returned for job')

    if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
      self.limiter.updateDelayCounters(resourceDict['Site'], jobID)

    pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag', False)
    if not pilotInfoReportedFlag:
      self._updatePilotInfo(resourceDict)
    self._updatePilotJobMapping(resourceDict, jobID)

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict

  def _getResourceDict(self, resourceDescription, credDict):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription(resourceDescription)
    resourceDict = self._checkCredentials(resourceDict, credDict)
    self._checkPilotVersion(resourceDict)
    if not self._checkMask(resourceDict):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose("Resource description:")
    for key in resourceDict:
      self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key]))

    return resourceDict

  def _processResourceDescription(self, resourceDescription):
    """ Check and form the resource description dictionary

        :param resourceDescription: a ceDict coming from a JobAgent,
                                    for example.
        :return: updated dictionary of resource description parameters
    """

    resourceDict = {}
    for name in singleValueDefFields:
      if name in resourceDescription:
        resourceDict[name] = resourceDescription[name]

    for name in multiValueMatchFields:
      if name in resourceDescription:
        resourceDict[name] = resourceDescription[name]

    if resourceDescription.get('Tag'):
      resourceDict['Tag'] = resourceDescription['Tag']
      if 'RequiredTag' in resourceDescription:
        resourceDict['RequiredTag'] = resourceDescription['RequiredTag']

    if 'JobID' in resourceDescription:
      resourceDict['JobID'] = resourceDescription['JobID']

    # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
    maxRAM = resourceDescription.get('MaxRAM')
    if maxRAM:
      try:
        maxRAM = int(maxRAM) / 1000
      except ValueError:
        maxRAM = None
    nProcessors = resourceDescription.get('NumberOfProcessors')
    if nProcessors:
      try:
        nProcessors = int(nProcessors)
      except ValueError:
        nProcessors = None
    for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]:
      if param and param <= 128:
        paramList = range(2, param + 1)
        paramTags = ['%d%s' % (par, key) for par in paramList]
        if paramTags:
          resourceDict.setdefault("Tag", []).extend(paramTags)

    # Add 'MultiProcessor' to the list of tags
    if nProcessors > 1:
      resourceDict.setdefault("Tag", []).append("MultiProcessor")

    # Add 'WholeNode' to the list of tags
    if "WholeNode" in resourceDescription:
      resourceDict.setdefault("Tag", []).append("WholeNode")

    if 'Tag' in resourceDict:
      resourceDict['Tag'] = list(set(resourceDict['Tag']))

    for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
              'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag'):
      if k in resourceDescription:
        resourceDict[k] = resourceDescription[k]

    return resourceDict

  def _reportStatus(self, resourceDict, jobID):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
    if not result['OK']:
      self.log.error("Problem reporting job status",
                     "setJobAttributes, jobID = %s: %s" % (jobID, result['Message']))
    else:
      self.log.verbose("Set job attributes for jobID %s" % jobID)

    result = self.jlDB.addLoggingRecord(jobID,
                                        status='Matched',
                                        minor='Assigned',
                                        source='Matcher')
    if not result['OK']:
      self.log.error("Problem reporting job status",
                     "addLoggingRecord, jobID = %s: %s" % (jobID, result['Message']))
    else:
      self.log.verbose("Added logging record for jobID %s" % jobID)

  def _checkMask(self, resourceDict):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if 'Site' not in resourceDict:
      self.log.error("Missing Site Name in Resource JDL")
      raise RuntimeError("Missing Site Name in Resource JDL")

    # Check if site is allowed
    result = self.siteClient.getUsableSites(resourceDict['Site'])
    if not result['OK']:
      self.log.error("Internal error",
                     "siteClient.getUsableSites: %s" % result['Message'])
      raise RuntimeError("Internal error")

    if resourceDict['Site'] not in result['Value']:
      return False

    return True

  def _updatePilotInfo(self, resourceDict):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get('PilotReference', '')
    if pilotReference:
      gridCE = resourceDict.get('GridCE', 'Unknown')
      site = resourceDict.get('Site', 'Unknown')
      benchmark = resourceDict.get('PilotBenchmark', 0.0)
      self.log.verbose('Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference,
                                                                                          gridCE,
                                                                                          site,
                                                                                          benchmark))

      result = self.pilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site,
                                                 destination=gridCE, benchmark=benchmark)
      if not result['OK']:
        self.log.warn("Problem updating pilot information",
                      "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result['Message']))

  def _updatePilotJobMapping(self, resourceDict, jobID):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get('PilotReference', '')
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
      if not result['OK']:
        self.log.error("Problem updating pilot information",
                       ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result['Message']))
      result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False)
      if not result['OK']:
        self.log.error("Problem updating pilot information",
                       "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result['Message']))

  def _checkCredentials(self, resourceDict, credDict):
    """ Check if we can get a job given the passed credentials
    """
    if Properties.GENERIC_PILOT in credDict['properties']:
      # You can only match groups in the same VO
      if credDict['group'] == "hosts":
        # for the host case the VirtualOrganization parameter
        # is mandatory in resourceDict
        vo = resourceDict.get('VirtualOrganization', '')
      else:
        vo = Registry.getVOForGroup(credDict['group'])
      if 'OwnerGroup' not in resourceDict:
        result = Registry.getGroupsForVO(vo)
        if result['OK']:
          resourceDict['OwnerGroup'] = result['Value']
        else:
          raise RuntimeError(result['Message'])
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict['properties']:
        self.log.notice("Setting the resource DN to the credentials DN")
        resourceDict['OwnerDN'] = credDict['DN']
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict['properties']:
        resourceDict['OwnerGroup'] = credDict['group']
        self.log.notice("Setting the resource group to the credentials group")
        if 'OwnerDN' in resourceDict and resourceDict['OwnerDN'] != credDict['DN']:
          ownerDN = resourceDict['OwnerDN']
          result = Registry.getGroupsForDN(resourceDict['OwnerDN'])
          if not result['OK']:
            raise RuntimeError(result['Message'])
          if credDict['group'] not in result['Value']:
            # DN is not in the same group! bad boy.
            self.log.notice("You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN)
            resourceDict['OwnerDN'] = credDict['DN']
      # Nothing special, group and DN have to be the same
      else:
        resourceDict['OwnerDN'] = credDict['DN']
        resourceDict['OwnerGroup'] = credDict['group']

    return resourceDict

  def _checkPilotVersion(self, resourceDict):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue("Pilot/CheckVersion", True):
      if 'ReleaseVersion' not in resourceDict:
        if 'DIRACVersion' not in resourceDict:
          raise RuntimeError('Version check requested and not provided by Pilot')
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue("Pilot/Version", [])
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError('Pilot version does not match the production version %s not in ( %s )' %
                           (pilotVersion, ",".join(validVersions)))
      # Check project if requested
      validProject = self.opsHelper.getValue("Pilot/Project", "")
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError("Version check requested but expected project %s not received" % validProject)
        if resourceDict['ReleaseProject'] != validProject:
          raise RuntimeError("Version check requested \
          but expected project %s != received %s" % (validProject,
                                                     resourceDict['ReleaseProject']))
Example #45
0
def main():
    global fullMatch
    global sites
    Script.registerSwitch("F", "full-match", "Check all the matching criteria",
                          setFullMatch)
    Script.registerSwitch(
        "S:", "site=", "Check matching for these sites (comma separated list)",
        setSites)

    Script.parseCommandLine(ignoreErrors=True)
    args = Script.getPositionalArgs()

    if len(args) == 0:
        gLogger.error("Error: No job description provided")
        Script.showHelp(exitCode=1)

    from DIRAC.Core.Security.ProxyInfo import getVOfromProxyGroup
    from DIRAC.ConfigurationSystem.Client.Helpers import Resources
    from DIRAC.Core.Utilities.PrettyPrint import printTable
    from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus
    from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus
    from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue

    with open(args[0]) as f:
        jdl = f.read()

    # Get the current VO
    result = getVOfromProxyGroup()
    if not result['OK']:
        gLogger.error('No proxy found, please login')
        DIRACExit(-1)
    voName = result['Value']

    resultQueues = Resources.getQueues(siteList=sites, community=voName)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)
    siteDict = resultQueues['Value']
    result = getQueuesResolved(siteDict)
    if not resultQueues['OK']:
        gLogger.error('Failed to get CE information')
        DIRACExit(-1)
    queueDict = result['Value']

    # get list of usable sites within this cycle
    resultMask = SiteStatus().getUsableSites()
    if not resultMask['OK']:
        gLogger.error('Failed to get Site mask information')
        DIRACExit(-1)
    siteMaskList = resultMask.get('Value', [])

    rssClient = ResourceStatus()

    fields = ('Site', 'CE', 'Queue', 'Status', 'Match', 'Reason')
    records = []

    for queue, queueInfo in queueDict.items():
        site = queueInfo['Site']
        ce = queueInfo['CEName']
        siteStatus = "Active" if site in siteMaskList else "InActive"
        ceStatus = siteStatus
        if rssClient.rssFlag:
            result = rssClient.getElementStatus(ce, "ComputingElement")
            if result['OK']:
                ceStatus = result['Value'][ce]['all']

        result = matchQueue(jdl, queueInfo, fullMatch=fullMatch)
        if not result['OK']:
            gLogger.error('Failed in getting match data', result['Message'])
            DIRACExit(-1)
        status = "Active" if siteStatus == "Active" and ceStatus == "Active" else "Inactive"
        if result['Value']['Match']:
            records.append((site, ce, queueInfo['Queue'], status, 'Yes', ''))
        else:
            records.append((site, ce, queueInfo['Queue'], status, 'No',
                            result['Value']['Reason']))

    gLogger.notice(
        printTable(fields,
                   records,
                   sortField='Site',
                   columnSeparator='  ',
                   printOut=False))
Example #46
0
class CloudDirector(AgentModule):
    """The CloudDirector works like a SiteDirector for cloud sites:
    It looks at the queued jobs in the task queues and attempts to
    start VM instances to meet the current demand.
    """

    def __init__(self, *args, **kwargs):
        super(CloudDirector, self).__init__(*args, **kwargs)
        self.vmTypeDict = {}
        self.vmTypeCECache = {}
        self.vmTypeSlots = {}
        self.failedVMTypes = defaultdict(int)
        self.firstPass = True

        self.vo = ""
        self.group = ""
        # self.voGroups contain all the eligible user groups for clouds submitted by this SiteDirector
        self.voGroups = []
        self.cloudDN = ""
        self.cloudGroup = ""
        self.platforms = []
        self.sites = []
        self.siteClient = None

        self.proxy = None

        self.updateStatus = True
        self.getOutput = False
        self.sendAccounting = True

    def initialize(self):
        self.siteClient = SiteStatus()
        return S_OK()

    def beginExecution(self):

        # The Director is for a particular user community
        self.vo = self.am_getOption("VO", "")
        if not self.vo:
            self.vo = CSGlobals.getVO()
        # The SiteDirector is for a particular user group
        self.group = self.am_getOption("Group", "")

        # Choose the group for which clouds will be submitted. This is a hack until
        # we will be able to match clouds to VOs.
        if not self.group:
            if self.vo:
                result = Registry.getGroupsForVO(self.vo)
                if not result["OK"]:
                    return result
                self.voGroups = []
                for group in result["Value"]:
                    if "NormalUser" in Registry.getPropertiesForGroup(group):
                        self.voGroups.append(group)
        else:
            self.voGroups = [self.group]

        result = findGenericCloudCredentials(vo=self.vo)
        if not result["OK"]:
            return result
        self.cloudDN, self.cloudGroup = result["Value"]
        self.maxVMsToSubmit = self.am_getOption("MaxVMsToSubmit", 1)
        self.runningPod = self.am_getOption("RunningPod", self.vo)

        # Get the site description dictionary
        siteNames = None
        if not self.am_getOption("Site", "Any").lower() == "any":
            siteNames = self.am_getOption("Site", [])
            if not siteNames:
                siteNames = None
        ces = None
        if not self.am_getOption("CEs", "Any").lower() == "any":
            ces = self.am_getOption("CEs", [])
            if not ces:
                ces = None

        result = getVMTypes(vo=self.vo, siteList=siteNames)
        if not result["OK"]:
            return result
        resourceDict = result["Value"]
        result = self.getEndpoints(resourceDict)
        if not result["OK"]:
            return result

        # if not siteNames:
        #  siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' )
        #  if siteName == 'Unknown':
        #    return S_OK( 'No site specified for the SiteDirector' )
        #  else:
        #    siteNames = [siteName]
        # self.siteNames = siteNames

        self.log.always("Sites:", siteNames)
        self.log.always("CEs:", ces)
        self.log.always("CloudDN:", self.cloudDN)
        self.log.always("CloudGroup:", self.cloudGroup)

        self.localhost = socket.getfqdn()
        self.proxy = ""

        if self.firstPass:
            if self.vmTypeDict:
                self.log.always("Agent will serve VM types:")
                for vmType in self.vmTypeDict:
                    self.log.always(
                        "Site: %s, CE: %s, VMType: %s"
                        % (self.vmTypeDict[vmType]["Site"], self.vmTypeDict[vmType]["CEName"], vmType)
                    )
        self.firstPass = False
        return S_OK()

    def __generateVMTypeHash(self, vmTypeDict):
        """Generate a hash of the queue description"""
        myMD5 = hashlib.md5()
        myMD5.update(str(sorted(vmTypeDict.items())).encode())
        hexstring = myMD5.hexdigest()
        return hexstring

    def getEndpoints(self, resourceDict):
        """Get the list of relevant CEs and their descriptions"""

        self.vmTypeDict = {}
        ceFactory = EndpointFactory()

        result = getPilotBootstrapParameters(vo=self.vo, runningPod=self.runningPod)
        if not result["OK"]:
            return result
        opParameters = result["Value"]

        for site in resourceDict:
            for ce in resourceDict[site]:
                ceDict = resourceDict[site][ce]
                ceTags = ceDict.get("Tag", [])
                if isinstance(ceTags, six.string_types):
                    ceTags = fromChar(ceTags)
                ceMaxRAM = ceDict.get("MaxRAM", None)
                qDict = ceDict.pop("VMTypes")
                for vmType in qDict:
                    vmTypeName = "%s_%s" % (ce, vmType)
                    self.vmTypeDict[vmTypeName] = {}
                    self.vmTypeDict[vmTypeName]["ParametersDict"] = qDict[vmType]
                    self.vmTypeDict[vmTypeName]["ParametersDict"]["VMType"] = vmType
                    self.vmTypeDict[vmTypeName]["ParametersDict"]["Site"] = site
                    self.vmTypeDict[vmTypeName]["ParametersDict"]["Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown")
                    self.vmTypeDict[vmTypeName]["ParametersDict"]["CPUTime"] = 99999999

                    vmTypeTags = self.vmTypeDict[vmTypeName]["ParametersDict"].get("Tag")
                    if vmTypeTags and isinstance(vmTypeTags, six.string_types):
                        vmTypeTags = fromChar(vmTypeTags)
                        self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = vmTypeTags
                    if ceTags:
                        if vmTypeTags:
                            allTags = list(set(ceTags + vmTypeTags))
                            self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = allTags
                        else:
                            self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"] = ceTags

                    maxRAM = self.vmTypeDict[vmTypeName]["ParametersDict"].get("MaxRAM")
                    maxRAM = ceMaxRAM if not maxRAM else maxRAM
                    if maxRAM:
                        self.vmTypeDict[vmTypeName]["ParametersDict"]["MaxRAM"] = maxRAM

                    ceWholeNode = ceDict.get("WholeNode", "true")
                    wholeNode = self.vmTypeDict[vmTypeName]["ParametersDict"].get("WholeNode", ceWholeNode)
                    if wholeNode.lower() in ("yes", "true"):
                        self.vmTypeDict[vmTypeName]["ParametersDict"].setdefault("Tag", [])
                        self.vmTypeDict[vmTypeName]["ParametersDict"]["Tag"].append("WholeNode")

                    platform = ""
                    if "Platform" in self.vmTypeDict[vmTypeName]["ParametersDict"]:
                        platform = self.vmTypeDict[vmTypeName]["ParametersDict"]["Platform"]
                    elif "Platform" in ceDict:
                        platform = ceDict["Platform"]
                    if platform and platform not in self.platforms:
                        self.platforms.append(platform)

                    if "Platform" not in self.vmTypeDict[vmTypeName]["ParametersDict"] and platform:
                        result = Resources.getDIRACPlatform(platform)
                        if result["OK"]:
                            self.vmTypeDict[vmTypeName]["ParametersDict"]["Platform"] = result["Value"][0]

                    ceVMTypeDict = dict(ceDict)
                    ceVMTypeDict["CEName"] = ce
                    ceVMTypeDict["VO"] = self.vo
                    ceVMTypeDict["VMType"] = vmType
                    ceVMTypeDict["RunningPod"] = self.runningPod
                    ceVMTypeDict["CSServers"] = gConfig.getValue("/DIRAC/Configuration/Servers", [])
                    ceVMTypeDict.update(self.vmTypeDict[vmTypeName]["ParametersDict"])

                    # Allow a resource-specifc CAPath to be set (as some clouds have their own CAs)
                    # Otherwise fall back to the system-wide default(s)
                    if "CAPath" not in ceVMTypeDict:
                        ceVMTypeDict["CAPath"] = gConfig.getValue(
                            "/DIRAC/Security/CAPath", "/opt/dirac/etc/grid-security/certificates/cas.pem"
                        )

                    # Generate the CE object for the vmType or pick the already existing one
                    # if the vmType definition did not change
                    vmTypeHash = self.__generateVMTypeHash(ceVMTypeDict)
                    if vmTypeName in self.vmTypeCECache and self.vmTypeCECache[vmTypeName]["Hash"] == vmTypeHash:
                        vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"]
                    else:
                        result = ceFactory.getCEObject(parameters=ceVMTypeDict)
                        if not result["OK"]:
                            return result
                        self.vmTypeCECache.setdefault(vmTypeName, {})
                        self.vmTypeCECache[vmTypeName]["Hash"] = vmTypeHash
                        self.vmTypeCECache[vmTypeName]["CE"] = result["Value"]
                        vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"]
                        vmTypeCE.setBootstrapParameters(opParameters)

                    self.vmTypeDict[vmTypeName]["CE"] = vmTypeCE
                    self.vmTypeDict[vmTypeName]["CEName"] = ce
                    self.vmTypeDict[vmTypeName]["CEType"] = ceDict["CEType"]
                    self.vmTypeDict[vmTypeName]["Site"] = site
                    self.vmTypeDict[vmTypeName]["VMType"] = vmType
                    self.vmTypeDict[vmTypeName]["Platform"] = platform
                    self.vmTypeDict[vmTypeName]["MaxInstances"] = ceDict["MaxInstances"]
                    if not self.vmTypeDict[vmTypeName]["CE"].isValid():
                        self.log.error("Failed to instantiate CloudEndpoint for %s" % vmTypeName)
                        continue

                    if site not in self.sites:
                        self.sites.append(site)

        return S_OK()

    def execute(self):
        """Main execution method"""

        if not self.vmTypeDict:
            self.log.warn("No site defined, exiting the cycle")
            return S_OK()

        result = self.createVMs()
        if not result["OK"]:
            self.log.error("Errors in the job submission: ", result["Message"])

        # cyclesDone = self.am_getModuleParam( 'cyclesDone' )
        # if self.updateStatus and cyclesDone % self.cloudStatusUpdateCycleFactor == 0:
        #  result = self.updatePilotStatus()
        #  if not result['OK']:
        #    self.log.error( 'Errors in updating cloud status: ', result['Message'] )

        return S_OK()

    def createVMs(self):
        """Go through defined computing elements and submit jobs if necessary"""

        vmTypeList = list(self.vmTypeDict.keys())

        # Check that there is some work at all
        setup = CSGlobals.getSetup()
        tqDict = {"Setup": setup, "CPUTime": 9999999}
        if self.vo:
            tqDict["VO"] = self.vo
        if self.voGroups:
            tqDict["OwnerGroup"] = self.voGroups

        result = Resources.getCompatiblePlatforms(self.platforms)
        if not result["OK"]:
            return result
        tqDict["Platform"] = result["Value"]
        tqDict["Site"] = self.sites
        tags = []
        for vmType in vmTypeList:
            if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]:
                tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"]
        tqDict["Tag"] = list(set(tags))

        self.log.verbose("Checking overall TQ availability with requirements")
        self.log.verbose(tqDict)

        matcherClient = MatcherClient()
        result = matcherClient.getMatchingTaskQueues(tqDict)
        if not result["OK"]:
            return result
        if not result["Value"]:
            self.log.verbose("No Waiting jobs suitable for the director")
            return S_OK()

        jobSites = set()
        anySite = False
        testSites = set()
        totalWaitingJobs = 0
        for tqID in result["Value"]:
            if "Sites" in result["Value"][tqID]:
                for site in result["Value"][tqID]["Sites"]:
                    if site.lower() != "any":
                        jobSites.add(site)
                    else:
                        anySite = True
            else:
                anySite = True
            if "JobTypes" in result["Value"][tqID]:
                if "Sites" in result["Value"][tqID]:
                    for site in result["Value"][tqID]["Sites"]:
                        if site.lower() != "any":
                            testSites.add(site)
            totalWaitingJobs += result["Value"][tqID]["Jobs"]

        tqIDList = list(result["Value"].keys())

        result = virtualMachineDB.getInstanceCounters("Status", {})
        totalVMs = 0
        if result["OK"]:
            for status in result["Value"]:
                if status in ["New", "Submitted", "Running"]:
                    totalVMs += result["Value"][status]
        self.log.info("Total %d jobs in %d task queues with %d VMs" % (totalWaitingJobs, len(tqIDList), totalVMs))

        # Check if the site is allowed in the mask
        result = self.siteClient.getUsableSites()
        if not result["OK"]:
            return S_ERROR("Can not get the site mask")
        siteMaskList = result.get("Value", [])

        vmTypeList = list(self.vmTypeDict.keys())
        random.shuffle(vmTypeList)
        totalSubmittedPilots = 0
        matchedQueues = 0
        for vmType in vmTypeList:
            ce = self.vmTypeDict[vmType]["CE"]
            ceName = self.vmTypeDict[vmType]["CEName"]
            vmTypeName = self.vmTypeDict[vmType]["VMType"]
            siteName = self.vmTypeDict[vmType]["Site"]
            platform = self.vmTypeDict[vmType]["Platform"]
            vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get("Tag", [])
            siteMask = siteName in siteMaskList
            endpoint = "%s::%s" % (siteName, ceName)
            maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"])
            processorTags = []

            # vms support WholeNode naturally
            processorTags.append("WholeNode")

            if not anySite and siteName not in jobSites:
                self.log.verbose("Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName))
                continue
            if not siteMask and siteName not in testSites:
                self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName))
                continue

            if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]:
                vmTypeCPUTime = int(self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"])
            else:
                self.log.warn("CPU time limit is not specified for queue %s, skipping..." % vmType)
                continue

            # Prepare the queue description to look for eligible jobs
            ceDict = ce.getParameterDict()

            if not siteMask:
                ceDict["JobType"] = "Test"
            if self.vo:
                ceDict["VO"] = self.vo
            if self.voGroups:
                ceDict["OwnerGroup"] = self.voGroups

            result = Resources.getCompatiblePlatforms(platform)
            if not result["OK"]:
                continue
            ceDict["Platform"] = result["Value"]

            ceDict["Tag"] = list(set(processorTags + vmTypeTags))

            # Get the number of eligible jobs for the target site/queue

            result = matcherClient.getMatchingTaskQueues(ceDict)
            if not result["OK"]:
                self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"])
                return result
            taskQueueDict = result["Value"]
            if not taskQueueDict:
                self.log.verbose("No matching TQs found for %s" % vmType)
                continue

            matchedQueues += 1
            totalTQJobs = 0
            tqIDList = list(taskQueueDict.keys())
            for tq in taskQueueDict:
                totalTQJobs += taskQueueDict[tq]["Jobs"]

            self.log.verbose(
                "%d job(s) from %d task queue(s) are eligible for %s queue" % (totalTQJobs, len(tqIDList), vmType)
            )

            # Get the number of already instantiated VMs for these task queues
            totalWaitingVMs = 0
            result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint})
            if result["OK"]:
                for status in result["Value"]:
                    if status in ["New", "Submitted"]:
                        totalWaitingVMs += result["Value"][status]
            if totalWaitingVMs >= totalTQJobs:
                self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs)

            self.log.verbose("%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType))

            # Get proxy to be used to connect to the cloud endpoint
            authType = ce.parameters.get("Auth")
            if authType and authType.lower() in ["x509", "voms"]:
                self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName))
                result = getProxyFileForCloud(ce)
                if not result["OK"]:
                    continue
                ce.setProxy(result["Value"])

            # Get the number of available slots on the target site/endpoint
            totalSlots = self.getVMInstances(endpoint, maxInstances)
            if totalSlots == 0:
                self.log.debug("%s: No slots available" % vmType)
                continue

            vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs))
            self.log.info(
                "%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d"
                % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit)
            )

            # Limit the number of VM instances to create to vmsToSubmit
            vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit)
            if vmsToSubmit == 0:
                continue

            self.log.info("Going to submit %d VMs to %s queue" % (vmsToSubmit, vmType))
            result = ce.createInstances(vmsToSubmit)

            # result = S_OK()
            if not result["OK"]:
                self.log.error("Failed submission to queue %s:\n" % vmType, result["Message"])
                self.failedVMTypes.setdefault(vmType, 0)
                self.failedVMTypes[vmType] += 1
                continue

            # Add VMs to the VirtualMachineDB
            vmDict = result["Value"]
            totalSubmittedPilots += len(vmDict)
            self.log.info("Submitted %d VMs to %s@%s" % (len(vmDict), vmTypeName, ceName))

            pilotList = []
            for uuID in vmDict:
                diracUUID = vmDict[uuID]["InstanceID"]
                endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName)
                result = virtualMachineDB.insertInstance(uuID, vmTypeName, diracUUID, endpoint, self.vo)
                if not result["OK"]:
                    continue
                pRef = "vm://" + ceName + "/" + diracUUID + ":00"
                pilotList.append(pRef)

            stampDict = {}
            tqPriorityList = []
            sumPriority = 0.0
            for tq in taskQueueDict:
                sumPriority += taskQueueDict[tq]["Priority"]
                tqPriorityList.append((tq, sumPriority))
            tqDict = {}
            for pilotID in pilotList:
                rndm = random.random() * sumPriority
                for tq, prio in tqPriorityList:
                    if rndm < prio:
                        tqID = tq
                        break
                if tqID not in tqDict:
                    tqDict[tqID] = []
                tqDict[tqID].append(pilotID)

            for tqID, pilotList in tqDict.items():
                result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, "", "", self.localhost, "Cloud", stampDict)
                if not result["OK"]:
                    self.log.error("Failed to insert pilots into the PilotAgentsDB: %s" % result["Message"])

        self.log.info(
            "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)
        )
        return S_OK()

    def getVMInstances(self, endpoint, maxInstances):

        result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint})
        if not result["OK"]:
            return result

        count = 0
        for status in result["Value"]:
            if status in ["New", "Submitted", "Running"]:
                count += int(result["Value"][status])

        return max(0, maxInstances - count)