예제 #1
0
    def getCEStatus(self):
        """ Method to return information on running and pending jobs.
    """
        result = S_OK()
        result['SubmittedJobs'] = 0
        result['RunningJobs'] = 0
        result['WaitingJobs'] = 0

        ##getWaitingPilots
        condDict = {
            'DestinationSite': self.ceName,
            'Status': WAITING_PILOT_STATUS
        }
        res = PilotAgentsDB().countPilots(condDict)
        if res['OK']:
            result['WaitingJobs'] = int(res['Value'])
        else:
            self.log.warn("Failure getting pilot count for %s: %s " %
                          (self.ceName, res['Message']))

        ##getRunningPilots
        condDict = {'DestinationSite': self.ceName, 'Status': 'Running'}
        res = PilotAgentsDB().countPilots(condDict)
        if res['OK']:
            result['RunningJobs'] = int(res['Value'])
        else:
            self.log.warn("Failure getting pilot count for %s: %s " %
                          (self.ceName, res['Message']))

        return result
예제 #2
0
    def export_getCurrentPilotCounters(cls, attrDict={}):
        """ Get pilot counters per Status with attrDict selection. Final statuses are given for
        the last day.
    """

        result = PilotAgentsDB().getCounters('PilotAgents', ['Status'],
                                             attrDict,
                                             timeStamp='LastUpdateTime')
        if not result['OK']:
            return result
        last_update = Time.dateTime() - Time.day
        resultDay = PilotAgentsDB().getCounters('PilotAgents', ['Status'],
                                                attrDict,
                                                newer=last_update,
                                                timeStamp='LastUpdateTime')
        if not resultDay['OK']:
            return resultDay

        resultDict = {}
        for statusDict, count in result['Value']:
            status = statusDict['Status']
            resultDict[status] = count
            if status in FINAL_STATES:
                resultDict[status] = 0
                for statusDayDict, ccount in resultDay['Value']:
                    if status == statusDayDict['Status']:
                        resultDict[status] = ccount
                    break

        return S_OK(resultDict)
예제 #3
0
    def export_deletePilots(cls, pilotIDs):

        if isinstance(pilotIDs, basestring):
            return PilotAgentsDB().deletePilot(pilotIDs)

        if isinstance(pilotIDs, (int, long)):
            pilotIDs = [
                pilotIDs,
            ]

        result = PilotAgentsDB().deletePilots(pilotIDs)
        if not result['OK']:
            return result
        if enablePilotsLogging:
            pilotIDs = result['Value']
            pilots = PilotAgentsDB().getPilotInfo(pilotID=pilotIDs)
            if not pilots['OK']:
                return pilots
            pilotRefs = []
            for pilot in pilots:
                pilotRefs.append(pilot['PilotJobReference'])
            result = PilotsLoggingDB().deletePilotsLogging(pilotRefs)
            if not result['OK']:
                return result

        return S_OK()
예제 #4
0
    def export_getPilots(cls, jobID):
        """ Get pilot references and their states for :
      - those pilots submitted for the TQ where job is sitting
      - (or) the pilots executing/having executed the Job
    """

        pilots = []
        result = PilotAgentsDB().getPilotsForJobID(int(jobID))
        if not result['OK']:
            if result['Message'].find('not found') == -1:
                return S_ERROR('Failed to get pilot: ' + result['Message'])
        else:
            pilots += result['Value']
        if not pilots:
            # Pilots were not found try to look in the Task Queue
            taskQueueID = 0
            result = TaskQueueDB().getTaskQueueForJob(int(jobID))
            if result['OK'] and result['Value']:
                taskQueueID = result['Value']
            if taskQueueID:
                result = PilotAgentsDB().getPilotsForTaskQueue(taskQueueID,
                                                               limit=10)
                if not result['OK']:
                    return S_ERROR('Failed to get pilot: ' + result['Message'])
                pilots += result['Value']

        if not pilots:
            return S_ERROR('Failed to get pilot for Job %d' % int(jobID))

        return PilotAgentsDB().getPilotInfo(pilotID=pilots)
예제 #5
0
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()
예제 #6
0
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption('PollingTime', 120)
        self.am_setOption('GridEnv', '')
        self.am_setOption('PilotStalledDays', 3)
        self.pilotDB = PilotAgentsDB()
        return S_OK()
예제 #7
0
    def initialize(self):
        """Sets defaults
    """
        self.am_setOption('PollingTime', 120)
        self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
        self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay',
                                                   7)

        self.pilotDB = PilotAgentsDB()
        return S_OK()
예제 #8
0
    def initialize(self):
        """Sets defaults"""

        self.am_setOption("GridEnv", "")
        self.pilotDB = PilotAgentsDB()
        self.diracadmin = DiracAdmin()
        self.jobDB = JobDB()
        self.clearPilotsDelay = self.am_getOption("ClearPilotsDelay", 30)
        self.clearAbortedDelay = self.am_getOption("ClearAbortedPilotsDelay",
                                                   7)
        self.pilots = PilotManagerClient()

        return S_OK()
예제 #9
0
  def initialize(self):
    """Sets defaults
    """

    self.am_setOption('PollingTime', 120)
    self.am_setOption('GridEnv', '')
    self.am_setOption('PilotStalledDays', 3)
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
    self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7)
    self.pilots = PilotManagerClient()

    return S_OK()
예제 #10
0
    def export_setJobForPilot(cls, jobID, pilotRef, destination=None):
        """ Report the DIRAC job ID which is executed by the given pilot job
    """

        result = PilotAgentsDB().setJobForPilot(int(jobID), pilotRef)
        if not result['OK']:
            return result
        result = PilotAgentsDB().setCurrentJobID(pilotRef, int(jobID))
        if not result['OK']:
            return result
        if destination:
            result = PilotAgentsDB().setPilotDestinationSite(
                pilotRef, destination)

        return result
예제 #11
0
    def export_getPilotStatistics(attribute, selectDict):
        """ Get pilot statistics distribution per attribute value with a given selection
    """

        startDate = selectDict.get('FromDate', None)
        if startDate:
            del selectDict['FromDate']

        if startDate is None:
            startDate = selectDict.get('LastUpdate', None)
            if startDate:
                del selectDict['LastUpdate']
        endDate = selectDict.get('ToDate', None)
        if endDate:
            del selectDict['ToDate']

        result = PilotAgentsDB().getCounters('PilotAgents', [attribute],
                                             selectDict,
                                             newer=startDate,
                                             older=endDate,
                                             timeStamp='LastUpdateTime')
        statistics = {}
        if result['OK']:
            for status, count in result['Value']:
                if "OwnerDN" in status:
                    userName = getUsernameForDN(status['OwnerDN'])
                    if userName['OK']:
                        status['OwnerDN'] = userName['Value']
                    statistics[status['OwnerDN']] = count
                else:
                    statistics[status[attribute]] = count

        return S_OK(statistics)
예제 #12
0
class PilotMonitorAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """
    self.am_setOption( 'PollingTime', 120 )
    self.clearPilotsDelay = self.am_getOption( 'ClearPilotsDelay', 30 )
    self.clearAbortedDelay = self.am_getOption( 'ClearAbortedPilotsDelay', 7 )

    self.pilotDB = PilotAgentsDB()
    return S_OK()

  #############################################################################
  def execute( self ):
    """
      Remove from PilotDB pilots that:
      - are older than self.clearPilotsDelay
      - are Aborted and older than self.clearAbortedDelay
    """
    result = self.pilotDB.clearPilots( self.clearPilotsDelay, self.clearAbortedDelay )
    if not result['OK']:
      self.log.warn( 'Failed to clear old pilots in the PilotAgentsDB' )

    return S_OK( 'Monitoring cycle complete.' )
예제 #13
0
def initializeMatcherHandler(serviceInfo):
    """  Matcher Service initialization
  """

    global gJobDB
    global gTaskQueueDB
    global jlDB
    global pilotAgentsDB

    gJobDB = JobDB()
    gTaskQueueDB = TaskQueueDB()
    jlDB = JobLoggingDB()
    pilotAgentsDB = PilotAgentsDB()

    gMonitor.registerActivity('matchTime', "Job matching time", 'Matching',
                              "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching',
                              "tqsk queues", gMonitor.OP_MEAN, 300)

    gTaskQueueDB.recalculateTQSharesForAll()
    gThreadScheduler.addPeriodicTask(120,
                                     gTaskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues)

    sendNumTaskQueues()

    return S_OK()
예제 #14
0
  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
예제 #15
0
    def export_killPilot(cls, pilotRefList):
        """ Kill the specified pilots
    """
        # Make a list if it is not yet
        pilotRefs = list(pilotRefList)
        if isinstance(pilotRefList, basestring):
            pilotRefs = [pilotRefList]

        # Regroup pilots per site and per owner
        pilotRefDict = {}
        for pilotReference in pilotRefs:
            result = PilotAgentsDB().getPilotInfo(pilotReference)
            if not result['OK'] or not result['Value']:
                return S_ERROR('Failed to get info for pilot ' +
                               pilotReference)

            pilotDict = result['Value'][pilotReference]
            owner = pilotDict['OwnerDN']
            group = pilotDict['OwnerGroup']
            queue = '@@@'.join([
                owner, group, pilotDict['GridSite'],
                pilotDict['DestinationSite'], pilotDict['Queue']
            ])
            gridType = pilotDict['GridType']
            pilotRefDict.setdefault(queue, {})
            pilotRefDict[queue].setdefault('PilotList', [])
            pilotRefDict[queue]['PilotList'].append(pilotReference)
            pilotRefDict[queue]['GridType'] = gridType

        failed = killPilotsInQueues(pilotRefDict)

        if failed:
            return S_ERROR('Failed to kill at least some pilots')

        return S_OK()
예제 #16
0
def initializeMatcherHandler(serviceInfo):
    """  Matcher Service initialization
  """

    global gJobDB
    global gJobLoggingDB
    global gTaskQueueDB
    global gPilotAgentsDB

    # Create JobDB object and initialize its tables.
    gJobDB = JobDB()
    res = gJobDB._checkTable()
    if not res['OK']:
        return res

    # Create JobLoggingDB object and initialize its tables.
    gJobLoggingDB = JobLoggingDB()
    res = gJobLoggingDB._checkTable()
    if not res['OK']:
        return res

    gTaskQueueDB = TaskQueueDB()

    # Create PilotAgentsDB object and initialize its tables.
    gPilotAgentsDB = PilotAgentsDB()
    res = gPilotAgentsDB._checkTable()
    if not res['OK']:
        return res

    gMonitor.registerActivity('matchTime', "Job matching time", 'Matching',
                              "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching',
                              "tqsk queues", gMonitor.OP_MEAN, 300)

    gTaskQueueDB.recalculateTQSharesForAll()
    gThreadScheduler.addPeriodicTask(120,
                                     gTaskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues)

    sendNumTaskQueues()

    return S_OK()
예제 #17
0
    def export_countPilots(cls,
                           condDict,
                           older=None,
                           newer=None,
                           timeStamp='SubmissionTime'):
        """ Set the pilot agent status
    """

        return PilotAgentsDB().countPilots(condDict, older, newer, timeStamp)
예제 #18
0
  def initialize( self ):
    """Sets defaults
    """
    self.am_setOption( 'PollingTime', 120 )
    self.clearPilotsDelay = self.am_getOption( 'ClearPilotsDelay', 30 )
    self.clearAbortedDelay = self.am_getOption( 'ClearAbortedPilotsDelay', 7 )

    self.pilotDB = PilotAgentsDB()
    return S_OK()
예제 #19
0
    def export_getPilotSummaryWeb(cls, selectDict, sortList, startItem,
                                  maxItems):
        """ Get the summary of the pilot information for a given page in the
        pilot monitor in a generic format
    """

        result = PilotAgentsDB().getPilotSummaryWeb(selectDict, sortList,
                                                    startItem, maxItems)
        return result
예제 #20
0
    def export_clearPilots(cls, interval=30, aborted_interval=7):

        result = PilotAgentsDB().clearPilots(interval, aborted_interval)
        if not result['OK']:
            return result
        if enablePilotsLogging:
            pilotIDs = result['Value']
            pilots = PilotAgentsDB().getPilotInfo(pilotID=pilotIDs)
            if not pilots['OK']:
                return pilots
            pilotRefs = []
            for pilot in pilots:
                pilotRefs.append(pilot['PilotJobReference'])
            result = PilotsLoggingDB().deletePilotsLogging(pilotRefs)
            if not result['OK']:
                return result

        return S_OK()
예제 #21
0
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( 'PollingTime', 120 )
    self.am_setOption( 'GridEnv', '' )
    self.am_setOption( 'PilotStalledDays', 3 )
    self.pilotDB = PilotAgentsDB()
    return S_OK()
예제 #22
0
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 120)
        self.am_setOption("GridEnv", "")
        self.am_setOption("PilotStalledDays", 3)
        self.pilotDB = PilotAgentsDB()
        return S_OK()
예제 #23
0
def initializeWMSAdministratorHandler(serviceInfo):
    """  WMS AdministratorService initialization
  """

    global jobDB
    global pilotDB
    global taskQueueDB

    jobDB = JobDB()
    pilotDB = PilotAgentsDB()
    taskQueueDB = TaskQueueDB()
    return S_OK()
예제 #24
0
 def export_addPilotTQReference(cls,
                                pilotRef,
                                taskQueueID,
                                ownerDN,
                                ownerGroup,
                                broker='Unknown',
                                gridType='DIRAC',
                                pilotStampDict={}):
     """ Add a new pilot job reference """
     return PilotAgentsDB().addPilotTQReference(pilotRef, taskQueueID,
                                                ownerDN, ownerGroup, broker,
                                                gridType, pilotStampDict)
예제 #25
0
  def initialize( self ):

    self.am_setOption( 'shifterProxy', 'DataManager' )

    self.rmClient = ResourceManagementClient()

    self.commands[ 'Downtime' ]            = [ { 'Downtime'            : {} } ]
    self.commands[ 'SpaceTokenOccupancy' ] = [ { 'SpaceTokenOccupancy' : {} } ]
    self.commands[ 'Pilot' ]               = [ { 'Pilot' : { 'timespan' : 1800 } },]
#                                               { 'Pilot' : { 'timespan' : 86400 } },
#                                               { 'Pilot' : { 'timespan' : 604800 } }]
 
    
    #PilotsCommand
#    self.commands[ 'Pilots' ] = [ 
#                                 { 'PilotsWMS' : { 'element' : 'Site', 'siteName' : None } },
#                                 { 'PilotsWMS' : { 'element' : 'Resource', 'siteName' : None } } 
#                                 ]
        

    #FIXME: do not forget about hourly vs Always ...etc                                                                       
    #AccountingCacheCommand
#    self.commands[ 'AccountingCache' ] = [
#                                          {'SuccessfullJobsBySiteSplitted'    :{'hours' :24, 'plotType' :'Job' }},
#                                          {'FailedJobsBySiteSplitted'         :{'hours' :24, 'plotType' :'Job' }},
#                                          {'SuccessfullPilotsBySiteSplitted'  :{'hours' :24, 'plotType' :'Pilot' }},
#                                          {'FailedPilotsBySiteSplitted'       :{'hours' :24, 'plotType' :'Pilot' }},
#                                          {'SuccessfullPilotsByCESplitted'    :{'hours' :24, 'plotType' :'Pilot' }},
#                                          {'FailedPilotsByCESplitted'         :{'hours' :24, 'plotType' :'Pilot' }},
#                                          {'RunningJobsBySiteSplitted'        :{'hours' :24, 'plotType' :'Job' }},
##                                          {'RunningJobsBySiteSplitted'        :{'hours' :168, 'plotType' :'Job' }},
##                                          {'RunningJobsBySiteSplitted'        :{'hours' :720, 'plotType' :'Job' }},
##                                          {'RunningJobsBySiteSplitted'        :{'hours' :8760, 'plotType' :'Job' }},    
#                                          ]                                  
    
    #VOBOXAvailability
#    self.commands[ 'VOBOXAvailability' ] = [
#                                            { 'VOBOXAvailability' : {} }
#   
    
    #Reuse clients for the commands
    self.clients[ 'GOCDBClient' ]              = GOCDBClient()
    self.clients[ 'ReportGenerator' ]          = RPCClient( 'Accounting/ReportGenerator' )
    self.clients[ 'ReportsClient' ]            = ReportsClient()
    self.clients[ 'ResourceStatusClient' ]     = ResourceStatusClient()
    self.clients[ 'ResourceManagementClient' ] = ResourceManagementClient()
    self.clients[ 'PilotsDB' ]                 = PilotAgentsDB()
    self.clients[ 'WMSAdministrator' ]         = RPCClient( 'WorkloadManagement/WMSAdministrator' )

    self.cCaller = CommandCaller
    
    return S_OK()
예제 #26
0
    def initializeHandler(cls, serviceInfoDict):
        """ Initialization of DB objects
    """

        cls.pilotAgentsDB = PilotAgentsDB()

        cls.gPilotsLoggingDB = None
        enablePilotsLogging = Operations().getValue(
            '/Services/JobMonitoring/usePilotsLoggingFlag', False)
        if enablePilotsLogging:
            cls.gPilotsLoggingDB = PilotsLoggingDB()

        return S_OK()
예제 #27
0
    def export_getCounters(cls,
                           table,
                           keys,
                           condDict,
                           newer=None,
                           timeStamp='SubmissionTime'):
        """ Set the pilot agent status
    """

        return PilotAgentsDB().getCounters(table,
                                           keys,
                                           condDict,
                                           newer=newer,
                                           timeStamp=timeStamp)
예제 #28
0
  def initialize(self):
    """Sets defaults
    """

    self.am_setOption('PollingTime', 120)
    self.am_setOption('GridEnv', '')
    self.am_setOption('PilotStalledDays', 3)
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
    self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7)
    self.WMSAdministrator = WMSAdministratorClient()

    return S_OK()
예제 #29
0
    def __init__(self, args=None, clients=None):
        """ Constructor.
    
    :Parameters:
      **args** - [, `dict` ]
        arguments to be passed to be used in the _prepareCommand method ( name and
        timespan are the expected ones )
      **clients - [, `dict` ]
        clients from where information is fetched. Mainly used to avoid creating
        new connections on agents looping over clients. ResourceManagementClient
        and PilotsDB are most welcome.  
    """

        super(PilotCommand, self).__init__(args, clients)

        if 'PilotsDB' in self.apis:
            self.pilotsDB = self.apis['PilotsDB']
        else:
            self.pilotsDB = PilotAgentsDB()

        if 'ResourceManagementClient' in self.apis:
            self.rmClient = self.apis['ResourceManagementClient']
        else:
            self.rmClient = ResourceManagementClient()
예제 #30
0
    def export_setPilotStatus(self,
                              pilotRef,
                              status,
                              destination=None,
                              reason=None,
                              gridSite=None,
                              queue=None):
        """ Set the pilot agent status
    """

        return PilotAgentsDB().setPilotStatus(pilotRef,
                                              status,
                                              destination=destination,
                                              statusReason=reason,
                                              gridSite=gridSite,
                                              queue=queue)
예제 #31
0
def initializeJobManagerHandler(serviceInfo):

    global gJobDB, gJobLoggingDB, gtaskQueueDB, enablePilotsLogging, gPilotAgentsDB, gPilotsLoggingDB
    gJobDB = JobDB()
    gJobLoggingDB = JobLoggingDB()
    gtaskQueueDB = TaskQueueDB()
    gPilotAgentsDB = PilotAgentsDB()

    # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict
    enablePilotsLogging = gConfig.getValue(
        serviceInfo['serviceSectionPath'].replace('JobManager',
                                                  'PilotsLogging') + '/Enable',
        'False').lower() in ('yes', 'true')

    if enablePilotsLogging:
        gPilotsLoggingDB = PilotsLoggingDB()
    return S_OK()
예제 #32
0
def initializePilotManagerHandler(serviceInfo):
    """  PilotManagerHandler initialization
  """

    global pilotDB
    global pilotsLoggingDB
    global enablePilotsLogging

    # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict
    enablePilotsLogging = gConfig.getValue(
        serviceInfo['serviceSectionPath'].replace('Pilots', 'PilotsLogging') +
        '/Enable', 'False').lower() in ('yes', 'true')

    pilotDB = PilotAgentsDB()
    if enablePilotsLogging:
        pilotsLoggingDB = PilotsLoggingDB()
    return S_OK()
예제 #33
0
    def initializeHandler(cls, serviceInfoDict):
        """ Initialization of DB objects and OptimizationMind
    """
        cls.jobDB = JobDB()
        cls.jobLoggingDB = JobLoggingDB()
        cls.taskQueueDB = TaskQueueDB()
        cls.pilotAgentsDB = PilotAgentsDB()

        cls.pilotsLoggingDB = None
        enablePilotsLogging = Operations().getValue(
            '/Services/JobMonitoring/usePilotsLoggingFlag', False)
        if enablePilotsLogging:
            cls.pilotsLoggingDB = PilotsLoggingDB()

        cls.msgClient = MessageClient("WorkloadManagement/OptimizationMind")
        cls.__connectToOptMind()
        gThreadScheduler.addPeriodicTask(60, cls.__connectToOptMind)
        return S_OK()
예제 #34
0
    def export_getPilotLoggingInfo(cls, pilotReference):
        """ Get the pilot logging info for the Grid job reference
    """

        result = PilotAgentsDB().getPilotInfo(pilotReference)
        if not result['OK'] or not result['Value']:
            return S_ERROR('Failed to determine owner for pilot ' +
                           pilotReference)

        pilotDict = result['Value'][pilotReference]
        owner = pilotDict['OwnerDN']
        group = pilotDict['OwnerGroup']
        gridType = pilotDict['GridType']

        return getPilotLoggingInfo(
            gridType,
            pilotReference,  # pylint: disable=unexpected-keyword-arg
            proxyUserDN=owner,
            proxyUserGroup=group)
예제 #35
0
  def initializeHandler(cls, serviceInfoDict):
    cls.jobDB = JobDB()
    cls.jobLoggingDB = JobLoggingDB()
    cls.taskQueueDB = TaskQueueDB()
    cls.pilotAgentsDB = PilotAgentsDB()
    cls.limiter = Limiter(jobDB=cls.jobDB)

    cls.taskQueueDB.recalculateTQSharesForAll()

    gMonitor.registerActivity('matchTime', "Job matching time",
                              'Matching', "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request",
                              'Matching', "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs",
                              'Matching', "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues",
                              'Matching', "tqsk queues", gMonitor.OP_MEAN, 300)

    gThreadScheduler.addPeriodicTask(120, cls.taskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, cls.sendNumTaskQueues)

    cls.sendNumTaskQueues()
    return S_OK()
예제 #36
0
    def initialize(self):
        ''' Standard initialize.
        Uses the ProductionManager shifterProxy to modify the ResourceStatus DB
    '''

        self.maxNumberOfThreads = self.am_getOption('maxNumberOfThreads',
                                                    self.maxNumberOfThreads)
        self.elementType = self.am_getOption('elementType', self.elementType)
        self.checkingFreqs = self.am_getOption('checkingFreqs',
                                               self.checkingFreqs)
        self.limitQueueFeeder = self.am_getOption('limitQueueFeeder',
                                                  self.limitQueueFeeder)

        self.elementsToBeChecked = Queue.Queue()
        self.threadPool = ThreadPool(self.maxNumberOfThreads,
                                     self.maxNumberOfThreads)

        self.rsClient = ResourceStatusClient()

        self.clients['ResourceStatusClient'] = self.rsClient
        self.clients['ResourceManagementClient'] = ResourceManagementClient()
        self.clients['PilotsDB'] = PilotAgentsDB()

        return S_OK()
예제 #37
0
  def __init__( self, args = None, clients = None ):
    """ Constructor.
    
    :Parameters:
      **args** - [, `dict` ]
        arguments to be passed to be used in the _prepareCommand method ( name and
        timespan are the expected ones )
      **clients - [, `dict` ]
        clients from where information is fetched. Mainly used to avoid creating
        new connections on agents looping over clients. ResourceManagementClient
        and PilotsDB are most welcome.  
    """
    
    super( PilotCommand, self ).__init__( args, clients )

    if 'PilotsDB' in self.apis:
      self.pilotsDB = self.apis[ 'PilotsDB' ]
    else:
      self.pilotsDB = PilotAgentsDB()

    if 'ResourceManagementClient' in self.apis:
      self.rmClient = self.apis[ 'ResourceManagementClient' ]
    else:
      self.rmClient = ResourceManagementClient()
예제 #38
0
class PilotCommand( Command ):
  """ Pilot 'master' Command.    
  """

  def __init__( self, args = None, clients = None ):
    """ Constructor.
    
    :Parameters:
      **args** - [, `dict` ]
        arguments to be passed to be used in the _prepareCommand method ( name and
        timespan are the expected ones )
      **clients - [, `dict` ]
        clients from where information is fetched. Mainly used to avoid creating
        new connections on agents looping over clients. ResourceManagementClient
        and PilotsDB are most welcome.  
    """
    
    super( PilotCommand, self ).__init__( args, clients )

    if 'PilotsDB' in self.apis:
      self.pilotsDB = self.apis[ 'PilotsDB' ]
    else:
      self.pilotsDB = PilotAgentsDB()

    if 'ResourceManagementClient' in self.apis:
      self.rmClient = self.apis[ 'ResourceManagementClient' ]
    else:
      self.rmClient = ResourceManagementClient()

  def _storeCommand( self, result ):
    """ Stores the results of doNew method on the database.
    
    :Parameters:
      **result** - `list( dict )`
        list of dictionaries to be inserted on the DB. Unfortunately, there is no
        bulk insertion method on the database. The dictionaries are sanitized in
        doNew method so that they match the column names in the database.
      
    :return: S_OK / S_ERROR
    """
    
    for pilotDict in result:
      
      lowerCasePilotDict = {}
      for key, value in pilotDict.iteritems():
        lowerCasePilotDict[ key[0].lower() + key[1:] ] = value
      
      # I do not care about the **magic, it makes it cleaner
      resQuery = self.rmClient.addOrModifyPilotCache( **lowerCasePilotDict )
      if not resQuery[ 'OK' ]:
        return resQuery

    return S_OK()
  
  def _prepareCommand( self ):
    """ Method that parses command arguments to extract the ones needed:
      name : name of the computing element
      timespan ( seconds ) : time window
      
    :return: : S_OK( name, timespan ) / S_ERROR        
    """

    if not 'name' in self.args:
      return S_ERROR( '"name" not found in self.args' )
    name = self.args[ 'name' ]
  
    if not 'timespan' in self.args:
      return S_ERROR( '"timespan" not found in self.args' )
    timespan = self.args[ 'timespan' ]
  
    return S_OK( ( name, timespan ) )
  
  def doNew( self, masterParams = None ):
    """ doNew method. If is master execution, name is declared as '' so that 
    all ce's are asked. Once values are obtained, they are stored on the Database.
    The entries with name Unknown, NotAssigned and Total are skipped.

    :Parameters:
      **masterParams** - [, bool ]
        if True, it queries for all elements in the database for the given timespan
    
    :return: S_OK( list ( dict ) ) / S_ERROR
    """
    
    # Ask for all CEs
    if masterParams is True:
      self.args[ 'name' ] = ''

    params = self._prepareCommand()
    if not params[ 'OK' ]:
      return params
    computingElement, timespan = params[ 'Value' ]
  
    # Calculate time window from timespan and utcnow  
    endTimeWindow   = datetime.utcnow()
    startTimeWindow = endTimeWindow - timedelta( seconds = timespan )
  
    # Get pilots information from DB 
    pilotsRes = self.pilotsDB.getPilotSummaryShort( startTimeWindow, 
                                                    endTimeWindow, 
                                                    computingElement )
    if not pilotsRes[ 'OK' ]:
      return pilotsRes
 
    # This list matches the database schema in ResourceManagemntDB. It is used
    # to have a perfect match even it there are no pilots on a particular state
    pilotStatuses = [ 'Scheduled', 'Waiting', 'Submitted', 'Running', 'Done', 'Aborted', 
                      'Cancelled', 'Deleted', 'Failed', 'Held', 'Killed', 'Stalled' ]
 
    uniformResult = [] 
       
    for ceName, pilotDict in pilotsRes[ 'Value' ].items():
      
      if ceName in [ 'Total', 'Unknown', 'NotAssigned' ]:
        continue
      
      uniformPilotDict = dict.fromkeys( pilotStatuses, 0 )
      uniformPilotDict.update( pilotDict )
      uniformPilotDict[ 'Timespan' ] = timespan
      uniformPilotDict[ 'CE' ]       = ceName
            
      uniformResult.append( uniformPilotDict )
    
    # Store results
    storeRes = self._storeCommand( uniformResult )
    if not storeRes[ 'OK' ]:
      return storeRes
    
    return S_OK( uniformResult )   

  def doCache( self ):
    """ doCache gets values from the database instead from the PilotsDB tables.
    If successful, returns a list of dictionaries with the database records. 
     
    :return: S_OK( list( dict ) ) / S_ERROR 
    """
 
    params = self._prepareCommand()
    if not params[ 'OK' ]:
      return params
    computingElement, timespan = params[ 'Value' ]    

    # Make sure the records we obtain are NOT out of date
    lastValidRecord = datetime.utcnow() - timedelta( seconds = timespan )

    result = self.rmClient.selectPilotCache( cE = computingElement, timespan = timespan,
                                             meta = { 'older' : ( 'LastCheckTime', lastValidRecord ) } )  
    if result[ 'OK' ]:
      result = S_OK( [ dict( zip( result[ 'Columns' ], res ) ) for res in result[ 'Value' ] ] )
      
    return result    

  def doMaster( self ):
    """ Master method, asks for all information in the database for the given 
    timespan ( see _prepareCommand ).

    :return: : S_OK( failedMessages )
    """
    
    pilotResults = self.doNew( masterParams = True )
    if not pilotResults[ 'OK' ]:
      self.metrics[ 'failed' ].append( pilotResults[ 'Message' ] )    
        
    return S_OK( self.metrics )    
예제 #39
0
class Matcher( object ):
  """ Logic for matching
  """

  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )


  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict


  def _getResourceDict( self, resourceDescription, credDict ):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription( resourceDescription )
    resourceDict = self._checkCredentials( resourceDict, credDict )
    self._checkPilotVersion( resourceDict )
    if not self._checkMask( resourceDict ):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose( "Resource description:" )
    for key in resourceDict:
      self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    return resourceDict

  def _processResourceDescription( self, resourceDescription ):
    """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

    resourceDict = {}
    if isinstance( resourceDescription, basestring ):
      classAdAgent = ClassAd( resourceDescription )
      if not classAdAgent.isOK():
        raise ValueError( 'Illegal Resource JDL' )
      self.log.verbose( classAdAgent.asJDL() )

      for name in singleValueDefFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'CPUTime':
            resourceDict[name] = classAdAgent.getAttributeInt( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      for name in multiValueMatchFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'SubmitPool':
            resourceDict[name] = classAdAgent.getListFromExpression( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      # Check if a JobID is requested
      if classAdAgent.lookupAttribute( 'JobID' ):
        resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' )

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ):
        if classAdAgent.lookupAttribute( k ):
          resourceDict[ k ] = classAdAgent.getAttributeString( k )

    else:
      for name in singleValueDefFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      for name in multiValueMatchFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      if resourceDescription.has_key( 'JobID' ):
        resourceDict['JobID'] = resourceDescription['JobID']

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
                 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ):
        if k in resourceDescription:
          resourceDict[ k ] = resourceDescription[ k ]

    return resourceDict



  def _reportStatus( self, resourceDict, jobID ):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes( jobID, attNames, attValues )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Set job attributes for jobID %s" % jobID )

    result = self.jlDB.addLoggingRecord( jobID,
                                         status = 'Matched',
                                         minor = 'Assigned',
                                         source = 'Matcher' )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Added logging record for jobID %s" % jobID )


  def _checkMask( self, resourceDict ):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if not 'Site' in resourceDict:
      self.log.error( "Missing Site Name in Resource JDL" )
      raise RuntimeError( "Missing Site Name in Resource JDL" )

    # Get common site mask and check the agent site
    result = self.jobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] )
      raise RuntimeError( "Internal error" )
    maskList = result['Value']

    if resourceDict['Site'] not in maskList:
      return False

    return True

  def _updatePilotInfo( self, resourceDict ):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      gridCE = resourceDict.get( 'GridCE', 'Unknown' )
      site = resourceDict.get( 'Site', 'Unknown' )
      benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
      self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) )

      result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site,
                                                  destination = gridCE, benchmark = benchmark )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _updatePilotJobMapping( self, resourceDict, jobID ):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )
      result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _checkCredentials( self, resourceDict, credDict ):
    """ Check if we can get a job given the passed credentials
    """
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      # You can only match groups in the same VO
      if credDict[ 'group' ] == "hosts":
        # for the host case the VirtualOrganization parameter
        # is mandatory in resourceDict
        vo = resourceDict.get( 'VirtualOrganization', '' )
      else:
        vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
      else:
        raise RuntimeError( result['Message'] )
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        self.log.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        self.log.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ]:
            raise RuntimeError( result['Message'] )
          if credDict[ 'group' ] not in result[ 'Value' ]:
            # DN is not in the same group! bad boy.
            self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    return resourceDict

  def _checkPilotVersion( self, resourceDict ):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          raise RuntimeError( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \
                            ( pilotVersion, ",".join( validVersions ) ) )
      # Check project if requested
      validProject = self.opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                                   resourceDict[ 'ReleaseProject' ] ) )
예제 #40
0
class PilotStatusAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    queryStateList = ["Ready", "Submitted", "Running", "Waiting", "Scheduled"]
    finalStateList = ["Done", "Aborted", "Cleared", "Deleted", "Failed"]
    identityFieldsList = ["OwnerDN", "OwnerGroup", "GridType", "Broker"]
    eligibleGridTypes = ["gLite"]

    #############################################################################
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 120)
        self.am_setOption("GridEnv", "")
        self.am_setOption("PilotStalledDays", 3)
        self.pilotDB = PilotAgentsDB()
        return S_OK()

    #############################################################################
    def execute(self):
        """The PilotAgent execution method.
    """

        self.pilotStalledDays = self.am_getOption("PilotStalledDays", 3)
        self.gridEnv = self.am_getOption("GridEnv")
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue("/DIRAC/Setup", "")
            if setup:
                instance = gConfig.getValue("/DIRAC/Setups/%s/WorkloadManagement" % setup, "")
                if instance:
                    self.gridEnv = gConfig.getValue("/Systems/WorkloadManagement/%s/GridEnv" % instance, "")
        result = self.pilotDB._getConnection()
        if result["OK"]:
            connection = result["Value"]
        else:
            return result

        result = self.pilotDB.getPilotGroups(self.identityFieldsList, {"Status": self.queryStateList})
        if not result["OK"]:
            self.log.error("Fail to get identities Groups", result["Message"])
            return result
        if not result["Value"]:
            return S_OK()

        pilotsToAccount = {}

        for ownerDN, ownerGroup, gridType, broker in result["Value"]:

            if not gridType in self.eligibleGridTypes:
                continue

            self.log.verbose("Getting pilots for %s:%s @ %s %s" % (ownerDN, ownerGroup, gridType, broker))

            condDict1 = {
                "Status": "Done",
                "StatusReason": "Report from JobAgent",
                "OwnerDN": ownerDN,
                "OwnerGroup": ownerGroup,
                "GridType": gridType,
                "Broker": broker,
            }

            condDict2 = {
                "Status": self.queryStateList,
                "OwnerDN": ownerDN,
                "OwnerGroup": ownerGroup,
                "GridType": gridType,
                "Broker": broker,
            }

            for condDict in [condDict1, condDict2]:
                result = self.clearWaitingPilots(condDict)
                if not result["OK"]:
                    self.log.warn("Failed to clear Waiting Pilot Jobs")

                result = self.pilotDB.selectPilots(condDict)
                if not result["OK"]:
                    self.log.warn("Failed to get the Pilot Agents")
                    return result
                if not result["Value"]:
                    continue
                refList = result["Value"]

                ret = gProxyManager.getPilotProxyFromDIRACGroup(ownerDN, ownerGroup)
                if not ret["OK"]:
                    self.log.error(ret["Message"])
                    self.log.error("Could not get proxy:", 'User "%s", Group "%s"' % (ownerDN, ownerGroup))
                    continue
                proxy = ret["Value"]

                self.log.verbose(
                    "Getting status for %s pilots for owner %s and group %s" % (len(refList), ownerDN, ownerGroup)
                )

                for start_index in range(0, len(refList), MAX_JOBS_QUERY):
                    refsToQuery = refList[start_index : start_index + MAX_JOBS_QUERY]
                    self.log.verbose(
                        "Querying %d pilots of %s starting at %d" % (len(refsToQuery), len(refList), start_index)
                    )
                    result = self.getPilotStatus(proxy, gridType, refsToQuery)
                    if not result["OK"]:
                        if result["Message"] == "Broker not Available":
                            self.log.error("Broker %s not Available" % broker)
                            break
                        self.log.warn("Failed to get pilot status:")
                        self.log.warn("%s:%s @ %s" % (ownerDN, ownerGroup, gridType))
                        continue

                    statusDict = result["Value"]
                    for pRef in statusDict:
                        pDict = statusDict[pRef]
                        if pDict:
                            if pDict["isParent"]:
                                self.log.verbose("Clear parametric parent %s" % pRef)
                                result = self.clearParentJob(pRef, pDict, connection)
                                if not result["OK"]:
                                    self.log.warn(result["Message"])
                                else:
                                    self.log.info("Parametric parent removed: %s" % pRef)
                            if pDict["FinalStatus"]:
                                self.log.verbose("Marking Status for %s to %s" % (pRef, pDict["Status"]))
                                pilotsToAccount[pRef] = pDict
                            else:
                                self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"]))
                                result = self.pilotDB.setPilotStatus(
                                    pRef,
                                    pDict["Status"],
                                    pDict["DestinationSite"],
                                    updateTime=pDict["StatusDate"],
                                    conn=connection,
                                )

                    if len(pilotsToAccount) > 100:
                        self.accountPilots(pilotsToAccount, connection)
                        pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        # Now handle pilots not updated in the last N days (most likely the Broker is no
        # longer available) and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        return S_OK()

    def clearWaitingPilots(self, condDict):
        """ Clear pilots in the faulty Waiting state
    """

        last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
        clearDict = {
            "Status": "Waiting",
            "OwnerDN": condDict["OwnerDN"],
            "OwnerGroup": condDict["OwnerGroup"],
            "GridType": condDict["GridType"],
            "Broker": condDict["Broker"],
        }
        result = self.pilotDB.selectPilots(clearDict, older=last_update)
        if not result["OK"]:
            self.log.warn("Failed to get the Pilot Agents fpr Waiting state")
            return result
        if not result["Value"]:
            return S_OK()
        refList = result["Value"]

        for pilotRef in refList:
            self.log.info("Setting Waiting pilot to Aborted: %s" % pilotRef)
            result = self.pilotDB.setPilotStatus(pilotRef, "Stalled", statusReason="Exceeded max waiting time")

        return S_OK()

    def clearParentJob(self, pRef, pDict, connection):
        """ Clear the parameteric parent job from the PilotAgentsDB
    """

        childList = pDict["ChildRefs"]

        # Check that at least one child is in the database
        children_ok = False
        for child in childList:
            result = self.pilotDB.getPilotInfo(child, conn=connection)
            if result["OK"]:
                if result["Value"]:
                    children_ok = True

        if children_ok:
            return self.pilotDB.deletePilot(pRef, conn=connection)
        else:
            self.log.verbose("Adding children for parent %s" % pRef)
            result = self.pilotDB.getPilotInfo(pRef)
            parentInfo = result["Value"][pRef]
            tqID = parentInfo["TaskQueueID"]
            ownerDN = parentInfo["OwnerDN"]
            ownerGroup = parentInfo["OwnerGroup"]
            broker = parentInfo["Broker"]
            gridType = parentInfo["GridType"]
            result = self.pilotDB.addPilotTQReference(
                childList, tqID, ownerDN, ownerGroup, broker=broker, gridType=gridType
            )
            if not result["OK"]:
                return result
            children_added = True
            for chRef, chDict in pDict["ChildDicts"].items():
                result = self.pilotDB.setPilotStatus(
                    chRef, chDict["Status"], destination=chDict["DestinationSite"], conn=connection
                )
                if not result["OK"]:
                    children_added = False
            if children_added:
                result = self.pilotDB.deletePilot(pRef, conn=connection)
            else:
                return S_ERROR("Failed to add children")
        return S_OK()

    def handleOldPilots(self, connection):
        """
      select all pilots that have not been updated in the last N days and declared them 
      Deleted, accounting for them.
    """
        pilotsToAccount = {}
        timeLimitToConsider = Time.toString(Time.dateTime() - Time.day * self.pilotStalledDays)
        # A.T. Below looks to be a bug
        # result = self.pilotDB.selectPilots( {'Status':self.queryStateList} , older=None, timeStamp='LastUpdateTime' )
        result = self.pilotDB.selectPilots(
            {"Status": self.queryStateList}, older=timeLimitToConsider, timeStamp="LastUpdateTime"
        )
        if not result["OK"]:
            self.log.error("Failed to get the Pilot Agents")
            return result
        if not result["Value"]:
            return S_OK()

        refList = result["Value"]
        result = self.pilotDB.getPilotInfo(refList)
        if not result["OK"]:
            self.log.error("Failed to get Info for Pilot Agents")
            return result

        pilotsDict = result["Value"]

        for pRef in pilotsDict:
            deletedJobDict = pilotsDict[pRef]
            deletedJobDict["Status"] = "Deleted"
            deletedJobDict["StatusDate"] = Time.dateTime()
            pilotsToAccount[pRef] = deletedJobDict
            if len(pilotsToAccount) > 100:
                self.accountPilots(pilotsToAccount, connection)
                pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)

        return S_OK()

    def accountPilots(self, pilotsToAccount, connection):
        """ account for pilots
    """
        accountingFlag = False
        pae = self.am_getOption("PilotAccountingEnabled", "yes")
        if pae.lower() == "yes":
            accountingFlag = True

        if not pilotsToAccount:
            self.log.info("No pilots to Account")
            return S_OK()

        accountingSent = False
        if accountingFlag:
            retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(), conn=connection)
            if not retVal["OK"]:
                self.log.error("Fail to retrieve Info for pilots", retVal["Message"])
                return retVal
            dbData = retVal["Value"]
            for pref in dbData:
                if pref in pilotsToAccount:
                    if dbData[pref]["Status"] not in self.finalStateList:
                        dbData[pref]["Status"] = pilotsToAccount[pref]["Status"]
                        dbData[pref]["DestinationSite"] = pilotsToAccount[pref]["DestinationSite"]
                        dbData[pref]["LastUpdateTime"] = pilotsToAccount[pref]["StatusDate"]

            retVal = self.__addPilotsAccountingReport(dbData)
            if not retVal["OK"]:
                self.log.error("Fail to retrieve Info for pilots", retVal["Message"])
                return retVal

            self.log.info("Sending accounting records...")
            retVal = gDataStoreClient.commit()
            if not retVal["OK"]:
                self.log.error("Can't send accounting reports", retVal["Message"])
            else:
                self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount))
                accountingSent = True

        if not accountingFlag or accountingSent:
            for pRef in pilotsToAccount:
                pDict = pilotsToAccount[pRef]
                self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"]))
                self.pilotDB.setPilotStatus(
                    pRef, pDict["Status"], pDict["DestinationSite"], pDict["StatusDate"], conn=connection
                )

        return S_OK()

    #############################################################################
    def getPilotStatus(self, proxy, gridType, pilotRefList):
        """ Get GRID job status information using the job's owner proxy and
        GRID job IDs. Returns for each JobID its status in the GRID WMS and
        its destination CE as a tuple of 2 elements
    """

        if gridType == "LCG":
            cmd = ["edg-job-status"]
        elif gridType == "gLite":
            cmd = ["glite-wms-job-status"]
        else:
            return S_ERROR()
        cmd.extend(pilotRefList)

        start = time.time()
        ret = executeGridCommand(proxy, cmd, self.gridEnv)
        self.log.info("%s Job Status Execution Time for %d jobs:" % (gridType, len(pilotRefList)), time.time() - start)

        if not ret["OK"]:
            self.log.error("Failed to execute %s Job Status" % gridType, ret["Message"])
            return S_ERROR()
        if ret["Value"][0] != 0:
            stderr = ret["Value"][2]
            stdout = ret["Value"][1]
            deleted = 0
            resultDict = {}
            status = "Deleted"
            destination = "Unknown"
            deletedJobDict = {
                "Status": status,
                "DestinationSite": destination,
                "StatusDate": Time.dateTime(),
                "isChild": False,
                "isParent": False,
                "ParentRef": False,
                "FinalStatus": status in self.finalStateList,
                "ChildRefs": [],
            }
            # Glite returns this error for Deleted jobs to std.err
            for job in List.fromChar(stderr, "\nUnable to retrieve the status for:")[1:]:
                pRef = List.fromChar(job, "\n")[0].strip()
                resultDict[pRef] = deletedJobDict
                self.pilotDB.setPilotStatus(pRef, "Deleted")
                deleted += 1
            # EDG returns a similar error for Deleted jobs to std.out
            for job in List.fromChar(stdout, "\nUnable to retrieve the status for:")[1:]:
                pRef = List.fromChar(job, "\n")[0].strip()
                if re.search("No such file or directory: no matching jobs found", job):
                    resultDict[pRef] = deletedJobDict
                    self.pilotDB.setPilotStatus(pRef, "Deleted")
                    deleted += 1
                if re.search("edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()", job):
                    # the Broker is not accesible
                    return S_ERROR("Broker not Available")
            if not deleted:
                self.log.error(
                    "Error executing %s Job Status:" % gridType, str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])
                )
                return S_ERROR()
            return S_OK(resultDict)

        stdout = ret["Value"][1]
        stderr = ret["Value"][2]
        resultDict = {}
        for job in List.fromChar(stdout, "\nStatus info for the Job :")[1:]:
            pRef = List.fromChar(job, "\n")[0].strip()
            resultDict[pRef] = self.__parseJobStatus(job, gridType)

        return S_OK(resultDict)

    def __parseJobStatus(self, job, gridType):
        """ Parse output of grid pilot status command
    """

        statusRE = "Current Status:\s*(\w*)"
        destinationRE = "Destination:\s*([\w\.-]*)"
        statusDateLCGRE = "reached on:\s*....(.*)"
        submittedDateRE = "Submitted:\s*....(.*)"
        statusFailedRE = "Current Status:.*\(Failed\)"

        status = None
        destination = "Unknown"
        statusDate = None
        submittedDate = None

        try:
            status = re.search(statusRE, job).group(1)
            if status == "Done" and re.search(statusFailedRE, job):
                status = "Failed"
            if re.search(destinationRE, job):
                destination = re.search(destinationRE, job).group(1)
            if gridType == "LCG" and re.search(statusDateLCGRE, job):
                statusDate = re.search(statusDateLCGRE, job).group(1)
                statusDate = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(statusDate, "%b %d %H:%M:%S %Y"))
            if gridType == "gLite" and re.search(submittedDateRE, job):
                submittedDate = re.search(submittedDateRE, job).group(1)
                submittedDate = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(submittedDate, "%b %d %H:%M:%S %Y %Z"))
        except:
            self.log.exception("Error parsing %s Job Status output:\n" % gridType, job)

        isParent = False
        if re.search("Nodes information", job):
            isParent = True
        isChild = False
        if re.search("Parent Job", job):
            isChild = True

        if status == "Running":
            # Pilots can be in Running state for too long, due to bugs in the WMS
            if statusDate:
                statusTime = Time.fromString(statusDate)
                delta = Time.dateTime() - statusTime
                if delta > 4 * Time.day:
                    self.log.info("Setting pilot status to Deleted after 4 days in Running")
                    status = "Deleted"
                    statusDate = statusTime + 4 * Time.day
            elif submittedDate:
                statusTime = Time.fromString(submittedDate)
                delta = Time.dateTime() - statusTime
                if delta > 7 * Time.day:
                    self.log.info("Setting pilot status to Deleted more than 7 days after submission still in Running")
                    status = "Deleted"
                    statusDate = statusTime + 7 * Time.day

        childRefs = []
        childDicts = {}
        if isParent:
            for subjob in List.fromChar(job, " Status info for the Job :")[1:]:
                chRef = List.fromChar(subjob, "\n")[0].strip()
                childDict = self.__parseJobStatus(subjob, gridType)
                childRefs.append(chRef)
                childDicts[chRef] = childDict

        return {
            "Status": status,
            "DestinationSite": destination,
            "StatusDate": statusDate,
            "isChild": isChild,
            "isParent": isParent,
            "ParentRef": False,
            "FinalStatus": status in self.finalStateList,
            "ChildRefs": childRefs,
            "ChildDicts": childDicts,
        }

    def __addPilotsAccountingReport(self, pilotsData):
        """ fill accounting data
    """
        for pRef in pilotsData:
            pData = pilotsData[pRef]
            pA = PilotAccounting()
            pA.setEndTime(pData["LastUpdateTime"])
            pA.setStartTime(pData["SubmissionTime"])
            retVal = CS.getUsernameForDN(pData["OwnerDN"])
            if not retVal["OK"]:
                userName = "******"
                self.log.error("Can't determine username for dn:", pData["OwnerDN"])
            else:
                userName = retVal["Value"]
            pA.setValueByKey("User", userName)
            pA.setValueByKey("UserGroup", pData["OwnerGroup"])
            result = getSiteForCE(pData["DestinationSite"])
            if result["OK"] and result["Value"].strip():
                pA.setValueByKey("Site", result["Value"].strip())
            else:
                pA.setValueByKey("Site", "Unknown")
            pA.setValueByKey("GridCE", pData["DestinationSite"])
            pA.setValueByKey("GridMiddleware", pData["GridType"])
            pA.setValueByKey("GridResourceBroker", pData["Broker"])
            pA.setValueByKey("GridStatus", pData["Status"])
            if not "Jobs" in pData:
                pA.setValueByKey("Jobs", 0)
            else:
                pA.setValueByKey("Jobs", len(pData["Jobs"]))
            self.log.verbose("Added accounting record for pilot %s" % pData["PilotID"])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal["OK"]:
                return retVal
        return S_OK()
예제 #41
0
class PilotStatusAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled']
  finalStateList = [ 'Done', 'Aborted', 'Cleared', 'Deleted', 'Failed' ]
  identityFieldsList = [ 'OwnerDN', 'OwnerGroup', 'GridType', 'Broker' ]
  eligibleGridTypes = [ 'gLite' ]

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( 'PollingTime', 120 )
    self.am_setOption( 'GridEnv', '' )
    self.am_setOption( 'PilotStalledDays', 3 )
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    return S_OK()

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """

    self.pilotStalledDays = self.am_getOption( 'PilotStalledDays', 3 )
    self.gridEnv = self.am_getOption( 'GridEnv' )
    if not self.gridEnv:
      # No specific option found, try a general one
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if setup:
        instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' )
        if instance:
          self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' )
    result = self.pilotDB._getConnection()
    if result['OK']:
      connection = result['Value']
    else:
      return result

    result = self.pilotDB.getPilotGroups( self.identityFieldsList,
                                         {'Status': self.queryStateList } )
    if not result['OK']:
      self.log.error( 'Fail to get identities Groups', result['Message'] )
      return result
    if not result['Value']:
      return S_OK()

    pilotsToAccount = {}

    for ownerDN, ownerGroup, gridType, broker in result['Value']:

      if not gridType in self.eligibleGridTypes:
        continue

      self.log.verbose( 'Getting pilots for %s:%s @ %s %s' % ( ownerDN, ownerGroup, gridType, broker ) )

      condDict1 = {'Status':'Done',
                   'StatusReason':'Report from JobAgent',
                   'OwnerDN':ownerDN,
                   'OwnerGroup':ownerGroup,
                   'GridType':gridType,
                   'Broker':broker}

      condDict2 = {'Status':self.queryStateList,
                   'OwnerDN':ownerDN,
                   'OwnerGroup':ownerGroup,
                   'GridType':gridType,
                   'Broker':broker}

      for condDict in [ condDict1, condDict2]:
        result = self.clearWaitingPilots( condDict )
        if not result['OK']:
          self.log.warn( 'Failed to clear Waiting Pilot Jobs' )

        result = self.pilotDB.selectPilots( condDict )
        if not result['OK']:
          self.log.warn( 'Failed to get the Pilot Agents' )
          return result
        if not result['Value']:
          continue
        refList = result['Value']

        ret = gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup )
        if not ret['OK']:
          self.log.error( ret['Message'] )
          self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) )
          continue
        proxy = ret['Value']

        self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % ( len( refList ),
                                                                                      ownerDN, ownerGroup ) )

        for start_index in range( 0, len( refList ), MAX_JOBS_QUERY ):
          refsToQuery = refList[ start_index : start_index + MAX_JOBS_QUERY ]
          self.log.verbose( 'Querying %d pilots of %s starting at %d' %
                            ( len( refsToQuery ), len( refList ), start_index ) )
          result = self.getPilotStatus( proxy, gridType, refsToQuery )
          if not result['OK']:
            if result['Message'] == 'Broker not Available':
              self.log.error( 'Broker %s not Available' % broker )
              break
            self.log.warn( 'Failed to get pilot status:' )
            self.log.warn( '%s:%s @ %s' % ( ownerDN, ownerGroup, gridType ) )
            continue

          statusDict = result[ 'Value' ]
          for pRef in statusDict:
            pDict = statusDict[ pRef ]
            if pDict:
              if pDict['isParent']:
                self.log.verbose( 'Clear parametric parent %s' % pRef )
                result = self.clearParentJob( pRef, pDict, connection )
                if not result['OK']:
                  self.log.warn( result['Message'] )
                else:
                  self.log.info( 'Parametric parent removed: %s' % pRef )
              if pDict[ 'FinalStatus' ]:
                self.log.verbose( 'Marking Status for %s to %s' % ( pRef, pDict['Status'] ) )
                pilotsToAccount[ pRef ] = pDict
              else:
                self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) )
                result = self.pilotDB.setPilotStatus( pRef,
                                                      pDict['Status'],
                                                      pDict['DestinationSite'],
                                                      updateTime = pDict['StatusDate'],
                                                      conn = connection )

          if len( pilotsToAccount ) > 100:
            self.accountPilots( pilotsToAccount, connection )
            pilotsToAccount = {}

    self.accountPilots( pilotsToAccount, connection )
    # Now handle pilots not updated in the last N days (most likely the Broker is no 
    # longer available) and declare them Deleted.
    result = self.handleOldPilots( connection )

    connection.close()

    return S_OK()

  def clearWaitingPilots( self, condDict ):
    """ Clear pilots in the faulty Waiting state
    """

    last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
    clearDict = {'Status':'Waiting',
                 'OwnerDN':condDict['OwnerDN'],
                 'OwnerGroup':condDict['OwnerGroup'],
                 'GridType':condDict['GridType'],
                 'Broker':condDict['Broker']}
    result = self.pilotDB.selectPilots( clearDict, older = last_update )
    if not result['OK']:
      self.log.warn( 'Failed to get the Pilot Agents for Waiting state' )
      return result
    if not result['Value']:
      return S_OK()
    refList = result['Value']

    for pilotRef in refList:
      self.log.info( 'Setting Waiting pilot to Aborted: %s' % pilotRef )
      result = self.pilotDB.setPilotStatus( pilotRef, 'Stalled', statusReason = 'Exceeded max waiting time' )

    return S_OK()

  def clearParentJob( self, pRef, pDict, connection ):
    """ Clear the parameteric parent job from the PilotAgentsDB
    """

    childList = pDict['ChildRefs']

    # Check that at least one child is in the database
    children_ok = False
    for child in childList:
      result = self.pilotDB.getPilotInfo( child, conn = connection )
      if result['OK']:
        if result['Value']:
          children_ok = True

    if children_ok:
      return self.pilotDB.deletePilot( pRef, conn = connection )
    else:
      self.log.verbose( 'Adding children for parent %s' % pRef )
      result = self.pilotDB.getPilotInfo( pRef )
      parentInfo = result['Value'][pRef]
      tqID = parentInfo['TaskQueueID']
      ownerDN = parentInfo['OwnerDN']
      ownerGroup = parentInfo['OwnerGroup']
      broker = parentInfo['Broker']
      gridType = parentInfo['GridType']
      result = self.pilotDB.addPilotTQReference( childList, tqID, ownerDN, ownerGroup,
                                                broker = broker, gridType = gridType )
      if not result['OK']:
        return result
      children_added = True
      for chRef, chDict in pDict['ChildDicts'].items():
        result = self.pilotDB.setPilotStatus( chRef, chDict['Status'],
                                             destination = chDict['DestinationSite'],
                                             conn = connection )
        if not result['OK']:
          children_added = False
      if children_added :
        result = self.pilotDB.deletePilot( pRef, conn = connection )
      else:
        return S_ERROR( 'Failed to add children' )
    return S_OK()

  def handleOldPilots( self, connection ):
    """
      select all pilots that have not been updated in the last N days and declared them 
      Deleted, accounting for them.
    """
    pilotsToAccount = {}
    timeLimitToConsider = Time.toString( Time.dateTime() - Time.day * self.pilotStalledDays )
    # A.T. Below looks to be a bug 
    #result = self.pilotDB.selectPilots( {'Status':self.queryStateList} , older=None, timeStamp='LastUpdateTime' )
    result = self.pilotDB.selectPilots( { 'Status':self.queryStateList} ,
                                        older = timeLimitToConsider,
                                        timeStamp = 'LastUpdateTime' )
    if not result['OK']:
      self.log.error( 'Failed to get the Pilot Agents' )
      return result
    if not result['Value']:
      return S_OK()

    refList = result['Value']
    result = self.pilotDB.getPilotInfo( refList )
    if not result['OK']:
      self.log.error( 'Failed to get Info for Pilot Agents' )
      return result

    pilotsDict = result['Value']

    for pRef in pilotsDict:
      if pilotsDict[pRef].has_key('Jobs') and len(pilotsDict[pRef]['Jobs']) > 0 and self._checkJobLastUpdateTime(pilotsDict[pRef]['Jobs'],self.pilotStalledDays):
        self.log.debug('%s should not be deleted since one job of %s is running.' % ( str(pRef) , str(pilotsDict[pRef]['Jobs']) ) )
        continue
      deletedJobDict = pilotsDict[pRef]
      deletedJobDict['Status'] = 'Deleted'
      deletedJobDict['StatusDate'] = Time.dateTime()
      pilotsToAccount[ pRef ] = deletedJobDict
      if len( pilotsToAccount ) > 100:
        self.accountPilots( pilotsToAccount, connection )
        self._killPilots( pilotsToAccount )
        pilotsToAccount = {}

    self.accountPilots( pilotsToAccount, connection )
    self._killPilots( pilotsToAccount )


    return S_OK()

  def accountPilots( self, pilotsToAccount, connection ):
    """ account for pilots
    """
    accountingFlag = False
    pae = self.am_getOption( 'PilotAccountingEnabled', 'yes' )
    if pae.lower() == "yes":
      accountingFlag = True

    if not pilotsToAccount:
      self.log.info( 'No pilots to Account' )
      return S_OK()

    accountingSent = False
    if accountingFlag:
      retVal = self.pilotDB.getPilotInfo( pilotsToAccount.keys(), conn = connection )
      if not retVal['OK']:
        self.log.error( 'Fail to retrieve Info for pilots', retVal['Message'] )
        return retVal
      dbData = retVal[ 'Value' ]
      for pref in dbData:
        if pref in pilotsToAccount:
          if dbData[pref][ 'Status' ] not in self.finalStateList:
            dbData[pref][ 'Status' ] = pilotsToAccount[pref][ 'Status' ]
            dbData[pref][ 'DestinationSite' ] = pilotsToAccount[pref][ 'DestinationSite' ]
            dbData[pref][ 'LastUpdateTime' ] = pilotsToAccount[pref][ 'StatusDate' ]

      retVal = self.__addPilotsAccountingReport( dbData )
      if not retVal['OK']:
        self.log.error( 'Fail to retrieve Info for pilots', retVal['Message'] )
        return retVal

      self.log.info( "Sending accounting records..." )
      retVal = gDataStoreClient.commit()
      if not retVal[ 'OK' ]:
        self.log.error( "Can't send accounting reports", retVal[ 'Message' ] )
      else:
        self.log.info( "Accounting sent for %s pilots" % len( pilotsToAccount ) )
        accountingSent = True

    if not accountingFlag or accountingSent:
      for pRef in pilotsToAccount:
        pDict = pilotsToAccount[pRef]
        self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) )
        self.pilotDB.setPilotStatus( pRef,
                                     pDict['Status'],
                                     pDict['DestinationSite'],
                                     pDict['StatusDate'],
                                     conn = connection )

    return S_OK()

  #############################################################################
  def getPilotStatus( self, proxy, gridType, pilotRefList ):
    """ Get GRID job status information using the job's owner proxy and
        GRID job IDs. Returns for each JobID its status in the GRID WMS and
        its destination CE as a tuple of 2 elements
    """

    if gridType == 'LCG':
      cmd = [ 'edg-job-status' ]
    elif gridType == 'gLite':
      cmd = [ 'glite-wms-job-status' ]
    else:
      return S_ERROR()
    cmd.extend( pilotRefList )

    start = time.time()
    ret = executeGridCommand( proxy, cmd, self.gridEnv )
    self.log.info( '%s Job Status Execution Time for %d jobs:' %
                   ( gridType, len( pilotRefList ) ), time.time() - start )

    if not ret['OK']:
      self.log.error( 'Failed to execute %s Job Status' % gridType, ret['Message'] )
      return S_ERROR()
    if ret['Value'][0] != 0:
      stderr = ret['Value'][2]
      stdout = ret['Value'][1]
      deleted = 0
      resultDict = {}
      status = 'Deleted'
      destination = 'Unknown'
      deletedJobDict = { 'Status': status,
             'DestinationSite': destination,
             'StatusDate': Time.dateTime(),
             'isChild': False,
             'isParent': False,
             'ParentRef': False,
             'FinalStatus' : status in self.finalStateList,
             'ChildRefs' : [] }
      # Glite returns this error for Deleted jobs to std.err
      for job in List.fromChar( stderr, '\nUnable to retrieve the status for:' )[1:]:
        pRef = List.fromChar( job, '\n' )[0].strip()
        resultDict[pRef] = deletedJobDict
        self.pilotDB.setPilotStatus( pRef, "Deleted" )
        deleted += 1
      # EDG returns a similar error for Deleted jobs to std.out
      for job in List.fromChar( stdout, '\nUnable to retrieve the status for:' )[1:]:
        pRef = List.fromChar( job, '\n' )[0].strip()
        if re.search( "No such file or directory: no matching jobs found", job ):
          resultDict[pRef] = deletedJobDict
          self.pilotDB.setPilotStatus( pRef, "Deleted" )
          deleted += 1
        if re.search( "edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()", job ):
          # the Broker is not accesible
          return S_ERROR( 'Broker not Available' )
      if not deleted:
        self.log.error( 'Error executing %s Job Status:' %
                        gridType, str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
        return S_ERROR()
      return S_OK( resultDict )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]
    resultDict = {}
    for job in List.fromChar( stdout, '\nStatus info for the Job :' )[1:]:
      pRef = List.fromChar( job, '\n' )[0].strip()
      resultDict[pRef] = self.__parseJobStatus( job, gridType )

    return S_OK( resultDict )

  def __parseJobStatus( self, job, gridType ):
    """ Parse output of grid pilot status command
    """

    statusRE = 'Current Status:\s*(\w*)'
    destinationRE = 'Destination:\s*([\w\.-]*)'
    statusDateLCGRE = 'reached on:\s*....(.*)'
    submittedDateRE = 'Submitted:\s*....(.*)'
    statusFailedRE = 'Current Status:.*\(Failed\)'

    status = None
    destination = 'Unknown'
    statusDate = None
    submittedDate = None

    try:
      status = re.search( statusRE, job ).group( 1 )
      if status == 'Done' and re.search( statusFailedRE, job ):
        status = 'Failed'
      if re.search( destinationRE, job ):
        destination = re.search( destinationRE, job ).group( 1 )
      if gridType == 'LCG' and re.search( statusDateLCGRE, job ):
        statusDate = re.search( statusDateLCGRE, job ).group( 1 )
        statusDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( statusDate, '%b %d %H:%M:%S %Y' ) )
      if gridType == 'gLite' and re.search( submittedDateRE, job ):
        submittedDate = re.search( submittedDateRE, job ).group( 1 )
        submittedDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( submittedDate, '%b %d %H:%M:%S %Y %Z' ) )
    except:
      self.log.exception( 'Error parsing %s Job Status output:\n' % gridType, job )

    isParent = False
    if re.search( 'Nodes information', job ):
      isParent = True
    isChild = False
    if re.search( 'Parent Job', job ):
      isChild = True

    if status == "Running":
      # Pilots can be in Running state for too long, due to bugs in the WMS
      if statusDate:
        statusTime = Time.fromString( statusDate )
        delta = Time.dateTime() - statusTime
        if delta > 4 * Time.day:
          self.log.info( 'Setting pilot status to Deleted after 4 days in Running' )
          status = "Deleted"
          statusDate = statusTime + 4 * Time.day
      elif submittedDate:
        statusTime = Time.fromString( submittedDate )
        delta = Time.dateTime() - statusTime
        if delta > 7 * Time.day:
          self.log.info( 'Setting pilot status to Deleted more than 7 days after submission still in Running' )
          status = "Deleted"
          statusDate = statusTime + 7 * Time.day

    childRefs = []
    childDicts = {}
    if isParent:
      for subjob in List.fromChar( job, ' Status info for the Job :' )[1:]:
        chRef = List.fromChar( subjob, '\n' )[0].strip()
        childDict = self.__parseJobStatus( subjob, gridType )
        childRefs.append( chRef )
        childDicts[chRef] = childDict

    return { 'Status': status,
             'DestinationSite': destination,
             'StatusDate': statusDate,
             'isChild': isChild,
             'isParent': isParent,
             'ParentRef': False,
             'FinalStatus' : status in self.finalStateList,
             'ChildRefs' : childRefs,
             'ChildDicts' : childDicts }

  def __addPilotsAccountingReport( self, pilotsData ):
    """ fill accounting data
    """
    for pRef in pilotsData:
      pData = pilotsData[pRef]
      pA = PilotAccounting()
      pA.setEndTime( pData[ 'LastUpdateTime' ] )
      pA.setStartTime( pData[ 'SubmissionTime' ] )
      retVal = CS.getUsernameForDN( pData[ 'OwnerDN' ] )
      if not retVal[ 'OK' ]:
        userName = '******'
        self.log.error( "Can't determine username for dn:", pData[ 'OwnerDN' ] )
      else:
        userName = retVal[ 'Value' ]
      pA.setValueByKey( 'User', userName )
      pA.setValueByKey( 'UserGroup', pData[ 'OwnerGroup' ] )
      result = getSiteForCE( pData[ 'DestinationSite' ] )
      if result['OK'] and result[ 'Value' ].strip():
        pA.setValueByKey( 'Site', result['Value'].strip() )
      else:
        pA.setValueByKey( 'Site', 'Unknown' )
      pA.setValueByKey( 'GridCE', pData[ 'DestinationSite' ] )
      pA.setValueByKey( 'GridMiddleware', pData[ 'GridType' ] )
      pA.setValueByKey( 'GridResourceBroker', pData[ 'Broker' ] )
      pA.setValueByKey( 'GridStatus', pData[ 'Status' ] )
      if not 'Jobs' in pData:
        pA.setValueByKey( 'Jobs', 0 )
      else:
        pA.setValueByKey( 'Jobs', len( pData['Jobs'] ) )
      self.log.verbose( "Added accounting record for pilot %s" % pData[ 'PilotID' ] )
      retVal = gDataStoreClient.addRegister( pA )
      if not retVal[ 'OK' ]:
        return retVal
    return S_OK()

  def _killPilots( self, acc ):
    for i in sorted(acc.keys()):
      result = self.diracadmin.getPilotInfo( i )
      if result['OK'] and result['Value'].has_key(i) and result['Value'][i].has_key('Status'):
        ret = self.diracadmin.killPilot( str(i) )
        if ret['OK']:
          self.log.info("Successfully deleted: %s (Status : %s)" % (i, result['Value'][i]['Status'] ) )
        else:
          self.log.error("Failed to delete %s : %s"  % ( i, ret['Message']))
      else:
        self.log.error("Failed to get info. of %s : %s" % ( i, str(result)))

  def _checkJobLastUpdateTime( self, joblist , StalledDays ):
    timeLimitToConsider = Time.dateTime() - Time.day * StalledDays 
    ret = False
    for JobID in joblist:
      result = self.jobDB.getJobAttributes(int(JobID))
      if result['OK']:
         if result['Value'].has_key('LastUpdateTime'):
           LastUpdateTime = result['Value']['LastUpdateTime']
           if Time.fromString(LastUpdateTime) > timeLimitToConsider:
             ret = True
             self.log.debug('Since '+str(JobID)+' updates LastUpdateTime on '+str(LastUpdateTime)+', this does not to need to be deleted.')
             break
      else:
        self.log.error("Error taking job info. from DB:%s" % str( result['Message'] ) )
    return ret
  labels = ['pilotUUID', 'timestamp', 'source', 'phase', 'status', 'messageContent']
  for log in logs:
    content.append([log[label] for label in labels])
  printTable(labels, content, numbering=False, columnSeparator=' | ')


if uuid:
  pilotsLogging = PilotsLoggingClient()
  result = pilotsLogging.getPilotsLogging(uuid)
  if not result['OK']:
    print 'ERROR: %s' % result['Message']
    DIRAC.exit(1)
  printPilotsLogging(result['Value'])
  DIRAC.exit(0)
else:
  pilotDB = PilotAgentsDB()
  pilotsLogging = PilotsLoggingClient()
  pilots = pilotDB.getPilotsForJobID(jobid)
  if not pilots['OK ']:
    print pilots['Message']
  for pilotID in pilots:
    info = pilotDB.getPilotInfo(pilotID=pilotID)
    if not info['OK']:
      print info['Message']
    for pilot in info:
      logging = pilotsLogging.getPilotsLogging(pilot['PilotJobReference'])
      if not logging['OK']:
        print logging['Message']
      printPilotsLogging(logging)
  DIRAC.exit(0)
예제 #43
0
class PilotStatusAgent(AgentModule):
  """
      The specific agents must provide the following methods:
        - initialize() for initial settings
        - beginExecution()
        - execute() - the main method called in the agent cycle
        - endExecution()
        - finalize() - the graceful exit of the method, this one is usually used
                   for the agent restart
  """

  queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled']
  finalStateList = ['Done', 'Aborted', 'Cleared', 'Deleted', 'Failed']

  def __init__(self, *args, **kwargs):
    """ c'tor
    """
    AgentModule.__init__(self, *args, **kwargs)

    self.jobDB = None
    self.pilotDB = None
    self.diracadmin = None

  #############################################################################
  def initialize(self):
    """Sets defaults
    """

    self.am_setOption('PollingTime', 120)
    self.am_setOption('GridEnv', '')
    self.am_setOption('PilotStalledDays', 3)
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
    self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7)
    self.WMSAdministrator = WMSAdministratorClient()

    return S_OK()

  #############################################################################
  def execute(self):
    """The PilotAgent execution method.
    """

    self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3)
    self.gridEnv = self.am_getOption('GridEnv')
    if not self.gridEnv:
      # No specific option found, try a general one
      setup = gConfig.getValue('/DIRAC/Setup', '')
      if setup:
        instance = gConfig.getValue('/DIRAC/Setups/%s/WorkloadManagement' % setup, '')
        if instance:
          self.gridEnv = gConfig.getValue('/Systems/WorkloadManagement/%s/GridEnv' % instance, '')
    result = self.pilotDB._getConnection()
    if result['OK']:
      connection = result['Value']
    else:
      return result

    # Now handle pilots not updated in the last N days (most likely the Broker is no
    # longer available) and declare them Deleted.
    result = self.handleOldPilots(connection)

    connection.close()

    result = self.WMSAdministrator.clearPilots(self.clearPilotsDelay, self.clearAbortedDelay)
    if not result['OK']:
      self.log.warn('Failed to clear old pilots in the PilotAgentsDB')

    return S_OK()

  def clearWaitingPilots(self, condDict):
    """ Clear pilots in the faulty Waiting state
    """

    last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
    clearDict = {'Status': 'Waiting',
                 'OwnerDN': condDict['OwnerDN'],
                 'OwnerGroup': condDict['OwnerGroup'],
                 'GridType': condDict['GridType'],
                 'Broker': condDict['Broker']}
    result = self.pilotDB.selectPilots(clearDict, older=last_update)
    if not result['OK']:
      self.log.warn('Failed to get the Pilot Agents for Waiting state')
      return result
    if not result['Value']:
      return S_OK()
    refList = result['Value']

    for pilotRef in refList:
      self.log.info('Setting Waiting pilot to Stalled: %s' % pilotRef)
      result = self.pilotDB.setPilotStatus(pilotRef, 'Stalled', statusReason='Exceeded max waiting time')

    return S_OK()

  def clearParentJob(self, pRef, pDict, connection):
    """ Clear the parameteric parent job from the PilotAgentsDB
    """

    childList = pDict['ChildRefs']

    # Check that at least one child is in the database
    children_ok = False
    for child in childList:
      result = self.pilotDB.getPilotInfo(child, conn=connection)
      if result['OK']:
        if result['Value']:
          children_ok = True

    if children_ok:
      return self.pilotDB.deletePilot(pRef, conn=connection)
    else:
      self.log.verbose('Adding children for parent %s' % pRef)
      result = self.pilotDB.getPilotInfo(pRef)
      parentInfo = result['Value'][pRef]
      tqID = parentInfo['TaskQueueID']
      ownerDN = parentInfo['OwnerDN']
      ownerGroup = parentInfo['OwnerGroup']
      broker = parentInfo['Broker']
      gridType = parentInfo['GridType']
      result = self.pilotDB.addPilotTQReference(childList, tqID, ownerDN, ownerGroup,
                                                broker=broker, gridType=gridType)
      if not result['OK']:
        return result
      children_added = True
      for chRef, chDict in pDict['ChildDicts'].items():
        result = self.pilotDB.setPilotStatus(chRef, chDict['Status'],
                                             destination=chDict['DestinationSite'],
                                             conn=connection)
        if not result['OK']:
          children_added = False
      if children_added:
        result = self.pilotDB.deletePilot(pRef, conn=connection)
      else:
        return S_ERROR('Failed to add children')
    return S_OK()

  def handleOldPilots(self, connection):
    """
      select all pilots that have not been updated in the last N days and declared them
      Deleted, accounting for them.
    """
    pilotsToAccount = {}
    timeLimitToConsider = Time.toString(Time.dateTime() - Time.day * self.pilotStalledDays)
    result = self.pilotDB.selectPilots({'Status': self.queryStateList},
                                       older=timeLimitToConsider,
                                       timeStamp='LastUpdateTime')
    if not result['OK']:
      self.log.error('Failed to get the Pilot Agents')
      return result
    if not result['Value']:
      return S_OK()

    refList = result['Value']
    result = self.pilotDB.getPilotInfo(refList)
    if not result['OK']:
      self.log.error('Failed to get Info for Pilot Agents')
      return result

    pilotsDict = result['Value']

    for pRef in pilotsDict:
      if pilotsDict[pRef].get('Jobs') and self._checkJobLastUpdateTime(pilotsDict[pRef]['Jobs'], self.pilotStalledDays):
        self.log.debug('%s should not be deleted since one job of %s is running.' %
                       (str(pRef), str(pilotsDict[pRef]['Jobs'])))
        continue
      deletedJobDict = pilotsDict[pRef]
      deletedJobDict['Status'] = 'Deleted'
      deletedJobDict['StatusDate'] = Time.dateTime()
      pilotsToAccount[pRef] = deletedJobDict
      if len(pilotsToAccount) > 100:
        self.accountPilots(pilotsToAccount, connection)
        self._killPilots(pilotsToAccount)
        pilotsToAccount = {}

    self.accountPilots(pilotsToAccount, connection)
    self._killPilots(pilotsToAccount)

    return S_OK()

  def accountPilots(self, pilotsToAccount, connection):
    """ account for pilots
    """
    accountingFlag = False
    pae = self.am_getOption('PilotAccountingEnabled', 'yes')
    if pae.lower() == "yes":
      accountingFlag = True

    if not pilotsToAccount:
      self.log.info('No pilots to Account')
      return S_OK()

    accountingSent = False
    if accountingFlag:
      retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(), conn=connection)
      if not retVal['OK']:
        self.log.error('Fail to retrieve Info for pilots', retVal['Message'])
        return retVal
      dbData = retVal['Value']
      for pref in dbData:
        if pref in pilotsToAccount:
          if dbData[pref]['Status'] not in self.finalStateList:
            dbData[pref]['Status'] = pilotsToAccount[pref]['Status']
            dbData[pref]['DestinationSite'] = pilotsToAccount[pref]['DestinationSite']
            dbData[pref]['LastUpdateTime'] = pilotsToAccount[pref]['StatusDate']

      retVal = self.__addPilotsAccountingReport(dbData)
      if not retVal['OK']:
        self.log.error('Fail to retrieve Info for pilots', retVal['Message'])
        return retVal

      self.log.info("Sending accounting records...")
      retVal = gDataStoreClient.commit()
      if not retVal['OK']:
        self.log.error("Can't send accounting reports", retVal['Message'])
      else:
        self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount))
        accountingSent = True

    if not accountingFlag or accountingSent:
      for pRef in pilotsToAccount:
        pDict = pilotsToAccount[pRef]
        self.log.verbose('Setting Status for %s to %s' % (pRef, pDict['Status']))
        self.pilotDB.setPilotStatus(pRef,
                                    pDict['Status'],
                                    pDict['DestinationSite'],
                                    pDict['StatusDate'],
                                    conn=connection)

    return S_OK()

  def __addPilotsAccountingReport(self, pilotsData):
    """ fill accounting data
    """
    for pRef in pilotsData:
      pData = pilotsData[pRef]
      pA = PilotAccounting()
      pA.setEndTime(pData['LastUpdateTime'])
      pA.setStartTime(pData['SubmissionTime'])
      retVal = CS.getUsernameForDN(pData['OwnerDN'])
      if not retVal['OK']:
        userName = '******'
        self.log.error("Can't determine username for dn:", pData['OwnerDN'])
      else:
        userName = retVal['Value']
      pA.setValueByKey('User', userName)
      pA.setValueByKey('UserGroup', pData['OwnerGroup'])
      result = getSiteForCE(pData['DestinationSite'])
      if result['OK'] and result['Value'].strip():
        pA.setValueByKey('Site', result['Value'].strip())
      else:
        pA.setValueByKey('Site', 'Unknown')
      pA.setValueByKey('GridCE', pData['DestinationSite'])
      pA.setValueByKey('GridMiddleware', pData['GridType'])
      pA.setValueByKey('GridResourceBroker', pData['Broker'])
      pA.setValueByKey('GridStatus', pData['Status'])
      if 'Jobs' not in pData:
        pA.setValueByKey('Jobs', 0)
      else:
        pA.setValueByKey('Jobs', len(pData['Jobs']))
      self.log.verbose("Added accounting record for pilot %s" % pData['PilotID'])
      retVal = gDataStoreClient.addRegister(pA)
      if not retVal['OK']:
        return retVal
    return S_OK()

  def _killPilots(self, acc):
    for i in sorted(acc.keys()):
      result = self.diracadmin.getPilotInfo(i)
      if result['OK'] and i in result['Value'] and 'Status' in result['Value'][i]:
        ret = self.diracadmin.killPilot(str(i))
        if ret['OK']:
          self.log.info("Successfully deleted: %s (Status : %s)" % (i, result['Value'][i]['Status']))
        else:
          self.log.error("Failed to delete pilot: ", "%s : %s" % (i, ret['Message']))
      else:
        self.log.error("Failed to get pilot info", "%s : %s" % (i, str(result)))

  def _checkJobLastUpdateTime(self, joblist, StalledDays):
    timeLimitToConsider = Time.dateTime() - Time.day * StalledDays
    ret = False
    for jobID in joblist:
      result = self.jobDB.getJobAttributes(int(jobID))
      if result['OK']:
        if 'LastUpdateTime' in result['Value']:
          lastUpdateTime = result['Value']['LastUpdateTime']
          if Time.fromString(lastUpdateTime) > timeLimitToConsider:
            ret = True
            self.log.debug(
                'Since %s updates LastUpdateTime on %s this does not to need to be deleted.' %
                (str(jobID), str(lastUpdateTime)))
            break
      else:
        self.log.error("Error taking job info from DB", result['Message'])
    return ret
예제 #44
0
  def test_PilotsDB( self ):

    wmsAdministrator = RPCClient( 'WorkloadManagement/WMSAdministrator' )
    pilotAgentDB = PilotAgentsDB()


    res = wmsAdministrator.addPilotTQReference( ['aPilot'], 1, '/a/ownerDN', 'a/owner/Group' )
    self.assert_( res['OK'] )
    res = wmsAdministrator.getCurrentPilotCounters( {} )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'Submitted': 1L} )
    res = pilotAgentDB.deletePilot( 'aPilot' )
    self.assert_( res['OK'] )
    res = wmsAdministrator.getCurrentPilotCounters( {} )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {} )

    res = wmsAdministrator.addPilotTQReference( ['anotherPilot'], 1, '/a/ownerDN', 'a/owner/Group' )
    self.assert_( res['OK'] )
    res = wmsAdministrator.storePilotOutput( 'anotherPilot', 'This is an output', 'this is an error' )
    self.assert_( res['OK'] )
    res = wmsAdministrator.getPilotOutput( 'anotherPilot' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'OwnerDN': '/a/ownerDN',
                                     'OwnerGroup': 'a/owner/Group',
                                     'StdErr': 'this is an error',
                                     'FileList': [],
                                     'StdOut': 'This is an output'} )
    # need a job for the following
#     res = wmsAdministrator.getJobPilotOutput( 1 )
#     self.assertEqual( res['Value'], {'OwnerDN': '/a/ownerDN', 'OwnerGroup': 'a/owner/Group',
#                                      'StdErr': 'this is an error', 'FileList': [], 'StdOut': 'This is an output'} )
#     self.assert_( res['OK'] )
    res = wmsAdministrator.getPilotInfo( 'anotherPilot' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['anotherPilot']['AccountingSent'], 'False' )
    self.assertEqual( res['Value']['anotherPilot']['PilotJobReference'], 'anotherPilot' )

    res = wmsAdministrator.selectPilots( {} )
    self.assert_( res['OK'] )
#     res = wmsAdministrator.getPilotLoggingInfo( 'anotherPilot' )
#     self.assert_( res['OK'] )
    res = wmsAdministrator.getPilotSummary( '', '' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['Total']['Submitted'], 1 )
    res = wmsAdministrator.getPilotMonitorWeb( {}, [], 0, 100 )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['TotalRecords'], 1 )
    res = wmsAdministrator.getPilotMonitorSelectors()
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'GridType': ['DIRAC'], 
                                     'OwnerGroup': ['a/owner/Group'], 
                                     'DestinationSite': ['NotAssigned'], 
                                     'Broker': ['Unknown'], 'Status': ['Submitted'], 
                                     'OwnerDN': ['/a/ownerDN'], 
                                     'GridSite': ['Unknown'], 
                                     'Owner': []} )
    res = wmsAdministrator.getPilotSummaryWeb( {}, [], 0, 100 )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['TotalRecords'], 1 )

    res = wmsAdministrator.setAccountingFlag( 'anotherPilot', 'True' )
    self.assert_( res['OK'] )
    res = wmsAdministrator.setPilotStatus( 'anotherPilot', 'Running' )
    self.assert_( res['OK'] )
    res = wmsAdministrator.getPilotInfo( 'anotherPilot' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['anotherPilot']['AccountingSent'], 'True' )
    self.assertEqual( res['Value']['anotherPilot']['Status'], 'Running' )

    res = wmsAdministrator.setJobForPilot( 123, 'anotherPilot' )
    self.assert_( res['OK'] )
    res = wmsAdministrator.setPilotBenchmark( 'anotherPilot', 12.3 )
    self.assert_( res['OK'] )
    res = wmsAdministrator.countPilots( {} )
    self.assert_( res['OK'] )
#     res = wmsAdministrator.getCounters()
#     # getPilotStatistics

    res = pilotAgentDB.deletePilot( 'anotherPilot' )
    self.assert_( res['OK'] )
    res = wmsAdministrator.getCurrentPilotCounters( {} )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {} )