def main():
    global uuid
    global jobid
    Script.registerSwitch('u:', 'uuid=',
                          'get PilotsLogging for given Pilot UUID', setUUID)
    Script.registerSwitch('j:', 'jobid=', 'get PilotsLogging for given Job ID',
                          setJobID)
    Script.parseCommandLine()

    from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient

    if jobid:
        result = PilotManagerClient().getPilots(jobid)
        if not result['OK']:
            gLogger.error(result['Message'])
            DIRAC.exit(1)
        gLogger.debug(result['Value'])
        uuid = list(result['Value'])[0]

    result = PilotManagerClient().getPilotLoggingInfo(uuid)
    if not result['OK']:
        gLogger.error(result['Message'])
        DIRAC.exit(1)
    gLogger.notice(result['Value'])

    DIRAC.exit(0)
예제 #2
0
def main():
    """reads in the options and deletes the matching pilots"""
    options = Params()
    options.registerCLISwitches()
    Script.parseCommandLine(ignoreErrors=True)
    # make sure *something* is set
    if not options.site and not options.ce and not options.vo and not options.status:
        print(
            "You must chose at least one of the following options: --vo, --ce --site"
        )

    # occasionally the same job might appear twice, but that shouldn't matter
    conditions = {}
    if options.status:
        conditions["Status"] = options.status[0]
    else:
        conditions["Status"] = ["Submitted", "Scheduled", "Waiting", "Unknown"]

    if options.site:
        conditions["GridSite"] = options.site

    if options.ce:
        conditions["DestinationSite"] = options.ce

    if options.vo:
        pilotstring = options.vo + "_pilot"
        conditions["OwnerGroup"] = pilotstring

    # conditions = {"Status":"Submitted", "GridSite":"LCG.UKI-LT2-IC-HEP.uk",
    #               "OwnerGroup":["lz_pilot", "gridpp_pilot"], "DestinationSite":"ceprod00.grid.hep.ph.ic.ac.uk"}
    print("Selecting pilots fulfulling the following conditions: %s" %
          conditions)

    pilotmanager = PilotManagerClient()
    result = pilotmanager.selectPilots(conditions)

    if not result['Value']:
        print("No pilots matching these criteria were found.")
        sys.exit(0)

    print("Found the following matching pilots:")
    for pilotRef in result['Value']:
        print(pilotRef)

    if options.dryrun:
        print("Dry run only. No pilots will be deleted")
        sys.exit(0)

    # now get the pilot references and delete them

    from DIRAC.Interfaces.API.DiracAdmin import DiracAdmin
    diracAdmin = DiracAdmin()

    for pilotRef in result['Value']:
        result = diracAdmin.killPilot(pilotRef)
        if not result['OK']:
            print("Error encountered when deleting pilot %s" % pilotRef)
            print(result)
예제 #3
0
    def __init__(self, args=None, clients=None):

        super(PilotCommand, self).__init__(args, clients)

        if 'Pilots' in self.apis:
            self.pilots = self.apis['Pilots']
        else:
            self.pilots = PilotManagerClient()

        if 'ResourceManagementClient' in self.apis:
            self.rmClient = self.apis['ResourceManagementClient']
        else:
            self.rmClient = ResourceManagementClient()
예제 #4
0
    def __init__(self, args=None, clients=None):

        super(PilotCommand, self).__init__(args, clients)

        if "Pilots" in self.apis:
            self.pilots = self.apis["Pilots"]
        else:
            self.pilots = PilotManagerClient()

        if "ResourceManagementClient" in self.apis:
            self.rmClient = self.apis["ResourceManagementClient"]
        else:
            self.rmClient = ResourceManagementClient()
예제 #5
0
    def initialize(self):
        """Sets defaults"""

        self.am_setOption("GridEnv", "")
        self.pilotDB = PilotAgentsDB()
        self.diracadmin = DiracAdmin()
        self.jobDB = JobDB()
        self.clearPilotsDelay = self.am_getOption("ClearPilotsDelay", 30)
        self.clearAbortedDelay = self.am_getOption("ClearAbortedPilotsDelay",
                                                   7)
        self.pilots = PilotManagerClient()

        return S_OK()
예제 #6
0
  def initialize(self):
    """Sets defaults
    """

    self.am_setOption('PollingTime', 120)
    self.am_setOption('GridEnv', '')
    self.am_setOption('PilotStalledDays', 3)
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
    self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7)
    self.pilots = PilotManagerClient()

    return S_OK()
예제 #7
0
    def initializeHandler(cls, svcInfoDict):
        """WMS AdministratorService initialization"""
        try:
            result = ObjectLoader().loadObject(
                "WorkloadManagementSystem.DB.JobDB", "JobDB")
            if not result["OK"]:
                return result
            cls.jobDB = result["Value"](parentLogger=cls.log)
        except RuntimeError as excp:
            return S_ERROR(f"Can't connect to DB: {excp!r}")

        cls.elasticJobParametersDB = None
        useESForJobParametersFlag = Operations().getValue(
            "/Services/JobMonitoring/useESForJobParametersFlag", False)
        if useESForJobParametersFlag:
            try:
                result = ObjectLoader().loadObject(
                    "WorkloadManagementSystem.DB.ElasticJobParametersDB",
                    "ElasticJobParametersDB")
                if not result["OK"]:
                    return result
                cls.elasticJobParametersDB = result["Value"]()
            except RuntimeError as excp:
                return S_ERROR(f"Can't connect to DB: {excp!r}")

        cls.pilotManager = PilotManagerClient()

        return S_OK()
예제 #8
0
  def initialize(self):
    """ Define the commands to be executed, and instantiate the clients that will be used.
    """

    res = ObjectLoader().loadObject('DIRAC.ResourceStatusSystem.Client.ResourceStatusClient',
                                    'ResourceStatusClient')
    if not res['OK']:
      self.log.error('Failed to load ResourceStatusClient class: %s' % res['Message'])
      return res
    rsClass = res['Value']

    res = ObjectLoader().loadObject('DIRAC.ResourceStatusSystem.Client.ResourceManagementClient',
                                    'ResourceManagementClient')
    if not res['OK']:
      self.log.error('Failed to load ResourceManagementClient class: %s' % res['Message'])
      return res
    rmClass = res['Value']

    self.commands['Downtime'] = [{'Downtime': {}}]
    self.commands['GOCDBSync'] = [{'GOCDBSync': {}}]
    self.commands['FreeDiskSpace'] = [{'FreeDiskSpace': {}}]

    # PilotsCommand
#    self.commands[ 'Pilots' ] = [
#                                 { 'PilotsWMS' : { 'element' : 'Site', 'siteName' : None } },
#                                 { 'PilotsWMS' : { 'element' : 'Resource', 'siteName' : None } }
#                                 ]

    # FIXME: do not forget about hourly vs Always ...etc
    # AccountingCacheCommand
#    self.commands[ 'AccountingCache' ] = [
#                                          {'SuccessfullJobsBySiteSplitted'    :{'hours' :24, 'plotType' :'Job' }},
#                                          {'FailedJobsBySiteSplitted'         :{'hours' :24, 'plotType' :'Job' }},
#                                          {'SuccessfullPilotsBySiteSplitted'  :{'hours' :24, 'plotType' :'Pilot' }},
#                                          {'FailedPilotsBySiteSplitted'       :{'hours' :24, 'plotType' :'Pilot' }},
#                                          {'SuccessfullPilotsByCESplitted'    :{'hours' :24, 'plotType' :'Pilot' }},
#                                          {'FailedPilotsByCESplitted'         :{'hours' :24, 'plotType' :'Pilot' }},
#                                          {'RunningJobsBySiteSplitted'        :{'hours' :24, 'plotType' :'Job' }},
# #                                          {'RunningJobsBySiteSplitted'        :{'hours' :168, 'plotType' :'Job' }},
# #                                          {'RunningJobsBySiteSplitted'        :{'hours' :720, 'plotType' :'Job' }},
# #                                          {'RunningJobsBySiteSplitted'        :{'hours' :8760, 'plotType' :'Job' }},
#                                          ]

    # VOBOXAvailability
#    self.commands[ 'VOBOXAvailability' ] = [
#                                            { 'VOBOXAvailability' : {} }
#

    # Reuse clients for the commands
    self.clients['GOCDBClient'] = GOCDBClient()
    self.clients['ReportsClient'] = ReportsClient()
    self.clients['ResourceStatusClient'] = rsClass()
    self.clients['ResourceManagementClient'] = rmClass()
    self.clients['WMSAdministrator'] = WMSAdministratorClient()
    self.clients['Pilots'] = PilotManagerClient()

    self.cCaller = CommandCaller

    return S_OK()
예제 #9
0
    def getPilotOutput(self, gridReference, directory=""):
        """Retrieve the pilot output  (std.out and std.err) for an existing job in the WMS.

          >>> gLogger.notice(dirac.getJobPilotOutput(12345))
          {'OK': True, 'Value': {}}

        :param job: JobID
        :type job: integer or string
        :return: S_OK,S_ERROR
        """
        if not isinstance(gridReference, str):
            return self._errorReport("Expected string for pilot reference")

        if not directory:
            directory = self.currentDir

        if not os.path.exists(directory):
            return self._errorReport("Directory %s does not exist" % directory)

        result = PilotManagerClient().getPilotOutput(gridReference)
        if not result["OK"]:
            return result

        gridReferenceSmall = gridReference.split("/")[-1]
        if not gridReferenceSmall:
            gridReferenceSmall = "reference"
        outputPath = "%s/pilot_%s" % (directory, gridReferenceSmall)

        if os.path.exists(outputPath):
            self.log.info("Remove %s and retry to continue" % outputPath)
            return S_ERROR("Remove %s and retry to continue" % outputPath)

        if not os.path.exists(outputPath):
            self.log.verbose("Creating directory %s" % outputPath)
            os.mkdir(outputPath)

        outputs = result["Value"]
        if "StdOut" in outputs:
            stdout = "%s/std.out" % (outputPath)
            with open(stdout, "w") as fopen:
                fopen.write(outputs["StdOut"])
            self.log.info("Standard output written to %s" % (stdout))
        else:
            self.log.warn("No standard output returned")

        if "StdErr" in outputs:
            stderr = "%s/std.err" % (outputPath)
            with open(stderr, "w") as fopen:
                fopen.write(outputs["StdErr"])
            self.log.info("Standard error written to %s" % (stderr))
        else:
            self.log.warn("No standard error returned")

        self.log.always("Outputs retrieved in %s" % outputPath)
        return result
예제 #10
0
    def getPilotOutput(self, gridReference, directory=''):
        """Retrieve the pilot output  (std.out and std.err) for an existing job in the WMS.

         >>> gLogger.notice(dirac.getJobPilotOutput(12345))
         {'OK': True, 'Value': {}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, six.string_types):
            return self._errorReport('Expected string for pilot reference')

        if not directory:
            directory = self.currentDir

        if not os.path.exists(directory):
            return self._errorReport('Directory %s does not exist' % directory)

        result = PilotManagerClient().getPilotOutput(gridReference)
        if not result['OK']:
            return result

        gridReferenceSmall = gridReference.split('/')[-1]
        if not gridReferenceSmall:
            gridReferenceSmall = 'reference'
        outputPath = '%s/pilot_%s' % (directory, gridReferenceSmall)

        if os.path.exists(outputPath):
            self.log.info('Remove %s and retry to continue' % outputPath)
            return S_ERROR('Remove %s and retry to continue' % outputPath)

        if not os.path.exists(outputPath):
            self.log.verbose('Creating directory %s' % outputPath)
            os.mkdir(outputPath)

        outputs = result['Value']
        if 'StdOut' in outputs:
            stdout = '%s/std.out' % (outputPath)
            with open(stdout, 'w') as fopen:
                fopen.write(outputs['StdOut'])
            self.log.info('Standard output written to %s' % (stdout))
        else:
            self.log.warn('No standard output returned')

        if 'StdErr' in outputs:
            stderr = '%s/std.err' % (outputPath)
            with open(stderr, 'w') as fopen:
                fopen.write(outputs['StdErr'])
            self.log.info('Standard error written to %s' % (stderr))
        else:
            self.log.warn('No standard error returned')

        self.log.always('Outputs retrieved in %s' % outputPath)
        return result
예제 #11
0
    def killPilot(self, gridReference):
        """Kill the pilot specified

         >>> gLogger.notice(dirac.getPilotInfo(12345))
         {'OK': True, 'Value': {}}

       :param gridReference: Pilot Job Reference
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, six.string_types):
            return self._errorReport('Expected string for pilot reference')

        result = PilotManagerClient().killPilot(gridReference)
        return result
예제 #12
0
    def getPilotLoggingInfo(self, gridReference):
        """Retrieve the pilot logging info for an existing job in the WMS.

         >>> gLogger.notice(dirac.getPilotLoggingInfo(12345))
         {'OK': True, 'Value': {"The output of the command"}}

       :param gridReference: Gridp pilot job reference Id
       :type gridReference: string
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, six.string_types):
            return self._errorReport('Expected string for pilot reference')

        return PilotManagerClient().getPilotLoggingInfo(gridReference)
예제 #13
0
    def finalize(self):
        """ Job Agent finalization method
    """

        gridCE = gConfig.getValue('/LocalSite/GridCE', '')
        queue = gConfig.getValue('/LocalSite/CEQueue', '')
        result = PilotManagerClient().setPilotStatus(str(self.pilotReference),
                                                     'Done', gridCE,
                                                     'Report from JobAgent',
                                                     self.siteName, queue)
        if not result['OK']:
            self.log.warn('Issue setting the pilot status', result['Message'])

        return S_OK()
예제 #14
0
def main():
    global uuid
    global jobid
    Script.registerSwitch("u:", "uuid=", "get PilotsLogging for given Pilot UUID", setUUID)
    Script.registerSwitch("j:", "jobid=", "get PilotsLogging for given Job ID", setJobID)
    Script.parseCommandLine()

    from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient

    if jobid:
        result = PilotManagerClient().getPilots(jobid)
        if not result["OK"]:
            gLogger.error(result["Message"])
            DIRAC.exit(1)
        gLogger.debug(result["Value"])
        uuid = list(result["Value"])[0]

    result = PilotManagerClient().getPilotLoggingInfo(uuid)
    if not result["OK"]:
        gLogger.error(result["Message"])
        DIRAC.exit(1)
    gLogger.notice(result["Value"])

    DIRAC.exit(0)
예제 #15
0
    def getCEStatus(self):
        """Method to return information on running and pending jobs.

        Warning: information currently returned depends on the PilotManager and not HTCondor.
        Results might be wrong if pilots or jobs are submitted manually via the CE.
        """
        result = S_OK()
        result["SubmittedJobs"] = 0
        result["RunningJobs"] = 0
        result["WaitingJobs"] = 0

        # getWaitingPilots
        condDict = {
            "DestinationSite": self.ceName,
            "Status": PilotStatus.PILOT_WAITING_STATES
        }
        res = PilotManagerClient().countPilots(condDict)
        if res["OK"]:
            result["WaitingJobs"] = int(res["Value"])
        else:
            self.log.warn("Failure getting pilot count for %s: %s " %
                          (self.ceName, res["Message"]))

        # getRunningPilots
        condDict = {
            "DestinationSite": self.ceName,
            "Status": PilotStatus.RUNNING
        }
        res = PilotManagerClient().countPilots(condDict)
        if res["OK"]:
            result["RunningJobs"] = int(res["Value"])
        else:
            self.log.warn("Failure getting pilot count for %s: %s " %
                          (self.ceName, res["Message"]))

        return result
예제 #16
0
    def getPilotInfo(self, gridReference):
        """Retrieve info relative to a pilot reference

         >>> gLogger.notice(dirac.getPilotInfo(12345))
         {'OK': True, 'Value': {}}

       :param gridReference: Pilot Job Reference
       :type gridReference: string
       :return: S_OK,S_ERROR
    """
        if not isinstance(gridReference, six.string_types):
            return self._errorReport('Expected string for pilot reference')

        result = PilotManagerClient().getPilotInfo(gridReference)
        return result
예제 #17
0
파일: JobAgent.py 프로젝트: TaykYoku/DIRAC
    def finalize(self):
        """Job Agent finalization method"""

        # wait for all jobs to be completed
        res = self.computingElement.shutdown()
        if not res["OK"]:
            self.log.error("CE could not be properly shut down",
                           res["Message"])
        elif res["Value"]:
            self.log.info("Job submission(s) result", res["Value"])

        gridCE = gConfig.getValue("/LocalSite/GridCE", "")
        queue = gConfig.getValue("/LocalSite/CEQueue", "")
        result = PilotManagerClient().setPilotStatus(str(self.pilotReference),
                                                     PilotStatus.DONE, gridCE,
                                                     "Report from JobAgent",
                                                     self.siteName, queue)
        if not result["OK"]:
            self.log.warn("Issue setting the pilot status", result["Message"])

        return S_OK()
예제 #18
0
    def initializeHandler(cls, svcInfoDict):
        """initialize DBs"""
        try:
            result = ObjectLoader().loadObject(
                "WorkloadManagementSystem.DB.JobDB", "JobDB")
            if not result["OK"]:
                return result
            cls.jobDB = result["Value"]()

            result = ObjectLoader().loadObject(
                "WorkloadManagementSystem.DB.JobLoggingDB", "JobLoggingDB")
            if not result["OK"]:
                return result
            cls.jobLoggingDB = result["Value"]()

            result = ObjectLoader().loadObject(
                "WorkloadManagementSystem.DB.TaskQueueDB", "TaskQueueDB")
            if not result["OK"]:
                return result
            cls.taskQueueDB = result["Value"]()

        except RuntimeError as excp:
            return S_ERROR("Can't connect to DB: %s" % excp)

        cls.elasticJobParametersDB = None
        useESForJobParametersFlag = Operations().getValue(
            "/Services/JobMonitoring/useESForJobParametersFlag", False)
        if useESForJobParametersFlag:
            try:
                result = ObjectLoader().loadObject(
                    "WorkloadManagementSystem.DB.ElasticJobParametersDB",
                    "ElasticJobParametersDB")
                if not result["OK"]:
                    return result
                cls.elasticJobParametersDB = result["Value"]()
            except RuntimeError as excp:
                return S_ERROR("Can't connect to DB: %s" % excp)

        cls.pilotManager = PilotManagerClient()
        return S_OK()
예제 #19
0
  def __getJobPilotStatus(self, jobID):
    """ Get the job pilot status
    """
    result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference')
    if not result['OK']:
      return result
    pilotReference = result['Value'].get('Pilot_Reference', 'Unknown')
    if pilotReference == 'Unknown':
      # There is no pilot reference, hence its status is unknown
      return S_OK('NoPilot')

    result = PilotManagerClient().getPilotInfo(pilotReference)
    if not result['OK']:
      if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
        self.log.warn("No pilot found", "for job %d: %s" % (jobID, result['Message']))
        return S_OK('NoPilot')
      self.log.error('Failed to get pilot information',
                     'for job %d: %s' % (jobID, result['Message']))
      return result
    pilotStatus = result['Value'][pilotReference]['Status']

    return S_OK(pilotStatus)
예제 #20
0
  def getJobPilots(self, jobID):
    """Extract the list of submitted pilots and their status for a given
       jobID from the WMS.  Useful information is printed to the screen.

         >>> gLogger.notice(dirac.getJobPilots())
         {'OK': True, 'Value': {PilotID:{StatusDict}}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR

    """
    if isinstance(jobID, basestring):
      try:
        jobID = int(jobID)
      except Exception as x:
        return self._errorReport(str(x), 'Expected integer or string for existing jobID')

    result = PilotManagerClient().getPilots(jobID)
    if result['OK']:
      gLogger.notice(self.pPrint.pformat(result['Value']))
    return result
예제 #21
0
    def __getJobPilotStatus(self, jobID):
        """ Get the job pilot status
    """
        result = JobMonitoringClient().getJobParameter(jobID,
                                                       'Pilot_Reference')
        if not result['OK']:
            return result
        pilotReference = result['Value'].get('Pilot_Reference')
        if not pilotReference:
            # There is no pilot reference, hence its status is unknown
            return S_OK('NoPilot')

        result = PilotManagerClient().getPilotInfo(pilotReference)
        if not result['OK']:
            if "No pilots found" in result['Message']:
                self.log.warn(result['Message'])
                return S_OK('NoPilot')
            self.log.error('Failed to get pilot information',
                           'for job %d: ' % jobID + result['Message'])
            return S_ERROR('Failed to get the pilot status')
        pilotStatus = result['Value'][pilotReference]['Status']

        return S_OK(pilotStatus)
예제 #22
0
    def _getJobPilotStatus(self, jobID):
        """Get the job pilot status"""
        result = JobMonitoringClient().getJobParameter(jobID,
                                                       "Pilot_Reference")
        if not result["OK"]:
            return result
        pilotReference = result["Value"].get("Pilot_Reference", "Unknown")
        if pilotReference == "Unknown":
            # There is no pilot reference, hence its status is unknown
            return S_OK("NoPilot")

        result = PilotManagerClient().getPilotInfo(pilotReference)
        if not result["OK"]:
            if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
                self.log.warn("No pilot found",
                              "for job %d: %s" % (jobID, result["Message"]))
                return S_OK("NoPilot")
            self.log.error("Failed to get pilot information",
                           "for job %d: %s" % (jobID, result["Message"]))
            return result
        pilotStatus = result["Value"][pilotReference]["Status"]

        return S_OK(pilotStatus)
예제 #23
0
    def getPilotSummary(self, startDate='', endDate=''):
        """Retrieve the pilot output for an existing job in the WMS.  Summary is
       printed at INFO level, full dictionary of results also returned.

         >>> gLogger.notice(dirac.getPilotSummary())
         {'OK': True, 'Value': {CE:{Status:Count}}}

       :param job: JobID
       :type job: integer or string
       :return: S_OK,S_ERROR
    """
        result = PilotManagerClient().getPilotSummary(startDate, endDate)
        if not result['OK']:
            return result

        ceDict = result['Value']
        headers = 'CE'.ljust(28)
        i = 0
        for ce, summary in ceDict.iteritems():
            states = summary.keys()
            if len(states) > i:
                i = len(states)

        for i in xrange(i):
            headers += 'Status'.ljust(12) + 'Count'.ljust(12)
        gLogger.notice(headers)

        for ce, summary in ceDict.iteritems():
            line = ce.ljust(28)
            states = sorted(summary)
            for state in states:
                count = str(summary[state])
                line += state.ljust(12) + count.ljust(12)
            gLogger.notice(line)

        return result
예제 #24
0
class PilotStatusAgent(AgentModule):
    """
    The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
    """
    def __init__(self, *args, **kwargs):
        """c'tor"""
        super().__init__(*args, **kwargs)

        self.jobDB = None
        self.pilotDB = None
        self.diracadmin = None

    #############################################################################
    def initialize(self):
        """Sets defaults"""

        self.am_setOption("GridEnv", "")
        self.pilotDB = PilotAgentsDB()
        self.diracadmin = DiracAdmin()
        self.jobDB = JobDB()
        self.clearPilotsDelay = self.am_getOption("ClearPilotsDelay", 30)
        self.clearAbortedDelay = self.am_getOption("ClearAbortedPilotsDelay",
                                                   7)
        self.pilots = PilotManagerClient()

        return S_OK()

    #############################################################################
    def execute(self):
        """The PilotAgent execution method."""

        self.pilotStalledDays = self.am_getOption("PilotStalledDays", 3)
        self.gridEnv = self.am_getOption("GridEnv")
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue("/DIRAC/Setup", "")
            if setup:
                instance = gConfig.getValue(
                    "/DIRAC/Setups/%s/WorkloadManagement" % setup, "")
                if instance:
                    self.gridEnv = gConfig.getValue(
                        "/Systems/WorkloadManagement/%s/GridEnv" % instance,
                        "")

        result = self.pilotDB._getConnection()
        if not result["OK"]:
            return result
        connection = result["Value"]

        # Now handle pilots not updated in the last N days and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        result = self.pilots.clearPilots(self.clearPilotsDelay,
                                         self.clearAbortedDelay)
        if not result["OK"]:
            self.log.warn("Failed to clear old pilots in the PilotAgentsDB")

        return S_OK()

    def handleOldPilots(self, connection):
        """
        select all pilots that have not been updated in the last N days and declared them
        Deleted, accounting for them.
        """
        pilotsToAccount = {}
        timeLimitToConsider = TimeUtilities.toString(
            datetime.datetime.utcnow() -
            TimeUtilities.day * self.pilotStalledDays)
        result = self.pilotDB.selectPilots(
            {"Status": PilotStatus.PILOT_TRANSIENT_STATES},
            older=timeLimitToConsider,
            timeStamp="LastUpdateTime")
        if not result["OK"]:
            self.log.error("Failed to get the Pilot Agents")
            return result
        if not result["Value"]:
            return S_OK()

        refList = result["Value"]
        result = self.pilotDB.getPilotInfo(refList)
        if not result["OK"]:
            self.log.error("Failed to get Info for Pilot Agents")
            return result

        pilotsDict = result["Value"]

        for pRef in pilotsDict:
            if pilotsDict[pRef].get("Jobs") and self._checkJobLastUpdateTime(
                    pilotsDict[pRef]["Jobs"], self.pilotStalledDays):
                self.log.debug(
                    "%s should not be deleted since one job of %s is running."
                    % (str(pRef), str(pilotsDict[pRef]["Jobs"])))
                continue
            deletedJobDict = pilotsDict[pRef]
            deletedJobDict["Status"] = PilotStatus.DELETED
            deletedJobDict["StatusDate"] = datetime.datetime.utcnow()
            pilotsToAccount[pRef] = deletedJobDict
            if len(pilotsToAccount) > 100:
                self.accountPilots(pilotsToAccount, connection)
                self._killPilots(pilotsToAccount)
                pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        self._killPilots(pilotsToAccount)

        return S_OK()

    def accountPilots(self, pilotsToAccount, connection):
        """account for pilots"""
        accountingFlag = False
        pae = self.am_getOption("PilotAccountingEnabled", "yes")
        if pae.lower() == "yes":
            accountingFlag = True

        if not pilotsToAccount:
            self.log.info("No pilots to Account")
            return S_OK()

        accountingSent = False
        if accountingFlag:
            retVal = self.pilotDB.getPilotInfo(list(pilotsToAccount),
                                               conn=connection)
            if not retVal["OK"]:
                self.log.error("Fail to retrieve Info for pilots",
                               retVal["Message"])
                return retVal
            dbData = retVal["Value"]
            for pref in dbData:
                if pref in pilotsToAccount:
                    if dbData[pref][
                            "Status"] not in PilotStatus.PILOT_FINAL_STATES:
                        dbData[pref]["Status"] = pilotsToAccount[pref][
                            "Status"]
                        dbData[pref]["DestinationSite"] = pilotsToAccount[
                            pref]["DestinationSite"]
                        dbData[pref]["LastUpdateTime"] = pilotsToAccount[pref][
                            "StatusDate"]

            retVal = self._addPilotsAccountingReport(dbData)
            if not retVal["OK"]:
                self.log.error("Fail to retrieve Info for pilots",
                               retVal["Message"])
                return retVal

            self.log.info("Sending accounting records...")
            retVal = gDataStoreClient.commit()
            if not retVal["OK"]:
                self.log.error("Can't send accounting reports",
                               retVal["Message"])
            else:
                self.log.info("Accounting sent for %s pilots" %
                              len(pilotsToAccount))
                accountingSent = True

        if not accountingFlag or accountingSent:
            for pRef in pilotsToAccount:
                pDict = pilotsToAccount[pRef]
                self.log.verbose("Setting Status for %s to %s" %
                                 (pRef, pDict["Status"]))
                self.pilotDB.setPilotStatus(pRef,
                                            pDict["Status"],
                                            pDict["DestinationSite"],
                                            pDict["StatusDate"],
                                            conn=connection)

        return S_OK()

    def _addPilotsAccountingReport(self, pilotsData):
        """fill accounting data"""
        for pRef in pilotsData:
            pData = pilotsData[pRef]
            pA = PilotAccounting()
            pA.setEndTime(pData["LastUpdateTime"])
            pA.setStartTime(pData["SubmissionTime"])
            retVal = Registry.getUsernameForDN(pData["OwnerDN"])
            if not retVal["OK"]:
                userName = "******"
                self.log.error(
                    "Can't determine username for dn",
                    ": %s : %s" % (pData["OwnerDN"], retVal["Message"]),
                )
            else:
                userName = retVal["Value"]
            pA.setValueByKey("User", userName)
            pA.setValueByKey("UserGroup", pData["OwnerGroup"])
            result = getCESiteMapping(pData["DestinationSite"])
            if result["OK"] and pData["DestinationSite"] in result["Value"]:
                pA.setValueByKey(
                    "Site", result["Value"][pData["DestinationSite"]].strip())
            else:
                pA.setValueByKey("Site", "Unknown")
            pA.setValueByKey("GridCE", pData["DestinationSite"])
            pA.setValueByKey("GridMiddleware", pData["GridType"])
            pA.setValueByKey("GridResourceBroker", pData["Broker"])
            pA.setValueByKey("GridStatus", pData["Status"])
            if "Jobs" not in pData:
                pA.setValueByKey("Jobs", 0)
            else:
                pA.setValueByKey("Jobs", len(pData["Jobs"]))
            self.log.verbose("Added accounting record for pilot %s" %
                             pData["PilotID"])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal["OK"]:
                return retVal
        return S_OK()

    def _killPilots(self, acc):
        for i in sorted(acc.keys()):
            result = self.diracadmin.getPilotInfo(i)
            if result["OK"] and i in result["Value"] and "Status" in result[
                    "Value"][i]:
                ret = self.diracadmin.killPilot(str(i))
                if ret["OK"]:
                    self.log.info(
                        "Successfully deleted", ": %s (Status : %s)" %
                        (i, result["Value"][i]["Status"]))
                else:
                    self.log.error("Failed to delete pilot: ",
                                   "%s : %s" % (i, ret["Message"]))
            else:
                self.log.error("Failed to get pilot info",
                               "%s : %s" % (i, str(result)))

    def _checkJobLastUpdateTime(self, joblist, StalledDays):
        timeLimitToConsider = datetime.datetime.utcnow(
        ) - TimeUtilities.day * StalledDays
        ret = False
        for jobID in joblist:
            result = self.jobDB.getJobAttributes(int(jobID))
            if result["OK"]:
                if "LastUpdateTime" in result["Value"]:
                    lastUpdateTime = result["Value"]["LastUpdateTime"]
                    if TimeUtilities.fromString(
                            lastUpdateTime) > timeLimitToConsider:
                        ret = True
                        self.log.debug(
                            "Since %s updates LastUpdateTime on %s this does not to need to be deleted."
                            % (str(jobID), str(lastUpdateTime)))
                        break
            else:
                self.log.error("Error taking job info from DB",
                               result["Message"])
        return ret
예제 #25
0
def test_PilotsDB():

  pilots = PilotManagerClient()

  res = pilots.addPilotTQReference(['aPilot'], 1, '/a/ownerDN', 'a/owner/Group')
  assert res['OK'] is True, res['Message']
  res = pilots.getCurrentPilotCounters({})
  assert res['OK'] is True, res['Message']
  assert res['Value'] == {'Submitted': 1}, res['Value']
  res = pilots.deletePilots('aPilot')
  assert res['OK'] is True, res['Message']
  res = pilots.getCurrentPilotCounters({})
  assert res['OK'] is True, res['Message']
  assert res['Value'] == {}, res['Value']

  res = pilots.addPilotTQReference(['anotherPilot'], 1, '/a/ownerDN', 'a/owner/Group')
  assert res['OK'] is True, res['Message']
  res = pilots.storePilotOutput('anotherPilot', 'This is an output', 'this is an error')
  assert res['OK'] is True, res['Message']
  res = pilots.getPilotOutput('anotherPilot')
  assert res['OK'] is True, res['Message']
  assert res['Value'] == {'OwnerDN': '/a/ownerDN',
                                     'OwnerGroup': 'a/owner/Group',
                                     'StdErr': 'this is an error',
                                     'FileList': [],
                                     'StdOut': 'This is an output'}
  res = pilots.getPilotInfo('anotherPilot')
  assert res['OK'] is True, res['Message']
  assert res['Value']['anotherPilot']['AccountingSent'] == 'False', res['Value']
  assert res['Value']['anotherPilot']['PilotJobReference'] == 'anotherPilot', res['Value']

  res = pilots.selectPilots({})
  assert res['OK'] is True, res['Message']
  res = pilots.getPilotSummary('', '')
  assert res['OK'] is True, res['Message']
  assert res['Value']['Total']['Submitted'] == 1
  res = pilots.getPilotMonitorWeb({}, [], 0, 100)
  assert res['OK'] is True, res['Message']
  assert res['Value']['TotalRecords'] == 1
  res = pilots.getPilotMonitorSelectors()
  assert res['OK'] is True, res['Message']
  assert res['Value'] == {'GridType': ['DIRAC'],
                          'OwnerGroup': ['a/owner/Group'],
                          'DestinationSite': ['NotAssigned'],
                          'Broker': ['Unknown'], 'Status': ['Submitted'],
                          'OwnerDN': ['/a/ownerDN'],
                          'GridSite': ['Unknown'],
                          'Owner': []}, res['Value']
  res = pilots.getPilotSummaryWeb({}, [], 0, 100)
  assert res['OK'] is True, res['Message']
  assert res['Value']['TotalRecords'] == 1, res['Value']

  res = pilots.setAccountingFlag('anotherPilot', 'True')
  assert res['OK'] is True, res['Message']
  res = pilots.setPilotStatus('anotherPilot', 'Running')
  assert res['OK'] is True, res['Message']
  res = pilots.getPilotInfo('anotherPilot')
  assert res['OK'] is True, res['Message']
  assert res['Value']['anotherPilot']['AccountingSent'] == 'True', res['Value']
  assert res['Value']['anotherPilot']['Status'] == 'Running', res['Value']

  res = pilots.setJobForPilot(123, 'anotherPilot')
  assert res['OK'] is True, res['Message']
  res = pilots.setPilotBenchmark('anotherPilot', 12.3)
  assert res['OK'] is True, res['Message']
  res = pilots.countPilots({})
  assert res['OK'] is True, res['Message']
#     res = pilots.getCounters()
#     # getPilotStatistics

  res = pilots.deletePilots('anotherPilot')
  assert res['OK'] is True, res['Message']
  res = pilots.getCurrentPilotCounters({})
  assert res['OK'] is True, res['Message']
  assert res['Value'] == {}, res['Value']
예제 #26
0
Script.registerSwitch('u:', 'uuid=', 'get PilotsLogging for given Pilot UUID',
                      setUUID)
Script.registerSwitch('j:', 'jobid=', 'get PilotsLogging for given Job ID',
                      setJobID)

Script.setUsageMessage('\n'.join([
    __doc__.split('\n')[1], 'Usage:',
    '  %s option value ' % Script.scriptName,
    'Only one option (either uuid or jobid) should be used.'
]))

Script.parseCommandLine()

from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient

if jobid:
    result = PilotManagerClient().getPilots(jobid)
    if not result['OK']:
        gLogger.error(result['Message'])
        DIRAC.exit(1)
    gLogger.debug(result['Value'])
    uuid = result['Value'].keys()[0]

result = PilotManagerClient().getPilotLoggingInfo(uuid)
if not result['OK']:
    gLogger.error(result['Message'])
    DIRAC.exit(1)
gLogger.notice(result['Value'])

DIRAC.exit(0)
예제 #27
0
class PilotCommand(Command):
    """
    Pilot "master" Command.
  """
    def __init__(self, args=None, clients=None):

        super(PilotCommand, self).__init__(args, clients)

        if 'Pilots' in self.apis:
            self.pilots = self.apis['Pilots']
        else:
            self.pilots = PilotManagerClient()

        if 'ResourceManagementClient' in self.apis:
            self.rmClient = self.apis['ResourceManagementClient']
        else:
            self.rmClient = ResourceManagementClient()

    def _storeCommand(self, result):
        """
      Stores the results of doNew method on the database.
    """

        for pilotDict in result:

            resQuery = self.rmClient.addOrModifyPilotCache(
                pilotDict['Site'], pilotDict['CE'], pilotDict['PilotsPerJob'],
                pilotDict['PilotJobEff'], pilotDict['Status'])
            if not resQuery['OK']:
                return resQuery

        return S_OK()

    def _prepareCommand(self):
        """
      JobCommand requires one arguments:
      - name : <str>
    """

        if 'name' not in self.args:
            return S_ERROR('"name" not found in self.args')
        name = self.args['name']

        if 'element' not in self.args:
            return S_ERROR('element is missing')
        element = self.args['element']

        if element not in ['Site', 'Resource']:
            return S_ERROR('"%s" is not Site nor Resource' % element)

        return S_OK((element, name))

    def doNew(self, masterParams=None):

        if masterParams is not None:
            element, name = masterParams
        else:
            params = self._prepareCommand()
            if not params['OK']:
                return params
            element, name = params['Value']

        wmsDict = {}

        if element == 'Site':
            wmsDict = {'GridSite': name}
        elif element == 'Resource':
            wmsDict = {'ExpandSite': name}
        else:
            # You should never see this error
            return S_ERROR('"%s" is not  Site nor Resource' % element)

        pilotsResults = self.pilots.getPilotSummaryWeb(wmsDict, [], 0, 0)

        if not pilotsResults['OK']:
            return pilotsResults
        pilotsResults = pilotsResults['Value']

        if 'ParameterNames' not in pilotsResults:
            return S_ERROR('Wrong result dictionary, missing "ParameterNames"')
        params = pilotsResults['ParameterNames']

        if 'Records' not in pilotsResults:
            return S_ERROR('Wrong formed result dictionary, missing "Records"')
        records = pilotsResults['Records']

        uniformResult = []

        for record in records:

            # This returns a dictionary with the following keys:
            # 'Site', 'CE', 'Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running',
            # 'Done', 'Aborted', 'Done_Empty', 'Aborted_Hour', 'Total', 'PilotsPerJob',
            # 'PilotJobEff', 'Status', 'InMask'
            pilotDict = dict(zip(params, record))

            pilotDict['PilotsPerJob'] = float(pilotDict['PilotsPerJob'])
            pilotDict['PilotJobEff'] = float(pilotDict['PilotJobEff'])

            uniformResult.append(pilotDict)

        storeRes = self._storeCommand(uniformResult)
        if not storeRes['OK']:
            return storeRes

        return S_OK(uniformResult)

    def doCache(self):

        params = self._prepareCommand()
        if not params['OK']:
            return params
        element, name = params['Value']

        if element == 'Site':
            # WMS returns Site entries with CE = 'Multiple'
            site, ce = name, 'Multiple'
        elif element == 'Resource':
            site, ce = None, name
        else:
            # You should never see this error
            return S_ERROR('"%s" is not  Site nor Resource' % element)

        result = self.rmClient.selectPilotCache(site, ce)
        if result['OK']:
            result = S_OK(
                [dict(zip(result['Columns'], res)) for res in result['Value']])

        return result

    def doMaster(self):

        siteNames = getSites()
        if not siteNames['OK']:
            return siteNames
        siteNames = siteNames['Value']

        res = getCESiteMapping()
        if not res['OK']:
            return res
        ces = list(res['Value'])

        pilotResults = self.doNew(('Site', siteNames))
        if not pilotResults['OK']:
            self.metrics['failed'].append(pilotResults['Message'])

        pilotResults = self.doNew(('Resource', ces))
        if not pilotResults['OK']:
            self.metrics['failed'].append(pilotResults['Message'])

        return S_OK(self.metrics)
예제 #28
0
def test_PilotsDB():

    pilots = PilotManagerClient()

    # This will allow you to run the test again if necessary
    for jobID in ["aPilot", "anotherPilot"]:
        pilots.deletePilots(jobID)

    res = pilots.addPilotTQReference(["aPilot"], 1, "/a/ownerDN", "a/owner/Group")
    assert res["OK"], res["Message"]
    res = pilots.getCurrentPilotCounters({})
    assert res["OK"], res["Message"]
    assert res["Value"] == {"Submitted": 1}
    res = pilots.deletePilots("aPilot")
    assert res["OK"], res["Message"]
    res = pilots.getCurrentPilotCounters({})
    assert res["OK"], res["Message"]
    assert res["Value"] == {}

    res = pilots.addPilotTQReference(["anotherPilot"], 1, "/a/ownerDN", "a/owner/Group")
    assert res["OK"], res["Message"]
    res = pilots.storePilotOutput("anotherPilot", "This is an output", "this is an error")
    assert res["OK"], res["Message"]
    res = pilots.getPilotOutput("anotherPilot")
    assert res["OK"], res["Message"]
    assert res["Value"] == {
        "OwnerDN": "/a/ownerDN",
        "OwnerGroup": "a/owner/Group",
        "StdErr": "this is an error",
        "FileList": [],
        "StdOut": "This is an output",
    }
    res = pilots.getPilotInfo("anotherPilot")
    assert res["OK"], res["Message"]
    assert res["Value"]["anotherPilot"]["AccountingSent"] == "False"
    assert res["Value"]["anotherPilot"]["PilotJobReference"] == "anotherPilot"

    res = pilots.selectPilots({})
    assert res["OK"], res["Message"]
    res = pilots.getPilotSummary("", "")
    assert res["OK"], res["Message"]
    assert res["Value"]["Total"]["Submitted"] == 1
    res = pilots.getPilotMonitorWeb({}, [], 0, 100)
    assert res["OK"], res["Message"]
    assert res["Value"]["TotalRecords"] == 1
    res = pilots.getPilotMonitorSelectors()
    assert res["OK"], res["Message"]
    assert res["Value"] == {
        "GridType": ["DIRAC"],
        "OwnerGroup": ["a/owner/Group"],
        "DestinationSite": ["NotAssigned"],
        "Broker": ["Unknown"],
        "Status": ["Submitted"],
        "OwnerDN": ["/a/ownerDN"],
        "GridSite": ["Unknown"],
        "Owner": [],
    }
    res = pilots.getPilotSummaryWeb({}, [], 0, 100)
    assert res["OK"], res["Message"]
    assert res["Value"]["TotalRecords"] == 1

    res = pilots.setAccountingFlag("anotherPilot", "True")
    assert res["OK"], res["Message"]
    res = pilots.setPilotStatus("anotherPilot", "Running")
    assert res["OK"], res["Message"]
    res = pilots.getPilotInfo("anotherPilot")
    assert res["OK"], res["Message"]
    assert res["Value"]["anotherPilot"]["AccountingSent"] == "True"
    assert res["Value"]["anotherPilot"]["Status"] == "Running"

    res = pilots.setJobForPilot(123, "anotherPilot")
    assert res["OK"], res["Message"]
    res = pilots.setPilotBenchmark("anotherPilot", 12.3)
    assert res["OK"], res["Message"]
    res = pilots.countPilots({})
    assert res["OK"], res["Message"]
    #     res = pilots.getCounters()
    #     # getPilotStatistics

    res = pilots.deletePilots("anotherPilot")
    assert res["OK"], res["Message"]
    res = pilots.getCurrentPilotCounters({})
    assert res["OK"], res["Message"]
    assert res["Value"] == {}
예제 #29
0
  def export_getJobPageSummaryWeb(self, selectDict, sortList, startItem, maxItems, selectJobs=True):
    """ Get the summary of the job information for a given page in the
        job monitor in a generic format
    """
    resultDict = {}
    startDate = selectDict.get('FromDate', None)
    if startDate:
      del selectDict['FromDate']
    # For backward compatibility
    if startDate is None:
      startDate = selectDict.get('LastUpdate', None)
      if startDate:
        del selectDict['LastUpdate']
    endDate = selectDict.get('ToDate', None)
    if endDate:
      del selectDict['ToDate']

    # Provide JobID bound to a specific PilotJobReference
    # There is no reason to have both PilotJobReference and JobID in selectDict
    # If that occurs, use the JobID instead of the PilotJobReference
    pilotJobRefs = selectDict.get('PilotJobReference')
    if pilotJobRefs:
      del selectDict['PilotJobReference']
      if 'JobID' not in selectDict or not selectDict['JobID']:
        if not isinstance(pilotJobRefs, list):
          pilotJobRefs = [pilotJobRefs]
        selectDict['JobID'] = []
        for pilotJobRef in pilotJobRefs:
          res = PilotManagerClient().getPilotInfo(pilotJobRef)
          if res['OK'] and 'Jobs' in res['Value'][pilotJobRef]:
            selectDict['JobID'].extend(res['Value'][pilotJobRef]['Jobs'])

    result = self.jobPolicy.getControlledUsers(RIGHT_GET_INFO)
    if not result['OK']:
      return S_ERROR('Failed to evaluate user rights')
    if result['Value'] != 'ALL':
      selectDict[('Owner', 'OwnerGroup')] = result['Value']

    # Sorting instructions. Only one for the moment.
    if sortList:
      orderAttribute = sortList[0][0] + ":" + sortList[0][1]
    else:
      orderAttribute = None

    statusDict = {}
    result = self.gJobDB.getCounters('Jobs', ['Status'], selectDict,
                                     newer=startDate,
                                     older=endDate,
                                     timeStamp='LastUpdateTime')

    nJobs = 0
    if result['OK']:
      for stDict, count in result['Value']:
        nJobs += count
        statusDict[stDict['Status']] = count

    resultDict['TotalRecords'] = nJobs
    if nJobs == 0:
      return S_OK(resultDict)

    resultDict['Extras'] = statusDict

    if selectJobs:
      iniJob = startItem
      if iniJob >= nJobs:
        return S_ERROR('Item number out of range')

      result = self.gJobDB.selectJobs(selectDict, orderAttribute=orderAttribute,
                                      newer=startDate, older=endDate, limit=(maxItems, iniJob))
      if not result['OK']:
        return S_ERROR('Failed to select jobs: ' + result['Message'])

      summaryJobList = result['Value']
      if not self.globalJobsInfo:
        validJobs, _invalidJobs, _nonauthJobs, _ownJobs = self.jobPolicy.evaluateJobRights(summaryJobList,
                                                                                           RIGHT_GET_INFO)
        summaryJobList = validJobs

      result = self.getAttributesForJobList(summaryJobList, SUMMARY)
      if not result['OK']:
        return S_ERROR('Failed to get job summary: ' + result['Message'])

      summaryDict = result['Value']

      # Evaluate last sign of life time
      for jobID, jobDict in summaryDict.items():
        if jobDict['HeartBeatTime'] == 'None':
          jobDict['LastSignOfLife'] = jobDict['LastUpdateTime']
        else:
          lastTime = Time.fromString(jobDict['LastUpdateTime'])
          hbTime = Time.fromString(jobDict['HeartBeatTime'])
          # Not only Stalled jobs but also Failed jobs because Stalled
          if ((hbTime - lastTime) > timedelta(0) or
                  jobDict['Status'] == "Stalled" or
                  jobDict['MinorStatus'].startswith('Job stalled') or
                  jobDict['MinorStatus'].startswith('Stalling')):
            jobDict['LastSignOfLife'] = jobDict['HeartBeatTime']
          else:
            jobDict['LastSignOfLife'] = jobDict['LastUpdateTime']

      tqDict = {}
      result = self.gTaskQueueDB.getTaskQueueForJobs(summaryJobList)
      if result['OK']:
        tqDict = result['Value']

      # If no jobs can be selected after the properties check
      if not summaryDict.keys():
        return S_OK(resultDict)

      # prepare the standard structure now
      key = summaryDict.keys()[0]
      paramNames = summaryDict[key].keys()

      records = []
      for jobID, jobDict in summaryDict.items():
        jParList = []
        for pname in paramNames:
          jParList.append(jobDict[pname])
        jParList.append(tqDict.get(jobID, 0))
        records.append(jParList)

      resultDict['ParameterNames'] = paramNames + ['TaskQueueID']
      resultDict['Records'] = records

    return S_OK(resultDict)
예제 #30
0
class PilotStatusAgent(AgentModule):
    """
      The specific agents must provide the following methods:
        - initialize() for initial settings
        - beginExecution()
        - execute() - the main method called in the agent cycle
        - endExecution()
        - finalize() - the graceful exit of the method, this one is usually used
                   for the agent restart
  """

    queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled']
    finalStateList = ['Done', 'Aborted', 'Cleared', 'Deleted', 'Failed']

    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        self.jobDB = None
        self.pilotDB = None
        self.diracadmin = None

    #############################################################################
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption('PollingTime', 120)
        self.am_setOption('GridEnv', '')
        self.am_setOption('PilotStalledDays', 3)
        self.pilotDB = PilotAgentsDB()
        self.diracadmin = DiracAdmin()
        self.jobDB = JobDB()
        self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
        self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay',
                                                   7)
        self.pilots = PilotManagerClient()

        return S_OK()

    #############################################################################
    def execute(self):
        """The PilotAgent execution method.
    """

        self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3)
        self.gridEnv = self.am_getOption('GridEnv')
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue('/DIRAC/Setup', '')
            if setup:
                instance = gConfig.getValue(
                    '/DIRAC/Setups/%s/WorkloadManagement' % setup, '')
                if instance:
                    self.gridEnv = gConfig.getValue(
                        '/Systems/WorkloadManagement/%s/GridEnv' % instance,
                        '')
        result = self.pilotDB._getConnection()
        if result['OK']:
            connection = result['Value']
        else:
            return result

        # Now handle pilots not updated in the last N days (most likely the Broker is no
        # longer available) and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        result = self.pilots.clearPilots(self.clearPilotsDelay,
                                         self.clearAbortedDelay)
        if not result['OK']:
            self.log.warn('Failed to clear old pilots in the PilotAgentsDB')

        return S_OK()

    def clearWaitingPilots(self, condDict):
        """ Clear pilots in the faulty Waiting state
    """

        last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
        clearDict = {
            'Status': 'Waiting',
            'OwnerDN': condDict['OwnerDN'],
            'OwnerGroup': condDict['OwnerGroup'],
            'GridType': condDict['GridType'],
            'Broker': condDict['Broker']
        }
        result = self.pilotDB.selectPilots(clearDict, older=last_update)
        if not result['OK']:
            self.log.warn('Failed to get the Pilot Agents for Waiting state')
            return result
        if not result['Value']:
            return S_OK()
        refList = result['Value']

        for pilotRef in refList:
            self.log.info('Setting Waiting pilot to Stalled: %s' % pilotRef)
            result = self.pilotDB.setPilotStatus(
                pilotRef, 'Stalled', statusReason='Exceeded max waiting time')

        return S_OK()

    def clearParentJob(self, pRef, pDict, connection):
        """ Clear the parameteric parent job from the PilotAgentsDB
    """

        childList = pDict['ChildRefs']

        # Check that at least one child is in the database
        children_ok = False
        for child in childList:
            result = self.pilotDB.getPilotInfo(child, conn=connection)
            if result['OK']:
                if result['Value']:
                    children_ok = True

        if children_ok:
            return self.pilotDB.deletePilot(pRef, conn=connection)
        else:
            self.log.verbose('Adding children for parent %s' % pRef)
            result = self.pilotDB.getPilotInfo(pRef)
            parentInfo = result['Value'][pRef]
            tqID = parentInfo['TaskQueueID']
            ownerDN = parentInfo['OwnerDN']
            ownerGroup = parentInfo['OwnerGroup']
            broker = parentInfo['Broker']
            gridType = parentInfo['GridType']
            result = self.pilotDB.addPilotTQReference(childList,
                                                      tqID,
                                                      ownerDN,
                                                      ownerGroup,
                                                      broker=broker,
                                                      gridType=gridType)
            if not result['OK']:
                return result
            children_added = True
            for chRef, chDict in pDict['ChildDicts'].items():
                result = self.pilotDB.setPilotStatus(
                    chRef,
                    chDict['Status'],
                    destination=chDict['DestinationSite'],
                    conn=connection)
                if not result['OK']:
                    children_added = False
            if children_added:
                result = self.pilotDB.deletePilot(pRef, conn=connection)
            else:
                return S_ERROR('Failed to add children')
        return S_OK()

    def handleOldPilots(self, connection):
        """
      select all pilots that have not been updated in the last N days and declared them
      Deleted, accounting for them.
    """
        pilotsToAccount = {}
        timeLimitToConsider = Time.toString(Time.dateTime() -
                                            Time.day * self.pilotStalledDays)
        result = self.pilotDB.selectPilots({'Status': self.queryStateList},
                                           older=timeLimitToConsider,
                                           timeStamp='LastUpdateTime')
        if not result['OK']:
            self.log.error('Failed to get the Pilot Agents')
            return result
        if not result['Value']:
            return S_OK()

        refList = result['Value']
        result = self.pilotDB.getPilotInfo(refList)
        if not result['OK']:
            self.log.error('Failed to get Info for Pilot Agents')
            return result

        pilotsDict = result['Value']

        for pRef in pilotsDict:
            if pilotsDict[pRef].get('Jobs') and self._checkJobLastUpdateTime(
                    pilotsDict[pRef]['Jobs'], self.pilotStalledDays):
                self.log.debug(
                    '%s should not be deleted since one job of %s is running.'
                    % (str(pRef), str(pilotsDict[pRef]['Jobs'])))
                continue
            deletedJobDict = pilotsDict[pRef]
            deletedJobDict['Status'] = 'Deleted'
            deletedJobDict['StatusDate'] = Time.dateTime()
            pilotsToAccount[pRef] = deletedJobDict
            if len(pilotsToAccount) > 100:
                self.accountPilots(pilotsToAccount, connection)
                self._killPilots(pilotsToAccount)
                pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        self._killPilots(pilotsToAccount)

        return S_OK()

    def accountPilots(self, pilotsToAccount, connection):
        """ account for pilots
    """
        accountingFlag = False
        pae = self.am_getOption('PilotAccountingEnabled', 'yes')
        if pae.lower() == "yes":
            accountingFlag = True

        if not pilotsToAccount:
            self.log.info('No pilots to Account')
            return S_OK()

        accountingSent = False
        if accountingFlag:
            retVal = self.pilotDB.getPilotInfo(list(pilotsToAccount.keys()),
                                               conn=connection)
            if not retVal['OK']:
                self.log.error('Fail to retrieve Info for pilots',
                               retVal['Message'])
                return retVal
            dbData = retVal['Value']
            for pref in dbData:
                if pref in pilotsToAccount:
                    if dbData[pref]['Status'] not in self.finalStateList:
                        dbData[pref]['Status'] = pilotsToAccount[pref][
                            'Status']
                        dbData[pref]['DestinationSite'] = pilotsToAccount[
                            pref]['DestinationSite']
                        dbData[pref]['LastUpdateTime'] = pilotsToAccount[pref][
                            'StatusDate']

            retVal = self._addPilotsAccountingReport(dbData)
            if not retVal['OK']:
                self.log.error('Fail to retrieve Info for pilots',
                               retVal['Message'])
                return retVal

            self.log.info("Sending accounting records...")
            retVal = gDataStoreClient.commit()
            if not retVal['OK']:
                self.log.error("Can't send accounting reports",
                               retVal['Message'])
            else:
                self.log.info("Accounting sent for %s pilots" %
                              len(pilotsToAccount))
                accountingSent = True

        if not accountingFlag or accountingSent:
            for pRef in pilotsToAccount:
                pDict = pilotsToAccount[pRef]
                self.log.verbose('Setting Status for %s to %s' %
                                 (pRef, pDict['Status']))
                self.pilotDB.setPilotStatus(pRef,
                                            pDict['Status'],
                                            pDict['DestinationSite'],
                                            pDict['StatusDate'],
                                            conn=connection)

        return S_OK()

    def _addPilotsAccountingReport(self, pilotsData):
        """ fill accounting data
    """
        for pRef in pilotsData:
            pData = pilotsData[pRef]
            pA = PilotAccounting()
            pA.setEndTime(pData['LastUpdateTime'])
            pA.setStartTime(pData['SubmissionTime'])
            retVal = Registry.getUsernameForDN(pData['OwnerDN'])
            if not retVal['OK']:
                userName = '******'
                self.log.error("Can't determine username for dn:",
                               pData['OwnerDN'])
            else:
                userName = retVal['Value']
            pA.setValueByKey('User', userName)
            pA.setValueByKey('UserGroup', pData['OwnerGroup'])
            result = getCESiteMapping(pData['DestinationSite'])
            if result['OK'] and result['Value'].strip():
                pA.setValueByKey(
                    'Site', result['Value'][pData['DestinationSite']].strip())
            else:
                pA.setValueByKey('Site', 'Unknown')
            pA.setValueByKey('GridCE', pData['DestinationSite'])
            pA.setValueByKey('GridMiddleware', pData['GridType'])
            pA.setValueByKey('GridResourceBroker', pData['Broker'])
            pA.setValueByKey('GridStatus', pData['Status'])
            if 'Jobs' not in pData:
                pA.setValueByKey('Jobs', 0)
            else:
                pA.setValueByKey('Jobs', len(pData['Jobs']))
            self.log.verbose("Added accounting record for pilot %s" %
                             pData['PilotID'])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal['OK']:
                return retVal
        return S_OK()

    def _killPilots(self, acc):
        for i in sorted(acc.keys()):
            result = self.diracadmin.getPilotInfo(i)
            if result['OK'] and i in result['Value'] and 'Status' in result[
                    'Value'][i]:
                ret = self.diracadmin.killPilot(str(i))
                if ret['OK']:
                    self.log.info("Successfully deleted: %s (Status : %s)" %
                                  (i, result['Value'][i]['Status']))
                else:
                    self.log.error("Failed to delete pilot: ",
                                   "%s : %s" % (i, ret['Message']))
            else:
                self.log.error("Failed to get pilot info",
                               "%s : %s" % (i, str(result)))

    def _checkJobLastUpdateTime(self, joblist, StalledDays):
        timeLimitToConsider = Time.dateTime() - Time.day * StalledDays
        ret = False
        for jobID in joblist:
            result = self.jobDB.getJobAttributes(int(jobID))
            if result['OK']:
                if 'LastUpdateTime' in result['Value']:
                    lastUpdateTime = result['Value']['LastUpdateTime']
                    if Time.fromString(lastUpdateTime) > timeLimitToConsider:
                        ret = True
                        self.log.debug(
                            'Since %s updates LastUpdateTime on %s this does not to need to be deleted.'
                            % (str(jobID), str(lastUpdateTime)))
                        break
            else:
                self.log.error("Error taking job info from DB",
                               result['Message'])
        return ret