コード例 #1
0
    def initialize(self):
        """ Sets defaults
    """

        self.am_setOption("PollingTime", 120)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        self.jobLoggingDB = JobLoggingDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        agentTSTypes = self.am_getOption('ProductionTypes', [])
        if agentTSTypes:
            self.prod_types = agentTSTypes
        else:
            self.prod_types = Operations().getValue(
                'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
        gLogger.info(
            "Will exclude the following Production types from cleaning %s" %
            (', '.join(self.prod_types)))
        self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 500)
        self.jobByJob = self.am_getOption('JobByJob', False)
        self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)

        self.removeStatusDelay['Done'] = self.am_getOption(
            'RemoveStatusDelay/Done', 7)
        self.removeStatusDelay['Killed'] = self.am_getOption(
            'RemoveStatusDelay/Killed', 7)
        self.removeStatusDelay['Failed'] = self.am_getOption(
            'RemoveStatusDelay/Failed', 7)
        self.removeStatusDelay['Any'] = self.am_getOption(
            'RemoveStatusDelay/Any', -1)

        return S_OK()
コード例 #2
0
ファイル: Matcher.py プロジェクト: marianne013/DIRAC
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()
コード例 #3
0
def cleanTaskQueues():
  tqDB = TaskQueueDB()
  jobDB = JobDB()
  logDB = JobLoggingDB()

  result = tqDB.enableAllTaskQueues()
  if not result['OK']:
    return result
  result = tqDB.findOrphanJobs()
  if not result['OK']:
    return result
  for jid in result['Value']:
    result = tqDB.deleteJob(jid)
    if not result['OK']:
      gLogger.error("Cannot delete from TQ job %s" % jid, result['Message'])
      continue
    result = jobDB.rescheduleJob(jid)
    if not result['OK']:
      gLogger.error("Cannot reschedule in JobDB job %s" % jid, result['Message'])
      continue
    result = logDB.addLoggingRecord(jid, JobStatus.RECEIVED, "", "", source="JobState")
    if not result['OK']:
      gLogger.error("Cannot add logging record in JobLoggingDB %s" % jid, result['Message'])
      continue
  return S_OK()
コード例 #4
0
ファイル: OptimizerModule.py プロジェクト: vingar/DIRAC
    def initialize(self, jobDB=False, logDB=False):
        """ Initialization of the Optimizer Agent.
    """
        if not jobDB:
            self.jobDB = JobDB()
        else:
            self.jobDB = jobDB
        if not logDB:
            self.logDB = JobLoggingDB()
        else:
            self.logDB = logDB

        trailing = "Agent"
        optimizerName = self.am_getModuleParam('agentName')
        if optimizerName[-len(trailing):].find(trailing) == 0:
            optimizerName = optimizerName[:-len(trailing)]
        self.am_setModuleParam('optimizerName', optimizerName)

        self.startingMinorStatus = self.am_getModuleParam('optimizerName')
        self.startingMajorStatus = "Checking"
        self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed')
        self.requiredJobInfo = 'jdl'
        self.am_setOption("PollingTime", 30)

        return self.initializeOptimizer()
コード例 #5
0
ファイル: OptimizerModule.py プロジェクト: DIRACGrid/DIRAC
    def initialize(self, jobDB=None, logDB=None):
        """Initialization of the Optimizer Agent."""
        self.jobDB = JobDB() if jobDB is None else jobDB
        if not self.jobDB.isValid():
            dExit(1)

        useESForJobParametersFlag = Operations().getValue(
            "/Services/JobMonitoring/useESForJobParametersFlag", False)
        if useESForJobParametersFlag:
            try:
                result = ObjectLoader().loadObject(
                    "WorkloadManagementSystem.DB.ElasticJobParametersDB",
                    "ElasticJobParametersDB")
                if not result["OK"]:
                    return result
                self.elasticJobParametersDB = result["Value"]()
            except RuntimeError as excp:
                return S_ERROR("Can't connect to DB: %s" % excp)

        self.logDB = JobLoggingDB() if logDB is None else logDB

        optimizerName = self.am_getModuleParam("agentName")
        if optimizerName.endswith("Agent"):
            optimizerName = optimizerName[:-len("Agent")]
        self.am_setModuleParam("optimizerName", optimizerName)

        self.startingMinorStatus = self.am_getModuleParam("optimizerName")
        self.failedStatus = self.am_getOption("FailedJobStatus",
                                              JobStatus.FAILED)
        self.am_setOption("PollingTime", 30)

        return self.initializeOptimizer()
コード例 #6
0
    def initialize(self):
        """Sets default parameters"""
        self.jobDB = JobDB()
        self.logDB = JobLoggingDB()

        # getting parameters

        if not self.am_getOption("Enable", True):
            self.log.info("Stalled Job Agent running in disabled mode")

        wms_instance = getSystemInstance("WorkloadManagement")
        if not wms_instance:
            return S_ERROR(
                "Can not get the WorkloadManagement system instance")
        self.stalledJobsTolerantSites = self.am_getOption(
            "StalledJobsTolerantSites", [])
        self.stalledJobsToleranceTime = self.am_getOption(
            "StalledJobsToleranceTime", 0)

        self.stalledJobsToRescheduleSites = self.am_getOption(
            "StalledJobsToRescheduleSites", [])

        self.submittingTime = self.am_getOption("SubmittingTime",
                                                self.submittingTime)
        self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime)
        self.rescheduledTime = self.am_getOption("RescheduledTime",
                                                 self.rescheduledTime)

        wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance,
                                 "JobWrapper")

        failedTime = self.am_getOption("FailedTimeHours", 6)
        watchdogCycle = gConfig.getValue(
            cfgPath(wrapperSection, "CheckingTime"), 30 * 60)
        watchdogCycle = max(
            watchdogCycle,
            gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"),
                             20 * 60))
        stalledTime = self.am_getOption("StalledTimeHours", 2)
        self.log.verbose("", "StalledTime = %s cycles" % (stalledTime))
        self.stalledTime = int(watchdogCycle * (stalledTime + 0.5))
        self.log.verbose("", "FailedTime = %s cycles" % (failedTime))

        # Add half cycle to avoid race conditions
        self.failedTime = int(watchdogCycle * (failedTime + 0.5))

        self.minorStalledStatuses = (
            JobMinorStatus.STALLED_PILOT_NOT_RUNNING,
            "Stalling for more than %d sec" % self.failedTime,
        )

        # setting up the threading
        maxNumberOfThreads = self.am_getOption("MaxNumberOfThreads", 15)
        self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads)
        self.threadPoolExecutor = concurrent.futures.ThreadPoolExecutor(
            max_workers=maxNumberOfThreads)

        return S_OK()
コード例 #7
0
ファイル: StalledJobAgent.py プロジェクト: acasajus/DIRAC
    def initialize(self):
        """Sets default parameters
"""
        self.jobDB = JobDB()
        self.logDB = JobLoggingDB()
        self.am_setOption('PollingTime', 60 * 60)
        if not self.am_getOption('Enable', True):
            self.log.info('Stalled Job Agent running in disabled mode')
        return S_OK()
コード例 #8
0
ファイル: Matcher.py プロジェクト: Andrew-McNab-UK/DIRAC
  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
コード例 #9
0
def initializeMatcherHandler(serviceInfo):
    """  Matcher Service initialization
  """

    global gJobDB
    global gTaskQueueDB
    global jlDB
    global pilotAgentsDB

    gJobDB = JobDB()
    gTaskQueueDB = TaskQueueDB()
    jlDB = JobLoggingDB()
    pilotAgentsDB = PilotAgentsDB()

    gMonitor.registerActivity('matchTime', "Job matching time", 'Matching',
                              "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching',
                              "tqsk queues", gMonitor.OP_MEAN, 300)

    gTaskQueueDB.recalculateTQSharesForAll()
    gThreadScheduler.addPeriodicTask(120,
                                     gTaskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues)

    sendNumTaskQueues()

    return S_OK()
コード例 #10
0
ファイル: JobCleaningAgent.py プロジェクト: ltomassetti/DIRAC
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) )
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
    
    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )
    self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 )

    return S_OK()
コード例 #11
0
ファイル: JobMonitoringHandler.py プロジェクト: sposs/DIRAC
def initializeJobMonitoringHandler(serviceInfo):

    global jobDB, jobLoggingDB, taskQueueDB
    jobDB = JobDB()
    jobLoggingDB = JobLoggingDB()
    taskQueueDB = TaskQueueDB()
    return S_OK()
コード例 #12
0
def initializeJobMonitoringHandler(serviceInfo):

    global gJobDB, gJobLoggingDB, gTaskQueueDB
    gJobDB = JobDB()
    gJobLoggingDB = JobLoggingDB()
    gTaskQueueDB = TaskQueueDB()
    return S_OK()
コード例 #13
0
  def initialize( self, jobDB = False, logDB = False ):
    """ Initialization of the Optimizer Agent.
    """
    if not jobDB:
      self.jobDB = JobDB()
    else:
      self.jobDB = jobDB
    if not logDB:
      self.logDB = JobLoggingDB()
    else:
      self.logDB = logDB

    trailing = "Agent"
    optimizerName = self.am_getModuleParam( 'agentName' )
    if optimizerName[ -len( trailing ):].find( trailing ) == 0:
      optimizerName = optimizerName[ :-len( trailing ) ]
    self.am_setModuleParam( 'optimizerName', optimizerName )

    self.startingMinorStatus = self.am_getModuleParam( 'optimizerName' )
    self.startingMajorStatus = "Checking"
    self.failedStatus = self.am_getOption( "FailedJobStatus" , 'Failed' )
    self.requiredJobInfo = 'jdl'
    self.am_setOption( "PollingTime", 30 )

    return self.initializeOptimizer()
コード例 #14
0
ファイル: JobState.py プロジェクト: DIRACGrid/DIRAC
 def checkDBAccess(cls):
     # Init DB if there
     if not JobState.__db.checked:
         JobState.__db.jobDB = JobDB()
         JobState.__db.logDB = JobLoggingDB()
         JobState.__db.tqDB = TaskQueueDB()
         JobState.__db.checked = True
コード例 #15
0
def initializeJobStateUpdateHandler(serviceInfo):

  global jobDB
  global logDB
  jobDB = JobDB()
  logDB = JobLoggingDB()
  return S_OK()
コード例 #16
0
ファイル: JobManagerHandler.py プロジェクト: mesmith75/DIRAC
def initializeJobManagerHandler(serviceInfo):

    global gJobDB, gJobLoggingDB, gtaskQueueDB
    gJobDB = JobDB()
    gJobLoggingDB = JobLoggingDB()
    gtaskQueueDB = TaskQueueDB()
    return S_OK()
コード例 #17
0
ファイル: MightyOptimizer.py プロジェクト: acasajus/DIRAC
 def initialize(self):
     """ Standard constructor
 """
     self.jobDB = JobDB()
     self.jobLoggingDB = JobLoggingDB()
     self._optimizers = {}
     self.am_setOption("PollingTime", 30)
     return S_OK()
コード例 #18
0
ファイル: MatcherHandler.py プロジェクト: graciani/DIRAC
def initializeMatcherHandler(serviceInfo):
    """  Matcher Service initialization
  """

    global gJobDB
    global gJobLoggingDB
    global gTaskQueueDB
    global gPilotAgentsDB

    # Create JobDB object and initialize its tables.
    gJobDB = JobDB()
    res = gJobDB._checkTable()
    if not res['OK']:
        return res

    # Create JobLoggingDB object and initialize its tables.
    gJobLoggingDB = JobLoggingDB()
    res = gJobLoggingDB._checkTable()
    if not res['OK']:
        return res

    gTaskQueueDB = TaskQueueDB()

    # Create PilotAgentsDB object and initialize its tables.
    gPilotAgentsDB = PilotAgentsDB()
    res = gPilotAgentsDB._checkTable()
    if not res['OK']:
        return res

    gMonitor.registerActivity('matchTime', "Job matching time", 'Matching',
                              "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching',
                              "tqsk queues", gMonitor.OP_MEAN, 300)

    gTaskQueueDB.recalculateTQSharesForAll()
    gThreadScheduler.addPeriodicTask(120,
                                     gTaskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues)

    sendNumTaskQueues()

    return S_OK()
コード例 #19
0
  def initialize(self):
    """Sets default parameters
    """
    self.jobDB = JobDB()
    self.logDB = JobLoggingDB()
    self.am_setOption('PollingTime', 60 * 60)
    if not self.am_getOption('Enable', True):
      self.log.info('Stalled Job Agent running in disabled mode')

    # setting up the threading
    maxNumberOfThreads = self.am_getOption('MaxNumberOfThreads', 15)
    threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads)
    self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads)

    for _ in range(maxNumberOfThreads):
      threadPool.generateJobAndQueueIt(self._execute)

    return S_OK()
コード例 #20
0
  def initialize( self ):
    """Sets default parameters
"""
    self.jobDB = JobDB()
    self.logDB = JobLoggingDB()
    self.am_setOption( 'PollingTime', 60 * 60 )
    if not self.am_getOption( 'Enable', True ):
      self.log.info( 'Stalled Job Agent running in disabled mode' )
    return S_OK()
コード例 #21
0
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()
コード例 #22
0
  def initialize(self, jobDB=None, logDB=None):
    """ Initialization of the Optimizer Agent.
    """
    self.jobDB = JobDB() if jobDB is None else jobDB
    if not self.jobDB.isValid():
      dExit(1)

    self.logDB = JobLoggingDB() if logDB is None else logDB

    optimizerName = self.am_getModuleParam('agentName')
    if optimizerName.endswith('Agent'):
      optimizerName = optimizerName[:-len('Agent')]
    self.am_setModuleParam('optimizerName', optimizerName)

    self.startingMinorStatus = self.am_getModuleParam('optimizerName')
    self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed')
    self.am_setOption("PollingTime", 30)

    return self.initializeOptimizer()
コード例 #23
0
  def initializeHandler(cls, svcInfoDict):
    cls.gJobDB = JobDB()
    cls.gJobLoggingDB = JobLoggingDB()
    cls.gTaskQueueDB = TaskQueueDB()

    cls.gElasticJobParametersDB = None
    useESForJobParametersFlag = Operations().getValue(
        '/Services/JobMonitoring/useESForJobParametersFlag', False)
    if useESForJobParametersFlag:
      cls.gElasticJobParametersDB = ElasticJobParametersDB()
    return S_OK()
コード例 #24
0
    def initializeHandler(cls, svcInfoDict):
        """
    Determines the switching of ElasticSearch and MySQL backends
    """
        cls.jobDB = JobDB()
        cls.jobLoggingDB = JobLoggingDB()

        cls.elasticJobParametersDB = None
        useESForJobParametersFlag = Operations().getValue(
            '/Services/JobMonitoring/useESForJobParametersFlag', False)
        if useESForJobParametersFlag:
            cls.elasticJobParametersDB = ElasticJobParametersDB()
        return S_OK()
コード例 #25
0
    def __updateJobStatus(self, jobID, status, minorstatus=None):
        """ This method updates the job status in the JobDB

    FIXME: Use the JobStateUpdate service instead of the JobDB
    """
        self.log.verbose(
            "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" %
            (jobID, status))
        from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
        jobDB = JobDB()
        if self.enabled:
            result = jobDB.setJobAttribute(jobID,
                                           'Status',
                                           status,
                                           update=True)
        else:
            return S_OK('DisabledMode')

        if not result['OK']:
            self.log.error("Failed to update job status", result['Message'])
            raise RuntimeError("Failed to update job status")

        if minorstatus is None:  #Retain last minor status for stalled jobs
            result = jobDB.getJobAttributes(jobID, ['MinorStatus'])
            if result['OK']:
                minorstatus = result['Value']['MinorStatus']
            else:
                self.log.error("Failed to get Minor Status", result['Message'])
                raise RuntimeError("Failed to get Minorstatus")
        else:
            self.log.verbose(
                "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)"
                % (jobID, minorstatus))
            result = jobDB.setJobAttribute(jobID,
                                           'MinorStatus',
                                           minorstatus,
                                           update=True)

        logStatus = status
        from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB

        result = JobLoggingDB().addLoggingRecord(jobID,
                                                 status=logStatus,
                                                 minor=minorstatus,
                                                 source='DataRecoveryAgent')
        if not result['OK']:
            ## just the logging entry, no big loss so no exception
            self.log.warn(result)

        return result
コード例 #26
0
ファイル: JobCleaningAgent.py プロジェクト: hanyl/DIRAC
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production'])
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()
コード例 #27
0
def initializeJobManagerHandler(serviceInfo):

    global gJobDB, gJobLoggingDB, gtaskQueueDB, enablePilotsLogging, gPilotAgentsDB, gPilotsLoggingDB
    gJobDB = JobDB()
    gJobLoggingDB = JobLoggingDB()
    gtaskQueueDB = TaskQueueDB()
    gPilotAgentsDB = PilotAgentsDB()

    # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict
    enablePilotsLogging = gConfig.getValue(
        serviceInfo['serviceSectionPath'].replace('JobManager',
                                                  'PilotsLogging') + '/Enable',
        'False').lower() in ('yes', 'true')

    if enablePilotsLogging:
        gPilotsLoggingDB = PilotsLoggingDB()
    return S_OK()
コード例 #28
0
ファイル: JobManagerHandler.py プロジェクト: pmusset/DIRAC
    def initializeHandler(cls, serviceInfoDict):
        """ Initialization of DB objects and OptimizationMind
    """
        cls.jobDB = JobDB()
        cls.jobLoggingDB = JobLoggingDB()
        cls.taskQueueDB = TaskQueueDB()
        cls.pilotAgentsDB = PilotAgentsDB()

        cls.pilotsLoggingDB = None
        enablePilotsLogging = Operations().getValue(
            '/Services/JobMonitoring/usePilotsLoggingFlag', False)
        if enablePilotsLogging:
            cls.pilotsLoggingDB = PilotsLoggingDB()

        cls.msgClient = MessageClient("WorkloadManagement/OptimizationMind")
        cls.__connectToOptMind()
        gThreadScheduler.addPeriodicTask(60, cls.__connectToOptMind)
        return S_OK()
コード例 #29
0
ファイル: OptimizerModule.py プロジェクト: DIRACGrid/DIRAC
  def initialize(self, jobDB=None, logDB=None):
    """ Initialization of the Optimizer Agent.
    """
    self.jobDB = JobDB() if jobDB is None else jobDB
    if not self.jobDB.isValid():
      dExit(1)

    self.logDB = JobLoggingDB() if logDB is None else logDB

    optimizerName = self.am_getModuleParam('agentName')
    if optimizerName.endswith('Agent'):
      optimizerName = optimizerName[:-len('Agent')]
    self.am_setModuleParam('optimizerName', optimizerName)

    self.startingMinorStatus = self.am_getModuleParam('optimizerName')
    self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed')
    self.am_setOption("PollingTime", 30)

    return self.initializeOptimizer()
コード例 #30
0
ファイル: JobCleaningAgent.py プロジェクト: afalabel/DIRAC
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()
コード例 #31
0
ファイル: MatcherHandler.py プロジェクト: pmusset/DIRAC
  def initializeHandler(cls, serviceInfoDict):
    cls.jobDB = JobDB()
    cls.jobLoggingDB = JobLoggingDB()
    cls.taskQueueDB = TaskQueueDB()
    cls.pilotAgentsDB = PilotAgentsDB()
    cls.limiter = Limiter(jobDB=cls.jobDB)

    cls.taskQueueDB.recalculateTQSharesForAll()

    gMonitor.registerActivity('matchTime', "Job matching time",
                              'Matching', "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request",
                              'Matching', "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs",
                              'Matching', "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues",
                              'Matching', "tqsk queues", gMonitor.OP_MEAN, 300)

    gThreadScheduler.addPeriodicTask(120, cls.taskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, cls.sendNumTaskQueues)

    cls.sendNumTaskQueues()
    return S_OK()
コード例 #32
0
ファイル: MatcherHandler.py プロジェクト: vfalbor/DIRAC
def initializeMatcherHandler( serviceInfo ):
  """  Matcher Service initialization
  """

  global jobDB
  global jobLoggingDB
  global taskQueueDB

  jobDB = JobDB()
  jobLoggingDB = JobLoggingDB()
  taskQueueDB = TaskQueueDB()

  gMonitor.registerActivity( 'matchTime', "Job matching time", 'Matching', "secs" , gMonitor.OP_MEAN, 300 )
  gMonitor.registerActivity( 'matchTaskQueues', "Task queues checked per job", 'Matching', "task queues" , gMonitor.OP_MEAN, 300 )
  gMonitor.registerActivity( 'matchesDone', "Job Matches", 'Matching', "matches" , gMonitor.OP_MEAN, 300 )
  gMonitor.registerActivity( 'numTQs', "Number of Task Queues", 'Matching', "tqsk queues" , gMonitor.OP_MEAN, 300 )

  taskQueueDB.recalculateTQSharesForAll()
  gThreadScheduler.addPeriodicTask( 120, taskQueueDB.recalculateTQSharesForAll )
  gThreadScheduler.addPeriodicTask( 120, sendNumTaskQueues )

  sendNumTaskQueues()

  return S_OK()
コード例 #33
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    #clients
    # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service?
    self.jobDB = None
    self.taskQueueDB = None
    self.jobLoggingDB = None

    self.maxJobsAtOnce = 100
    self.jobByJob = False
    self.throttlingPeriod = 0.

    self.removeStatusDelay = {'Done':7,
                              'Killed':1,
                              'Failed':7 }

  #############################################################################
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) )
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
    
    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )

    return S_OK()

  def __getAllowedJobTypes( self ):
    """ Get valid jobTypes
    """
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """ Remove jobs in various status
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    
    # No jobs in the system subject to removal
    if not result['Value']:
      return S_OK()
    
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in self.removeStatusDelay:
      delay = self.removeStatusDelay[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

      
    result = self.deleteJobOversizedSandbox( jobList ) 
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot schedule removal of oversized sandboxes", result[ 'Message' ] )
      return result 
    
    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop( jobList.index( job ) ) 

    # TODO: we should not remove a job if it still has requests in the RequestManager.
    # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """ 

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK( {'Successful':successful, 'Failed':failed} )   

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn
           
    result = {'Successful':successful, 'Failed':failed}
    return S_OK( result )   
    
  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
コード例 #34
0
ファイル: JobCleaningAgent.py プロジェクト: afalabel/DIRAC
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()

  def __getAllowedJobTypes( self ):
    #Get valid jobTypes
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in REMOVE_STATUS_DELAY:
      delay = REMOVE_STATUS_DELAY[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

      
    result = self.deleteJobOversizedSandbox( jobList ) 
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] )
      return result 
    
    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop( jobList.index( job ) ) 

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """ 

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogting JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK( {'Successful':successful, 'Failed':failed} )   

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn
           
    result = {'Successful':successful, 'Failed':failed}
    return S_OK( result )   
    
  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
コード例 #35
0
ファイル: OptimizerModule.py プロジェクト: sparsh35/DIRAC
class OptimizerModule(AgentModule):
    """
      The specific agents must provide the following methods:

        *  initialize() for initial settings
        *  beginExecution()
        *  execute() - the main method called in the agent cycle
        *  endExecution()
        *  finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)
        self.jobDB = None
        self.logDB = None
        self.startingMinorStatus = None
        self.startingMajorStatus = JobStatus.CHECKING
        self.failedStatus = None
        self.requiredJobInfo = 'jdl'
        self._initResult = None

    def initialize(self, jobDB=None, logDB=None):
        """ Initialization of the Optimizer Agent.
    """
        self.jobDB = JobDB() if jobDB is None else jobDB
        if not self.jobDB.isValid():
            dExit(1)

        self.logDB = JobLoggingDB() if logDB is None else logDB

        optimizerName = self.am_getModuleParam('agentName')
        if optimizerName.endswith('Agent'):
            optimizerName = optimizerName[:-len('Agent')]
        self.am_setModuleParam('optimizerName', optimizerName)

        self.startingMinorStatus = self.am_getModuleParam('optimizerName')
        self.failedStatus = self.am_getOption("FailedJobStatus",
                                              JobStatus.FAILED)
        self.am_setOption("PollingTime", 30)

        return self.initializeOptimizer()

    def initializeOptimizer(self):
        """ To be overwritten by inheriting class
    """
        return S_OK()

    #############################################################################
    def execute(self):
        """ The main agent execution method
    """

        result = self.initializeOptimizer()
        if not result['OK']:
            return result
        self._initResult = result['Value']

        condition = {'Status': self.startingMajorStatus}
        if self.startingMinorStatus:
            condition['MinorStatus'] = self.startingMinorStatus

        result = self.jobDB.selectJobs(condition)
        if not result['OK']:
            self.log.warn('Failed to get a job list from the JobDB')
            return S_ERROR('Failed to get a job list from the JobDB')

        if not result['Value']:
            self.log.verbose('No pending jobs to process')
            return S_OK('No work to do')

        for job in result['Value']:
            result = self.getJobDefinition(job)
            if not result['OK']:
                self.setFailedJob(job, result['Message'], '')
                continue
            jobDef = result['Value']
            result = self.optimizeJob(job, jobDef['classad'])

        return S_OK()

    #############################################################################
    def optimizeJob(self, job, classAdJob):
        """ Call the corresponding Optimizer checkJob method
    """
        self.log.verbose(
            'Job will be processed', '%s by %sExecutor' %
            (job, self.am_getModuleParam('optimizerName')))
        result = self.checkJob(job, classAdJob)
        if not result['OK']:
            self.setFailedJob(job, result['Message'], classAdJob)
        return result

    #############################################################################
    def getJobDefinition(self, job, jobDef=False):
        """ Retrieve JDL of the Job and return jobDef dictionary
    """
        if not jobDef:
            jobDef = {}
        # If not jdl in jobinfo load it
        if 'jdl' not in jobDef:
            if self.requiredJobInfo == 'jdlOriginal':
                result = self.jobDB.getJobJDL(job, original=True)
                if not result['OK']:
                    self.log.error("No JDL for job", "%s" % job)
                    return S_ERROR("No JDL for job")
                jobDef['jdl'] = result['Value']
            if self.requiredJobInfo == 'jdl':
                result = self.jobDB.getJobJDL(job)
                if not result['OK']:
                    self.log.error("No JDL for job", "%s" % job)
                    return S_ERROR("No JDL for job")
                jobDef['jdl'] = result['Value']
        # Load the classad if needed
        if 'jdl' in jobDef and 'classad' not in jobDef:
            try:
                classad = ClassAd(jobDef['jdl'])
            except BaseException:
                self.log.debug("Cannot load JDL")
                return S_ERROR('Illegal Job JDL')
            if not classad.isOK():
                self.log.debug(
                    "Warning: illegal JDL for job %s, will be marked problematic"
                    % (job))
                return S_ERROR('Illegal Job JDL')
            jobDef['classad'] = classad
        return S_OK(jobDef)

    #############################################################################
    def getOptimizerJobInfo(self, job, reportName):
        """This method gets job optimizer information that will
       be used for
    """
        self.log.debug("self.jobDB.getJobOptParameter(%s,'%s')" %
                       (job, reportName))
        result = self.jobDB.getJobOptParameter(job, reportName)
        if result['OK']:
            value = result['Value']
            if not value:
                self.log.warn('JobDB returned null value',
                              'for %s %s' % (job, reportName))
                return S_ERROR('No optimizer info returned')
            else:
                try:
                    return S_OK(eval(value))
                except BaseException as x:
                    return S_ERROR(
                        'Could not evaluate optimizer parameters: %s' %
                        repr(x))

        return result

    #############################################################################
    def setOptimizerJobInfo(self, job, reportName, value):
        """This method sets the job optimizer information that will subsequently
       be used for job scheduling and TURL queries on the WN.
    """
        self.log.debug("self.jobDB.setJobOptParameter(%s,'%s','%s')" %
                       (job, reportName, value))
        if not self.am_Enabled():
            return S_OK()
        return self.jobDB.setJobOptParameter(job, reportName, str(value))

    #############################################################################
    def setOptimizerChain(self, job, value):
        """This method sets the job optimizer chain, in principle only needed by
       one of the optimizers.
    """
        self.log.debug("self.jobDB.setOptimizerChain(%s,%s)" % (job, value))
        if not self.am_Enabled():
            return S_OK()
        return self.jobDB.setOptimizerChain(job, value)

    #############################################################################
    def setNextOptimizer(self, job):
        """This method is executed when the optimizer instance has successfully
       processed the job.  The next optimizer in the chain will subsequently
       start to work on the job.
    """

        result = self.logDB.addLoggingRecord(
            job,
            status=self.startingMajorStatus,
            minor=self.startingMinorStatus,
            source=self.am_getModuleParam("optimizerName"))
        if not result['OK']:
            self.log.warn("Can't add logging record", result['Message'])

        self.log.debug("self.jobDB.setNextOptimizer(%s,'%s')" %
                       (job, self.am_getModuleParam("optimizerName")))
        return self.jobDB.setNextOptimizer(
            job, self.am_getModuleParam("optimizerName"))

    #############################################################################
    def updateJobStatus(self, job, status, minorStatus=None, appStatus=None):
        """This method updates the job status in the JobDB, this should only be
       used to fail jobs due to the optimizer chain.
    """
        self.log.debug(
            "self.jobDB.setJobStatus(%s,'Status/Minor/Application','%s'/'%s'/'%s',update=True)"
            % (job, status, str(minorStatus), str(appStatus)))
        if not self.am_Enabled():
            return S_OK()

        result = self.jobDB.setJobStatus(job, status, minorStatus, appStatus)
        if not result['OK']:
            return result

        result = self.logDB.addLoggingRecord(
            job,
            status=status,
            minor=minorStatus,
            application=appStatus,
            source=self.am_getModuleParam('optimizerName'))
        if not result['OK']:
            self.log.warn("Can't add logging record", result['Message'])

        return S_OK()

    #############################################################################
    def setJobParam(self, job, reportName, value):
        """This method updates a job parameter in the JobDB.
    """
        self.log.debug("self.jobDB.setJobParameter(%s,'%s','%s')" %
                       (job, reportName, value))
        if not self.am_Enabled():
            return S_OK()
        return self.jobDB.setJobParameter(job, reportName, value)

    #############################################################################
    def setFailedJob(self, job, msg, classAdJob=None):
        """This method moves the job to the failed status
    """
        self.log.debug("self.updateJobStatus(%s,'%s','%s')" %
                       (job, self.failedStatus, msg))
        if not self.am_Enabled():
            return S_OK()
        self.updateJobStatus(job, self.failedStatus, msg)
        if classAdJob:
            self.sendAccountingRecord(job, msg, classAdJob)

    #############################################################################
    def checkJob(self, job, classad):
        """This method controls the checking of the job, should be overridden in a subclass
    """
        self.log.warn(
            'Optimizer: checkJob method should be implemented in a subclass')
        return S_ERROR(
            'Optimizer: checkJob method should be implemented in a subclass')

    #############################################################################
    def sendAccountingRecord(self, job, msg, classAdJob):
        """
      Send and accounting record for the failed job
    """
        accountingReport = AccountingJob()
        accountingReport.setStartTime()
        accountingReport.setEndTime()

        owner = classAdJob.getAttributeString('Owner')
        userGroup = classAdJob.getAttributeString('OwnerGroup')
        jobGroup = classAdJob.getAttributeString('JobGroup')
        jobType = classAdJob.getAttributeString('JobType')
        jobClass = 'unknown'
        if classAdJob.lookupAttribute('JobSplitType'):
            jobClass = classAdJob.getAttributeString('JobSplitType')
        inputData = []
        processingType = 'unknown'
        if classAdJob.lookupAttribute('ProcessingType'):
            processingType = classAdJob.getAttributeString('ProcessingType')
        if classAdJob.lookupAttribute('InputData'):
            inputData = classAdJob.getListFromExpression('InputData')
        inputDataFiles = len(inputData)
        outputData = []
        if classAdJob.lookupAttribute('OutputData'):
            outputData = classAdJob.getListFromExpression('OutputData')
        outputDataFiles = len(outputData)

        acData = {
            'User': owner,
            'UserGroup': userGroup,
            'JobGroup': jobGroup,
            'JobType': jobType,
            'JobClass': jobClass,
            'ProcessingType': processingType,
            'FinalMajorStatus': self.failedStatus,
            'FinalMinorStatus': msg,
            'CPUTime': 0.0,
            'NormCPUTime': 0.0,
            'ExecTime': 0.0,
            'InputDataSize': 0.0,
            'OutputDataSize': 0.0,
            'InputDataFiles': inputDataFiles,
            'OutputDataFiles': outputDataFiles,
            'DiskSpace': 0.0,
            'InputSandBoxSize': 0.0,
            'OutputSandBoxSize': 0.0,
            'ProcessedEvents': 0.0
        }

        self.log.debug('Accounting Report is:')
        self.log.debug(acData)
        accountingReport.setValuesFromDict(acData)
        return accountingReport.commit()
コード例 #36
0
class StalledJobAgent( AgentModule ):
  """
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
  jobDB = None
  logDB = None
  matchedTime = 7200
  rescheduledTime = 600
  completedTime = 86400

  #############################################################################
  def initialize( self ):
    """Sets default parameters
"""
    self.jobDB = JobDB()
    self.logDB = JobLoggingDB()
    self.am_setOption( 'PollingTime', 60 * 60 )
    if not self.am_getOption( 'Enable', True ):
      self.log.info( 'Stalled Job Agent running in disabled mode' )
    return S_OK()

  #############################################################################
  def execute( self ):
    """ The main agent execution method
"""
    self.log.verbose( 'Waking up Stalled Job Agent' )

    wms_instance = getSystemInstance( 'WorkloadManagement' )
    if not wms_instance:
      return S_ERROR( 'Can not get the WorkloadManagement system instance' )
    wrapperSection = cfgPath( 'Systems', 'WorkloadManagement', wms_instance, 'JobWrapper' )

    stalledTime = self.am_getOption( 'StalledTimeHours', 2 )
    failedTime = self.am_getOption( 'FailedTimeHours', 6 )

    self.matchedTime = self.am_getOption( 'MatchedTime', self.matchedTime )
    self.rescheduledTime = self.am_getOption( 'RescheduledTime', self.rescheduledTime )
    self.completedTime = self.am_getOption( 'CompletedTime', self.completedTime )

    self.log.verbose( 'StalledTime = %s cycles' % ( stalledTime ) )
    self.log.verbose( 'FailedTime = %s cycles' % ( failedTime ) )

    watchdogCycle = gConfig.getValue( cfgPath( wrapperSection , 'CheckingTime' ), 30 * 60 )
    watchdogCycle = max( watchdogCycle, gConfig.getValue( cfgPath( wrapperSection , 'MinCheckingTime' ), 20 * 60 ) )

    # Add half cycle to avoid race conditions
    stalledTime = watchdogCycle * ( stalledTime + 0.5 )
    failedTime = watchdogCycle * ( failedTime + 0.5 )

    result = self.__markStalledJobs( stalledTime )
    if not result['OK']:
      self.log.error( 'Failed to detect stalled jobs', result['Message'] )

    #Note, jobs will be revived automatically during the heartbeat signal phase and
    #subsequent status changes will result in jobs not being selected by the
    #stalled job agent.

    result = self.__failStalledJobs( failedTime )
    if not result['OK']:
      self.log.error( 'Failed to process stalled jobs', result['Message'] )

    result = self.__failCompletedJobs()
    if not result['OK']:
      self.log.error( 'Failed to process completed jobs', result['Message'] )

    result = self.__kickStuckJobs()
    if not result['OK']:
      self.log.error( 'Failed to kick stuck jobs', result['Message'] )

    return S_OK( 'Stalled Job Agent cycle complete' )

  #############################################################################
  def __markStalledJobs( self, stalledTime ):
    """ Identifies stalled jobs running without update longer than stalledTime.
"""
    stalledCounter = 0
    runningCounter = 0
    result = self.jobDB.selectJobs( {'Status':'Running'} )
    if not result['OK']:
      return result
    if not result['Value']:
      return S_OK()
    jobs = result['Value']
    self.log.info( '%s Running jobs will be checked for being stalled' % ( len( jobs ) ) )
    jobs.sort()
# jobs = jobs[:10] #for debugging
    for job in jobs:
      result = self.__getStalledJob( job, stalledTime )
      if result['OK']:
        self.log.verbose( 'Updating status to Stalled for job %s' % ( job ) )
        self.__updateJobStatus( job, 'Stalled' )
        stalledCounter += 1
      else:
        self.log.verbose( result['Message'] )
        runningCounter += 1

    self.log.info( 'Total jobs: %s, Stalled job count: %s, Running job count: %s' %
                   ( len( jobs ), stalledCounter, runningCounter ) )
    return S_OK()

  #############################################################################
  def __failStalledJobs( self, failedTime ):
    """ Changes the Stalled status to Failed for jobs long in the Stalled status
"""

    result = self.jobDB.selectJobs( {'Status':'Stalled'} )
    if not result['OK']:
      return result

    failedCounter = 0

    if result['Value']:
      jobs = result['Value']
      self.log.info( '%s Stalled jobs will be checked for failure' % ( len( jobs ) ) )

      for job in jobs:

        # Check if the job pilot is lost
        result = self.__getJobPilotStatus( job )
        if result['OK']:
          pilotStatus = result['Value']
          if pilotStatus != "Running":
            result = self.__updateJobStatus( job, 'Failed',
                                             "Job stalled: pilot not running" )
            failedCounter += 1
            result = self.__sendAccounting( job )
            if not result['OK']:
              self.log.error( 'Failed to send accounting', result['Message'] )
              break
            continue

        result = self.__getLatestUpdateTime( job )
        if not result['OK']:
          return result
        currentTime = toEpoch()
        lastUpdate = result['Value']
        elapsedTime = currentTime - lastUpdate
        if elapsedTime > failedTime:
          self.__updateJobStatus( job, 'Failed', 'Stalling for more than %d sec' % failedTime )
          failedCounter += 1
          result = self.__sendAccounting( job )
          if not result['OK']:
            self.log.error( 'Failed to send accounting', result['Message'] )
            break

    recoverCounter = 0

    for minor in ["Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime]:
      result = self.jobDB.selectJobs( {'Status':'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' } )
      if not result['OK']:
        return result
      if result['Value']:
        jobs = result['Value']
        self.log.info( '%s Stalled jobs will be Accounted' % ( len( jobs ) ) )
        for job in jobs:
          result = self.__sendAccounting( job )
          if not result['OK']:
            self.log.error( 'Failed to send accounting', result['Message'] )
            continue

          recoverCounter += 1
      if not result['OK']:
        break

    if failedCounter:
      self.log.info( '%d jobs set to Failed' % failedCounter )
    if recoverCounter:
      self.log.info( '%d jobs properly Accounted' % recoverCounter )
    return S_OK( failedCounter )

  #############################################################################
  def __getJobPilotStatus( self, jobID ):
    """ Get the job pilot status
"""
    result = self.jobDB.getJobParameter( jobID, 'Pilot_Reference' )
    if not result['OK']:
      return result
    if not result['Value']:
      return S_ERROR( 'Failed to get the pilot reference' )

    pilotReference = result['Value']
    wmsAdminClient = RPCClient( 'WorkloadManagement/WMSAdministrator' )
    result = wmsAdminClient.getPilotInfo( pilotReference )
    if not result['OK']:
      if "No pilots found" in result['Message']:
        self.log.warn( result['Message'] )
        return S_OK( 'NoPilot' )
      self.log.error( 'Failed to get pilot information', result['Message'] )
      return S_ERROR( 'Failed to get the pilot status' )
    pilotStatus = result['Value'][pilotReference]['Status']

    return S_OK( pilotStatus )


  #############################################################################
  def __getStalledJob( self, job, stalledTime ):
    """ Compares the most recent of LastUpdateTime and HeartBeatTime against
the stalledTime limit.
"""
    result = self.__getLatestUpdateTime( job )
    if not result['OK']:
      return result

    currentTime = toEpoch()
    lastUpdate = result['Value']

    elapsedTime = currentTime - lastUpdate
    self.log.verbose( '(CurrentTime-LastUpdate) = %s secs' % ( elapsedTime ) )
    if elapsedTime > stalledTime:
      self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % ( job, elapsedTime ) )
      return S_OK( 'Stalled' )

    return S_ERROR( 'Job %s is running and will be ignored' % job )

  #############################################################################
  def __getLatestUpdateTime( self, job ):
    """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
    result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime'] )
    if not result['OK']:
      self.log.error( 'Failed to get job attributes', result['Message'] )
    if not result['OK'] or not result['Value']:
      self.log.error( 'Could not get attributes for job', '%s' % job )
      return S_ERROR( 'Could not get attributes for job' )

    self.log.verbose( result )
    latestUpdate = 0
    if not result['Value']['HeartBeatTime'] or result['Value']['HeartBeatTime'] == 'None':
      self.log.verbose( 'HeartBeatTime is null for job %s' % job )
    else:
      latestUpdate = toEpoch( fromString( result['Value']['HeartBeatTime'] ) )

    if not result['Value']['LastUpdateTime'] or result['Value']['LastUpdateTime'] == 'None':
      self.log.verbose( 'LastUpdateTime is null for job %s' % job )
    else:
      lastUpdate = toEpoch( fromString( result['Value']['LastUpdateTime'] ) )
      if latestUpdate < lastUpdate:
        latestUpdate = lastUpdate

    if not latestUpdate:
      return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job )
    else:
      self.log.verbose( 'Latest update time from epoch for job %s is %s' % ( job, latestUpdate ) )
      return S_OK( latestUpdate )

  #############################################################################
  def __updateJobStatus( self, job, status, minorstatus = None ):
    """ This method updates the job status in the JobDB, this should only be
used to fail jobs due to the optimizer chain.
"""
    self.log.verbose( "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % ( job, status ) )

    if self.am_getOption( 'Enable', True ):
      result = self.jobDB.setJobAttribute( job, 'Status', status, update = True )
    else:
      result = S_OK( 'DisabledMode' )

    if result['OK']:
      if minorstatus:
        self.log.verbose( "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % ( job, minorstatus ) )
        result = self.jobDB.setJobAttribute( job, 'MinorStatus', minorstatus, update = True )

    if not minorstatus: #Retain last minor status for stalled jobs
      result = self.jobDB.getJobAttributes( job, ['MinorStatus'] )
      if result['OK']:
        minorstatus = result['Value']['MinorStatus']

    logStatus = status
    result = self.logDB.addLoggingRecord( job, status = logStatus, minor = minorstatus, source = 'StalledJobAgent' )
    if not result['OK']:
      self.log.warn( result )

    return result

  def __getProcessingType( self, jobID ):
    """ Get the Processing Type from the JDL, until it is promoted to a real Attribute
"""
    processingType = 'unknown'
    result = self.jobDB.getJobJDL( jobID, original = True )
    if not result['OK']:
      return processingType
    classAdJob = ClassAd( result['Value'] )
    if classAdJob.lookupAttribute( 'ProcessingType' ):
      processingType = classAdJob.getAttributeString( 'ProcessingType' )
    return processingType


  #############################################################################
  def __sendAccounting( self, jobID ):
    """ Send WMS accounting data for the given job
"""
    try:
      accountingReport = Job()
      endTime = 'Unknown'
      lastHeartBeatTime = 'Unknown'

      result = self.jobDB.getJobAttributes( jobID )
      if not result['OK']:
        return result
      jobDict = result['Value']

      startTime, endTime = self.__checkLoggingInfo( jobID, jobDict )
      lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict )
      lastHeartBeatTime = fromString( lastHeartBeatTime )
      if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
        endTime = lastHeartBeatTime

      cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor' )
      if not cpuNormalization['OK'] or not cpuNormalization['Value']:
        cpuNormalization = 0.0
      else:
        cpuNormalization = float( cpuNormalization['Value'] )
    except Exception:
      self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % ( str( jobID ), str( endTime ), str( lastHeartBeatTime ) ), '' , False )
      return S_ERROR( "Exception" )
    processingType = self.__getProcessingType( jobID )

    accountingReport.setStartTime( startTime )
    accountingReport.setEndTime( endTime )
    # execTime = toEpoch( endTime ) - toEpoch( startTime )
    #Fill the accounting data
    acData = { 'Site' : jobDict['Site'],
               'User' : jobDict['Owner'],
               'UserGroup' : jobDict['OwnerGroup'],
               'JobGroup' : jobDict['JobGroup'],
               'JobType' : jobDict['JobType'],
               'JobClass' : jobDict['JobSplitType'],
               'ProcessingType' : processingType,
               'FinalMajorStatus' : 'Failed',
               'FinalMinorStatus' : 'Stalled',
               'CPUTime' : lastCPUTime,
               'NormCPUTime' : lastCPUTime * cpuNormalization,
               'ExecTime' : lastWallTime,
               'InputDataSize' : 0.0,
               'OutputDataSize' : 0.0,
               'InputDataFiles' : 0,
               'OutputDataFiles' : 0,
               'DiskSpace' : 0.0,
               'InputSandBoxSize' : 0.0,
               'OutputSandBoxSize' : 0.0,
               'ProcessedEvents' : 0
             }
    
    # For accidentally stopped jobs ExecTime can be not set
    if not acData['ExecTime']:
      acData['ExecTime'] = acData['CPUTime']
    elif acData['ExecTime'] < acData['CPUTime']:
      acData['ExecTime'] = acData['CPUTime']
    
    self.log.verbose( 'Accounting Report is:' )
    self.log.verbose( acData )
    accountingReport.setValuesFromDict( acData )

    result = accountingReport.commit()
    if result['OK']:
      self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' )
    else:
      self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % ( int( jobID ), result['Message'] ) )
    return result

  def __checkHeartBeat( self, jobID, jobDict ):
    """ Get info from HeartBeat
"""
    result = self.jobDB.getHeartBeatData( jobID )
    lastCPUTime = 0
    lastWallTime = 0
    lastHeartBeatTime = jobDict['StartExecTime']
    if lastHeartBeatTime == "None":
      lastHeartBeatTime = 0

    if result['OK']:
      for name, value, heartBeatTime in result['Value']:
        if 'CPUConsumed' == name:
          try:
            value = int( float( value ) )
            if value > lastCPUTime:
              lastCPUTime = value
          except ValueError:
            pass
        if 'WallClockTime' == name:
          try:
            value = int( float( value ) )
            if value > lastWallTime:
              lastWallTime = value
          except ValueError:
            pass
        if heartBeatTime > lastHeartBeatTime:
          lastHeartBeatTime = heartBeatTime

    return lastCPUTime, lastWallTime, lastHeartBeatTime

  def __checkLoggingInfo( self, jobID, jobDict ):
    """ Get info from JobLogging
"""
    logList = []
    result = self.logDB.getJobLoggingInfo( jobID )
    if result['OK']:
      logList = result['Value']

    startTime = jobDict['StartExecTime']
    if not startTime or startTime == 'None':
      # status, minor, app, stime, source
      for items in logList:
        if items[0] == 'Running':
          startTime = items[3]
          break
      if not startTime or startTime == 'None':
        startTime = jobDict['SubmissionTime']

    if type( startTime ) in types.StringTypes:
      startTime = fromString( startTime )
      if startTime == None:
        self.log.error( 'Wrong timestamp in DB', items[3] )
        startTime = dateTime()


    endTime = dateTime()
    # status, minor, app, stime, source
    for items in logList:
      if items[0] == 'Stalled':
        endTime = fromString( items[3] )
    if endTime == None:
      self.log.error( 'Wrong timestamp in DB', items[3] )
      endTime = dateTime()

    return startTime, endTime

  def __kickStuckJobs( self ):
    """ Reschedule jobs stuck in initialization status Rescheduled, Matched
"""

    message = ''

    checkTime = str( dateTime() - self.matchedTime * second )
    result = self.jobDB.selectJobs( {'Status':'Matched'}, older = checkTime )
    if not result['OK']:
      self.log.error( 'Failed to select jobs', result['Message'] )
      return result

    jobIDs = result['Value']
    if jobIDs:
      self.log.info( 'Rescheduling %d jobs stuck in Matched status' % len( jobIDs ) )
      result = self.jobDB.rescheduleJobs( jobIDs )
      if 'FailedJobs' in result:
        message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs'] )

    checkTime = str( dateTime() - self.rescheduledTime * second )
    result = self.jobDB.selectJobs( {'Status':'Rescheduled'}, older = checkTime )
    if not result['OK']:
      self.log.error( 'Failed to select jobs', result['Message'] )
      return result

    jobIDs = result['Value']
    if jobIDs:
      self.log.info( 'Rescheduling %d jobs stuck in Rescheduled status' % len( jobIDs ) )
      result = self.jobDB.rescheduleJobs( jobIDs )
      if 'FailedJobs' in result:
        if message:
          message += '\n'
        message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs'] )

    if message:
      return S_ERROR( message )
    else:
      return S_OK()

  def __failCompletedJobs( self ):
    """ Failed Jobs stuck in Completed Status for a long time.
      They are due to pilots being killed during the
      finalization of the job execution.
    """

    # Get old Completed Jobs
    checkTime = str( dateTime() - self.completedTime * second )
    result = self.jobDB.selectJobs( {'Status':'Completed'}, older = checkTime )
    if not result['OK']:
      self.log.error( 'Failed to select jobs', result['Message'] )
      return result

    jobIDs = result['Value']
    if not jobIDs:
      return S_OK()

    # Remove those with Minor Status "Pending Requests"
    for jobID in jobIDs:
      result = self.jobDB.getJobAttributes( jobID, ['Status','MinorStatus'] )
      if not result['OK']:
        self.log.error( 'Failed to get job attributes', result['Message'] )
        continue
      if result['Value']['Status'] != "Completed":
        continue
      if result['Value']['MinorStatus'] == "Pending Requests":
        continue

      result = self.__updateJobStatus( jobID, 'Failed',
                                       "Job died during finalization" )
      result = self.__sendAccounting( jobID )
      if not result['OK']:
        self.log.error( 'Failed to send accounting', result['Message'] )
        continue

    return S_OK()
コード例 #37
0
ファイル: StalledJobAgent.py プロジェクト: sparsh35/DIRAC
class StalledJobAgent(AgentModule):
    """
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        self.jobDB = None
        self.logDB = None
        self.matchedTime = 7200
        self.rescheduledTime = 600
        self.submittingTime = 300
        self.stalledJobsTolerantSites = []

    #############################################################################
    def initialize(self):
        """Sets default parameters
    """
        self.jobDB = JobDB()
        self.logDB = JobLoggingDB()
        self.am_setOption('PollingTime', 60 * 60)
        if not self.am_getOption('Enable', True):
            self.log.info('Stalled Job Agent running in disabled mode')
        return S_OK()

    #############################################################################
    def execute(self):
        """ The main agent execution method
    """

        self.log.verbose('Waking up Stalled Job Agent')

        wms_instance = getSystemInstance('WorkloadManagement')
        if not wms_instance:
            return S_ERROR(
                'Can not get the WorkloadManagement system instance')
        wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance,
                                 'JobWrapper')

        stalledTime = self.am_getOption('StalledTimeHours', 2)
        failedTime = self.am_getOption('FailedTimeHours', 6)
        self.stalledJobsTolerantSites = self.am_getOption(
            'StalledJobsTolerantSites', [])
        self.stalledJobsToleranceTime = self.am_getOption(
            'StalledJobsToleranceTime', 0)

        self.submittingTime = self.am_getOption('SubmittingTime',
                                                self.submittingTime)
        self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime)
        self.rescheduledTime = self.am_getOption('RescheduledTime',
                                                 self.rescheduledTime)

        self.log.verbose('StalledTime = %s cycles' % (stalledTime))
        self.log.verbose('FailedTime = %s cycles' % (failedTime))

        watchdogCycle = gConfig.getValue(
            cfgPath(wrapperSection, 'CheckingTime'), 30 * 60)
        watchdogCycle = max(
            watchdogCycle,
            gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'),
                             20 * 60))

        # Add half cycle to avoid race conditions
        stalledTime = int(watchdogCycle * (stalledTime + 0.5))
        failedTime = int(watchdogCycle * (failedTime + 0.5))

        result = self._markStalledJobs(stalledTime)
        if not result['OK']:
            self.log.error('Failed to detect stalled jobs', result['Message'])

        # Note, jobs will be revived automatically during the heartbeat signal phase and
        # subsequent status changes will result in jobs not being selected by the
        # stalled job agent.

        result = self._failStalledJobs(failedTime)
        if not result['OK']:
            self.log.error('Failed to process stalled jobs', result['Message'])

        result = self._failSubmittingJobs()
        if not result['OK']:
            self.log.error('Failed to process jobs being submitted',
                           result['Message'])

        result = self._kickStuckJobs()
        if not result['OK']:
            self.log.error('Failed to kick stuck jobs', result['Message'])

        return S_OK('Stalled Job Agent cycle complete')

    #############################################################################
    def _markStalledJobs(self, stalledTime):
        """ Identifies stalled jobs running or completing without update longer than stalledTime.
    """
        stalledCounter = 0
        aliveCounter = 0
        # This is the minimum time we wait for declaring a job Stalled, therefore it is safe
        checkTime = dateTime() - stalledTime * second
        checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING]
        # Only get jobs whose HeartBeat is older than the stalledTime
        result = self.jobDB.selectJobs({'Status': checkedStatuses},
                                       older=checkTime,
                                       timeStamp='HeartBeatTime')
        if not result['OK']:
            return result
        if not result['Value']:
            return S_OK()
        jobs = sorted(result['Value'])
        self.log.info(
            '%d %s jobs will be checked for being stalled, heartbeat before %s'
            % (len(jobs), ' & '.join(checkedStatuses), str(checkTime)))

        for job in jobs:
            delayTime = stalledTime
            # Add a tolerance time for some sites if required
            site = self.jobDB.getJobAttribute(job, 'site')['Value']
            if site in self.stalledJobsTolerantSites:
                delayTime += self.stalledJobsToleranceTime
            # Check if the job is really stalled
            result = self.__checkJobStalled(job, delayTime)
            if result['OK']:
                self.log.verbose('Updating status to Stalled for job %s' %
                                 (job))
                self.__updateJobStatus(job, 'Stalled')
                stalledCounter += 1
            else:
                self.log.verbose(result['Message'])
                aliveCounter += 1

        self.log.info('Total jobs: %d, Stalled jobs: %d, %s jobs: %d' %
                      (len(jobs), stalledCounter, '+'.join(checkedStatuses),
                       aliveCounter))
        return S_OK()

    #############################################################################
    def _failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
    """
        # Only get jobs that have been Stalled for long enough
        checkTime = dateTime() - failedTime * second
        result = self.jobDB.selectJobs({'Status': JobStatus.STALLED},
                                       older=checkTime)
        if not result['OK']:
            return result
        jobs = result['Value']

        failedCounter = 0
        minorStalledStatuses = ("Job stalled: pilot not running",
                                'Stalling for more than %d sec' % failedTime)

        if jobs:
            self.log.info(
                '%d jobs Stalled before %s will be checked for failure' %
                (len(jobs), str(checkTime)))

            for job in jobs:
                setFailed = False
                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if not result['OK']:
                    self.log.error('Failed to get pilot status',
                                   result['Message'])
                    continue
                pilotStatus = result['Value']
                if pilotStatus != "Running":
                    setFailed = minorStalledStatuses[0]
                else:
                    # Verify that there was no sign of life for long enough
                    result = self.__getLatestUpdateTime(job)
                    if not result['OK']:
                        self.log.error('Failed to get job update time',
                                       result['Message'])
                        continue
                    elapsedTime = toEpoch() - result['Value']
                    if elapsedTime > failedTime:
                        setFailed = minorStalledStatuses[1]

                # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info
                if setFailed:
                    self.__sendKillCommand(job)
                    self.__updateJobStatus(job, JobStatus.FAILED, setFailed)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])

        recoverCounter = 0

        for minor in minorStalledStatuses:
            result = self.jobDB.selectJobs({
                'Status': JobStatus.FAILED,
                'MinorStatus': minor,
                'AccountedFlag': 'False'
            })
            if not result['OK']:
                return result
            if result['Value']:
                jobs = result['Value']
                self.log.info('%s Stalled jobs will be Accounted' %
                              (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])
                        continue

                    recoverCounter += 1
            if not result['OK']:
                break

        if failedCounter:
            self.log.info('%d jobs set to Failed' % failedCounter)
        if recoverCounter:
            self.log.info('%d jobs properly Accounted' % recoverCounter)
        return S_OK(failedCounter)

    #############################################################################
    def __getJobPilotStatus(self, jobID):
        """ Get the job pilot status
    """
        result = JobMonitoringClient().getJobParameter(jobID,
                                                       'Pilot_Reference')
        if not result['OK']:
            return result
        pilotReference = result['Value'].get('Pilot_Reference')
        if not pilotReference:
            # There is no pilot reference, hence its status is unknown
            return S_OK('NoPilot')

        result = PilotManagerClient().getPilotInfo(pilotReference)
        if not result['OK']:
            if "No pilots found" in result['Message']:
                self.log.warn(result['Message'])
                return S_OK('NoPilot')
            self.log.error('Failed to get pilot information',
                           'for job %d: ' % jobID + result['Message'])
            return S_ERROR('Failed to get the pilot status')
        pilotStatus = result['Value'][pilotReference]['Status']

        return S_OK(pilotStatus)

    #############################################################################
    def __checkJobStalled(self, job, stalledTime):
        """ Compares the most recent of LastUpdateTime and HeartBeatTime against
    the stalledTime limit.
    """
        result = self.__getLatestUpdateTime(job)
        if not result['OK']:
            return result

        elapsedTime = toEpoch() - result['Value']
        self.log.verbose('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime))
        if elapsedTime > stalledTime:
            self.log.info(
                'Job %s is identified as stalled with last update > %s secs ago'
                % (job, elapsedTime))
            return S_OK('Stalled')

        return S_ERROR('Job %s is running and will be ignored' % job)

    #############################################################################
    def __getLatestUpdateTime(self, job):
        """ Returns the most recent of HeartBeatTime and LastUpdateTime
    """
        result = self.jobDB.getJobAttributes(
            job, ['HeartBeatTime', 'LastUpdateTime'])
        if not result['OK']:
            self.log.error('Failed to get job attributes', result['Message'])
        if not result['OK'] or not result['Value']:
            self.log.error('Could not get attributes for job', '%s' % job)
            return S_ERROR('Could not get attributes for job')

        latestUpdate = 0
        if not result['Value']['HeartBeatTime'] or result['Value'][
                'HeartBeatTime'] == 'None':
            self.log.verbose('HeartBeatTime is null for job %s' % job)
        else:
            latestUpdate = toEpoch(fromString(
                result['Value']['HeartBeatTime']))

        if not result['Value']['LastUpdateTime'] or result['Value'][
                'LastUpdateTime'] == 'None':
            self.log.verbose('LastUpdateTime is null for job %s' % job)
        else:
            latestUpdate = max(
                latestUpdate,
                toEpoch(fromString(result['Value']['LastUpdateTime'])))

        if not latestUpdate:
            return S_ERROR(
                'LastUpdate and HeartBeat times are null for job %s' % job)
        else:
            self.log.verbose('Latest update time from epoch for job %s is %s' %
                             (job, latestUpdate))
            return S_OK(latestUpdate)

    #############################################################################
    def __updateJobStatus(self, job, status, minorstatus=None):
        """ This method updates the job status in the JobDB, this should only be
        used to fail jobs due to the optimizer chain.
    """
        self.log.verbose(
            "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" %
            (job, status))

        if self.am_getOption('Enable', True):
            result = self.jobDB.setJobAttribute(job,
                                                'Status',
                                                status,
                                                update=True)
        else:
            result = S_OK('DisabledMode')

        if result['OK']:
            if minorstatus:
                self.log.verbose(
                    "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)"
                    % (job, minorstatus))
                result = self.jobDB.setJobAttribute(job,
                                                    'MinorStatus',
                                                    minorstatus,
                                                    update=True)

        if not minorstatus:  # Retain last minor status for stalled jobs
            result = self.jobDB.getJobAttributes(job, ['MinorStatus'])
            if result['OK']:
                minorstatus = result['Value']['MinorStatus']

        logStatus = status
        result = self.logDB.addLoggingRecord(job,
                                             status=logStatus,
                                             minor=minorstatus,
                                             source='StalledJobAgent')
        if not result['OK']:
            self.log.warn(result)

        return result

    def __getProcessingType(self, jobID):
        """ Get the Processing Type from the JDL, until it is promoted to a real Attribute
    """
        processingType = 'unknown'
        result = self.jobDB.getJobJDL(jobID, original=True)
        if not result['OK']:
            return processingType
        classAdJob = ClassAd(result['Value'])
        if classAdJob.lookupAttribute('ProcessingType'):
            processingType = classAdJob.getAttributeString('ProcessingType')
        return processingType

    def __sendAccounting(self, jobID):
        """ Send WMS accounting data for the given job
    """
        try:
            accountingReport = Job()
            endTime = 'Unknown'
            lastHeartBeatTime = 'Unknown'

            result = self.jobDB.getJobAttributes(jobID)
            if not result['OK']:
                return result
            jobDict = result['Value']

            startTime, endTime = self.__checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            result = JobMonitoringClient().getJobParameter(
                jobID, 'CPUNormalizationFactor')
            if not result['OK'] or not result['Value']:
                self.log.error(
                    'Error getting Job Parameter CPUNormalizationFactor, setting 0',
                    result.get('Message', 'No such value'))
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(
                    result['Value'].get('CPUNormalizationFactor'))

        except Exception as e:
            self.log.exception(
                "Exception in __sendAccounting",
                "for job=%s: endTime=%s, lastHBTime=%s" %
                (str(jobID), str(endTime), str(lastHeartBeatTime)),
                lException=e)
            return S_ERROR("Exception")
        processingType = self.__getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            'Site': jobDict['Site'],
            'User': jobDict['Owner'],
            'UserGroup': jobDict['OwnerGroup'],
            'JobGroup': jobDict['JobGroup'],
            'JobType': jobDict['JobType'],
            'JobClass': jobDict['JobSplitType'],
            'ProcessingType': processingType,
            'FinalMajorStatus': 'Failed',
            'FinalMinorStatus': 'Stalled',
            'CPUTime': lastCPUTime,
            'NormCPUTime': lastCPUTime * cpuNormalization,
            'ExecTime': lastWallTime,
            'InputDataSize': 0.0,
            'OutputDataSize': 0.0,
            'InputDataFiles': 0,
            'OutputDataFiles': 0,
            'DiskSpace': 0.0,
            'InputSandBoxSize': 0.0,
            'OutputSandBoxSize': 0.0,
            'ProcessedEvents': 0
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData['ExecTime']:
            acData['ExecTime'] = acData['CPUTime']
        elif acData['ExecTime'] < acData['CPUTime']:
            acData['ExecTime'] = acData['CPUTime']

        self.log.verbose('Accounting Report is:')
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result['OK']:
            self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True')
        else:
            self.log.error(
                'Failed to send accounting report',
                'Job: %d, Error: %s' % (int(jobID), result['Message']))
        return result

    def __checkHeartBeat(self, jobID, jobDict):
        """ Get info from HeartBeat
    """
        result = self.jobDB.getHeartBeatData(jobID)
        lastCPUTime = 0
        lastWallTime = 0
        lastHeartBeatTime = jobDict['StartExecTime']
        if lastHeartBeatTime == "None":
            lastHeartBeatTime = 0

        if result['OK']:
            for name, value, heartBeatTime in result['Value']:
                if name == 'CPUConsumed':
                    try:
                        value = int(float(value))
                        if value > lastCPUTime:
                            lastCPUTime = value
                    except ValueError:
                        pass
                if name == 'WallClockTime':
                    try:
                        value = int(float(value))
                        if value > lastWallTime:
                            lastWallTime = value
                    except ValueError:
                        pass
                if heartBeatTime > lastHeartBeatTime:
                    lastHeartBeatTime = heartBeatTime

        return lastCPUTime, lastWallTime, lastHeartBeatTime

    def __checkLoggingInfo(self, jobID, jobDict):
        """ Get info from JobLogging
    """
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result['OK']:
            logList = result['Value']

        startTime = jobDict['StartExecTime']
        if not startTime or startTime == 'None':
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == 'Running':
                    startTime = items[3]
                    break
            if not startTime or startTime == 'None':
                startTime = jobDict['SubmissionTime']

        if isinstance(startTime, six.string_types):
            startTime = fromString(startTime)
            if startTime is None:
                self.log.error('Wrong timestamp in DB', items[3])
                startTime = dateTime()

        endTime = dateTime()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == 'Stalled':
                endTime = fromString(items[3])
        if endTime is None:
            self.log.error('Wrong timestamp in DB', items[3])
            endTime = dateTime()

        return startTime, endTime

    def _kickStuckJobs(self):
        """ Reschedule jobs stuck in initialization status Rescheduled, Matched
    """

        message = ''

        checkTime = dateTime() - self.matchedTime * second
        result = self.jobDB.selectJobs({'Status': JobStatus.MATCHED},
                                       older=checkTime)
        if not result['OK']:
            self.log.error('Failed to select jobs', result['Message'])
            return result

        jobIDs = result['Value']
        if jobIDs:
            self.log.info('Rescheduling %d jobs stuck in Matched status' %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if 'FailedJobs' in result:
                message = 'Failed to reschedule %d jobs stuck in Matched status' % len(
                    result['FailedJobs'])

        checkTime = dateTime() - self.rescheduledTime * second
        result = self.jobDB.selectJobs({'Status': 'Rescheduled'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error('Failed to select jobs', result['Message'])
            return result

        jobIDs = result['Value']
        if jobIDs:
            self.log.info('Rescheduling %d jobs stuck in Rescheduled status' %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if 'FailedJobs' in result:
                if message:
                    message += '\n'
                message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len(
                    result['FailedJobs'])

        if message:
            return S_ERROR(message)
        return S_OK()

    def _failSubmittingJobs(self):
        """ Failed Jobs stuck in Submitting Status for a long time.
        They are due to a failed bulk submission transaction.
    """

        # Get old Submitting Jobs
        checkTime = dateTime() - self.submittingTime * second
        result = self.jobDB.selectJobs({'Status': JobStatus.SUBMITTING},
                                       older=checkTime)
        if not result['OK']:
            self.log.error('Failed to select jobs', result['Message'])
            return result

        for jobID in result['Value']:
            result = self.__updateJobStatus(jobID, JobStatus.FAILED)
            if not result['OK']:
                self.log.error('Failed to update job status',
                               result['Message'])
                continue

        return S_OK()

    def __sendKillCommand(self, job):
        """Send a kill signal to the job such that it cannot continue running.

    :param int job: ID of job to send kill command
    """
        ownerDN = self.jobDB.getJobAttribute(job, 'OwnerDN')
        ownerGroup = self.jobDB.getJobAttribute(job, 'OwnerGroup')
        if ownerDN['OK'] and ownerGroup['OK']:
            wmsClient = WMSClient(useCertificates=True,
                                  delegatedDN=ownerDN['Value'],
                                  delegatedGroup=ownerGroup['Value'])
            resKill = wmsClient.killJob(job)
            if not resKill['OK']:
                self.log.error("Failed to send kill command to job",
                               "%s: %s" % (job, resKill['Message']))
        else:
            self.log.error(
                "Failed to get ownerDN or Group for job:",
                "%s: %s, %s" % (job, ownerDN.get(
                    'Message', ''), ownerGroup.get('Message', '')))
コード例 #38
0
ファイル: OptimizerModule.py プロジェクト: DIRACGrid/DIRAC
class OptimizerModule(AgentModule):
  """
      The specific agents must provide the following methods:

        *  initialize() for initial settings
        *  beginExecution()
        *  execute() - the main method called in the agent cycle
        *  endExecution()
        *  finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def __init__(self, *args, **kwargs):
    """ c'tor
    """
    AgentModule.__init__(self, *args, **kwargs)
    self.jobDB = None
    self.logDB = None
    self.startingMinorStatus = None
    self.startingMajorStatus = "Checking"
    self.failedStatus = None
    self.requiredJobInfo = 'jdl'
    self._initResult = None

  def initialize(self, jobDB=None, logDB=None):
    """ Initialization of the Optimizer Agent.
    """
    self.jobDB = JobDB() if jobDB is None else jobDB
    if not self.jobDB.isValid():
      dExit(1)

    self.logDB = JobLoggingDB() if logDB is None else logDB

    optimizerName = self.am_getModuleParam('agentName')
    if optimizerName.endswith('Agent'):
      optimizerName = optimizerName[:-len('Agent')]
    self.am_setModuleParam('optimizerName', optimizerName)

    self.startingMinorStatus = self.am_getModuleParam('optimizerName')
    self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed')
    self.am_setOption("PollingTime", 30)

    return self.initializeOptimizer()

  def initializeOptimizer(self):
    """ To be overwritten by inheriting class
    """
    return S_OK()

  #############################################################################
  def execute(self):
    """ The main agent execution method
    """

    result = self.initializeOptimizer()
    if not result['OK']:
      return result
    self._initResult = result['Value']

    condition = {'Status': self.startingMajorStatus}
    if self.startingMinorStatus:
      condition['MinorStatus'] = self.startingMinorStatus

    result = self.jobDB.selectJobs(condition)
    if not result['OK']:
      self.log.warn('Failed to get a job list from the JobDB')
      return S_ERROR('Failed to get a job list from the JobDB')

    if not result['Value']:
      self.log.verbose('No pending jobs to process')
      return S_OK('No work to do')

    for job in result['Value']:
      result = self.getJobDefinition(job)
      if not result['OK']:
        self.setFailedJob(job, result['Message'], '')
        continue
      jobDef = result['Value']
      result = self.optimizeJob(job, jobDef['classad'])

    return S_OK()

  #############################################################################
  def optimizeJob(self, job, classAdJob):
    """ Call the corresponding Optimizer checkJob method
    """
    self.log.info('Job %s will be processed by %sAgent' % (job, self.am_getModuleParam('optimizerName')))
    result = self.checkJob(job, classAdJob)
    if not result['OK']:
      self.setFailedJob(job, result['Message'], classAdJob)
    return result

  #############################################################################
  def getJobDefinition(self, job, jobDef=False):
    """ Retrieve JDL of the Job and return jobDef dictionary
    """
    if not jobDef:
      jobDef = {}
    # If not jdl in jobinfo load it
    if 'jdl' not in jobDef:
      if self.requiredJobInfo == 'jdlOriginal':
        result = self.jobDB.getJobJDL(job, original=True)
        if not result['OK']:
          self.log.error("No JDL for job", "%s" % job)
          return S_ERROR("No JDL for job")
        jobDef['jdl'] = result['Value']
      if self.requiredJobInfo == 'jdl':
        result = self.jobDB.getJobJDL(job)
        if not result['OK']:
          self.log.error("No JDL for job", "%s" % job)
          return S_ERROR("No JDL for job")
        jobDef['jdl'] = result['Value']
    # Load the classad if needed
    if 'jdl' in jobDef and 'classad' not in jobDef:
      try:
        classad = ClassAd(jobDef['jdl'])
      except BaseException:
        self.log.debug("Cannot load JDL")
        return S_ERROR('Illegal Job JDL')
      if not classad.isOK():
        self.log.debug("Warning: illegal JDL for job %s, will be marked problematic" % (job))
        return S_ERROR('Illegal Job JDL')
      jobDef['classad'] = classad
    return S_OK(jobDef)

  #############################################################################
  def getOptimizerJobInfo(self, job, reportName):
    """This method gets job optimizer information that will
       be used for
    """
    self.log.verbose("self.jobDB.getJobOptParameter(%s,'%s')" % (job, reportName))
    result = self.jobDB.getJobOptParameter(job, reportName)
    if result['OK']:
      value = result['Value']
      if not value:
        self.log.warn('JobDB returned null value for %s %s' % (job, reportName))
        return S_ERROR('No optimizer info returned')
      else:
        try:
          return S_OK(eval(value))
        except BaseException as x:
          return S_ERROR('Could not evaluate optimizer parameters: %s' % repr(x))

    return result

  #############################################################################
  def setOptimizerJobInfo(self, job, reportName, value):
    """This method sets the job optimizer information that will subsequently
       be used for job scheduling and TURL queries on the WN.
    """
    self.log.verbose("self.jobDB.setJobOptParameter(%s,'%s','%s')" % (job, reportName, value))
    if not self.am_Enabled():
      return S_OK()
    return self.jobDB.setJobOptParameter(job, reportName, str(value))

  #############################################################################
  def setOptimizerChain(self, job, value):
    """This method sets the job optimizer chain, in principle only needed by
       one of the optimizers.
    """
    self.log.verbose("self.jobDB.setOptimizerChain(%s,%s)" % (job, value))
    if not self.am_Enabled():
      return S_OK()
    return self.jobDB.setOptimizerChain(job, value)

  #############################################################################
  def setNextOptimizer(self, job):
    """This method is executed when the optimizer instance has successfully
       processed the job.  The next optimizer in the chain will subsequently
       start to work on the job.
    """

    result = self.logDB.addLoggingRecord(job, status=self.startingMajorStatus,
                                         minor=self.startingMinorStatus,
                                         source=self.am_getModuleParam("optimizerName"))
    if not result['OK']:
      self.log.warn(result['Message'])

    self.log.verbose("self.jobDB.setNextOptimizer(%s,'%s')" % (job, self.am_getModuleParam("optimizerName")))
    return self.jobDB.setNextOptimizer(job, self.am_getModuleParam("optimizerName"))

  #############################################################################
  def updateJobStatus(self, job, status, minorStatus=None, appStatus=None):
    """This method updates the job status in the JobDB, this should only be
       used to fail jobs due to the optimizer chain.
    """
    self.log.verbose("self.jobDB.setJobStatus(%s,'Status/Minor/Application','%s'/'%s'/'%s',update=True)" %
                     (job, status, str(minorStatus), str(appStatus)))
    if not self.am_Enabled():
      return S_OK()

    result = self.jobDB.setJobStatus(job, status, minorStatus, appStatus)
    if not result['OK']:
      return result

    result = self.logDB.addLoggingRecord(job, status=status, minor=minorStatus, application=appStatus,
                                         source=self.am_getModuleParam('optimizerName'))
    if not result['OK']:
      self.log.warn(result['Message'])

    return S_OK()

  #############################################################################
  def setJobParam(self, job, reportName, value):
    """This method updates a job parameter in the JobDB.
    """
    self.log.verbose("self.jobDB.setJobParameter(%s,'%s','%s')" % (job, reportName, value))
    if not self.am_Enabled():
      return S_OK()
    return self.jobDB.setJobParameter(job, reportName, value)

  #############################################################################
  def setFailedJob(self, job, msg, classAdJob=None):
    """This method moves the job to the failed status
    """
    self.log.verbose("self.updateJobStatus(%s,'%s','%s')" % (job, self.failedStatus, msg))
    if not self.am_Enabled():
      return S_OK()
    self.updateJobStatus(job, self.failedStatus, msg)
    if classAdJob:
      self.sendAccountingRecord(job, msg, classAdJob)

  #############################################################################
  def checkJob(self, job, classad):
    """This method controls the checking of the job, should be overridden in a subclass
    """
    self.log.warn('Optimizer: checkJob method should be implemented in a subclass')
    return S_ERROR('Optimizer: checkJob method should be implemented in a subclass')

  #############################################################################
  def sendAccountingRecord(self, job, msg, classAdJob):
    """
      Send and accounting record for the failed job
    """
    accountingReport = AccountingJob()
    accountingReport.setStartTime()
    accountingReport.setEndTime()

    owner = classAdJob.getAttributeString('Owner')
    userGroup = classAdJob.getAttributeString('OwnerGroup')
    jobGroup = classAdJob.getAttributeString('JobGroup')
    jobType = classAdJob.getAttributeString('JobType')
    jobClass = 'unknown'
    if classAdJob.lookupAttribute('JobSplitType'):
      jobClass = classAdJob.getAttributeString('JobSplitType')
    inputData = []
    processingType = 'unknown'
    if classAdJob.lookupAttribute('ProcessingType'):
      processingType = classAdJob.getAttributeString('ProcessingType')
    if classAdJob.lookupAttribute('InputData'):
      inputData = classAdJob.getListFromExpression('InputData')
    inputDataFiles = len(inputData)
    outputData = []
    if classAdJob.lookupAttribute('OutputData'):
      outputData = classAdJob.getListFromExpression('OutputData')
    outputDataFiles = len(outputData)

    acData = {
        'User': owner,
        'UserGroup': userGroup,
        'JobGroup': jobGroup,
        'JobType': jobType,
        'JobClass': jobClass,
        'ProcessingType': processingType,
        'FinalMajorStatus': self.failedStatus,
        'FinalMinorStatus': msg,
        'CPUTime': 0.0,
        'NormCPUTime': 0.0,
        'ExecTime': 0.0,
        'InputDataSize': 0.0,
        'OutputDataSize': 0.0,
        'InputDataFiles': inputDataFiles,
        'OutputDataFiles': outputDataFiles,
        'DiskSpace': 0.0,
        'InputSandBoxSize': 0.0,
        'OutputSandBoxSize': 0.0,
        'ProcessedEvents': 0.0
    }

    self.log.verbose('Accounting Report is:')
    self.log.verbose(acData)
    accountingReport.setValuesFromDict(acData)
    return accountingReport.commit()
コード例 #39
0
ファイル: Test_JobLoggingDB.py プロジェクト: DIRACGrid/DIRAC
 def setUp(self):
     self.jlogDB = JobLoggingDB()
コード例 #40
0
ファイル: Matcher.py プロジェクト: Andrew-McNab-UK/DIRAC
class Matcher( object ):
  """ Logic for matching
  """

  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )


  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict


  def _getResourceDict( self, resourceDescription, credDict ):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription( resourceDescription )
    resourceDict = self._checkCredentials( resourceDict, credDict )
    self._checkPilotVersion( resourceDict )
    if not self._checkMask( resourceDict ):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose( "Resource description:" )
    for key in resourceDict:
      self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    return resourceDict

  def _processResourceDescription( self, resourceDescription ):
    """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

    resourceDict = {}
    if isinstance( resourceDescription, basestring ):
      classAdAgent = ClassAd( resourceDescription )
      if not classAdAgent.isOK():
        raise ValueError( 'Illegal Resource JDL' )
      self.log.verbose( classAdAgent.asJDL() )

      for name in singleValueDefFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'CPUTime':
            resourceDict[name] = classAdAgent.getAttributeInt( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      for name in multiValueMatchFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'SubmitPool':
            resourceDict[name] = classAdAgent.getListFromExpression( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      # Check if a JobID is requested
      if classAdAgent.lookupAttribute( 'JobID' ):
        resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' )

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ):
        if classAdAgent.lookupAttribute( k ):
          resourceDict[ k ] = classAdAgent.getAttributeString( k )

    else:
      for name in singleValueDefFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      for name in multiValueMatchFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      if resourceDescription.has_key( 'JobID' ):
        resourceDict['JobID'] = resourceDescription['JobID']

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
                 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ):
        if k in resourceDescription:
          resourceDict[ k ] = resourceDescription[ k ]

    return resourceDict



  def _reportStatus( self, resourceDict, jobID ):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes( jobID, attNames, attValues )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Set job attributes for jobID %s" % jobID )

    result = self.jlDB.addLoggingRecord( jobID,
                                         status = 'Matched',
                                         minor = 'Assigned',
                                         source = 'Matcher' )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Added logging record for jobID %s" % jobID )


  def _checkMask( self, resourceDict ):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if not 'Site' in resourceDict:
      self.log.error( "Missing Site Name in Resource JDL" )
      raise RuntimeError( "Missing Site Name in Resource JDL" )

    # Get common site mask and check the agent site
    result = self.jobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] )
      raise RuntimeError( "Internal error" )
    maskList = result['Value']

    if resourceDict['Site'] not in maskList:
      return False

    return True

  def _updatePilotInfo( self, resourceDict ):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      gridCE = resourceDict.get( 'GridCE', 'Unknown' )
      site = resourceDict.get( 'Site', 'Unknown' )
      benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
      self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) )

      result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site,
                                                  destination = gridCE, benchmark = benchmark )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _updatePilotJobMapping( self, resourceDict, jobID ):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )
      result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _checkCredentials( self, resourceDict, credDict ):
    """ Check if we can get a job given the passed credentials
    """
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      # You can only match groups in the same VO
      if credDict[ 'group' ] == "hosts":
        # for the host case the VirtualOrganization parameter
        # is mandatory in resourceDict
        vo = resourceDict.get( 'VirtualOrganization', '' )
      else:
        vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
      else:
        raise RuntimeError( result['Message'] )
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        self.log.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        self.log.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ]:
            raise RuntimeError( result['Message'] )
          if credDict[ 'group' ] not in result[ 'Value' ]:
            # DN is not in the same group! bad boy.
            self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    return resourceDict

  def _checkPilotVersion( self, resourceDict ):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          raise RuntimeError( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \
                            ( pilotVersion, ",".join( validVersions ) ) )
      # Check project if requested
      validProject = self.opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                                   resourceDict[ 'ReleaseProject' ] ) )
コード例 #41
0
ファイル: JobCleaningAgent.py プロジェクト: marianne013/DIRAC
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:

         *  initialize() for initial settings
         *  beginExecution()
         *  execute() - the main method called in the agent cycle
         *  endExecution()
         *  finalize() - the graceful exit of the method, this one is usually used for the agent restart
  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    #clients
    # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service?
    self.jobDB = None
    self.taskQueueDB = None
    self.jobLoggingDB = None

    self.maxJobsAtOnce = 100
    self.jobByJob = False
    self.throttlingPeriod = 0.

    self.prodTypes = []

    self.removeStatusDelay = {}

  #############################################################################
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prodTypes = agentTSTypes
    else:
      self.prodTypes = Operations().getValue(
          'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
    gLogger.info("Will exclude the following Production types from cleaning %s" % (
        ', '.join(self.prodTypes)))
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)

    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )
    self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 )

    return S_OK()

  def __getAllowedJobTypes( self ):
    """ Get valid jobTypes
    """
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prodTypes:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """ Remove jobs in various status
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result

    # No jobs in the system subject to removal
    if not result['Value']:
      return S_OK()

    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in self.removeStatusDelay:
      delay = self.removeStatusDelay[ status ]
      if delay < 0:
        # Negative delay means don't delete anything...
        continue
      condDict = dict( baseCond )
      if status != 'Any':
        condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s day(s)" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.error("Cannot unassign jobs to sandboxes", result['Message'])
      return result

    result = self.deleteJobOversizedSandbox(jobList)
    if not result[ 'OK' ]:
      gLogger.error(
          "Cannot schedule removal of oversized sandboxes", result['Message'])
      return result

    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop(jobList.index(job))

    # TODO: we should not remove a job if it still has requests in the RequestManager.
    # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:
          error_count += 1
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)
    else:
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )

      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK({'Successful': successful, 'Failed': failed})

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn

    result = {'Successful':successful, 'Failed':failed}
    return S_OK(result)

  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
コード例 #42
0
ファイル: StalledJobAgent.py プロジェクト: acasajus/DIRAC
class StalledJobAgent(AgentModule):
    """
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
    jobDB = None
    logDB = None
    matchedTime = 7200
    rescheduledTime = 600
    completedTime = 86400

    #############################################################################
    def initialize(self):
        """Sets default parameters
"""
        self.jobDB = JobDB()
        self.logDB = JobLoggingDB()
        self.am_setOption('PollingTime', 60 * 60)
        if not self.am_getOption('Enable', True):
            self.log.info('Stalled Job Agent running in disabled mode')
        return S_OK()

    #############################################################################
    def execute(self):
        """ The main agent execution method
"""
        self.log.verbose('Waking up Stalled Job Agent')

        wms_instance = getSystemInstance('WorkloadManagement')
        if not wms_instance:
            return S_ERROR(
                'Can not get the WorkloadManagement system instance')
        wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance,
                                 'JobWrapper')

        stalledTime = self.am_getOption('StalledTimeHours', 2)
        failedTime = self.am_getOption('FailedTimeHours', 6)

        self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime)
        self.rescheduledTime = self.am_getOption('RescheduledTime',
                                                 self.rescheduledTime)
        self.completedTime = self.am_getOption('CompletedTime',
                                               self.completedTime)

        self.log.verbose('StalledTime = %s cycles' % (stalledTime))
        self.log.verbose('FailedTime = %s cycles' % (failedTime))

        watchdogCycle = gConfig.getValue(
            cfgPath(wrapperSection, 'CheckingTime'), 30 * 60)
        watchdogCycle = max(
            watchdogCycle,
            gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'),
                             20 * 60))

        # Add half cycle to avoid race conditions
        stalledTime = watchdogCycle * (stalledTime + 0.5)
        failedTime = watchdogCycle * (failedTime + 0.5)

        result = self.__markStalledJobs(stalledTime)
        if not result['OK']:
            self.log.error(result['Message'])

        #Note, jobs will be revived automatically during the heartbeat signal phase and
        #subsequent status changes will result in jobs not being selected by the
        #stalled job agent.

        result = self.__failStalledJobs(failedTime)
        if not result['OK']:
            self.log.error(result['Message'])

        result = self.__failCompletedJobs()

        if not result['OK']:
            self.log.error(result['Message'])

        result = self.__kickStuckJobs()
        if not result['OK']:
            self.log.error(result['Message'])

        return S_OK('Stalled Job Agent cycle complete')

    #############################################################################
    def __markStalledJobs(self, stalledTime):
        """ Identifies stalled jobs running without update longer than stalledTime.
"""
        stalledCounter = 0
        runningCounter = 0
        result = self.jobDB.selectJobs({'Status': 'Running'})
        if not result['OK']:
            return result
        if not result['Value']:
            return S_OK()
        jobs = result['Value']
        self.log.info('%s Running jobs will be checked for being stalled' %
                      (len(jobs)))
        jobs.sort()
        # jobs = jobs[:10] #for debugging
        for job in jobs:
            result = self.__getStalledJob(job, stalledTime)
            if result['OK']:
                self.log.verbose('Updating status to Stalled for job %s' %
                                 (job))
                self.__updateJobStatus(job, 'Stalled')
                stalledCounter += 1
            else:
                self.log.verbose(result['Message'])
                runningCounter += 1

        self.log.info(
            'Total jobs: %s, Stalled job count: %s, Running job count: %s' %
            (len(jobs), stalledCounter, runningCounter))
        return S_OK()

    #############################################################################
    def __failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
"""

        result = self.jobDB.selectJobs({'Status': 'Stalled'})
        if not result['OK']:
            return result

        failedCounter = 0

        if result['Value']:
            jobs = result['Value']
            self.log.info('%s Stalled jobs will be checked for failure' %
                          (len(jobs)))

            for job in jobs:

                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if result['OK']:
                    pilotStatus = result['Value']
                    if pilotStatus != "Running":
                        result = self.__updateJobStatus(
                            job, 'Failed', "Job stalled: pilot not running")
                        failedCounter += 1
                        result = self.__sendAccounting(job)
                        if not result['OK']:
                            self.log.error(result['Message'])
                            break
                        continue

                result = self.__getLatestUpdateTime(job)
                if not result['OK']:
                    return result
                currentTime = toEpoch()
                lastUpdate = result['Value']
                elapsedTime = currentTime - lastUpdate
                if elapsedTime > failedTime:
                    self.__updateJobStatus(
                        job, 'Failed',
                        'Stalling for more than %d sec' % failedTime)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error(result['Message'])
                        break

        recoverCounter = 0

        for minor in [
                "Job stalled: pilot not running",
                'Stalling for more than %d sec' % failedTime
        ]:
            result = self.jobDB.selectJobs({
                'Status': 'Failed',
                'MinorStatus': minor,
                'AccountedFlag': 'False'
            })
            if not result['OK']:
                return result
            if result['Value']:
                jobs = result['Value']
                self.log.info('%s Stalled jobs will be Accounted' %
                              (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error(result['Message'])
                        continue

                    recoverCounter += 1
            if not result['OK']:
                break

        if failedCounter:
            self.log.info('%d jobs set to Failed' % failedCounter)
        if recoverCounter:
            self.log.info('%d jobs properly Accounted' % recoverCounter)
        return S_OK(failedCounter)

    #############################################################################
    def __getJobPilotStatus(self, jobID):
        """ Get the job pilot status
"""
        result = self.jobDB.getJobParameter(jobID, 'Pilot_Reference')
        if not result['OK']:
            return result
        if not result['Value']:
            return S_ERROR('Failed to get the pilot reference')

        pilotReference = result['Value']
        wmsAdminClient = RPCClient('WorkloadManagement/WMSAdministrator')
        result = wmsAdminClient.getPilotInfo(pilotReference)
        if not result['OK']:
            self.log.error(result['Message'])
            return S_ERROR('Failed to get the pilot status')
        pilotStatus = result['Value'][pilotReference]['Status']

        return S_OK(pilotStatus)

    #############################################################################
    def __getStalledJob(self, job, stalledTime):
        """ Compares the most recent of LastUpdateTime and HeartBeatTime against
the stalledTime limit.
"""
        result = self.__getLatestUpdateTime(job)
        if not result['OK']:
            return result

        currentTime = toEpoch()
        lastUpdate = result['Value']

        elapsedTime = currentTime - lastUpdate
        self.log.verbose('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime))
        if elapsedTime > stalledTime:
            self.log.info(
                'Job %s is identified as stalled with last update > %s secs ago'
                % (job, elapsedTime))
            return S_OK('Stalled')

        return S_ERROR('Job %s is running and will be ignored' % job)

    #############################################################################
    def __getLatestUpdateTime(self, job):
        """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
        result = self.jobDB.getJobAttributes(
            job, ['HeartBeatTime', 'LastUpdateTime'])
        if not result['OK']:
            self.log.error(result['Message'])
        if not result['OK'] or not result['Value']:
            return S_ERROR('Could not get attributes for job %s' % job)

        self.log.verbose(result)
        latestUpdate = 0
        if not result['Value']['HeartBeatTime'] or result['Value'][
                'HeartBeatTime'] == 'None':
            self.log.verbose('HeartBeatTime is null for job %s' % job)
        else:
            latestUpdate = toEpoch(fromString(
                result['Value']['HeartBeatTime']))

        if not result['Value']['LastUpdateTime'] or result['Value'][
                'LastUpdateTime'] == 'None':
            self.log.verbose('LastUpdateTime is null for job %s' % job)
        else:
            lastUpdate = toEpoch(fromString(result['Value']['LastUpdateTime']))
            if latestUpdate < lastUpdate:
                latestUpdate = lastUpdate

        if not latestUpdate:
            return S_ERROR(
                'LastUpdate and HeartBeat times are null for job %s' % job)
        else:
            self.log.verbose('Latest update time from epoch for job %s is %s' %
                             (job, latestUpdate))
            return S_OK(latestUpdate)

    #############################################################################
    def __updateJobStatus(self, job, status, minorstatus=None):
        """ This method updates the job status in the JobDB, this should only be
used to fail jobs due to the optimizer chain.
"""
        self.log.verbose(
            "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" %
            (job, status))

        if self.am_getOption('Enable', True):
            result = self.jobDB.setJobAttribute(job,
                                                'Status',
                                                status,
                                                update=True)
        else:
            result = S_OK('DisabledMode')

        if result['OK']:
            if minorstatus:
                self.log.verbose(
                    "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)"
                    % (job, minorstatus))
                result = self.jobDB.setJobAttribute(job,
                                                    'MinorStatus',
                                                    minorstatus,
                                                    update=True)

        if not minorstatus:  #Retain last minor status for stalled jobs
            result = self.jobDB.getJobAttributes(job, ['MinorStatus'])
            if result['OK']:
                minorstatus = result['Value']['MinorStatus']

        logStatus = status
        result = self.logDB.addLoggingRecord(job,
                                             status=logStatus,
                                             minor=minorstatus,
                                             source='StalledJobAgent')
        if not result['OK']:
            self.log.warn(result)

        return result

    def __getProcessingType(self, jobID):
        """ Get the Processing Type from the JDL, until it is promoted to a real Attribute
"""
        processingType = 'unknown'
        result = self.jobDB.getJobJDL(jobID, original=True)
        if not result['OK']:
            return processingType
        classAdJob = ClassAd(result['Value'])
        if classAdJob.lookupAttribute('ProcessingType'):
            processingType = classAdJob.getAttributeString('ProcessingType')
        return processingType

    #############################################################################
    def __sendAccounting(self, jobID):
        """ Send WMS accounting data for the given job
"""
        try:
            accountingReport = Job()
            endTime = 'Unknown'
            lastHeartBeatTime = 'Unknown'

            result = self.jobDB.getJobAttributes(jobID)
            if not result['OK']:
                return result
            jobDict = result['Value']

            startTime, endTime = self.__checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            cpuNormalization = self.jobDB.getJobParameter(
                jobID, 'CPUNormalizationFactor')
            if not cpuNormalization['OK'] or not cpuNormalization['Value']:
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(cpuNormalization['Value'])
        except Exception:
            self.log.exception(
                "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s"
                % (str(jobID), str(endTime), str(lastHeartBeatTime)), '',
                False)
            return S_ERROR("Exception")
        processingType = self.__getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        #Fill the accounting data
        acData = {
            'Site': jobDict['Site'],
            'User': jobDict['Owner'],
            'UserGroup': jobDict['OwnerGroup'],
            'JobGroup': jobDict['JobGroup'],
            'JobType': jobDict['JobType'],
            'JobClass': jobDict['JobSplitType'],
            'ProcessingType': processingType,
            'FinalMajorStatus': 'Failed',
            'FinalMinorStatus': 'Stalled',
            'CPUTime': lastCPUTime,
            'NormCPUTime': lastCPUTime * cpuNormalization,
            'ExecTime': lastWallTime,
            'InputDataSize': 0.0,
            'OutputDataSize': 0.0,
            'InputDataFiles': 0,
            'OutputDataFiles': 0,
            'DiskSpace': 0.0,
            'InputSandBoxSize': 0.0,
            'OutputSandBoxSize': 0.0,
            'ProcessedEvents': 0
        }
        self.log.verbose('Accounting Report is:')
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result['OK']:
            self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True')
        else:
            self.log.error(
                'Failed to send accounting report',
                'Job: %d, Error: %s' % (int(jobID), result['Message']))
        return result

    def __checkHeartBeat(self, jobID, jobDict):
        """ Get info from HeartBeat
"""
        result = self.jobDB.getHeartBeatData(jobID)
        lastCPUTime = 0
        lastWallTime = 0
        lastHeartBeatTime = jobDict['StartExecTime']
        if lastHeartBeatTime == "None":
            lastHeartBeatTime = 0

        if result['OK']:
            for name, value, heartBeatTime in result['Value']:
                if 'CPUConsumed' == name:
                    try:
                        value = int(float(value))
                        if value > lastCPUTime:
                            lastCPUTime = value
                    except ValueError:
                        pass
                if 'WallClockTime' == name:
                    try:
                        value = int(float(value))
                        if value > lastWallTime:
                            lastWallTime = value
                    except ValueError:
                        pass
                if heartBeatTime > lastHeartBeatTime:
                    lastHeartBeatTime = heartBeatTime

        return lastCPUTime, lastWallTime, lastHeartBeatTime

    def __checkLoggingInfo(self, jobID, jobDict):
        """ Get info from JobLogging
"""
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result['OK']:
            logList = result['Value']

        startTime = jobDict['StartExecTime']
        if not startTime or startTime == 'None':
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == 'Running':
                    startTime = items[3]
                    break
            if not startTime or startTime == 'None':
                startTime = jobDict['SubmissionTime']

        if type(startTime) in types.StringTypes:
            startTime = fromString(startTime)
            if startTime == None:
                self.log.error('Wrong timestamp in DB', items[3])
                startTime = dateTime()

        endTime = dateTime()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == 'Stalled':
                endTime = fromString(items[3])
        if endTime == None:
            self.log.error('Wrong timestamp in DB', items[3])
            endTime = dateTime()

        return startTime, endTime

    def __kickStuckJobs(self):
        """ Reschedule jobs stuck in initialization status Rescheduled, Matched
"""

        message = ''

        checkTime = str(dateTime() - self.matchedTime * second)
        result = self.jobDB.selectJobs({'Status': 'Matched'}, older=checkTime)
        if not result['OK']:
            self.log.error(result['Message'])
            return result

        jobIDs = result['Value']
        if jobIDs:
            self.log.info('Rescheduling %d jobs stuck in Matched status' %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if 'FailedJobs' in result:
                message = 'Failed to reschedule %d jobs stuck in Matched status' % len(
                    result['FailedJobs'])

        checkTime = str(dateTime() - self.rescheduledTime * second)
        result = self.jobDB.selectJobs({'Status': 'Rescheduled'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error(result['Message'])
            return result

        jobIDs = result['Value']
        if jobIDs:
            self.log.info('Rescheduling %d jobs stuck in Rescheduled status' %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if 'FailedJobs' in result:
                if message:
                    message += '\n'
                message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len(
                    result['FailedJobs'])

        if message:
            return S_ERROR(message)
        else:
            return S_OK()

    def __failCompletedJobs(self):
        """ Failed Jobs stuck in Completed Status for a long time.
      They are due to pilots being killed during the 
      finalization of the job execution.
    """

        # Get old Completed Jobs
        checkTime = str(dateTime() - self.completedTime * second)
        result = self.jobDB.selectJobs({'Status': 'Completed'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error(result['Message'])
            return result

        jobIDs = result['Value']
        if not jobIDs:
            return S_OK()

        # Remove those with Minor Status "Pending Requests"
        for jobID in jobIDs:
            result = self.jobDB.getJobAttribute(jobID, 'MinorStatus')
            if not result['OK']:
                self.log.error(result['Message'])
                continue
            if result['Value'] == "Pending Requests":
                continue

            result = self.__updateJobStatus(jobID, 'Failed',
                                            "Job died during finalization")
            result = self.__sendAccounting(jobID)
            if not result['OK']:
                self.log.error(result['Message'])
                continue

        return S_OK()
コード例 #43
0
ファイル: JobCleaningAgent.py プロジェクト: hanyl/DIRAC
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production'])
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()

  def __getAllowedJobTypes( self ):
    #Get valid jobTypes
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in REMOVE_STATUS_DELAY:
      delay = REMOVE_STATUS_DELAY[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()
コード例 #44
0
ファイル: StalledJobAgent.py プロジェクト: DIRACGrid/DIRAC
class StalledJobAgent(AgentModule):
    """
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""

    jobDB = None
    logDB = None
    matchedTime = 7200
    rescheduledTime = 600
    completedTime = 86400

    #############################################################################
    def initialize(self):
        """Sets default parameters
"""
        self.jobDB = JobDB()
        self.logDB = JobLoggingDB()
        self.am_setOption("PollingTime", 60 * 60)
        if not self.am_getOption("Enable", True):
            self.log.info("Stalled Job Agent running in disabled mode")
        return S_OK()

    #############################################################################
    def execute(self):
        """ The main agent execution method
"""
        self.log.verbose("Waking up Stalled Job Agent")

        wms_instance = getSystemInstance("WorkloadManagement")
        if not wms_instance:
            return S_ERROR("Can not get the WorkloadManagement system instance")
        wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper")

        stalledTime = self.am_getOption("StalledTimeHours", 2)
        failedTime = self.am_getOption("FailedTimeHours", 6)

        self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime)
        self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime)
        self.completedTime = self.am_getOption("CompletedTime", self.completedTime)

        self.log.verbose("StalledTime = %s cycles" % (stalledTime))
        self.log.verbose("FailedTime = %s cycles" % (failedTime))

        watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, "CheckingTime"), 30 * 60)
        watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60))

        # Add half cycle to avoid race conditions
        stalledTime = watchdogCycle * (stalledTime + 0.5)
        failedTime = watchdogCycle * (failedTime + 0.5)

        result = self.__markStalledJobs(stalledTime)
        if not result["OK"]:
            self.log.error("Failed to detect stalled jobs", result["Message"])

        # Note, jobs will be revived automatically during the heartbeat signal phase and
        # subsequent status changes will result in jobs not being selected by the
        # stalled job agent.

        result = self.__failStalledJobs(failedTime)
        if not result["OK"]:
            self.log.error("Failed to process stalled jobs", result["Message"])

        result = self.__failCompletedJobs()
        if not result["OK"]:
            self.log.error("Failed to process completed jobs", result["Message"])

        result = self.__kickStuckJobs()
        if not result["OK"]:
            self.log.error("Failed to kick stuck jobs", result["Message"])

        return S_OK("Stalled Job Agent cycle complete")

    #############################################################################
    def __markStalledJobs(self, stalledTime):
        """ Identifies stalled jobs running without update longer than stalledTime.
"""
        stalledCounter = 0
        runningCounter = 0
        result = self.jobDB.selectJobs({"Status": "Running"})
        if not result["OK"]:
            return result
        if not result["Value"]:
            return S_OK()
        jobs = result["Value"]
        self.log.info("%s Running jobs will be checked for being stalled" % (len(jobs)))
        jobs.sort()
        # jobs = jobs[:10] #for debugging
        for job in jobs:
            result = self.__getStalledJob(job, stalledTime)
            if result["OK"]:
                self.log.verbose("Updating status to Stalled for job %s" % (job))
                self.__updateJobStatus(job, "Stalled")
                stalledCounter += 1
            else:
                self.log.verbose(result["Message"])
                runningCounter += 1

        self.log.info(
            "Total jobs: %s, Stalled job count: %s, Running job count: %s" % (len(jobs), stalledCounter, runningCounter)
        )
        return S_OK()

    #############################################################################
    def __failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
"""

        result = self.jobDB.selectJobs({"Status": "Stalled"})
        if not result["OK"]:
            return result
        jobs = result["Value"]

        failedCounter = 0
        minorStalledStatuses = ("Job stalled: pilot not running", "Stalling for more than %d sec" % failedTime)

        if jobs:
            self.log.info("%s Stalled jobs will be checked for failure" % (len(jobs)))

            for job in jobs:
                setFailed = False
                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if not result["OK"]:
                    self.log.error("Failed to get pilot status", result["Message"])
                    continue
                pilotStatus = result["Value"]
                if pilotStatus != "Running":
                    setFailed = minorStalledStatuses[0]
                else:

                    result = self.__getLatestUpdateTime(job)
                    if not result["OK"]:
                        self.log.error("Failed to get job update time", result["Message"])
                        continue
                    elapsedTime = toEpoch() - result["Value"]
                    if elapsedTime > failedTime:
                        setFailed = minorStalledStatuses[1]

                # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info
                if setFailed:
                    # Send a kill signal to the job such that it cannot continue running
                    WMSClient().killJob(job)
                    self.__updateJobStatus(job, "Failed", setFailed)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result["OK"]:
                        self.log.error("Failed to send accounting", result["Message"])

        recoverCounter = 0

        for minor in minorStalledStatuses:
            result = self.jobDB.selectJobs({"Status": "Failed", "MinorStatus": minor, "AccountedFlag": "False"})
            if not result["OK"]:
                return result
            if result["Value"]:
                jobs = result["Value"]
                self.log.info("%s Stalled jobs will be Accounted" % (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result["OK"]:
                        self.log.error("Failed to send accounting", result["Message"])
                        continue

                    recoverCounter += 1
            if not result["OK"]:
                break

        if failedCounter:
            self.log.info("%d jobs set to Failed" % failedCounter)
        if recoverCounter:
            self.log.info("%d jobs properly Accounted" % recoverCounter)
        return S_OK(failedCounter)

    #############################################################################
    def __getJobPilotStatus(self, jobID):
        """ Get the job pilot status
"""
        result = self.jobDB.getJobParameter(jobID, "Pilot_Reference")
        if not result["OK"]:
            return result
        if not result["Value"]:
            return S_ERROR("Failed to get the pilot reference")

        pilotReference = result["Value"]
        wmsAdminClient = RPCClient("WorkloadManagement/WMSAdministrator")
        result = wmsAdminClient.getPilotInfo(pilotReference)
        if not result["OK"]:
            if "No pilots found" in result["Message"]:
                self.log.warn(result["Message"])
                return S_OK("NoPilot")
            self.log.error("Failed to get pilot information", result["Message"])
            return S_ERROR("Failed to get the pilot status")
        pilotStatus = result["Value"][pilotReference]["Status"]

        return S_OK(pilotStatus)

    #############################################################################
    def __getStalledJob(self, job, stalledTime):
        """ Compares the most recent of LastUpdateTime and HeartBeatTime against
the stalledTime limit.
"""
        result = self.__getLatestUpdateTime(job)
        if not result["OK"]:
            return result

        currentTime = toEpoch()
        lastUpdate = result["Value"]

        elapsedTime = currentTime - lastUpdate
        self.log.verbose("(CurrentTime-LastUpdate) = %s secs" % (elapsedTime))
        if elapsedTime > stalledTime:
            self.log.info("Job %s is identified as stalled with last update > %s secs ago" % (job, elapsedTime))
            return S_OK("Stalled")

        return S_ERROR("Job %s is running and will be ignored" % job)

    #############################################################################
    def __getLatestUpdateTime(self, job):
        """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
        result = self.jobDB.getJobAttributes(job, ["HeartBeatTime", "LastUpdateTime"])
        if not result["OK"]:
            self.log.error("Failed to get job attributes", result["Message"])
        if not result["OK"] or not result["Value"]:
            self.log.error("Could not get attributes for job", "%s" % job)
            return S_ERROR("Could not get attributes for job")

        self.log.verbose(result)
        latestUpdate = 0
        if not result["Value"]["HeartBeatTime"] or result["Value"]["HeartBeatTime"] == "None":
            self.log.verbose("HeartBeatTime is null for job %s" % job)
        else:
            latestUpdate = toEpoch(fromString(result["Value"]["HeartBeatTime"]))

        if not result["Value"]["LastUpdateTime"] or result["Value"]["LastUpdateTime"] == "None":
            self.log.verbose("LastUpdateTime is null for job %s" % job)
        else:
            lastUpdate = toEpoch(fromString(result["Value"]["LastUpdateTime"]))
            if latestUpdate < lastUpdate:
                latestUpdate = lastUpdate

        if not latestUpdate:
            return S_ERROR("LastUpdate and HeartBeat times are null for job %s" % job)
        else:
            self.log.verbose("Latest update time from epoch for job %s is %s" % (job, latestUpdate))
            return S_OK(latestUpdate)

    #############################################################################
    def __updateJobStatus(self, job, status, minorstatus=None):
        """ This method updates the job status in the JobDB, this should only be
used to fail jobs due to the optimizer chain.
"""
        self.log.verbose("self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (job, status))

        if self.am_getOption("Enable", True):
            result = self.jobDB.setJobAttribute(job, "Status", status, update=True)
        else:
            result = S_OK("DisabledMode")

        if result["OK"]:
            if minorstatus:
                self.log.verbose("self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (job, minorstatus))
                result = self.jobDB.setJobAttribute(job, "MinorStatus", minorstatus, update=True)

        if not minorstatus:  # Retain last minor status for stalled jobs
            result = self.jobDB.getJobAttributes(job, ["MinorStatus"])
            if result["OK"]:
                minorstatus = result["Value"]["MinorStatus"]

        logStatus = status
        result = self.logDB.addLoggingRecord(job, status=logStatus, minor=minorstatus, source="StalledJobAgent")
        if not result["OK"]:
            self.log.warn(result)

        return result

    def __getProcessingType(self, jobID):
        """ Get the Processing Type from the JDL, until it is promoted to a real Attribute
"""
        processingType = "unknown"
        result = self.jobDB.getJobJDL(jobID, original=True)
        if not result["OK"]:
            return processingType
        classAdJob = ClassAd(result["Value"])
        if classAdJob.lookupAttribute("ProcessingType"):
            processingType = classAdJob.getAttributeString("ProcessingType")
        return processingType

    #############################################################################
    def __sendAccounting(self, jobID):
        """ Send WMS accounting data for the given job
"""
        try:
            accountingReport = Job()
            endTime = "Unknown"
            lastHeartBeatTime = "Unknown"

            result = self.jobDB.getJobAttributes(jobID)
            if not result["OK"]:
                return result
            jobDict = result["Value"]

            startTime, endTime = self.__checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            cpuNormalization = self.jobDB.getJobParameter(jobID, "CPUNormalizationFactor")
            if not cpuNormalization["OK"] or not cpuNormalization["Value"]:
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(cpuNormalization["Value"])
        except Exception:
            self.log.exception(
                "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s"
                % (str(jobID), str(endTime), str(lastHeartBeatTime)),
                "",
                False,
            )
            return S_ERROR("Exception")
        processingType = self.__getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            "Site": jobDict["Site"],
            "User": jobDict["Owner"],
            "UserGroup": jobDict["OwnerGroup"],
            "JobGroup": jobDict["JobGroup"],
            "JobType": jobDict["JobType"],
            "JobClass": jobDict["JobSplitType"],
            "ProcessingType": processingType,
            "FinalMajorStatus": "Failed",
            "FinalMinorStatus": "Stalled",
            "CPUTime": lastCPUTime,
            "NormCPUTime": lastCPUTime * cpuNormalization,
            "ExecTime": lastWallTime,
            "InputDataSize": 0.0,
            "OutputDataSize": 0.0,
            "InputDataFiles": 0,
            "OutputDataFiles": 0,
            "DiskSpace": 0.0,
            "InputSandBoxSize": 0.0,
            "OutputSandBoxSize": 0.0,
            "ProcessedEvents": 0,
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData["ExecTime"]:
            acData["ExecTime"] = acData["CPUTime"]
        elif acData["ExecTime"] < acData["CPUTime"]:
            acData["ExecTime"] = acData["CPUTime"]

        self.log.verbose("Accounting Report is:")
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result["OK"]:
            self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True")
        else:
            self.log.error("Failed to send accounting report", "Job: %d, Error: %s" % (int(jobID), result["Message"]))
        return result

    def __checkHeartBeat(self, jobID, jobDict):
        """ Get info from HeartBeat
"""
        result = self.jobDB.getHeartBeatData(jobID)
        lastCPUTime = 0
        lastWallTime = 0
        lastHeartBeatTime = jobDict["StartExecTime"]
        if lastHeartBeatTime == "None":
            lastHeartBeatTime = 0

        if result["OK"]:
            for name, value, heartBeatTime in result["Value"]:
                if "CPUConsumed" == name:
                    try:
                        value = int(float(value))
                        if value > lastCPUTime:
                            lastCPUTime = value
                    except ValueError:
                        pass
                if "WallClockTime" == name:
                    try:
                        value = int(float(value))
                        if value > lastWallTime:
                            lastWallTime = value
                    except ValueError:
                        pass
                if heartBeatTime > lastHeartBeatTime:
                    lastHeartBeatTime = heartBeatTime

        return lastCPUTime, lastWallTime, lastHeartBeatTime

    def __checkLoggingInfo(self, jobID, jobDict):
        """ Get info from JobLogging
"""
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result["OK"]:
            logList = result["Value"]

        startTime = jobDict["StartExecTime"]
        if not startTime or startTime == "None":
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == "Running":
                    startTime = items[3]
                    break
            if not startTime or startTime == "None":
                startTime = jobDict["SubmissionTime"]

        if type(startTime) in types.StringTypes:
            startTime = fromString(startTime)
            if startTime == None:
                self.log.error("Wrong timestamp in DB", items[3])
                startTime = dateTime()

        endTime = dateTime()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == "Stalled":
                endTime = fromString(items[3])
        if endTime == None:
            self.log.error("Wrong timestamp in DB", items[3])
            endTime = dateTime()

        return startTime, endTime

    def __kickStuckJobs(self):
        """ Reschedule jobs stuck in initialization status Rescheduled, Matched
"""

        message = ""

        checkTime = str(dateTime() - self.matchedTime * second)
        result = self.jobDB.selectJobs({"Status": "Matched"}, older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        jobIDs = result["Value"]
        if jobIDs:
            self.log.info("Rescheduling %d jobs stuck in Matched status" % len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if "FailedJobs" in result:
                message = "Failed to reschedule %d jobs stuck in Matched status" % len(result["FailedJobs"])

        checkTime = str(dateTime() - self.rescheduledTime * second)
        result = self.jobDB.selectJobs({"Status": "Rescheduled"}, older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        jobIDs = result["Value"]
        if jobIDs:
            self.log.info("Rescheduling %d jobs stuck in Rescheduled status" % len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if "FailedJobs" in result:
                if message:
                    message += "\n"
                message += "Failed to reschedule %d jobs stuck in Rescheduled status" % len(result["FailedJobs"])

        if message:
            return S_ERROR(message)
        else:
            return S_OK()

    def __failCompletedJobs(self):
        """ Failed Jobs stuck in Completed Status for a long time.
      They are due to pilots being killed during the
      finalization of the job execution.
    """

        # Get old Completed Jobs
        checkTime = str(dateTime() - self.completedTime * second)
        result = self.jobDB.selectJobs({"Status": "Completed"}, older=checkTime)
        if not result["OK"]:
            self.log.error("Failed to select jobs", result["Message"])
            return result

        jobIDs = result["Value"]
        if not jobIDs:
            return S_OK()

        # Remove those with Minor Status "Pending Requests"
        for jobID in jobIDs:
            result = self.jobDB.getJobAttributes(jobID, ["Status", "MinorStatus"])
            if not result["OK"]:
                self.log.error("Failed to get job attributes", result["Message"])
                continue
            if result["Value"]["Status"] != "Completed":
                continue
            if result["Value"]["MinorStatus"] == "Pending Requests":
                continue

            result = self.__updateJobStatus(jobID, "Failed", "Job died during finalization")
            result = self.__sendAccounting(jobID)
            if not result["OK"]:
                self.log.error("Failed to send accounting", result["Message"])
                continue

        return S_OK()