Example #1
0
def cleanTaskQueues():
  tqDB = TaskQueueDB()
  jobDB = JobDB()
  logDB = JobLoggingDB()

  result = tqDB.enableAllTaskQueues()
  if not result['OK']:
    return result
  result = tqDB.findOrphanJobs()
  if not result['OK']:
    return result
  for jid in result['Value']:
    result = tqDB.deleteJob(jid)
    if not result['OK']:
      gLogger.error("Cannot delete from TQ job %s" % jid, result['Message'])
      continue
    result = jobDB.rescheduleJob(jid)
    if not result['OK']:
      gLogger.error("Cannot reschedule in JobDB job %s" % jid, result['Message'])
      continue
    result = logDB.addLoggingRecord(jid, JobStatus.RECEIVED, "", "", source="JobState")
    if not result['OK']:
      gLogger.error("Cannot add logging record in JobLoggingDB %s" % jid, result['Message'])
      continue
  return S_OK()
Example #2
0
def initializeMatcherHandler(serviceInfo):
    """  Matcher Service initialization
  """

    global gJobDB
    global gTaskQueueDB
    global jlDB
    global pilotAgentsDB

    gJobDB = JobDB()
    gTaskQueueDB = TaskQueueDB()
    jlDB = JobLoggingDB()
    pilotAgentsDB = PilotAgentsDB()

    gMonitor.registerActivity('matchTime', "Job matching time", 'Matching',
                              "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching',
                              "tqsk queues", gMonitor.OP_MEAN, 300)

    gTaskQueueDB.recalculateTQSharesForAll()
    gThreadScheduler.addPeriodicTask(120,
                                     gTaskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues)

    sendNumTaskQueues()

    return S_OK()
Example #3
0
def initializeMatcherHandler(serviceInfo):
    """  Matcher Service initialization
  """

    global jobDB
    global jobLoggingDB
    global taskQueueDB

    jobDB = JobDB()
    jobLoggingDB = JobLoggingDB()
    taskQueueDB = TaskQueueDB()

    gMonitor.registerActivity("matchTime", "Job matching time", "Matching", "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity(
        "matchTaskQueues", "Task queues checked per job", "Matching", "task queues", gMonitor.OP_MEAN, 300
    )
    gMonitor.registerActivity("matchesDone", "Job Matches", "Matching", "matches", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity("numTQs", "Number of Task Queues", "Matching", "tqsk queues", gMonitor.OP_MEAN, 300)

    taskQueueDB.recalculateTQSharesForAll()
    gThreadScheduler.addPeriodicTask(120, taskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(120, sendNumTaskQueues)

    sendNumTaskQueues()

    return S_OK()
Example #4
0
def initializeMatcherHandler( serviceInfo ):
  """  Matcher Service initialization
  """

  global gJobDB
  global gJobLoggingDB
  global gTaskQueueDB
  global gPilotAgentsDB

  gJobDB = JobDB()
  gJobLoggingDB = JobLoggingDB()
  gTaskQueueDB = TaskQueueDB()
  gPilotAgentsDB = PilotAgentsDB()

  gMonitor.registerActivity( 'matchTime', "Job matching time",
                             'Matching', "secs" , gMonitor.OP_MEAN, 300 )
  gMonitor.registerActivity( 'matchesDone', "Job Match Request",
                             'Matching', "matches" , gMonitor.OP_RATE, 300 )
  gMonitor.registerActivity( 'matchesOK', "Matched jobs",
                             'Matching', "matches" , gMonitor.OP_RATE, 300 )
  gMonitor.registerActivity( 'numTQs', "Number of Task Queues",
                             'Matching', "tqsk queues" , gMonitor.OP_MEAN, 300 )

  gTaskQueueDB.recalculateTQSharesForAll()
  gThreadScheduler.addPeriodicTask( 120, gTaskQueueDB.recalculateTQSharesForAll )
  gThreadScheduler.addPeriodicTask( 60, sendNumTaskQueues )

  sendNumTaskQueues()

  return S_OK()
Example #5
0
  def test_matcher( self ):
    # insert a proper DN to run the test
    resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]',
                           'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB',
                           'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO',
                           'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 }
    matcher = RPCClient( 'WorkloadManagement/Matcher' )
    JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )
    wmsClient = WMSClient()

    job = helloWorldJob()
    job.setDestination( 'DIRAC.Jenkins.org' )
    job.setInputData( '/a/bbb' )
    job.setType( 'User' )
    jobDescription = createFile( job )
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )

    jobID = res['Value']

    res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' )
    self.assert_( res['OK'] )


    tqDB = TaskQueueDB()
    tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]',
                 'OwnerGroup':'prod', 'Setup':'JenkinsSetup', 'CPUTime':86400}
    res = tqDB.insertJob( jobID, tqDefDict, 10 )
    self.assert_( res['OK'] )

    res = matcher.requestJob( resourceDescription )
    print res
    self.assert_( res['OK'] )
    wmsClient.deleteJob( jobID )
Example #6
0
    def initialize(self):
        """ Sets defaults
    """

        self.am_setOption("PollingTime", 120)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        self.jobLoggingDB = JobLoggingDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        agentTSTypes = self.am_getOption('ProductionTypes', [])
        if agentTSTypes:
            self.prod_types = agentTSTypes
        else:
            self.prod_types = Operations().getValue(
                'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
        gLogger.info(
            "Will exclude the following Production types from cleaning %s" %
            (', '.join(self.prod_types)))
        self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 500)
        self.jobByJob = self.am_getOption('JobByJob', False)
        self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)

        self.removeStatusDelay['Done'] = self.am_getOption(
            'RemoveStatusDelay/Done', 7)
        self.removeStatusDelay['Killed'] = self.am_getOption(
            'RemoveStatusDelay/Killed', 7)
        self.removeStatusDelay['Failed'] = self.am_getOption(
            'RemoveStatusDelay/Failed', 7)
        self.removeStatusDelay['Any'] = self.am_getOption(
            'RemoveStatusDelay/Any', -1)

        return S_OK()
Example #7
0
  def test_matcher( self ):
    # insert a proper DN to run the test
    resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]',
                           'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB',
                           'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO',
                           'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 }
    matcher = RPCClient( 'WorkloadManagement/Matcher' )
    JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )
    wmsClient = WMSClient()

    job = helloWorldJob()
    job.setDestination( 'DIRAC.Jenkins.org' )
    job.setInputData( '/a/bbb' )
    job.setType( 'User' )
    jobDescription = createFile( job )
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )

    jobID = res['Value']

    res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' )
    self.assert_( res['OK'] )


    tqDB = TaskQueueDB()
    tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]',
                 'OwnerGroup':'prod', 'Setup':'dirac-JenkinsSetup', 'CPUTime':86400}
    res = tqDB.insertJob( jobID, tqDefDict, 10 )
    self.assert_( res['OK'] )

    res = matcher.requestJob( resourceDescription )
    print res
    self.assert_( res['OK'] )
    wmsClient.deleteJob( jobID )
Example #8
0
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()
Example #9
0
  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
Example #10
0
    def initializeOptimizer(self):
        """Initialize specific parameters for TaskQueueAgent.
    """
        self.waitingStatus = self.am_getOption('WaitingStatus', 'Waiting')
        self.waitingMinorStatus = self.am_getOption('WaitingMinorStatus',
                                                    'Pilot Agent Submission')
        try:
            self.taskQueueDB = TaskQueueDB()
            result = self.taskQueueDB.enableAllTaskQueues()
            if not result['OK']:
                raise Exception("Can't enable TaskQueues: %s" %
                                result['Message'])

        except Exception, e:
            self.log.exception()
            return S_ERROR("Cannot initialize taskqueueDB: %s" % str(e))
Example #11
0
def initializeJobMonitoringHandler(serviceInfo):

    global jobDB, jobLoggingDB, taskQueueDB
    jobDB = JobDB()
    jobLoggingDB = JobLoggingDB()
    taskQueueDB = TaskQueueDB()
    return S_OK()
Example #12
0
 def checkDBAccess(cls):
     # Init DB if there
     if not JobState.__db.checked:
         JobState.__db.jobDB = JobDB()
         JobState.__db.logDB = JobLoggingDB()
         JobState.__db.tqDB = TaskQueueDB()
         JobState.__db.checked = True
Example #13
0
  def export_getPilots(cls, jobID):
    """ Get pilot references and their states for :
      - those pilots submitted for the TQ where job is sitting
      - (or) the pilots executing/having executed the Job
    """

    pilots = []
    result = pilotDB.getPilotsForJobID(int(jobID))
    if not result['OK']:
      if result['Message'].find('not found') == -1:
        return S_ERROR('Failed to get pilot: ' + result['Message'])
    else:
      pilots += result['Value']
    if not pilots:
      # Pilots were not found try to look in the Task Queue
      taskQueueID = 0
      result = TaskQueueDB().getTaskQueueForJob(int(jobID))
      if result['OK'] and result['Value']:
        taskQueueID = result['Value']
      if taskQueueID:
        result = pilotDB.getPilotsForTaskQueue(taskQueueID, limit=10)
        if not result['OK']:
          return S_ERROR('Failed to get pilot: ' + result['Message'])
        pilots += result['Value']

    if not pilots:
      return S_ERROR('Failed to get pilot for Job %d' % int(jobID))

    return pilotDB.getPilotInfo(pilotID=pilots)
Example #14
0
def initializeJobMonitoringHandler(serviceInfo):

    global gJobDB, gJobLoggingDB, gTaskQueueDB
    gJobDB = JobDB()
    gJobLoggingDB = JobLoggingDB()
    gTaskQueueDB = TaskQueueDB()
    return S_OK()
Example #15
0
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) )
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
    
    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )
    self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 )

    return S_OK()
Example #16
0
def initializeJobManagerHandler(serviceInfo):

    global gJobDB, gJobLoggingDB, gtaskQueueDB
    gJobDB = JobDB()
    gJobLoggingDB = JobLoggingDB()
    gtaskQueueDB = TaskQueueDB()
    return S_OK()
Example #17
0
class TaskQueueAgent(OptimizerModule):
    """
      The specific Optimizer must provide the following methods:
      - initializeOptimizer() before each execution cycle
      - checkJob() - the main method called for each job
  """

    #############################################################################
    def initializeOptimizer(self):
        """Initialize specific parameters for TaskQueueAgent.
    """
        self.waitingStatus = self.am_getOption('WaitingStatus', 'Waiting')
        self.waitingMinorStatus = self.am_getOption('WaitingMinorStatus',
                                                    'Pilot Agent Submission')
        try:
            self.taskQueueDB = TaskQueueDB()
            result = self.taskQueueDB.enableAllTaskQueues()
            if not result['OK']:
                raise Exception("Can't enable TaskQueues: %s" %
                                result['Message'])

        except Exception, e:
            self.log.exception()
            return S_ERROR("Cannot initialize taskqueueDB: %s" % str(e))
        return S_OK()
Example #18
0
    def test_matcher(self):
        # insert a proper DN to run the test
        resourceDescription = {
            "OwnerGroup": "prod",
            "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser",
            "DIRACVersion": "pippo",
            "GridCE": "some.grid.ce.org",
            "ReleaseVersion": "blabla",
            "VirtualOrganization": "LHCb",
            "PilotInfoReportedFlag": "True",
            "PilotBenchmark": "anotherPilot",
            "Site": "DIRAC.Jenkins.ch",
            "CPUTime": 86400,
        }
        wmsClient = WMSClient()

        job = helloWorldJob()
        job.setDestination("DIRAC.Jenkins.ch")
        job.setInputData("/a/bbb")
        job.setType("User")
        jobDescription = createFile(job)
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res["OK"], res.get("Message"))

        jobID = res["Value"]

        # forcing the update
        res = JobStateUpdateClient().setJobStatus(jobID, JobStatus.WAITING,
                                                  "matching", "source", None,
                                                  True)
        self.assertTrue(res["OK"], res.get("Message"))

        tqDB = TaskQueueDB()
        tqDefDict = {
            "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser",
            "OwnerGroup": "prod",
            "Setup": "dirac-JenkinsSetup",
            "CPUTime": 86400,
        }
        res = tqDB.insertJob(jobID, tqDefDict, 10)
        self.assertTrue(res["OK"], res.get("Message"))

        res = MatcherClient().requestJob(resourceDescription)
        print(res)
        self.assertTrue(res["OK"], res.get("Message"))
        wmsClient.deleteJob(jobID)
Example #19
0
def initializeMatcherHandler(serviceInfo):
    """  Matcher Service initialization
  """

    global gJobDB
    global gJobLoggingDB
    global gTaskQueueDB
    global gPilotAgentsDB

    # Create JobDB object and initialize its tables.
    gJobDB = JobDB()
    res = gJobDB._checkTable()
    if not res['OK']:
        return res

    # Create JobLoggingDB object and initialize its tables.
    gJobLoggingDB = JobLoggingDB()
    res = gJobLoggingDB._checkTable()
    if not res['OK']:
        return res

    gTaskQueueDB = TaskQueueDB()

    # Create PilotAgentsDB object and initialize its tables.
    gPilotAgentsDB = PilotAgentsDB()
    res = gPilotAgentsDB._checkTable()
    if not res['OK']:
        return res

    gMonitor.registerActivity('matchTime', "Job matching time", 'Matching',
                              "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching',
                              "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching',
                              "tqsk queues", gMonitor.OP_MEAN, 300)

    gTaskQueueDB.recalculateTQSharesForAll()
    gThreadScheduler.addPeriodicTask(120,
                                     gTaskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues)

    sendNumTaskQueues()

    return S_OK()
Example #20
0
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()
Example #21
0
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 60)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        self.prod_types = self.am_getOption('ProductionTypes', [
            'DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge',
            'production'
        ])
        gLogger.info(
            'Will exclude the following Production types from cleaning %s' %
            (string.join(self.prod_types, ', ')))
        self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 200)
        self.jobByJob = self.am_getOption('JobByJob', True)
        self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
        return S_OK()
Example #22
0
  def initializeHandler(cls, svcInfoDict):
    cls.gJobDB = JobDB()
    cls.gJobLoggingDB = JobLoggingDB()
    cls.gTaskQueueDB = TaskQueueDB()

    cls.gElasticJobParametersDB = None
    useESForJobParametersFlag = Operations().getValue(
        '/Services/JobMonitoring/useESForJobParametersFlag', False)
    if useESForJobParametersFlag:
      cls.gElasticJobParametersDB = ElasticJobParametersDB()
    return S_OK()
Example #23
0
def initializeWMSAdministratorHandler(serviceInfo):
    """  WMS AdministratorService initialization
  """

    global jobDB
    global taskQueueDB

    jobDB = JobDB()
    taskQueueDB = TaskQueueDB()

    return S_OK()
Example #24
0
  def initializeHandler(cls, svcInfoDict):
    """ WMS AdministratorService initialization
    """
    cls.jobDB = JobDB()
    cls.taskQueueDB = TaskQueueDB()

    cls.elasticJobParametersDB = None
    useESForJobParametersFlag = Operations().getValue(
        '/Services/JobMonitoring/useESForJobParametersFlag', False)
    if useESForJobParametersFlag:
      cls.elasticJobParametersDB = ElasticJobParametersDB()

    return S_OK()
Example #25
0
  def initializeOptimizer( self ):
    """Initialize specific parameters for TaskQueueAgent.
    """
    self.waitingStatus = self.am_getOption( 'WaitingStatus', 'Waiting' )
    self.waitingMinorStatus = self.am_getOption( 'WaitingMinorStatus', 'Pilot Agent Submission' )
    try:
      self.taskQueueDB = TaskQueueDB()
      result = self.taskQueueDB.enableAllTaskQueues()
      if not result[ 'OK' ]:
        raise Exception( "Can't enable TaskQueues: %s" % result[ 'Message' ] )

    except Exception, e:
      self.log.exception()
      return S_ERROR( "Cannot initialize taskqueueDB: %s" % str( e ) )
Example #26
0
def initializeWMSAdministratorHandler(serviceInfo):
    """ WMS AdministratorService initialization

      :param dict serviceInfo: service information dictionary

      :return: S_OK()/S_ERROR()
  """
    global jobDB
    global taskQueueDB

    jobDB = JobDB()
    taskQueueDB = TaskQueueDB()

    return S_OK()
Example #27
0
def initializeMatcherHandler( serviceInfo ):
  """  Matcher Service initialization
  """

  global jobDB
  global jobLoggingDB
  global taskQueueDB

  jobDB = JobDB()
  jobLoggingDB = JobLoggingDB()
  taskQueueDB = TaskQueueDB()

  gMonitor.registerActivity( 'matchTime', "Job matching time", 'Matching', "secs" , gMonitor.OP_MEAN, 300 )
  gMonitor.registerActivity( 'matchTaskQueues', "Task queues checked per job", 'Matching', "task queues" , gMonitor.OP_MEAN, 300 )
  gMonitor.registerActivity( 'matchesDone', "Job Matches", 'Matching', "matches" , gMonitor.OP_MEAN, 300 )
  gMonitor.registerActivity( 'numTQs', "Number of Task Queues", 'Matching', "tqsk queues" , gMonitor.OP_MEAN, 300 )

  taskQueueDB.recalculateTQSharesForAll()
  gThreadScheduler.addPeriodicTask( 120, taskQueueDB.recalculateTQSharesForAll )
  gThreadScheduler.addPeriodicTask( 120, sendNumTaskQueues )

  sendNumTaskQueues()

  return S_OK()
Example #28
0
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production'])
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()
Example #29
0
def initializeJobManagerHandler(serviceInfo):

    global gJobDB, gJobLoggingDB, gtaskQueueDB, enablePilotsLogging, gPilotAgentsDB, gPilotsLoggingDB
    gJobDB = JobDB()
    gJobLoggingDB = JobLoggingDB()
    gtaskQueueDB = TaskQueueDB()
    gPilotAgentsDB = PilotAgentsDB()

    # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict
    enablePilotsLogging = gConfig.getValue(
        serviceInfo['serviceSectionPath'].replace('JobManager',
                                                  'PilotsLogging') + '/Enable',
        'False').lower() in ('yes', 'true')

    if enablePilotsLogging:
        gPilotsLoggingDB = PilotsLoggingDB()
    return S_OK()
Example #30
0
    def initializeHandler(cls, serviceInfoDict):
        """ Initialization of DB objects and OptimizationMind
    """
        cls.jobDB = JobDB()
        cls.jobLoggingDB = JobLoggingDB()
        cls.taskQueueDB = TaskQueueDB()
        cls.pilotAgentsDB = PilotAgentsDB()

        cls.pilotsLoggingDB = None
        enablePilotsLogging = Operations().getValue(
            '/Services/JobMonitoring/usePilotsLoggingFlag', False)
        if enablePilotsLogging:
            cls.pilotsLoggingDB = PilotsLoggingDB()

        cls.msgClient = MessageClient("WorkloadManagement/OptimizationMind")
        cls.__connectToOptMind()
        gThreadScheduler.addPeriodicTask(60, cls.__connectToOptMind)
        return S_OK()
Example #31
0
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 60)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        self.prod_types = self.am_getOption(
            "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"]
        )
        gLogger.info(
            "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", "))
        )
        self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200)
        self.jobByJob = self.am_getOption("JobByJob", True)
        self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0)
        return S_OK()
Example #32
0
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()
Example #33
0
class TaskQueuesAgent(AgentModule):
    """Agent for recalculating TQ shares"""
    def __init__(self, *args, **kwargs):
        """c'tor"""
        AgentModule.__init__(self, *args, **kwargs)

        # clients
        self.tqDB = None

    def initialize(self):
        """just initialize TQDB"""
        self.tqDB = TaskQueueDB()
        return S_OK()

    def execute(self):
        """calls TQDB.recalculateTQSharesForAll"""
        res = self.tqDB.recalculateTQSharesForAll()
        if not res["OK"]:
            self.log.error("Error recalculating TQ shares", res["Message"])

        return S_OK()
Example #34
0
class TaskQueueAgent( OptimizerModule ):
  """
      The specific Optimizer must provide the following methods:
      - initializeOptimizer() before each execution cycle
      - checkJob() - the main method called for each job
  """

  #############################################################################
  def initializeOptimizer( self ):
    """Initialize specific parameters for TaskQueueAgent.
    """
    self.waitingStatus = self.am_getOption( 'WaitingStatus', 'Waiting' )
    self.waitingMinorStatus = self.am_getOption( 'WaitingMinorStatus', 'Pilot Agent Submission' )
    try:
      self.taskQueueDB = TaskQueueDB()
      result = self.taskQueueDB.enableAllTaskQueues()
      if not result[ 'OK' ]:
        raise Exception( "Can't enable TaskQueues: %s" % result[ 'Message' ] )

    except Exception, e:
      self.log.exception()
      return S_ERROR( "Cannot initialize taskqueueDB: %s" % str( e ) )
    return S_OK()
Example #35
0
  def initializeHandler(cls, serviceInfoDict):
    cls.jobDB = JobDB()
    cls.jobLoggingDB = JobLoggingDB()
    cls.taskQueueDB = TaskQueueDB()
    cls.pilotAgentsDB = PilotAgentsDB()
    cls.limiter = Limiter(jobDB=cls.jobDB)

    cls.taskQueueDB.recalculateTQSharesForAll()

    gMonitor.registerActivity('matchTime', "Job matching time",
                              'Matching', "secs", gMonitor.OP_MEAN, 300)
    gMonitor.registerActivity('matchesDone', "Job Match Request",
                              'Matching', "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('matchesOK', "Matched jobs",
                              'Matching', "matches", gMonitor.OP_RATE, 300)
    gMonitor.registerActivity('numTQs', "Number of Task Queues",
                              'Matching', "tqsk queues", gMonitor.OP_MEAN, 300)

    gThreadScheduler.addPeriodicTask(120, cls.taskQueueDB.recalculateTQSharesForAll)
    gThreadScheduler.addPeriodicTask(60, cls.sendNumTaskQueues)

    cls.sendNumTaskQueues()
    return S_OK()
Example #36
0
def initializeWMSAdministratorHandler(serviceInfo):
  """  WMS AdministratorService initialization
  """

  global jobDB
  global pilotDB
  global taskQueueDB
  global enablePilotsLogging

  # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict
  enablePilotsLogging = gConfig.getValue(
      serviceInfo['serviceSectionPath'].replace(
          'WMSAdministrator',
          'PilotsLogging') + '/Enable',
      'False').lower() in (
      'yes',
      'true')

  jobDB = JobDB()
  pilotDB = PilotAgentsDB()
  taskQueueDB = TaskQueueDB()
  if enablePilotsLogging:
    pilotsLoggingDB = PilotsLoggingDB()
  return S_OK()
# File :    dirac-admin-submit-pilot-for-job
# Author :  Ricardo Graciani
########################################################################
__RCSID__ = "cd6b25c (2010-12-04 11:45:50 +0000) Ricardo Graciani <*****@*****.**>"
import sys
import DIRAC
from DIRAC.Core.Base import Script

Script.parseCommandLine( ignoreErrors = True )
args = Script.getPositionalArgs()

from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB

jobdb = JobDB()
tqdb = TaskQueueDB()

result = jobdb.selectJobs( { 'Status' : [ 'Received', 'Checking', 'Waiting' ] } )
if not result[ 'OK' ]:
  print result[ 'Message' ]
  sys.exit( 1 )
jobList = result[ 'Value' ]
print tqdb.forceRecreationOfTables()
for job in jobList:
  result = jobdb.getJobAttribute( job, 'RescheduleCounter' )
  if not result[ 'OK' ]:
    print "Cannot get reschedule counter for job %s" % job
    rC = 0
  rC = result[ 'Value' ]
  if rC >= jobdb.maxRescheduling:
    jobdb.setJobAttribute( job, "RescheduleCounter", "0" )
Example #38
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()

  def __getAllowedJobTypes( self ):
    #Get valid jobTypes
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in REMOVE_STATUS_DELAY:
      delay = REMOVE_STATUS_DELAY[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

      
    result = self.deleteJobOversizedSandbox( jobList ) 
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] )
      return result 
    
    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop( jobList.index( job ) ) 

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """ 

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogting JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK( {'Successful':successful, 'Failed':failed} )   

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn
           
    result = {'Successful':successful, 'Failed':failed}
    return S_OK( result )   
    
  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
Example #39
0

    Suggestion: for local testing, run this with::
        python -m pytest -c ../pytest.ini  -vv tests/Integration/WorkloadManagementSystem/Test_TaskQueueDB.py
"""

from DIRAC import gLogger

from DIRAC.Core.Base.Script import parseCommandLine

parseCommandLine()

from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB

gLogger.setLevel('DEBUG')
tqDB = TaskQueueDB()


def test_basicChain():
    """ a basic put - remove
  """
    tqDefDict = {
        'OwnerDN': '/my/DN',
        'OwnerGroup': 'myGroup',
        'Setup': 'aSetup',
        'CPUTime': 50000
    }
    result = tqDB.insertJob(123, tqDefDict, 10)
    assert result['OK'] is True
    result = tqDB.getTaskQueueForJobs([123])
    assert result['OK'] is True
Example #40
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    #clients
    # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service?
    self.jobDB = None
    self.taskQueueDB = None
    self.jobLoggingDB = None

    self.maxJobsAtOnce = 100
    self.jobByJob = False
    self.throttlingPeriod = 0.

    self.removeStatusDelay = {'Done':7,
                              'Killed':1,
                              'Failed':7 }

  #############################################################################
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) )
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
    
    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )

    return S_OK()

  def __getAllowedJobTypes( self ):
    """ Get valid jobTypes
    """
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """ Remove jobs in various status
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    
    # No jobs in the system subject to removal
    if not result['Value']:
      return S_OK()
    
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in self.removeStatusDelay:
      delay = self.removeStatusDelay[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

      
    result = self.deleteJobOversizedSandbox( jobList ) 
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot schedule removal of oversized sandboxes", result[ 'Message' ] )
      return result 
    
    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop( jobList.index( job ) ) 

    # TODO: we should not remove a job if it still has requests in the RequestManager.
    # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """ 

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK( {'Successful':successful, 'Failed':failed} )   

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn
           
    result = {'Successful':successful, 'Failed':failed}
    return S_OK( result )   
    
  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
Example #41
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()

  def __getAllowedJobTypes( self ):
    #Get valid jobTypes
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in REMOVE_STATUS_DELAY:
      delay = REMOVE_STATUS_DELAY[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()
Example #42
0
 def setUp(self):
     gLogger.setLevel('DEBUG')
     self.tqDB = TaskQueueDB()
Example #43
0
class Matcher( object ):
  """ Logic for matching
  """

  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )


  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict


  def _getResourceDict( self, resourceDescription, credDict ):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription( resourceDescription )
    resourceDict = self._checkCredentials( resourceDict, credDict )
    self._checkPilotVersion( resourceDict )
    if not self._checkMask( resourceDict ):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose( "Resource description:" )
    for key in resourceDict:
      self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    return resourceDict

  def _processResourceDescription( self, resourceDescription ):
    """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

    resourceDict = {}
    if isinstance( resourceDescription, basestring ):
      classAdAgent = ClassAd( resourceDescription )
      if not classAdAgent.isOK():
        raise ValueError( 'Illegal Resource JDL' )
      self.log.verbose( classAdAgent.asJDL() )

      for name in singleValueDefFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'CPUTime':
            resourceDict[name] = classAdAgent.getAttributeInt( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      for name in multiValueMatchFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'SubmitPool':
            resourceDict[name] = classAdAgent.getListFromExpression( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      # Check if a JobID is requested
      if classAdAgent.lookupAttribute( 'JobID' ):
        resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' )

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ):
        if classAdAgent.lookupAttribute( k ):
          resourceDict[ k ] = classAdAgent.getAttributeString( k )

    else:
      for name in singleValueDefFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      for name in multiValueMatchFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      if resourceDescription.has_key( 'JobID' ):
        resourceDict['JobID'] = resourceDescription['JobID']

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
                 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ):
        if k in resourceDescription:
          resourceDict[ k ] = resourceDescription[ k ]

    return resourceDict



  def _reportStatus( self, resourceDict, jobID ):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes( jobID, attNames, attValues )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Set job attributes for jobID %s" % jobID )

    result = self.jlDB.addLoggingRecord( jobID,
                                         status = 'Matched',
                                         minor = 'Assigned',
                                         source = 'Matcher' )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Added logging record for jobID %s" % jobID )


  def _checkMask( self, resourceDict ):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if not 'Site' in resourceDict:
      self.log.error( "Missing Site Name in Resource JDL" )
      raise RuntimeError( "Missing Site Name in Resource JDL" )

    # Get common site mask and check the agent site
    result = self.jobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] )
      raise RuntimeError( "Internal error" )
    maskList = result['Value']

    if resourceDict['Site'] not in maskList:
      return False

    return True

  def _updatePilotInfo( self, resourceDict ):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      gridCE = resourceDict.get( 'GridCE', 'Unknown' )
      site = resourceDict.get( 'Site', 'Unknown' )
      benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
      self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) )

      result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site,
                                                  destination = gridCE, benchmark = benchmark )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _updatePilotJobMapping( self, resourceDict, jobID ):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )
      result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _checkCredentials( self, resourceDict, credDict ):
    """ Check if we can get a job given the passed credentials
    """
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      # You can only match groups in the same VO
      if credDict[ 'group' ] == "hosts":
        # for the host case the VirtualOrganization parameter
        # is mandatory in resourceDict
        vo = resourceDict.get( 'VirtualOrganization', '' )
      else:
        vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
      else:
        raise RuntimeError( result['Message'] )
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        self.log.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        self.log.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ]:
            raise RuntimeError( result['Message'] )
          if credDict[ 'group' ] not in result[ 'Value' ]:
            # DN is not in the same group! bad boy.
            self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    return resourceDict

  def _checkPilotVersion( self, resourceDict ):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          raise RuntimeError( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \
                            ( pilotVersion, ",".join( validVersions ) ) )
      # Check project if requested
      validProject = self.opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                                   resourceDict[ 'ReleaseProject' ] ) )
Example #44
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()

  def __getAllowedJobTypes( self ):
    #Get valid jobTypes
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prod_types:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result
    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in REMOVE_STATUS_DELAY:
      delay = REMOVE_STATUS_DELAY[ status ]
      condDict = dict( baseCond )
      condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] )

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:  
          error_count += 1  
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)  
    else:    
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )
  
      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1    

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()
Example #45
0
class JobCleaningAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 60)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        self.prod_types = self.am_getOption(
            "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"]
        )
        gLogger.info(
            "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", "))
        )
        self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200)
        self.jobByJob = self.am_getOption("JobByJob", True)
        self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0)
        return S_OK()

    def __getAllowedJobTypes(self):
        # Get valid jobTypes
        result = self.jobDB.getDistinctJobAttributes("JobType")
        if not result["OK"]:
            return result
        cleanJobTypes = []
        for jobType in result["Value"]:
            if jobType not in self.prod_types:
                cleanJobTypes.append(jobType)
        self.log.notice("JobTypes to clean %s" % cleanJobTypes)
        return S_OK(cleanJobTypes)

    #############################################################################
    def execute(self):
        """The PilotAgent execution method.
    """
        # Delete jobs in "Deleted" state
        result = self.removeJobsByStatus({"Status": "Deleted"})
        if not result["OK"]:
            return result
        # Get all the Job types that can be cleaned
        result = self.__getAllowedJobTypes()
        if not result["OK"]:
            return result
        baseCond = {"JobType": result["Value"]}
        # Remove jobs with final status
        for status in REMOVE_STATUS_DELAY:
            delay = REMOVE_STATUS_DELAY[status]
            condDict = dict(baseCond)
            condDict["Status"] = status
            delTime = str(Time.dateTime() - delay * Time.day)
            result = self.removeJobsByStatus(condDict, delTime)
            if not result["OK"]:
                gLogger.warn("Failed to remove jobs in status %s" % status)
        return S_OK()

    def removeJobsByStatus(self, condDict, delay=False):
        """ Remove deleted jobs
    """
        if delay:
            gLogger.verbose("Removing jobs with %s and older than %s" % (condDict, delay))
            result = self.jobDB.selectJobs(condDict, older=delay)
        else:
            gLogger.verbose("Removing jobs with %s " % condDict)
            result = self.jobDB.selectJobs(condDict)

        if not result["OK"]:
            return result

        jobList = result["Value"]
        if len(jobList) > self.maxJobsAtOnce:
            jobList = jobList[: self.maxJobsAtOnce]
        if not jobList:
            return S_OK()

        self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict))

        count = 0
        error_count = 0
        result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList)
        if not result["OK"]:
            gLogger.warn("Cannot unassign jobs to sandboxes", result["Message"])

        if self.jobByJob:
            for jobID in jobList:
                resultJobDB = self.jobDB.removeJobFromDB(jobID)
                resultTQ = self.taskQueueDB.deleteJob(jobID)
                if not resultJobDB["OK"]:
                    gLogger.warn("Failed to remove job %d from JobDB" % jobID, result["Message"])
                    error_count += 1
                elif not resultTQ["OK"]:
                    gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, result["Message"])
                    error_count += 1
                else:
                    count += 1
                if self.throttlingPeriod:
                    time.sleep(self.throttlingPeriod)
        else:
            result = self.jobDB.removeJobFromDB(jobList)
            if not result["OK"]:
                gLogger.error("Failed to delete %d jobs from JobDB" % len(jobList))
            else:
                gLogger.info("Deleted %d jobs from JobDB" % len(jobList))

            for jobID in jobList:
                resultTQ = self.taskQueueDB.deleteJob(jobID)
                if not resultTQ["OK"]:
                    gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, resultTQ["Message"])
                    error_count += 1
                else:
                    count += 1

        if count > 0 or error_count > 0:
            gLogger.info("Deleted %d jobs from JobDB, %d errors" % (count, error_count))
        return S_OK()
Example #46
0
class JobCleaningAgent( AgentModule ):
  """
      The specific agents must provide the following methods:

         *  initialize() for initial settings
         *  beginExecution()
         *  execute() - the main method called in the agent cycle
         *  endExecution()
         *  finalize() - the graceful exit of the method, this one is usually used for the agent restart
  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    #clients
    # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service?
    self.jobDB = None
    self.taskQueueDB = None
    self.jobLoggingDB = None

    self.maxJobsAtOnce = 100
    self.jobByJob = False
    self.throttlingPeriod = 0.

    self.prodTypes = []

    self.removeStatusDelay = {}

  #############################################################################
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prodTypes = agentTSTypes
    else:
      self.prodTypes = Operations().getValue(
          'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
    gLogger.info("Will exclude the following Production types from cleaning %s" % (
        ', '.join(self.prodTypes)))
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)

    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )
    self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 )

    return S_OK()

  def __getAllowedJobTypes( self ):
    """ Get valid jobTypes
    """
    result = self.jobDB.getDistinctJobAttributes( 'JobType' )
    if not result[ 'OK' ]:
      return result
    cleanJobTypes = []
    for jobType in result[ 'Value' ]:
      if jobType not in self.prodTypes:
        cleanJobTypes.append( jobType )
    self.log.notice( "JobTypes to clean %s" % cleanJobTypes )
    return S_OK( cleanJobTypes )

  #############################################################################
  def execute( self ):
    """ Remove jobs in various status
    """
    #Delete jobs in "Deleted" state
    result = self.removeJobsByStatus( { 'Status' : 'Deleted' } )
    if not result[ 'OK' ]:
      return result
    #Get all the Job types that can be cleaned
    result = self.__getAllowedJobTypes()
    if not result[ 'OK' ]:
      return result

    # No jobs in the system subject to removal
    if not result['Value']:
      return S_OK()

    baseCond = { 'JobType' : result[ 'Value' ] }
    # Remove jobs with final status
    for status in self.removeStatusDelay:
      delay = self.removeStatusDelay[ status ]
      if delay < 0:
        # Negative delay means don't delete anything...
        continue
      condDict = dict( baseCond )
      if status != 'Any':
        condDict[ 'Status' ] = status
      delTime = str( Time.dateTime() - delay * Time.day )
      result = self.removeJobsByStatus( condDict, delTime )
      if not result['OK']:
        gLogger.warn( 'Failed to remove jobs in status %s' % status )
    return S_OK()

  def removeJobsByStatus( self, condDict, delay = False ):
    """ Remove deleted jobs
    """
    if delay:
      gLogger.verbose( "Removing jobs with %s and older than %s day(s)" % ( condDict, delay ) )
      result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce )
    else:
      gLogger.verbose( "Removing jobs with %s " % condDict )
      result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce )

    if not result['OK']:
      return result

    jobList = result['Value']
    if len(jobList) > self.maxJobsAtOnce:
      jobList = jobList[:self.maxJobsAtOnce]
    if not jobList:
      return S_OK()

    self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) )

    count = 0
    error_count = 0
    result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList )
    if not result[ 'OK' ]:
      gLogger.error("Cannot unassign jobs to sandboxes", result['Message'])
      return result

    result = self.deleteJobOversizedSandbox(jobList)
    if not result[ 'OK' ]:
      gLogger.error(
          "Cannot schedule removal of oversized sandboxes", result['Message'])
      return result

    failedJobs = result['Value']['Failed']
    for job in failedJobs:
      jobList.pop(jobList.index(job))

    # TODO: we should not remove a job if it still has requests in the RequestManager.
    # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB

    if self.jobByJob:
      for jobID in jobList:
        resultJobDB = self.jobDB.removeJobFromDB( jobID )
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        resultLogDB = self.jobLoggingDB.deleteJob( jobID )
        errorFlag = False
        if not resultJobDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] )
          errorFlag = True
        if not resultLogDB['OK']:
          gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] )
          errorFlag = True
        if errorFlag:
          error_count += 1
        else:
          count += 1
        if self.throttlingPeriod:
          time.sleep(self.throttlingPeriod)
    else:
      result = self.jobDB.removeJobFromDB( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobDB' % len(jobList) )

      for jobID in jobList:
        resultTQ = self.taskQueueDB.deleteJob( jobID )
        if not resultTQ['OK']:
          gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] )
          error_count += 1
        else:
          count += 1

      result = self.jobLoggingDB.deleteJob( jobList )
      if not result['OK']:
        gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) )
      else:
        gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) )

    if count > 0 or error_count > 0 :
      gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) )
    return S_OK()

  def deleteJobOversizedSandbox( self, jobIDList ):
    """ Delete the job oversized sandbox files from storage elements
    """

    failed = {}
    successful = {}

    lfnDict = {}
    for jobID in jobIDList:
      result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' )
      if result['OK']:
        lfn = result['Value']
        if lfn:
          lfnDict[lfn] = jobID
        else:
          successful[jobID] = 'No oversized sandbox found'
      else:
        gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] )
    if not lfnDict:
      return S_OK({'Successful': successful, 'Failed': failed})

    # Schedule removal of the LFNs now

    for lfn,jobID in lfnDict.items():
      result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
      if not result['OK']:
        failed[jobID] = lfn
        continue
      if not result['Value']:
        failed[jobID] = lfn
        continue

      ownerDN = result['Value']['OwnerDN']
      ownerGroup = result['Value']['OwnerGroup']
      result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup )
      if not result['OK']:
        failed[jobID] = lfn
      else:
        successful[jobID] = lfn

    result = {'Successful':successful, 'Failed':failed}
    return S_OK(result)

  def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ):
    """ Set removal request with the given credentials
    """
    oRequest = Request()
    oRequest.OwnerDN = ownerDN
    oRequest.OwnerGroup = ownerGroup
    oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml'
    oRequest.SourceComponent = 'JobCleaningAgent'

    removeFile = Operation()
    removeFile.Type = 'RemoveFile'

    removedFile = File()
    removedFile.LFN = lfn

    removeFile.addFile( removedFile )
    oRequest.addOperation( removeFile )

    return ReqClient().putRequest( oRequest )
Example #47
0
# print db.checkImageStatus( 'name', 'flavor'*10, 'requirements' )

ret = db.insertInstance( 'Image3', 'instance' )
print "insertInstance          ", ret
ret = db.insertInstance( 'Image2', 'instance' )
print "insertInstance          ", ret

if not ret['OK']:
  DIRAC.exit()

print type( ret['Value'] )
print "declareInstanceSubmitted", db.declareInstanceSubmitted( ret['Value'] )
id1 = DIRAC.Time.toString()
print "declareInstanceRunning  ", db.declareInstanceRunning( 'Image3', id1, 'IP', 'ip' )
id2 = DIRAC.Time.toString()
print "declareInstanceRunning  ", db.declareInstanceRunning( 'Image2', id2, 'IP', 'ip' )
print "declareInstanceRunning  ", db.instanceIDHeartBeat( id2, 1.0 )

for status in validStates:
  print "get%10sInstances  " % status, db.getInstancesByStatus( status )

print "declareInstanceHalting  ", db.declareInstanceHalting( id1, 0.0 )
print "declareInstanceHalting  ", db.declareInstanceHalting( id2, 0.0 )

print "declareStalledInstances ", db.declareStalledInstances()
print "declareStalledInstances ", db.declareStalledInstances()

from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB
tq = TaskQueueDB()
print tq.retrieveTaskQueues()