Beispiel #1
0
def initSites():
  '''
    Initializes Sites statuses taking their values from the "SiteMask" table of "JobDB" database.
  '''

  jobClient = JobDB()
  rssClient = ResourceStatusDB()

  sites = jobClient.getAllSiteMaskStatus()

  if not sites[ 'OK' ]:
    subLogger.error( sites[ 'Message' ] )
    DIRACExit( 1 )

  for site, elements in sites['Value'].iteritems():
    table  = { 'table': 'SiteStatus' }
    insert = { 'Status': elements[0], 'Reason': 'Synchronized', 'Name': site, 'DateEffective': elements[1], 'TokenExpiration': Datetime,
             'ElementType': 'Site', 'StatusType': 'all', 'LastCheckTime': None, 'TokenOwner': elements[2] }

    result = rssClient.addIfNotThere(insert, table)

    if not result[ 'OK' ]:
      subLogger.error( result[ 'Message' ] )
      DIRACExit( 1 )

  return S_OK()
    def doNew(self, masterParams=None):

        hosts = masterParams

        sql = """
select JP.Value, J.Status, J.Site, count(*) from Jobs J, JobParameters JP 
where J.JobID = JP.JobID and JP.Name = 'HostName' 
and J.EndExecTime >= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) 
group by JP.Value, J.Status
"""

        jobDB = JobDB()
        queryRes = jobDB._query(sql)
        if not queryRes["OK"]:
            return queryRes
        records = queryRes["Value"]

        hostJobs = {}

        for record in records:
            hostName = record[0]
            status = record[1]
            if status != "Done" and status != "Failed":
                continue
            if hostName not in hostJobs:
                hostJobs[hostName] = {"Site": record[2], "Done": 0, "Failed": 0}
            hostJobs[hostName][record[1]] = record[3]

        uniformResult = []

        for host, hostDict in hostJobs.items():
            hostDict["Host"] = host
            try:
                hosts.remove(host)
            except ValueError:
                pass
            if hostDict["Done"] == 0 and hostDict["Failed"] == 0:
                hostDict["Efficiency"] = 0.0
            else:
                hostDict["Efficiency"] = (
                    math.floor(float(hostDict["Done"]) / (hostDict["Done"] + hostDict["Failed"]) * 1000) / 10
                )
            uniformResult.append(hostDict)

        if len(hosts) != 0:
            deleteRes = self.rmClient.deleteWorkNodeCache(host=hosts)
            if not deleteRes["OK"]:
                return deleteRes

        storeRes = self._storeCommand(uniformResult)
        if not storeRes["OK"]:
            return storeRes

        return S_OK(uniformResult)
def mysql_querry(querry):
    if USE_PURE_MYSQL:
        import MySQLdb#@UnresolvedImport
        db = MySQLdb.connect(host="diracdb2.ihep.ac.cn", # your host, usually localhost
                         user="******", # your username
                         passwd="###DIRAC_DB_PASS###", # your password
                         db="JobDB")
        cur = db.cursor()
        cur.execute(querry)
        data = cur.fetchall()
        cur.close()
        return data
    else:
        db = JobDB()
        return db._query( querry )['Value']
Beispiel #4
0
class JobDBTestCase( unittest.TestCase ):
  """ Base class for the JobDB test cases
  """

  def setUp( self ):
    gLogger.setLevel( 'DEBUG' )
    self.jobDB = JobDB()

  def tearDown( self ):
    result = self.jobDB.selectJobs( {} )
    self.assert_( result['OK'], 'Status after selectJobs' )
    jobs = result['Value']
    for job in jobs:
      result = self.jobDB.removeJobFromDB( job )
      self.assert_( result['OK'] )
Beispiel #5
0
  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
Beispiel #6
0
class JobDBTest( unittest.TestCase ):

  def setUp( self ):

    def mockInit(self ):
      self.log = MagicMock()
      self.logger = MagicMock()
      self._connected = True

    from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
    with patch( MODULE_NAME+".JobDB.__init__", new=mockInit):
      self.jobDB = JobDB()
    self.jobDB._query = MagicMock( name="Query" )
    self.jobDB._escapeString = MagicMock( return_value=S_OK() )

  def tearDown( self ):
    pass


  def test_getInputData( self ):
    self.jobDB._query.return_value = S_OK( (( '/vo/user/lfn1',), ('LFN:/vo/user/lfn2',)) )
    result = self.jobDB.getInputData( 1234 )
    print result
    self.assertTrue( result['OK'] )
    self.assertEqual( result['Value'], [ '/vo/user/lfn1', '/vo/user/lfn2' ] )
Beispiel #7
0
  def initialize( self ):
    """ Sets defaults
    """

    self.am_setOption( "PollingTime", 120 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) )
    self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 )
    self.jobByJob = self.am_getOption( 'JobByJob', False )
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)
    
    self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 )
    self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 )
    self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 )
    self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 )

    return S_OK()
Beispiel #8
0
  def initialize( self, jobDB = False, logDB = False ):
    """ Initialization of the Optimizer Agent.
    """
    if not jobDB:
      self.jobDB = JobDB()
    else:
      self.jobDB = jobDB
    if not logDB:
      self.logDB = JobLoggingDB()
    else:
      self.logDB = logDB

    trailing = "Agent"
    optimizerName = self.am_getModuleParam( 'agentName' )
    if optimizerName[ -len( trailing ):].find( trailing ) == 0:
      optimizerName = optimizerName[ :-len( trailing ) ]
    self.am_setModuleParam( 'optimizerName', optimizerName )

    self.startingMinorStatus = self.am_getModuleParam( 'optimizerName' )
    self.startingMajorStatus = "Checking"
    self.failedStatus = self.am_getOption( "FailedJobStatus" , 'Failed' )
    self.requiredJobInfo = 'jdl'
    self.am_setOption( "PollingTime", 30 )

    return self.initializeOptimizer()
Beispiel #9
0
class JobDBTestCase(unittest.TestCase):
  """ Base class for the JobDB test cases
  """
  
  def setUp(self):
    print
    self.jobDB = JobDB('Test',20)
    
  def createJob(self):
    result = self.jobDB.getJobID()
    jobID = result['Value']
    jdlfile = open("test.jdl","r")
    jdl = jdlfile.read()
    jdlfile.close()
    result = self.jobDB.insertJobIntoDB(jobID,jdl) 
    return jobID  
Beispiel #10
0
 def initialize( self ):
   """ Standard constructor
   """
   self.jobDB = JobDB()
   self.jobLoggingDB = JobLoggingDB()
   self._optimizers = {}
   self.am_setOption( "PollingTime", 30 )
   return S_OK()
Beispiel #11
0
  def initialize( self ):
    """Sets default parameters
"""
    self.jobDB = JobDB()
    self.logDB = JobLoggingDB()
    self.am_setOption( 'PollingTime', 60 * 60 )
    if not self.am_getOption( 'Enable', True ):
      self.log.info( 'Stalled Job Agent running in disabled mode' )
    return S_OK()
Beispiel #12
0
class JobHistoryAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  def initialize( self ):

    self.jobDB = JobDB()

    for status in MONITOR_STATUS:
      for site in MONITOR_SITES:
        gLogger.verbose( "Registering activity %s-%s" % ( status, site ) )
        gLogger.verbose( "Jobs in %s state at %s" % ( status, site ) )
        gMonitor.registerActivity( "%s-%s" % ( status, site ), "Jobs in %s state at %s" % ( status, site ),
                                  "JobHistoryAgent", "Jobs/minute", gMonitor.OP_MEAN )

    self.last_update = 0
    self.resultDB = None
    self.reportPeriod = 60
    return S_OK()

  def execute( self ):
    """ Main execution method
    """
    delta = time.time() - self.last_update
    if delta > self.reportPeriod:
      result = self.jobDB.getCounters( 'Jobs', ['Status', 'Site'], {}, '' )
      if not result['OK']:
        return S_ERROR( 'Failed to get data from the Job Database' )
      self.resultDB = result['Value']
      self.last_update = time.time()

    totalDict = {}
    for status in MONITOR_STATUS:
      totalDict[status] = 0

    for row in self.resultDB:
      site = row[0]['Site']
      status = row[0]['Status']
      count = row[1]
      if site in MONITOR_SITES and status in MONITOR_STATUS:
        gLogger.verbose( "Adding mark %s-%s: " % ( status, site ) + str( count ) )
        gMonitor.addMark( "%s-%s" % ( status, site ), count )
      if status in totalDict:
        totalDict[status] += count

    for status in MONITOR_STATUS:
      gLogger.verbose( "Adding mark %s-All sites: " % status + str( totalDict[status] ) )
      gMonitor.addMark( "%s-All sites" % status, totalDict[status] )

    return S_OK()
Beispiel #13
0
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( 'PollingTime', 120 )
    self.am_setOption( 'GridEnv', '' )
    self.am_setOption( 'PilotStalledDays', 3 )
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    return S_OK()
Beispiel #14
0
  def setUp( self ):

    def mockInit(self ):
      self.log = MagicMock()
      self.logger = MagicMock()
      self._connected = True

    from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
    with patch( MODULE_NAME+".JobDB.__init__", new=mockInit):
      self.jobDB = JobDB()
    self.jobDB._query = MagicMock( name="Query" )
    self.jobDB._escapeString = MagicMock( return_value=S_OK() )
    def getDataFromDB(self):
           db = JobDB()
           cmd = 'SELECT * FROM Jobs'
           res = db._query( cmd ) 

           #Week data
           cmd = "select J.Site, JP.Value, sum(1),  S.total from Jobs J, JobParameters JP, (select JP.Value as host, sum(1) as total from Jobs J, JobParameters JP where J.JobID=JP.JobID and JP.Name='HostName' group by JP.Value) S where J.JobID=JP.JobID and JP.Name='HostName' and J.Status='Done' and S.host=JP.Value group by JP.Value order by J.Site asc"

           result = mysql_querry( cmd )
           #print result
           data = {}
           for i in result:
             data[(i[0], i[1])]=[int(i[2]), int(i[3])]

           #TwoDays data
           cmd = "select J.Site, JP.Value, sum(1), S.total from Jobs J, JobParameters JP, (select JP.Value as host, sum(1) as total from Jobs J, JobParameters JP where J.JobID=JP.JobID and JP.Name='HostName' and J.SubmissionTime>= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 48 HOUR) group by JP.Value) S where J.JobID=JP.JobID and JP.Name='HostName' and J.Status='Done' and S.host=JP.Value and J.SubmissionTime>= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 48 HOUR) group by JP.Value order by J.Site asc"

           result = mysql_querry( cmd )
           for i in result:
             if (i[0],i[1]) not in data:
                data[(i[0],i[1])] = [0,0]
                print "Strange behavior: for ", (i[0],i[1]), " was no week data"
             data[(i[0], i[1])] += [int(i[2]), int(i[3])]


           #OneDay data
           cmd = "select J.Site, JP.Value, sum(1),  S.total from Jobs J, JobParameters JP, (select JP.Value as host, sum(1) as total from Jobs J, JobParameters JP where J.JobID=JP.JobID and JP.Name='HostName' and J.SubmissionTime>= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) group by JP.Value) S where J.JobID=JP.JobID and JP.Name='HostName' and J.Status='Done' and S.host=JP.Value and J.SubmissionTime>= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR) group by JP.Value order by J.Site asc"

           result = mysql_querry( cmd )
           for i in result:
             if (i[0],i[1]) not in data:
                data[(i[0],i[1])] = [0,0]
                print "Strange behavior: for ", (i[0],i[1]), " was no week data"
             data[(i[0], i[1])] += [int(i[2]), int(i[3])]

           prejson = [[x[0], x[1]]+data[x] for x in data]
           for i in range(0, len(prejson)):
             prejson[i] += [0 for j in range(0, 8-len(prejson[i]))] 

           return prejson
Beispiel #16
0
  def __init__( self, args = None, clients = None ):
    
    super( JobCommand, self ).__init__( args, clients )

    if 'JobDB' in self.apis:
      self.jobDB = self.apis[ 'JobDB' ]
    else:
      self.jobDB = JobDB()

    if 'ResourceManagementClient' in self.apis:
      self.rmClient = self.apis[ 'ResourceManagementClient' ]
    else:
      self.rmClient = ResourceManagementClient()
Beispiel #17
0
  def doNew( self, masterParams = None ):

    sql = """
select JP.Value, J.Status, J.Site, count(*) from Jobs J, JobParameters JP
where J.JobID = JP.JobID and JP.Name = 'HostName'
and J.LastUpdateTime >= DATE_SUB(UTC_TIMESTAMP(),INTERVAL 24 HOUR)
group by JP.Value, J.Status
"""

    jobDB = JobDB()
    queryRes = jobDB._query(sql)
    if not queryRes[ 'OK' ]:
      return queryRes
    records = queryRes[ 'Value' ]

    hostJobs = {}

    for record in records:
      hostName = record[ 0 ]
      if hostName not in hostJobs:
        hostJobs[ hostName ] = { 'Site' : record[ 2 ], 'Running' : 0, 'Done' : 0, 'Failed' : 0 }
      hostJobs[ hostName ][ record[ 1 ] ] = record[ 3 ]

    uniformResult = []

    for host, hostDict in hostJobs.items():
      hostDict[ 'Host' ] = host
      if hostDict[ 'Done' ] == 0 and hostDict[ 'Failed' ] == 0:
        hostDict[ 'Efficiency' ] = 0
      else:
        hostDict[ 'Efficiency' ] = float( hostDict[ 'Done' ] ) / ( hostDict[ 'Done' ] + hostDict[ 'Failed' ] )
      uniformResult.append( hostDict )

    storeRes = self._storeCommand( uniformResult )
    if not storeRes[ 'OK' ]:
      return storeRes

    return S_OK( uniformResult )
Beispiel #18
0
  def initialize(self):
    """Sets defaults
    """

    self.am_setOption('PollingTime', 120)
    self.am_setOption('GridEnv', '')
    self.am_setOption('PilotStalledDays', 3)
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
    self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7)
    self.WMSAdministrator = WMSAdministratorClient()

    return S_OK()
Beispiel #19
0
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production'])
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()
Beispiel #20
0
  def initialize( self ):

    self.jobDB = JobDB()

    for status in MONITOR_STATUS:
      for site in MONITOR_SITES:
        gLogger.verbose( "Registering activity %s-%s" % ( status, site ) )
        gLogger.verbose( "Jobs in %s state at %s" % ( status, site ) )
        gMonitor.registerActivity( "%s-%s" % ( status, site ), "Jobs in %s state at %s" % ( status, site ),
                                  "JobHistoryAgent", "Jobs/minute", gMonitor.OP_MEAN )

    self.last_update = 0
    self.resultDB = None
    self.reportPeriod = 60
    return S_OK()
  def initialize( self ):
    """ Standard constructor
    """
    self.dsClients = {}
    self.jobDB = JobDB()

    self.reportPeriod = 850
    self.am_setOption( "PollingTime", self.reportPeriod )
    self.__jobDBFields = []
    for field in self.__summaryKeyFieldsMapping:
      if field == 'User':
        field = 'Owner'
      elif field == 'UserGroup':
        field = 'OwnerGroup'
      self.__jobDBFields.append( field )
    return S_OK()
Beispiel #22
0
class SPTCorrector(BaseCorrector):

    _GLOBAL_MAX_CORRECTION = "MaxGlobalCorrection"
    _SLICE_TIME_SPAN = "TimeSpan"
    _SLICE_WEIGHT = "Weight"
    _SLICE_MAX_CORRECTION = "MaxCorrection"

    def initialize(self):

        self.__jobDB = JobDB()

        return S_OK()

    def applyCorrection(self, entitiesExpectedShare):

        print "AT >>> entitiesExpectedShare", entitiesExpectedShare

        ownerDNs = entitiesExpectedShare.keys()

        group = self.getGroup()
        result = self.__jobDB.getCounters("Jobs", ["OwnerDN"], {"OwnerGroup": group, "Status": "Waiting"})
        if not result["OK"]:
            print "AT >>> result", result
            return entitiesExpectedShare

        ownerDict = {}
        for row in result["Value"]:
            ownerDict[row[0]["OwnerDN"]] = row[1]
        print "AT >>> ownerDict", ownerDict

        resultShare = {}
        minNumber = 1000000000000
        minOwnerDN = ""
        for ownerDN in ownerDNs:
            resultShare[ownerDN] = 0
            if minNumber > ownerDict[ownerDN]:
                minNumber = ownerDict[ownerDN]
                minOwnerDN = ownerDN
        resultShare[minOwnerDN] = 1

        print "AT >>> resultShare", resultShare

        return resultShare

    def updateHistoryKnowledge(self):

        return S_OK()
Beispiel #23
0
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 60)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        self.prod_types = self.am_getOption(
            "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"]
        )
        gLogger.info(
            "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", "))
        )
        self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200)
        self.jobByJob = self.am_getOption("JobByJob", True)
        self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0)
        return S_OK()
Beispiel #24
0
  def initialize(self, jobDB=None, logDB=None):
    """ Initialization of the Optimizer Agent.
    """
    self.jobDB = JobDB() if jobDB is None else jobDB
    if not self.jobDB.isValid():
      dExit(1)

    self.logDB = JobLoggingDB() if logDB is None else logDB

    optimizerName = self.am_getModuleParam('agentName')
    if optimizerName.endswith('Agent'):
      optimizerName = optimizerName[:-len('Agent')]
    self.am_setModuleParam('optimizerName', optimizerName)

    self.startingMinorStatus = self.am_getModuleParam('optimizerName')
    self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed')
    self.am_setOption("PollingTime", 30)

    return self.initializeOptimizer()
Beispiel #25
0
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( "PollingTime", 60 )
    self.jobDB = JobDB()
    self.taskQueueDB = TaskQueueDB()
    self.jobLoggingDB = JobLoggingDB()
    # self.sandboxDB = SandboxDB( 'SandboxDB' )
    agentTSTypes = self.am_getOption('ProductionTypes', [])
    if agentTSTypes:
      self.prod_types = agentTSTypes
    else:
      self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', ')))
    self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100)
    self.jobByJob = self.am_getOption('JobByJob',True)
    self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.)
    return S_OK()
  def initialize(self):
    """ Standard constructor
    """

    self.jobDB = JobDB()

    self.am_setOption("PollingTime", 900)
    self.messageQueue = self.am_getOption('MessageQueue', 'dirac.wmshistory')

    self.monitoringReporter = MonitoringReporter(monitoringType="WMSHistory", failoverQueueName=self.messageQueue)

    for field in self.__summaryKeyFieldsMapping:
      if field == 'User':
        field = 'Owner'
      elif field == 'UserGroup':
        field = 'OwnerGroup'
      self.__jobDBFields.append(field)

    return S_OK()
Beispiel #27
0
 def initialize( self ):
   """ Standard constructor
   """
   
   self.jobDB = JobDB()
   
   self.reportPeriod = 120
   self.am_setOption( "PollingTime", self.reportPeriod )
   
   self.monitoringReporter = MonitoringReporter( monitoringType = "WMSHistory" )
   
   for field in self.__summaryKeyFieldsMapping:
     if field == 'User':
       field = 'Owner'
     elif field == 'UserGroup':
       field = 'OwnerGroup'
     self.__jobDBFields.append( field )
   
   return S_OK()
Beispiel #28
0
  def __init__(self, jobDB=None, opsHelper=None):
    """ Constructor
    """
    self.__runningLimitSection = "JobScheduling/RunningLimit"
    self.__matchingDelaySection = "JobScheduling/MatchingDelay"
    self.csDictCache = DictCache()
    self.condCache = DictCache()
    self.delayMem = {}

    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()

    self.log = gLogger.getSubLogger("Limiter")

    if opsHelper:
      self.__opsHelper = opsHelper
    else:
      self.__opsHelper = Operations()
  def initialize( self ):
    """ Standard constructor
    """
    
    self.jobDB = JobDB()

    self.am_setOption( "PollingTime", 120 )
    self.__jobDBFields = []
    for field in self.__summaryKeyFieldsMapping:
      if field == 'User':
        field = 'Owner'
      elif field == 'UserGroup':
        field = 'OwnerGroup'
      self.__jobDBFields.append( field )
    
    result = gConfig.getOption( "/Systems/RabbitMQ/User" )
    if not result['OK']:
      raise RuntimeError( 'Failed to get the configuration parameters: User' )
    user = result['Value']
    
    result = gConfig.getOption( "/Systems/RabbitMQ/Password" )
    if not result['OK']:
      raise RuntimeError( 'Failed to get the configuration parameters: Password' )
    password = result['Value']
    
    result = gConfig.getOption( "/Systems/RabbitMQ/Host" )
    if not result['OK']:
      raise RuntimeError( 'Failed to get the configuration parameters: Host' )
    self.host = result['Value']
    
    self.credentials = pika.PlainCredentials( user, password )

    self.connection = pika.BlockingConnection( pika.ConnectionParameters( host = self.host, credentials = self.credentials ) )
    self.channel = self.connection.channel()

    self.channel.exchange_declare( exchange = 'mdatabase',
                         exchange_type = 'fanout' )

    return S_OK()
Beispiel #30
0
class Matcher:
    """Logic for matching"""
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None,
                 pilotRef=None):
        """c'tor"""
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        if pilotRef:
            self.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.pilotAgentsDB.log = gLogger.getSubLogger("[%s]Matcher" %
                                                          pilotRef)
            self.jobDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.tqDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.jlDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
        else:
            self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB,
                               opsHelper=self.opsHelper,
                               pilotRef=pilotRef)

        self.siteClient = SiteStatus()

    def selectJob(self, resourceDescription, credDict):
        """Main job selection function to find the highest priority job matching the resource capacity"""

        startTime = time.time()

        resourceDict = self._getResourceDict(resourceDescription, credDict)

        # Make a nice print of the resource matching parameters
        toPrintDict = dict(resourceDict)
        if "MaxRAM" in resourceDescription:
            toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"]
        if "NumberOfProcessors" in resourceDescription:
            toPrintDict["NumberOfProcessors"] = resourceDescription[
                "NumberOfProcessors"]
        toPrintDict["Tag"] = []
        if "Tag" in resourceDict:
            for tag in resourceDict["Tag"]:
                if not tag.endswith("GB") and not tag.endswith("Processors"):
                    toPrintDict["Tag"].append(tag)
        if not toPrintDict["Tag"]:
            toPrintDict.pop("Tag")
        self.log.info("Resource description for matching",
                      printDict(toPrintDict))

        negativeCond = self.limiter.getNegativeCondForSite(
            resourceDict["Site"], resourceDict.get("GridCE"))
        result = self.tqDB.matchAndGetJob(resourceDict,
                                          negativeCond=negativeCond)

        if not result["OK"]:
            raise RuntimeError(result["Message"])
        result = result["Value"]
        if not result["matchFound"]:
            self.log.info("No match found")
            return {}

        jobID = result["jobId"]
        resAtt = self.jobDB.getJobAttributes(
            jobID, ["OwnerDN", "OwnerGroup", "Status"])
        if not resAtt["OK"]:
            raise RuntimeError("Could not retrieve job attributes")
        if not resAtt["Value"]:
            raise RuntimeError("No attributes returned for job")
        if not resAtt["Value"]["Status"] == "Waiting":
            self.log.error("Job matched by the TQ is not in Waiting state",
                           str(jobID))
            result = self.tqDB.deleteJob(jobID)
            if not result["OK"]:
                raise RuntimeError(result["Message"])
            raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

        self._reportStatus(resourceDict, jobID)

        result = self.jobDB.getJobJDL(jobID)
        if not result["OK"]:
            raise RuntimeError("Failed to get the job JDL")

        resultDict = {}
        resultDict["JDL"] = result["Value"]
        resultDict["JobID"] = jobID

        matchTime = time.time() - startTime
        self.log.verbose("Match time", "[%s]" % str(matchTime))

        # Get some extra stuff into the response returned
        resOpt = self.jobDB.getJobOptParameters(jobID)
        if resOpt["OK"]:
            for key, value in resOpt["Value"].items():
                resultDict[key] = value
        resAtt = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"])
        if not resAtt["OK"]:
            raise RuntimeError("Could not retrieve job attributes")
        if not resAtt["Value"]:
            raise RuntimeError("No attributes returned for job")

        if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.limiter.updateDelayCounters(resourceDict["Site"], jobID)

        pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag",
                                                 False)
        if not pilotInfoReportedFlag:
            self._updatePilotInfo(resourceDict)
        self._updatePilotJobMapping(resourceDict, jobID)

        resultDict["DN"] = resAtt["Value"]["OwnerDN"]
        resultDict["Group"] = resAtt["Value"]["OwnerGroup"]
        resultDict["PilotInfoReportedFlag"] = True

        return resultDict

    def _getResourceDict(self, resourceDescription, credDict):
        """from resourceDescription to resourceDict (just various mods)"""
        resourceDict = self._processResourceDescription(resourceDescription)
        resourceDict = self._checkCredentials(resourceDict, credDict)
        self._checkPilotVersion(resourceDict)
        if not self._checkMask(resourceDict):
            # Banned destinations can only take Test jobs
            resourceDict["JobType"] = "Test"

        self.log.verbose("Resource description")
        for key in resourceDict:
            self.log.debug("%s : %s" % (key.rjust(20), resourceDict[key]))

        return resourceDict

    def _processResourceDescription(self, resourceDescription):
        """Check and form the resource description dictionary

        :param resourceDescription: a ceDict coming from a JobAgent,
                                    for example.
        :return: updated dictionary of resource description parameters
        """

        resourceDict = {}
        for name in singleValueDefFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in multiValueMatchFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        if resourceDescription.get("Tag"):
            tags = resourceDescription["Tag"]
            resourceDict["Tag"] = (tags if isinstance(tags, list) else list(
                {tag.strip("\"' ")
                 for tag in tags.strip("[]").split(",")}))
            if "RequiredTag" in resourceDescription:
                requiredTagsList = (list({
                    tag.strip("\"' ")
                    for tag in resourceDescription["RequiredTag"].strip(
                        "[]").split(",")
                }) if isinstance(resourceDescription["RequiredTag"], str) else
                                    resourceDescription["RequiredTag"])
                resourceDict["RequiredTag"] = requiredTagsList

        if "JobID" in resourceDescription:
            resourceDict["JobID"] = resourceDescription["JobID"]

        # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
        maxRAM = resourceDescription.get("MaxRAM")
        if maxRAM:
            try:
                maxRAM = int(maxRAM / 1000)
            except ValueError:
                maxRAM = None
        nProcessors = resourceDescription.get("NumberOfProcessors")
        if nProcessors:
            try:
                nProcessors = int(nProcessors)
            except ValueError:
                nProcessors = None
        for param, key in [(maxRAM, "GB"), (nProcessors, "Processors")]:
            if param and param <= 1024:
                paramList = list(range(2, param + 1))
                paramTags = ["%d%s" % (par, key) for par in paramList]
                if paramTags:
                    resourceDict.setdefault("Tag", []).extend(paramTags)

        # Add 'MultiProcessor' to the list of tags
        if nProcessors and nProcessors > 1:
            resourceDict.setdefault("Tag", []).append("MultiProcessor")

        # Add 'WholeNode' to the list of tags
        if "WholeNode" in resourceDescription:
            resourceDict.setdefault("Tag", []).append("WholeNode")

        if "Tag" in resourceDict:
            resourceDict["Tag"] = list(set(resourceDict["Tag"]))
        if "RequiredTag" in resourceDict:
            resourceDict["RequiredTag"] = list(set(
                resourceDict["RequiredTag"]))

        for k in (
                "DIRACVersion",
                "ReleaseVersion",
                "ReleaseProject",
                "VirtualOrganization",
                "PilotReference",
                "PilotBenchmark",
                "PilotInfoReportedFlag",
        ):
            if k in resourceDescription:
                resourceDict[k] = resourceDescription[k]

        return resourceDict

    def _reportStatus(self, resourceDict, jobID):
        """Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
        """
        attNames = ["Status", "MinorStatus", "ApplicationStatus", "Site"]
        attValues = ["Matched", "Assigned", "Unknown", resourceDict["Site"]]
        result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
        if not result["OK"]:
            self.log.error(
                "Problem reporting job status",
                "setJobAttributes, jobID = %s: %s" %
                (jobID, result["Message"]))
        else:
            self.log.verbose("Set job attributes for jobID", jobID)

        result = self.jlDB.addLoggingRecord(jobID,
                                            status=JobStatus.MATCHED,
                                            minorStatus="Assigned",
                                            source="Matcher")
        if not result["OK"]:
            self.log.error(
                "Problem reporting job status",
                "addLoggingRecord, jobID = %s: %s" %
                (jobID, result["Message"]))
        else:
            self.log.verbose("Added logging record for jobID", jobID)

    def _checkMask(self, resourceDict):
        """Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
        """
        if "Site" not in resourceDict:
            self.log.error("Missing Site Name in Resource JDL")
            raise RuntimeError("Missing Site Name in Resource JDL")

        # Check if site is allowed
        result = self.siteClient.getUsableSites(resourceDict["Site"])
        if not result["OK"]:
            self.log.error("Internal error",
                           "siteClient.getUsableSites: %s" % result["Message"])
            raise RuntimeError("Internal error")

        if resourceDict["Site"] not in result["Value"]:
            return False

        return True

    def _updatePilotInfo(self, resourceDict):
        """Update pilot information - do not fail if we don't manage to do it"""
        pilotReference = resourceDict.get("PilotReference", "")
        if pilotReference and pilotReference != "Unknown":
            gridCE = resourceDict.get("GridCE", "Unknown")
            site = resourceDict.get("Site", "Unknown")
            benchmark = resourceDict.get("PilotBenchmark", 0.0)
            self.log.verbose(
                "Reporting pilot info",
                "for %s: gridCE=%s, site=%s, benchmark=%f" %
                (pilotReference, gridCE, site, benchmark),
            )

            result = self.pilotAgentsDB.setPilotStatus(
                pilotReference,
                status=PilotStatus.RUNNING,
                gridSite=site,
                destination=gridCE,
                benchmark=benchmark)
            if not result["OK"]:
                self.log.warn(
                    "Problem updating pilot information",
                    "; setPilotStatus. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )

    def _updatePilotJobMapping(self, resourceDict, jobID):
        """Update pilot to job mapping information"""
        pilotReference = resourceDict.get("PilotReference", "")
        if pilotReference and pilotReference != "Unknown":
            result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            if not result["OK"]:
                self.log.error(
                    "Problem updating pilot information",
                    ";setCurrentJobID. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )
            result = self.pilotAgentsDB.setJobForPilot(jobID,
                                                       pilotReference,
                                                       updateStatus=False)
            if not result["OK"]:
                self.log.error(
                    "Problem updating pilot information",
                    "; setJobForPilot. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )

    def _checkCredentials(self, resourceDict, credDict):
        """Check if we can get a job given the passed credentials"""
        if Properties.GENERIC_PILOT in credDict["properties"]:
            # You can only match groups in the same VO
            if credDict["group"] == "hosts":
                # for the host case the VirtualOrganization parameter
                # is mandatory in resourceDict
                vo = resourceDict.get("VirtualOrganization", "")
            else:
                vo = Registry.getVOForGroup(credDict["group"])
            if "OwnerGroup" not in resourceDict:
                result = Registry.getGroupsForVO(vo)
                if result["OK"]:
                    resourceDict["OwnerGroup"] = result["Value"]
                else:
                    raise RuntimeError(result["Message"])
        else:
            # If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict["properties"]:
                self.log.notice(
                    "Setting the resource DN to the credentials DN")
                resourceDict["OwnerDN"] = credDict["DN"]
            # If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict["properties"]:
                resourceDict["OwnerGroup"] = credDict["group"]
                self.log.notice(
                    "Setting the resource group to the credentials group")
                if "OwnerDN" in resourceDict and resourceDict[
                        "OwnerDN"] != credDict["DN"]:
                    ownerDN = resourceDict["OwnerDN"]
                    result = Registry.getGroupsForDN(resourceDict["OwnerDN"])
                    if not result["OK"]:
                        raise RuntimeError(result["Message"])
                    if credDict["group"] not in result["Value"]:
                        # DN is not in the same group! bad boy.
                        self.log.warn(
                            "You cannot request jobs from this DN, as it does not belong to your group!",
                            "(%s)" % ownerDN,
                        )
                        resourceDict["OwnerDN"] = credDict["DN"]
            # Nothing special, group and DN have to be the same
            else:
                resourceDict["OwnerDN"] = credDict["DN"]
                resourceDict["OwnerGroup"] = credDict["group"]

        return resourceDict

    def _checkPilotVersion(self, resourceDict):
        """Check the pilot DIRAC version"""
        if self.opsHelper.getValue("Pilot/CheckVersion", True):
            if "ReleaseVersion" not in resourceDict:
                if "DIRACVersion" not in resourceDict:
                    raise PilotVersionError(
                        "Version check requested and not provided by Pilot")
                else:
                    pilotVersion = resourceDict["DIRACVersion"]
            else:
                pilotVersion = resourceDict["ReleaseVersion"]

            validVersions = [
                convertToPy3VersionNumber(newStyleVersion)
                for newStyleVersion in self.opsHelper.getValue(
                    "Pilot/Version", [])
            ]
            if validVersions and convertToPy3VersionNumber(
                    pilotVersion) not in validVersions:
                raise PilotVersionError(
                    "Pilot version does not match the production version: %s not in ( %s )"
                    % (pilotVersion, ",".join(validVersions)))
            # Check project if requested
            validProject = self.opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if "ReleaseProject" not in resourceDict:
                    raise PilotVersionError(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict["ReleaseProject"] != validProject:
                    raise PilotVersionError(
                        "Version check requested but expected project %s != received %s"
                        % (validProject, resourceDict["ReleaseProject"]))
Beispiel #31
0
    def __init__(self):

        self.JobDB = JobDB()
        self.custom_timers = {}
Beispiel #32
0
 def __init__(self):
     """ The standard constructor takes the database name (dbname) and the name of the
     configuration section (dbconfig)
 """
     DIRACJobDB.__init__(self)
     self.jdl2DBParameters += ['runNumber']
Beispiel #33
0
class Limiter(object):

  def __init__(self, jobDB=None, opsHelper=None):
    """ Constructor
    """
    self.__runningLimitSection = "JobScheduling/RunningLimit"
    self.__matchingDelaySection = "JobScheduling/MatchingDelay"
    self.csDictCache = DictCache()
    self.condCache = DictCache()
    self.delayMem = {}

    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()

    self.log = gLogger.getSubLogger("Limiter")

    if opsHelper:
      self.__opsHelper = opsHelper
    else:
      self.__opsHelper = Operations()

  def getNegativeCond(self):
    """ Get negative condition for ALL sites
    """
    orCond = self.condCache.get("GLOBAL")
    if orCond:
      return orCond
    negCond = {}
    # Run Limit
    result = self.__opsHelper.getSections(self.__runningLimitSection)
    sites = []
    if result['OK']:
      sites = result['Value']
    for siteName in sites:
      result = self.__getRunningCondition(siteName)
      if not result['OK']:
        continue
      data = result['Value']
      if data:
        negCond[siteName] = data
    # Delay limit
    result = self.__opsHelper.getSections(self.__matchingDelaySection)
    sites = []
    if result['OK']:
      sites = result['Value']
    for siteName in sites:
      result = self.__getDelayCondition(siteName)
      if not result['OK']:
        continue
      data = result['Value']
      if not data:
        continue
      if siteName in negCond:
        negCond[siteName] = self.__mergeCond(negCond[siteName], data)
      else:
        negCond[siteName] = data
    orCond = []
    for siteName in negCond:
      negCond[siteName]['Site'] = siteName
      orCond.append(negCond[siteName])
    self.condCache.add("GLOBAL", 10, orCond)
    return orCond

  def getNegativeCondForSite(self, siteName):
    """ Generate a negative query based on the limits set on the site
    """
    # Check if Limits are imposed onto the site
    negativeCond = {}
    if self.__opsHelper.getValue("JobScheduling/CheckJobLimits", True):
      result = self.__getRunningCondition(siteName)
      if result['OK']:
        negativeCond = result['Value']
      self.log.verbose('Negative conditions for site %s after checking limits are: %s' % (siteName, str(negativeCond)))

    if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
      result = self.__getDelayCondition(siteName)
      if result['OK']:
        delayCond = result['Value']
        self.log.verbose('Negative conditions for site %s after delay checking are: %s' % (siteName, str(delayCond)))
        negativeCond = self.__mergeCond(negativeCond, delayCond)

    if negativeCond:
      self.log.info('Negative conditions for site %s are: %s' % (siteName, str(negativeCond)))

    return negativeCond

  def __mergeCond(self, negCond, addCond):
    """ Merge two negative dicts
    """
    # Merge both negative dicts
    for attr in addCond:
      if attr not in negCond:
        negCond[attr] = []
      for value in addCond[attr]:
        if value not in negCond[attr]:
          negCond[attr].append(value)
    return negCond

  def __extractCSData(self, section):
    """ Extract limiting information from the CS in the form:
        { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    """
    stuffDict = self.csDictCache.get(section)
    if stuffDict:
      return S_OK(stuffDict)

    result = self.__opsHelper.getSections(section)
    if not result['OK']:
      return result
    attribs = result['Value']
    stuffDict = {}
    for attName in attribs:
      result = self.__opsHelper.getOptionsDict("%s/%s" % (section, attName))
      if not result['OK']:
        return result
      attLimits = result['Value']
      try:
        attLimits = dict([(k, int(attLimits[k])) for k in attLimits])
      except Exception as excp:
        errMsg = "%s/%s has to contain numbers: %s" % (section, attName, str(excp))
        self.log.error(errMsg)
        return S_ERROR(errMsg)
      stuffDict[attName] = attLimits

    self.csDictCache.add(section, 300, stuffDict)
    return S_OK(stuffDict)

  def __getRunningCondition(self, siteName):
    """ Get extra conditions allowing site throttling
    """
    siteSection = "%s/%s" % (self.__runningLimitSection, siteName)
    result = self.__extractCSData(siteSection)
    if not result['OK']:
      return result
    limitsDict = result['Value']
    # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    if not limitsDict:
      return S_OK({})
    # Check if the site exceeding the given limits
    negCond = {}
    for attName in limitsDict:
      if attName not in self.jobDB.jobAttributeNames:
        self.log.error("Attribute %s does not exist. Check the job limits" % attName)
        continue
      cK = "Running:%s:%s" % (siteName, attName)
      data = self.condCache.get(cK)
      if not data:
        result = self.jobDB.getCounters(
            'Jobs', [attName], {
                'Site': siteName, 'Status': [
                    'Running', 'Matched', 'Stalled']})
        if not result['OK']:
          return result
        data = result['Value']
        data = dict([(k[0][attName], k[1]) for k in data])
        self.condCache.add(cK, 10, data)
      for attValue in limitsDict[attName]:
        limit = limitsDict[attName][attValue]
        running = data.get(attValue, 0)
        if running >= limit:
          self.log.verbose('Job Limit imposed at %s on %s/%s=%d,'
                           ' %d jobs already deployed' % (siteName, attName, attValue, limit, running))
          if attName not in negCond:
            negCond[attName] = []
          negCond[attName].append(attValue)
    # negCond is something like : {'JobType': ['Merge']}
    return S_OK(negCond)

  def updateDelayCounters(self, siteName, jid):
    # Get the info from the CS
    siteSection = "%s/%s" % (self.__matchingDelaySection, siteName)
    result = self.__extractCSData(siteSection)
    if not result['OK']:
      return result
    delayDict = result['Value']
    # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } }
    if not delayDict:
      return S_OK()
    attNames = []
    for attName in delayDict:
      if attName not in self.jobDB.jobAttributeNames:
        self.log.error("Attribute %s does not exist in the JobDB. Please fix it!" % attName)
      else:
        attNames.append(attName)
    result = self.jobDB.getJobAttributes(jid, attNames)
    if not result['OK']:
      self.log.error("While retrieving attributes coming from %s: %s" % (siteSection, result['Message']))
      return result
    atts = result['Value']
    # Create the DictCache if not there
    if siteName not in self.delayMem:
      self.delayMem[siteName] = DictCache()
    # Update the counters
    delayCounter = self.delayMem[siteName]
    for attName in atts:
      attValue = atts[attName]
      if attValue in delayDict[attName]:
        delayTime = delayDict[attName][attValue]
        self.log.notice("Adding delay for %s/%s=%s of %s secs" % (siteName, attName,
                                                                  attValue, delayTime))
        delayCounter.add((attName, attValue), delayTime)
    return S_OK()

  def __getDelayCondition(self, siteName):
    """ Get extra conditions allowing matching delay
    """
    if siteName not in self.delayMem:
      return S_OK({})
    lastRun = self.delayMem[siteName].getKeys()
    negCond = {}
    for attName, attValue in lastRun:
      if attName not in negCond:
        negCond[attName] = []
      negCond[attName].append(attValue)
    return S_OK(negCond)
Beispiel #34
0
class Matcher( object ):
  """ Logic for matching
  """

  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )


  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict


  def _getResourceDict( self, resourceDescription, credDict ):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription( resourceDescription )
    resourceDict = self._checkCredentials( resourceDict, credDict )
    self._checkPilotVersion( resourceDict )
    if not self._checkMask( resourceDict ):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose( "Resource description:" )
    for key in resourceDict:
      self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    return resourceDict

  def _processResourceDescription( self, resourceDescription ):
    """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

    resourceDict = {}
    if isinstance( resourceDescription, basestring ):
      classAdAgent = ClassAd( resourceDescription )
      if not classAdAgent.isOK():
        raise ValueError( 'Illegal Resource JDL' )
      self.log.verbose( classAdAgent.asJDL() )

      for name in singleValueDefFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'CPUTime':
            resourceDict[name] = classAdAgent.getAttributeInt( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      for name in multiValueMatchFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'SubmitPool':
            resourceDict[name] = classAdAgent.getListFromExpression( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      # Check if a JobID is requested
      if classAdAgent.lookupAttribute( 'JobID' ):
        resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' )

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ):
        if classAdAgent.lookupAttribute( k ):
          resourceDict[ k ] = classAdAgent.getAttributeString( k )

    else:
      for name in singleValueDefFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      for name in multiValueMatchFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      if resourceDescription.has_key( 'JobID' ):
        resourceDict['JobID'] = resourceDescription['JobID']

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
                 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ):
        if k in resourceDescription:
          resourceDict[ k ] = resourceDescription[ k ]

    return resourceDict



  def _reportStatus( self, resourceDict, jobID ):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes( jobID, attNames, attValues )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Set job attributes for jobID %s" % jobID )

    result = self.jlDB.addLoggingRecord( jobID,
                                         status = 'Matched',
                                         minor = 'Assigned',
                                         source = 'Matcher' )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Added logging record for jobID %s" % jobID )


  def _checkMask( self, resourceDict ):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if not 'Site' in resourceDict:
      self.log.error( "Missing Site Name in Resource JDL" )
      raise RuntimeError( "Missing Site Name in Resource JDL" )

    # Get common site mask and check the agent site
    result = self.jobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] )
      raise RuntimeError( "Internal error" )
    maskList = result['Value']

    if resourceDict['Site'] not in maskList:
      return False

    return True

  def _updatePilotInfo( self, resourceDict ):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      gridCE = resourceDict.get( 'GridCE', 'Unknown' )
      site = resourceDict.get( 'Site', 'Unknown' )
      benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
      self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) )

      result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site,
                                                  destination = gridCE, benchmark = benchmark )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _updatePilotJobMapping( self, resourceDict, jobID ):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )
      result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _checkCredentials( self, resourceDict, credDict ):
    """ Check if we can get a job given the passed credentials
    """
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      # You can only match groups in the same VO
      if credDict[ 'group' ] == "hosts":
        # for the host case the VirtualOrganization parameter
        # is mandatory in resourceDict
        vo = resourceDict.get( 'VirtualOrganization', '' )
      else:
        vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
      else:
        raise RuntimeError( result['Message'] )
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        self.log.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        self.log.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ]:
            raise RuntimeError( result['Message'] )
          if credDict[ 'group' ] not in result[ 'Value' ]:
            # DN is not in the same group! bad boy.
            self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    return resourceDict

  def _checkPilotVersion( self, resourceDict ):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          raise RuntimeError( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \
                            ( pilotVersion, ",".join( validVersions ) ) )
      # Check project if requested
      validProject = self.opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                                   resourceDict[ 'ReleaseProject' ] ) )
Beispiel #35
0
class StalledJobAgent(AgentModule):
    """
The specific agents must provide the following methods:
- initialize() for initial settings
- beginExecution()
- execute() - the main method called in the agent cycle
- endExecution()
- finalize() - the graceful exit of the method, this one is usually used
for the agent restart
"""
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        self.jobDB = None
        self.logDB = None
        self.matchedTime = 7200
        self.rescheduledTime = 600
        self.completedTime = 86400
        self.submittingTime = 300

    #############################################################################
    def initialize(self):
        """Sets default parameters
"""
        self.jobDB = JobDB()
        self.logDB = JobLoggingDB()
        self.am_setOption('PollingTime', 60 * 60)
        self.stalledJobsTolerantSites = self.am_getOption(
            'StalledJobsTolerantSites', [])
        if not self.am_getOption('Enable', True):
            self.log.info('Stalled Job Agent running in disabled mode')
        return S_OK()

    #############################################################################
    def execute(self):
        """ The main agent execution method
"""
        self.log.verbose('Waking up Stalled Job Agent')

        wms_instance = getSystemInstance('WorkloadManagement')
        if not wms_instance:
            return S_ERROR(
                'Can not get the WorkloadManagement system instance')
        wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance,
                                 'JobWrapper')

        stalledTime = self.am_getOption('StalledTimeHours', 2)
        failedTime = self.am_getOption('FailedTimeHours', 6)
        self.stalledJobsToleranceTime = self.am_getOption(
            'StalledJobsToleranceTime', 0)

        self.submittingTime = self.am_getOption('SubmittingTime',
                                                self.submittingTime)
        self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime)
        self.rescheduledTime = self.am_getOption('RescheduledTime',
                                                 self.rescheduledTime)
        self.completedTime = self.am_getOption('CompletedTime',
                                               self.completedTime)

        self.log.verbose('StalledTime = %s cycles' % (stalledTime))
        self.log.verbose('FailedTime = %s cycles' % (failedTime))

        watchdogCycle = gConfig.getValue(
            cfgPath(wrapperSection, 'CheckingTime'), 30 * 60)
        watchdogCycle = max(
            watchdogCycle,
            gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'),
                             20 * 60))

        # Add half cycle to avoid race conditions
        stalledTime = watchdogCycle * (stalledTime + 0.5)
        failedTime = watchdogCycle * (failedTime + 0.5)

        result = self.__markStalledJobs(stalledTime)
        if not result['OK']:
            self.log.error('Failed to detect stalled jobs', result['Message'])

        # Note, jobs will be revived automatically during the heartbeat signal phase and
        # subsequent status changes will result in jobs not being selected by the
        # stalled job agent.

        result = self.__failStalledJobs(failedTime)
        if not result['OK']:
            self.log.error('Failed to process stalled jobs', result['Message'])

        result = self.__failCompletedJobs()
        if not result['OK']:
            self.log.error('Failed to process completed jobs',
                           result['Message'])

        result = self.__failSubmittingJobs()
        if not result['OK']:
            self.log.error('Failed to process jobs being submitted',
                           result['Message'])

        result = self.__kickStuckJobs()
        if not result['OK']:
            self.log.error('Failed to kick stuck jobs', result['Message'])

        return S_OK('Stalled Job Agent cycle complete')

    #############################################################################
    def __markStalledJobs(self, stalledTime):
        """ Identifies stalled jobs running without update longer than stalledTime.
"""
        stalledCounter = 0
        runningCounter = 0
        result = self.jobDB.selectJobs({'Status': 'Running'})
        if not result['OK']:
            return result
        if not result['Value']:
            return S_OK()
        jobs = result['Value']
        self.log.info('%s Running jobs will be checked for being stalled' %
                      (len(jobs)))
        jobs.sort()
        # jobs = jobs[:10] #for debugging
        for job in jobs:
            site = self.jobDB.getJobAttribute(job, 'site')['Value']
            if site in self.stalledJobsTolerantSites:
                result = self.__getStalledJob(
                    job, stalledTime + self.stalledJobsToleranceTime)
            else:
                result = self.__getStalledJob(job, stalledTime)
            if result['OK']:
                self.log.verbose('Updating status to Stalled for job %s' %
                                 (job))
                self.__updateJobStatus(job, 'Stalled')
                stalledCounter += 1
            else:
                self.log.verbose(result['Message'])
                runningCounter += 1

        self.log.info(
            'Total jobs: %s, Stalled job count: %s, Running job count: %s' %
            (len(jobs), stalledCounter, runningCounter))
        return S_OK()

    #############################################################################
    def __failStalledJobs(self, failedTime):
        """ Changes the Stalled status to Failed for jobs long in the Stalled status
    """

        result = self.jobDB.selectJobs({'Status': 'Stalled'})
        if not result['OK']:
            return result
        jobs = result['Value']

        failedCounter = 0
        minorStalledStatuses = ("Job stalled: pilot not running",
                                'Stalling for more than %d sec' % failedTime)

        if jobs:
            self.log.info('%s Stalled jobs will be checked for failure' %
                          (len(jobs)))

            for job in jobs:
                setFailed = False
                # Check if the job pilot is lost
                result = self.__getJobPilotStatus(job)
                if not result['OK']:
                    self.log.error('Failed to get pilot status',
                                   result['Message'])
                    continue
                pilotStatus = result['Value']
                if pilotStatus != "Running":
                    setFailed = minorStalledStatuses[0]
                else:

                    result = self.__getLatestUpdateTime(job)
                    if not result['OK']:
                        self.log.error('Failed to get job update time',
                                       result['Message'])
                        continue
                    elapsedTime = toEpoch() - result['Value']
                    if elapsedTime > failedTime:
                        setFailed = minorStalledStatuses[1]

                # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info
                if setFailed:
                    self.__sendKillCommand(job)
                    self.__updateJobStatus(job, 'Failed', setFailed)
                    failedCounter += 1
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])

        recoverCounter = 0

        for minor in minorStalledStatuses:
            result = self.jobDB.selectJobs({
                'Status': 'Failed',
                'MinorStatus': minor,
                'AccountedFlag': 'False'
            })
            if not result['OK']:
                return result
            if result['Value']:
                jobs = result['Value']
                self.log.info('%s Stalled jobs will be Accounted' %
                              (len(jobs)))
                for job in jobs:
                    result = self.__sendAccounting(job)
                    if not result['OK']:
                        self.log.error('Failed to send accounting',
                                       result['Message'])
                        continue

                    recoverCounter += 1
            if not result['OK']:
                break

        if failedCounter:
            self.log.info('%d jobs set to Failed' % failedCounter)
        if recoverCounter:
            self.log.info('%d jobs properly Accounted' % recoverCounter)
        return S_OK(failedCounter)

    #############################################################################
    def __getJobPilotStatus(self, jobID):
        """ Get the job pilot status
"""
        result = JobMonitoringClient().getJobParameter(jobID,
                                                       'Pilot_Reference')
        if not result['OK']:
            return result
        pilotReference = result['Value'].get('Pilot_Reference')
        if not pilotReference:
            # There is no pilot reference, hence its status is unknown
            return S_OK('NoPilot')

        result = WMSAdministratorClient().getPilotInfo(pilotReference)
        if not result['OK']:
            if "No pilots found" in result['Message']:
                self.log.warn(result['Message'])
                return S_OK('NoPilot')
            self.log.error('Failed to get pilot information',
                           'for job %d: ' % jobID + result['Message'])
            return S_ERROR('Failed to get the pilot status')
        pilotStatus = result['Value'][pilotReference]['Status']

        return S_OK(pilotStatus)

    #############################################################################
    def __getStalledJob(self, job, stalledTime):
        """ Compares the most recent of LastUpdateTime and HeartBeatTime against
the stalledTime limit.
"""
        result = self.__getLatestUpdateTime(job)
        if not result['OK']:
            return result

        currentTime = toEpoch()
        lastUpdate = result['Value']

        elapsedTime = currentTime - lastUpdate
        self.log.verbose('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime))
        if elapsedTime > stalledTime:
            self.log.info(
                'Job %s is identified as stalled with last update > %s secs ago'
                % (job, elapsedTime))
            return S_OK('Stalled')

        return S_ERROR('Job %s is running and will be ignored' % job)

    #############################################################################
    def __getLatestUpdateTime(self, job):
        """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
        result = self.jobDB.getJobAttributes(
            job, ['HeartBeatTime', 'LastUpdateTime'])
        if not result['OK']:
            self.log.error('Failed to get job attributes', result['Message'])
        if not result['OK'] or not result['Value']:
            self.log.error('Could not get attributes for job', '%s' % job)
            return S_ERROR('Could not get attributes for job')

        self.log.verbose(result)
        latestUpdate = 0
        if not result['Value']['HeartBeatTime'] or result['Value'][
                'HeartBeatTime'] == 'None':
            self.log.verbose('HeartBeatTime is null for job %s' % job)
        else:
            latestUpdate = toEpoch(fromString(
                result['Value']['HeartBeatTime']))

        if not result['Value']['LastUpdateTime'] or result['Value'][
                'LastUpdateTime'] == 'None':
            self.log.verbose('LastUpdateTime is null for job %s' % job)
        else:
            lastUpdate = toEpoch(fromString(result['Value']['LastUpdateTime']))
            if latestUpdate < lastUpdate:
                latestUpdate = lastUpdate

        if not latestUpdate:
            return S_ERROR(
                'LastUpdate and HeartBeat times are null for job %s' % job)
        else:
            self.log.verbose('Latest update time from epoch for job %s is %s' %
                             (job, latestUpdate))
            return S_OK(latestUpdate)

    #############################################################################
    def __updateJobStatus(self, job, status, minorstatus=None):
        """ This method updates the job status in the JobDB, this should only be
used to fail jobs due to the optimizer chain.
"""
        self.log.verbose(
            "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" %
            (job, status))

        if self.am_getOption('Enable', True):
            result = self.jobDB.setJobAttribute(job,
                                                'Status',
                                                status,
                                                update=True)
        else:
            result = S_OK('DisabledMode')

        if result['OK']:
            if minorstatus:
                self.log.verbose(
                    "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)"
                    % (job, minorstatus))
                result = self.jobDB.setJobAttribute(job,
                                                    'MinorStatus',
                                                    minorstatus,
                                                    update=True)

        if not minorstatus:  # Retain last minor status for stalled jobs
            result = self.jobDB.getJobAttributes(job, ['MinorStatus'])
            if result['OK']:
                minorstatus = result['Value']['MinorStatus']

        logStatus = status
        result = self.logDB.addLoggingRecord(job,
                                             status=logStatus,
                                             minor=minorstatus,
                                             source='StalledJobAgent')
        if not result['OK']:
            self.log.warn(result)

        return result

    def __getProcessingType(self, jobID):
        """ Get the Processing Type from the JDL, until it is promoted to a real Attribute
    """
        processingType = 'unknown'
        result = self.jobDB.getJobJDL(jobID, original=True)
        if not result['OK']:
            return processingType
        classAdJob = ClassAd(result['Value'])
        if classAdJob.lookupAttribute('ProcessingType'):
            processingType = classAdJob.getAttributeString('ProcessingType')
        return processingType

    def __sendAccounting(self, jobID):
        """ Send WMS accounting data for the given job
    """
        try:
            accountingReport = Job()
            endTime = 'Unknown'
            lastHeartBeatTime = 'Unknown'

            result = self.jobDB.getJobAttributes(jobID)
            if not result['OK']:
                return result
            jobDict = result['Value']

            startTime, endTime = self.__checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            result = JobMonitoringClient().getJobParameter(
                jobID, 'CPUNormalizationFactor')
            if not result['OK'] or not result['Value']:
                self.log.error(
                    'Error getting Job Parameter CPUNormalizationFactor, setting 0',
                    result['Message'])
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(
                    result['Value'].get('CPUNormalizationFactor'))

        except Exception as e:
            self.log.exception(
                "Exception in __sendAccounting",
                "for job=%s: endTime=%s, lastHBTime=%s" %
                (str(jobID), str(endTime), str(lastHeartBeatTime)),
                lException=e)
            return S_ERROR("Exception")

        processingType = self.__getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            'Site': jobDict['Site'],
            'User': jobDict['Owner'],
            'UserGroup': jobDict['OwnerGroup'],
            'JobGroup': jobDict['JobGroup'],
            'JobType': jobDict['JobType'],
            'JobClass': jobDict['JobSplitType'],
            'ProcessingType': processingType,
            'FinalMajorStatus': 'Failed',
            'FinalMinorStatus': 'Stalled',
            'CPUTime': lastCPUTime,
            'NormCPUTime': lastCPUTime * cpuNormalization,
            'ExecTime': lastWallTime,
            'InputDataSize': 0.0,
            'OutputDataSize': 0.0,
            'InputDataFiles': 0,
            'OutputDataFiles': 0,
            'DiskSpace': 0.0,
            'InputSandBoxSize': 0.0,
            'OutputSandBoxSize': 0.0,
            'ProcessedEvents': 0
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData['ExecTime']:
            acData['ExecTime'] = acData['CPUTime']
        elif acData['ExecTime'] < acData['CPUTime']:
            acData['ExecTime'] = acData['CPUTime']

        self.log.verbose('Accounting Report is:')
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result['OK']:
            self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True')
        else:
            self.log.error(
                'Failed to send accounting report',
                'Job: %d, Error: %s' % (int(jobID), result['Message']))
        return result

    def __checkHeartBeat(self, jobID, jobDict):
        """ Get info from HeartBeat
"""
        result = self.jobDB.getHeartBeatData(jobID)
        lastCPUTime = 0
        lastWallTime = 0
        lastHeartBeatTime = jobDict['StartExecTime']
        if lastHeartBeatTime == "None":
            lastHeartBeatTime = 0

        if result['OK']:
            for name, value, heartBeatTime in result['Value']:
                if name == 'CPUConsumed':
                    try:
                        value = int(float(value))
                        if value > lastCPUTime:
                            lastCPUTime = value
                    except ValueError:
                        pass
                if name == 'WallClockTime':
                    try:
                        value = int(float(value))
                        if value > lastWallTime:
                            lastWallTime = value
                    except ValueError:
                        pass
                if heartBeatTime > lastHeartBeatTime:
                    lastHeartBeatTime = heartBeatTime

        return lastCPUTime, lastWallTime, lastHeartBeatTime

    def __checkLoggingInfo(self, jobID, jobDict):
        """ Get info from JobLogging
"""
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result['OK']:
            logList = result['Value']

        startTime = jobDict['StartExecTime']
        if not startTime or startTime == 'None':
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == 'Running':
                    startTime = items[3]
                    break
            if not startTime or startTime == 'None':
                startTime = jobDict['SubmissionTime']

        if isinstance(startTime, basestring):
            startTime = fromString(startTime)
            if startTime is None:
                self.log.error('Wrong timestamp in DB', items[3])
                startTime = dateTime()

        endTime = dateTime()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == 'Stalled':
                endTime = fromString(items[3])
        if endTime is None:
            self.log.error('Wrong timestamp in DB', items[3])
            endTime = dateTime()

        return startTime, endTime

    def __kickStuckJobs(self):
        """ Reschedule jobs stuck in initialization status Rescheduled, Matched
"""

        message = ''

        checkTime = str(dateTime() - self.matchedTime * second)
        result = self.jobDB.selectJobs({'Status': 'Matched'}, older=checkTime)
        if not result['OK']:
            self.log.error('Failed to select jobs', result['Message'])
            return result

        jobIDs = result['Value']
        if jobIDs:
            self.log.info('Rescheduling %d jobs stuck in Matched status' %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if 'FailedJobs' in result:
                message = 'Failed to reschedule %d jobs stuck in Matched status' % len(
                    result['FailedJobs'])

        checkTime = str(dateTime() - self.rescheduledTime * second)
        result = self.jobDB.selectJobs({'Status': 'Rescheduled'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error('Failed to select jobs', result['Message'])
            return result

        jobIDs = result['Value']
        if jobIDs:
            self.log.info('Rescheduling %d jobs stuck in Rescheduled status' %
                          len(jobIDs))
            result = self.jobDB.rescheduleJobs(jobIDs)
            if 'FailedJobs' in result:
                if message:
                    message += '\n'
                message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len(
                    result['FailedJobs'])

        if message:
            return S_ERROR(message)
        return S_OK()

    def __failCompletedJobs(self):
        """ Failed Jobs stuck in Completed Status for a long time.
      They are due to pilots being killed during the
      finalization of the job execution.
    """

        # Get old Completed Jobs
        checkTime = str(dateTime() - self.completedTime * second)
        result = self.jobDB.selectJobs({'Status': 'Completed'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error('Failed to select jobs', result['Message'])
            return result

        jobIDs = result['Value']
        if not jobIDs:
            return S_OK()

        # Remove those with Minor Status "Pending Requests"
        for jobID in jobIDs:
            result = self.jobDB.getJobAttributes(jobID,
                                                 ['Status', 'MinorStatus'])
            if not result['OK']:
                self.log.error('Failed to get job attributes',
                               result['Message'])
                continue
            if result['Value']['Status'] != "Completed":
                continue
            if result['Value']['MinorStatus'] == "Pending Requests":
                continue

            result = self.__updateJobStatus(jobID, 'Failed',
                                            "Job died during finalization")
            result = self.__sendAccounting(jobID)
            if not result['OK']:
                self.log.error('Failed to send accounting', result['Message'])
                continue

        return S_OK()

    def __failSubmittingJobs(self):
        """ Failed Jobs stuck in Submitting Status for a long time.
        They are due to a failed bulk submission transaction.
    """

        # Get old Submitting Jobs
        checkTime = str(dateTime() - self.submittingTime * second)
        result = self.jobDB.selectJobs({'Status': 'Submitting'},
                                       older=checkTime)
        if not result['OK']:
            self.log.error('Failed to select jobs', result['Message'])
            return result

        jobIDs = result['Value']
        if not jobIDs:
            return S_OK()

        for jobID in jobIDs:
            result = self.__updateJobStatus(jobID, 'Failed')
            if not result['OK']:
                self.log.error('Failed to update job status',
                               result['Message'])
                continue

        return S_OK()

    def __sendKillCommand(self, job):
        """Send a kill signal to the job such that it cannot continue running.

    :param int job: ID of job to send kill command
    """
        ownerDN = self.jobDB.getJobAttribute(job, 'OwnerDN')
        ownerGroup = self.jobDB.getJobAttribute(job, 'OwnerGroup')
        if ownerDN['OK'] and ownerGroup['OK']:
            wmsClient = WMSClient(useCertificates=True,
                                  delegatedDN=ownerDN['Value'],
                                  delegatedGroup=ownerGroup['Value'])
            resKill = wmsClient.killJob(job)
            if not resKill['OK']:
                self.log.error("Failed to send kill command to job",
                               "%s: %s" % (job, resKill['Message']))
        else:
            self.log.error(
                "Failed to get ownerDN or Group for job:",
                "%s: %s, %s" % (job, ownerDN.get(
                    'Message', ''), ownerGroup.get('Message', '')))
Beispiel #36
0
class JobCleaningAgent(AgentModule):
    """
      The specific agents must provide the following methods:

         *  initialize() for initial settings
         *  beginExecution()
         *  execute() - the main method called in the agent cycle
         *  endExecution()
         *  finalize() - the graceful exit of the method, this one is usually used for the agent restart
  """
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        #clients
        # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service?
        self.jobDB = None
        self.taskQueueDB = None
        self.jobLoggingDB = None

        self.maxJobsAtOnce = 100
        self.jobByJob = False
        self.throttlingPeriod = 0.

        self.prodTypes = []

        self.removeStatusDelay = {}

    #############################################################################
    def initialize(self):
        """ Sets defaults
    """

        self.am_setOption("PollingTime", 120)
        self.jobDB = JobDB()
        self.taskQueueDB = TaskQueueDB()
        self.jobLoggingDB = JobLoggingDB()
        # self.sandboxDB = SandboxDB( 'SandboxDB' )
        agentTSTypes = self.am_getOption('ProductionTypes', [])
        if agentTSTypes:
            self.prodTypes = agentTSTypes
        else:
            self.prodTypes = Operations().getValue(
                'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
        gLogger.info(
            "Will exclude the following Production types from cleaning %s" %
            (', '.join(self.prodTypes)))
        self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 500)
        self.jobByJob = self.am_getOption('JobByJob', False)
        self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.)

        self.removeStatusDelay['Done'] = self.am_getOption(
            'RemoveStatusDelay/Done', 7)
        self.removeStatusDelay['Killed'] = self.am_getOption(
            'RemoveStatusDelay/Killed', 7)
        self.removeStatusDelay['Failed'] = self.am_getOption(
            'RemoveStatusDelay/Failed', 7)
        self.removeStatusDelay['Any'] = self.am_getOption(
            'RemoveStatusDelay/Any', -1)

        return S_OK()

    def __getAllowedJobTypes(self):
        """ Get valid jobTypes
    """
        result = self.jobDB.getDistinctJobAttributes('JobType')
        if not result['OK']:
            return result
        cleanJobTypes = []
        for jobType in result['Value']:
            if jobType not in self.prodTypes:
                cleanJobTypes.append(jobType)
        self.log.notice("JobTypes to clean %s" % cleanJobTypes)
        return S_OK(cleanJobTypes)

    #############################################################################
    def execute(self):
        """ Remove jobs in various status
    """
        #Delete jobs in "Deleted" state
        result = self.removeJobsByStatus({'Status': 'Deleted'})
        if not result['OK']:
            return result
        #Get all the Job types that can be cleaned
        result = self.__getAllowedJobTypes()
        if not result['OK']:
            return result

        # No jobs in the system subject to removal
        if not result['Value']:
            return S_OK()

        baseCond = {'JobType': result['Value']}
        # Remove jobs with final status
        for status in self.removeStatusDelay:
            delay = self.removeStatusDelay[status]
            if delay < 0:
                # Negative delay means don't delete anything...
                continue
            condDict = dict(baseCond)
            if status != 'Any':
                condDict['Status'] = status
            delTime = str(Time.dateTime() - delay * Time.day)
            result = self.removeJobsByStatus(condDict, delTime)
            if not result['OK']:
                gLogger.warn('Failed to remove jobs in status %s' % status)
        return S_OK()

    def removeJobsByStatus(self, condDict, delay=False):
        """ Remove deleted jobs
    """
        if delay:
            gLogger.verbose("Removing jobs with %s and older than %s day(s)" %
                            (condDict, delay))
            result = self.jobDB.selectJobs(condDict,
                                           older=delay,
                                           limit=self.maxJobsAtOnce)
        else:
            gLogger.verbose("Removing jobs with %s " % condDict)
            result = self.jobDB.selectJobs(condDict, limit=self.maxJobsAtOnce)

        if not result['OK']:
            return result

        jobList = result['Value']
        if len(jobList) > self.maxJobsAtOnce:
            jobList = jobList[:self.maxJobsAtOnce]
        if not jobList:
            return S_OK()

        self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict))

        count = 0
        error_count = 0
        result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList)
        if not result['OK']:
            gLogger.error("Cannot unassign jobs to sandboxes",
                          result['Message'])
            return result

        result = self.deleteJobOversizedSandbox(jobList)
        if not result['OK']:
            gLogger.error("Cannot schedule removal of oversized sandboxes",
                          result['Message'])
            return result

        failedJobs = result['Value']['Failed']
        for job in failedJobs:
            jobList.pop(jobList.index(job))

        # TODO: we should not remove a job if it still has requests in the RequestManager.
        # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB

        if self.jobByJob:
            for jobID in jobList:
                resultJobDB = self.jobDB.removeJobFromDB(jobID)
                resultTQ = self.taskQueueDB.deleteJob(jobID)
                resultLogDB = self.jobLoggingDB.deleteJob(jobID)
                errorFlag = False
                if not resultJobDB['OK']:
                    gLogger.warn('Failed to remove job %d from JobDB' % jobID,
                                 result['Message'])
                    errorFlag = True
                if not resultTQ['OK']:
                    gLogger.warn(
                        'Failed to remove job %d from TaskQueueDB' % jobID,
                        result['Message'])
                    errorFlag = True
                if not resultLogDB['OK']:
                    gLogger.warn(
                        'Failed to remove job %d from JobLoggingDB' % jobID,
                        result['Message'])
                    errorFlag = True
                if errorFlag:
                    error_count += 1
                else:
                    count += 1
                if self.throttlingPeriod:
                    time.sleep(self.throttlingPeriod)
        else:
            result = self.jobDB.removeJobFromDB(jobList)
            if not result['OK']:
                gLogger.error('Failed to delete %d jobs from JobDB' %
                              len(jobList))
            else:
                gLogger.info('Deleted %d jobs from JobDB' % len(jobList))

            for jobID in jobList:
                resultTQ = self.taskQueueDB.deleteJob(jobID)
                if not resultTQ['OK']:
                    gLogger.warn(
                        'Failed to remove job %d from TaskQueueDB' % jobID,
                        resultTQ['Message'])
                    error_count += 1
                else:
                    count += 1

            result = self.jobLoggingDB.deleteJob(jobList)
            if not result['OK']:
                gLogger.error('Failed to delete %d jobs from JobLoggingDB' %
                              len(jobList))
            else:
                gLogger.info('Deleted %d jobs from JobLoggingDB' %
                             len(jobList))

        if count > 0 or error_count > 0:
            gLogger.info('Deleted %d jobs from JobDB, %d errors' %
                         (count, error_count))
        return S_OK()

    def deleteJobOversizedSandbox(self, jobIDList):
        """ Delete the job oversized sandbox files from storage elements
    """

        failed = {}
        successful = {}

        lfnDict = {}
        for jobID in jobIDList:
            result = JobMonitoringClient().getJobParameter(
                jobID, 'OutputSandboxLFN')
            if result['OK']:
                lfn = result['Value'].get('OutputSandboxLFN')
                if lfn:
                    lfnDict[lfn] = jobID
                else:
                    successful[jobID] = 'No oversized sandbox found'
            else:
                gLogger.error('Error interrogating JobDB: %s' %
                              result['Message'])

        if not lfnDict:
            return S_OK({'Successful': successful, 'Failed': failed})

        # Schedule removal of the LFNs now

        for lfn, jobID in lfnDict.items():
            result = self.jobDB.getJobAttributes(jobID,
                                                 ['OwnerDN', 'OwnerGroup'])
            if not result['OK']:
                failed[jobID] = lfn
                continue
            if not result['Value']:
                failed[jobID] = lfn
                continue

            ownerDN = result['Value']['OwnerDN']
            ownerGroup = result['Value']['OwnerGroup']
            result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup)
            if not result['OK']:
                failed[jobID] = lfn
            else:
                successful[jobID] = lfn

        result = {'Successful': successful, 'Failed': failed}
        return S_OK(result)

    def __setRemovalRequest(self, lfn, ownerDN, ownerGroup):
        """ Set removal request with the given credentials
    """
        oRequest = Request()
        oRequest.OwnerDN = ownerDN
        oRequest.OwnerGroup = ownerGroup
        oRequest.RequestName = os.path.basename(
            lfn).strip() + '_removal_request.xml'
        oRequest.SourceComponent = 'JobCleaningAgent'

        removeFile = Operation()
        removeFile.Type = 'RemoveFile'

        removedFile = File()
        removedFile.LFN = lfn

        removeFile.addFile(removedFile)
        oRequest.addOperation(removeFile)

        return ReqClient().putRequest(oRequest)
Beispiel #37
0
class StatesAccountingAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    __summaryKeyFieldsMapping = [
        'Status',
        'Site',
        'User',
        'UserGroup',
        'JobGroup',
        'JobType',
    ]
    __summaryDefinedFields = [('ApplicationStatus', 'unset'),
                              ('MinorStatus', 'unset')]
    __summaryValueFieldsMapping = [
        'Jobs',
        'Reschedules',
    ]
    __renameFieldsMapping = {'JobType': 'JobSplitType'}

    def initialize(self):
        """ Standard constructor
    """
        self.dsClients = {}
        self.jobDB = JobDB()
        self.retryOnce = False
        self.retryValues = []

        self.reportPeriod = 850
        self.am_setOption("PollingTime", self.reportPeriod)
        self.__jobDBFields = []
        for field in self.__summaryKeyFieldsMapping:
            if field == 'User':
                field = 'Owner'
            elif field == 'UserGroup':
                field = 'OwnerGroup'
            self.__jobDBFields.append(field)
        return S_OK()

    def execute(self):
        """ Main execution method
    """
        result = gConfig.getSections("/DIRAC/Setups")
        if not result['OK']:
            return result
        validSetups = result['Value']
        self.log.info("Valid setups for this cycle are %s" %
                      ", ".join(validSetups))
        #Get the WMS Snapshot!
        result = self.jobDB.getSummarySnapshot(self.__jobDBFields)
        now = Time.dateTime()
        if not result['OK']:
            self.log.error(
                "Can't get the JobDB summary",
                "%s: won't commit at this cycle" % result['Message'])
        else:
            values = result['Value'][1]

            if self.retryOnce:
                self.log.verbose(
                    "Adding to records to commit those not committed within the previous cycle"
                )
            acWMSListAdded = []

            for record in values:
                recordSetup = record[0]
                if recordSetup not in validSetups:
                    self.log.error("Setup %s is not valid" % recordSetup)
                    continue
                if recordSetup not in self.dsClients:
                    self.log.info("Creating DataStore client for %s" %
                                  recordSetup)
                    self.dsClients[recordSetup] = DataStoreClient(
                        setup=recordSetup, retryGraceTime=900)
                record = record[1:]
                rD = {}
                for fV in self.__summaryDefinedFields:
                    rD[fV[0]] = fV[1]
                for iP in range(len(self.__summaryKeyFieldsMapping)):
                    fieldName = self.__summaryKeyFieldsMapping[iP]
                    rD[self.__renameFieldsMapping.get(fieldName,
                                                      fieldName)] = record[iP]
                record = record[len(self.__summaryKeyFieldsMapping):]
                for iP in range(len(self.__summaryValueFieldsMapping)):
                    rD[self.__summaryValueFieldsMapping[iP]] = int(record[iP])
                acWMS = WMSHistory()
                acWMS.setStartTime(now)
                acWMS.setEndTime(now)
                acWMS.setValuesFromDict(rD)
                retVal = acWMS.checkValues()
                if not retVal['OK']:
                    self.log.error("Invalid accounting record ",
                                   "%s -> %s" % (retVal['Message'], rD))
                else:
                    self.dsClients[recordSetup].addRegister(acWMS)
                    acWMSListAdded.append(acWMS)

            if self.retryOnce and self.retryValues:
                for acWMSCumulated in self.retryValues:
                    retVal = acWMSCumulated.checkValues()
                    if not retVal['OK']:
                        self.log.error("Invalid accounting record ",
                                       "%s" % (retVal['Message']))
                    else:
                        self.dsClients[recordSetup].addRegister(acWMSCumulated)

            for setup in self.dsClients:
                self.log.info("Sending records for setup %s" % setup)
                result = self.dsClients[setup].commit()
                if not result['OK']:
                    self.log.error(
                        "Couldn't commit wms history for setup %s" % setup,
                        result['Message'])
                    # Re-creating the client: for new connection, and for avoiding accumulating too large of a backlog
                    self.dsClients[setup] = DataStoreClient(setup=setup,
                                                            retryGraceTime=900)
                    if not self.retryOnce:
                        self.log.info("Will try again at next cycle")
                        self.retryOnce = True
                        self.retryValues = acWMSListAdded
                    else:
                        self.log.warn("Won't retry one more time")
                        self.retryOnce = False
                        self.retryValues = []
                else:
                    self.log.info("Sent %s records for setup %s" %
                                  (result['Value'], setup))
                    self.retryOnce = False
        return S_OK()
Beispiel #38
0
class MightyOptimizer(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """
    __jobStates = ['Received', 'Checking']

    def initialize(self):
        """ Standard constructor
    """
        self.jobDB = JobDB()
        self.jobLoggingDB = JobLoggingDB()
        self._optimizers = {}
        self.am_setOption("PollingTime", 30)
        return S_OK()

    def execute(self):
        """ The method call by AgentModule on each iteration
    """
        jobTypeCondition = self.am_getOption("JobTypeRestriction", [])
        jobCond = {'Status': self.__jobStates}
        if jobTypeCondition:
            jobCond['JobType'] = jobTypeCondition
        result = self.jobDB.selectJobs(jobCond)
        if not result['OK']:
            return result
        jobsList = result['Value']
        self.log.info("Got %s jobs for this iteration" % len(jobsList))
        if not jobsList:
            return S_OK()
        result = self.jobDB.getAttributesForJobList(jobsList)
        if not result['OK']:
            return result
        jobsToProcess = result['Value']
        for jobId in jobsToProcess:
            self.log.info("== Processing job %s == " % jobId)
            jobAttrs = jobsToProcess[jobId]
            jobDef = False
            jobOptimized = False
            jobOK = True
            while not jobOptimized:
                result = self.optimizeJob(jobId, jobAttrs, jobDef)
                if not result['OK']:
                    self.log.error(
                        "Optimizer %s error" % jobAttrs['MinorStatus'],
                        "Job %s: %s" % (str(jobID), result['Message']))
                    jobOK = False
                    break
                optResult = result['Value']
                jobOptimized = optResult['done']
                if 'jobDef' in optResult:
                    jobDef = optResult['jobDef']
            if jobOK:
                self.log.info("Finished optimizing job %s" % jobId)
        return S_OK()

    def optimizeJob(self, jobId, jobAttrs, jobDef):
        """ The method call for each Job to be optimized
    """
        #Get the next optimizer
        result = self._getNextOptimizer(jobAttrs)
        if not result['OK']:
            return result
        optimizer = result['Value']
        if not optimizer:
            return S_OK({'done': True})
        #If there's no job def then get it
        if not jobDef:
            result = optimizer.getJobDefinition(jobId, jobDef)
            if not result['OK']:
                optimizer.setFailedJob(jobId, result['Message'])
                return result
            jobDef = result['Value']
        #Does the optimizer require a proxy?
        shifterEnv = False
        if optimizer.am_getModuleParam('shifterProxy'):
            shifterEnv = True
            result = setupShifterProxyInEnv(
                optimizer.am_getModuleParam('shifterProxy'),
                optimizer.am_getShifterProxyLocation())
            if not result['OK']:
                return result
        #Call the initCycle function
        result = self.am_secureCall(optimizer.beginExecution,
                                    name="beginExecution")
        if not result['OK']:
            return result
        #Do the work
        result = optimizer.optimizeJob(jobId, jobDef['classad'])
        if not result['OK']:
            return result
        nextOptimizer = result['Value']
        #If there was a shifter proxy, unset it
        if shifterEnv:
            del (os.environ['X509_USER_PROXY'])
        #Check if the JDL has changed
        newJDL = jobDef['classad'].asJDL()
        if newJDL != jobDef['jdl']:
            jobDef['jdl'] = newJDL
        #If there's a new optimizer set it!
        if nextOptimizer:
            jobAttrs['Status'] = 'Checking'
            jobAttrs['MinorStatus'] = nextOptimizer
            return S_OK({'done': False, 'jobDef': jobDef})
        return S_OK({'done': True, 'jobDef': jobDef})

    def _getNextOptimizer(self, jobAttrs):
        """ Determine next Optimizer in the Path
    """
        if jobAttrs['Status'] == 'Received':
            nextOptimizer = "JobPath"
        else:
            nextOptimizer = jobAttrs['MinorStatus']
        if nextOptimizer in self.am_getOption("FilteredOptimizers",
                                              "InputData, BKInputData"):
            return S_OK(False)
        gLogger.info("Next optimizer for job %s is %s" %
                     (jobAttrs['JobID'], nextOptimizer))
        if nextOptimizer not in self._optimizers:
            result = self.__loadOptimizer(nextOptimizer)
            if not result['OK']:
                return result
            self._optimizers[nextOptimizer] = result['Value']
        return S_OK(self._optimizers[nextOptimizer])

    @gOptimizerLoadSync
    def __loadOptimizer(self, optimizerName):
        """Need to load an optimizer
    """
        gLogger.info("Loading optimizer %s" % optimizerName)
        try:
            agentName = "%sAgent" % optimizerName
            optimizerModule = __import__(
                'DIRAC.WorkloadManagementSystem.Agent.%s' % agentName,
                globals(), locals(), agentName)
            optimizerClass = getattr(optimizerModule, agentName)
            optimizer = optimizerClass("WorkloadManagement/%s" % agentName,
                                       self.am_getModuleParam('fullName'))
            result = optimizer.am_initialize(self.jobDB, self.jobLoggingDB)
            if not result['OK']:
                return S_ERROR("Can't initialize optimizer %s: %s" %
                               (optimizerName, result['Message']))
        except Exception, e:
            gLogger.exception("LOADERROR")
            return S_ERROR("Can't load optimizer %s: %s" %
                           (optimizerName, str(e)))
        return S_OK(optimizer)