Example #1
0
class Matcher( object ):
  """ Logic for matching
  """

  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )


  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict


  def _getResourceDict( self, resourceDescription, credDict ):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription( resourceDescription )
    resourceDict = self._checkCredentials( resourceDict, credDict )
    self._checkPilotVersion( resourceDict )
    if not self._checkMask( resourceDict ):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose( "Resource description:" )
    for key in resourceDict:
      self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    return resourceDict

  def _processResourceDescription( self, resourceDescription ):
    """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

    resourceDict = {}
    if isinstance( resourceDescription, basestring ):
      classAdAgent = ClassAd( resourceDescription )
      if not classAdAgent.isOK():
        raise ValueError( 'Illegal Resource JDL' )
      self.log.verbose( classAdAgent.asJDL() )

      for name in singleValueDefFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'CPUTime':
            resourceDict[name] = classAdAgent.getAttributeInt( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      for name in multiValueMatchFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'SubmitPool':
            resourceDict[name] = classAdAgent.getListFromExpression( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      # Check if a JobID is requested
      if classAdAgent.lookupAttribute( 'JobID' ):
        resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' )

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ):
        if classAdAgent.lookupAttribute( k ):
          resourceDict[ k ] = classAdAgent.getAttributeString( k )

    else:
      for name in singleValueDefFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      for name in multiValueMatchFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      if resourceDescription.has_key( 'JobID' ):
        resourceDict['JobID'] = resourceDescription['JobID']

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
                 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ):
        if k in resourceDescription:
          resourceDict[ k ] = resourceDescription[ k ]

    return resourceDict



  def _reportStatus( self, resourceDict, jobID ):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes( jobID, attNames, attValues )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Set job attributes for jobID %s" % jobID )

    result = self.jlDB.addLoggingRecord( jobID,
                                         status = 'Matched',
                                         minor = 'Assigned',
                                         source = 'Matcher' )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Added logging record for jobID %s" % jobID )


  def _checkMask( self, resourceDict ):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if not 'Site' in resourceDict:
      self.log.error( "Missing Site Name in Resource JDL" )
      raise RuntimeError( "Missing Site Name in Resource JDL" )

    # Get common site mask and check the agent site
    result = self.jobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] )
      raise RuntimeError( "Internal error" )
    maskList = result['Value']

    if resourceDict['Site'] not in maskList:
      return False

    return True

  def _updatePilotInfo( self, resourceDict ):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      gridCE = resourceDict.get( 'GridCE', 'Unknown' )
      site = resourceDict.get( 'Site', 'Unknown' )
      benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
      self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) )

      result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site,
                                                  destination = gridCE, benchmark = benchmark )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _updatePilotJobMapping( self, resourceDict, jobID ):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )
      result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _checkCredentials( self, resourceDict, credDict ):
    """ Check if we can get a job given the passed credentials
    """
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      # You can only match groups in the same VO
      if credDict[ 'group' ] == "hosts":
        # for the host case the VirtualOrganization parameter
        # is mandatory in resourceDict
        vo = resourceDict.get( 'VirtualOrganization', '' )
      else:
        vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
      else:
        raise RuntimeError( result['Message'] )
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        self.log.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        self.log.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ]:
            raise RuntimeError( result['Message'] )
          if credDict[ 'group' ] not in result[ 'Value' ]:
            # DN is not in the same group! bad boy.
            self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    return resourceDict

  def _checkPilotVersion( self, resourceDict ):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          raise RuntimeError( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \
                            ( pilotVersion, ",".join( validVersions ) ) )
      # Check project if requested
      validProject = self.opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                                   resourceDict[ 'ReleaseProject' ] ) )
Example #2
0
class PilotStatusAgent(AgentModule):
    """
    The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
    """
    def __init__(self, *args, **kwargs):
        """c'tor"""
        super().__init__(*args, **kwargs)

        self.jobDB = None
        self.pilotDB = None
        self.diracadmin = None

    #############################################################################
    def initialize(self):
        """Sets defaults"""

        self.am_setOption("GridEnv", "")
        self.pilotDB = PilotAgentsDB()
        self.diracadmin = DiracAdmin()
        self.jobDB = JobDB()
        self.clearPilotsDelay = self.am_getOption("ClearPilotsDelay", 30)
        self.clearAbortedDelay = self.am_getOption("ClearAbortedPilotsDelay",
                                                   7)
        self.pilots = PilotManagerClient()

        return S_OK()

    #############################################################################
    def execute(self):
        """The PilotAgent execution method."""

        self.pilotStalledDays = self.am_getOption("PilotStalledDays", 3)
        self.gridEnv = self.am_getOption("GridEnv")
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue("/DIRAC/Setup", "")
            if setup:
                instance = gConfig.getValue(
                    "/DIRAC/Setups/%s/WorkloadManagement" % setup, "")
                if instance:
                    self.gridEnv = gConfig.getValue(
                        "/Systems/WorkloadManagement/%s/GridEnv" % instance,
                        "")

        result = self.pilotDB._getConnection()
        if not result["OK"]:
            return result
        connection = result["Value"]

        # Now handle pilots not updated in the last N days and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        result = self.pilots.clearPilots(self.clearPilotsDelay,
                                         self.clearAbortedDelay)
        if not result["OK"]:
            self.log.warn("Failed to clear old pilots in the PilotAgentsDB")

        return S_OK()

    def handleOldPilots(self, connection):
        """
        select all pilots that have not been updated in the last N days and declared them
        Deleted, accounting for them.
        """
        pilotsToAccount = {}
        timeLimitToConsider = TimeUtilities.toString(
            datetime.datetime.utcnow() -
            TimeUtilities.day * self.pilotStalledDays)
        result = self.pilotDB.selectPilots(
            {"Status": PilotStatus.PILOT_TRANSIENT_STATES},
            older=timeLimitToConsider,
            timeStamp="LastUpdateTime")
        if not result["OK"]:
            self.log.error("Failed to get the Pilot Agents")
            return result
        if not result["Value"]:
            return S_OK()

        refList = result["Value"]
        result = self.pilotDB.getPilotInfo(refList)
        if not result["OK"]:
            self.log.error("Failed to get Info for Pilot Agents")
            return result

        pilotsDict = result["Value"]

        for pRef in pilotsDict:
            if pilotsDict[pRef].get("Jobs") and self._checkJobLastUpdateTime(
                    pilotsDict[pRef]["Jobs"], self.pilotStalledDays):
                self.log.debug(
                    "%s should not be deleted since one job of %s is running."
                    % (str(pRef), str(pilotsDict[pRef]["Jobs"])))
                continue
            deletedJobDict = pilotsDict[pRef]
            deletedJobDict["Status"] = PilotStatus.DELETED
            deletedJobDict["StatusDate"] = datetime.datetime.utcnow()
            pilotsToAccount[pRef] = deletedJobDict
            if len(pilotsToAccount) > 100:
                self.accountPilots(pilotsToAccount, connection)
                self._killPilots(pilotsToAccount)
                pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        self._killPilots(pilotsToAccount)

        return S_OK()

    def accountPilots(self, pilotsToAccount, connection):
        """account for pilots"""
        accountingFlag = False
        pae = self.am_getOption("PilotAccountingEnabled", "yes")
        if pae.lower() == "yes":
            accountingFlag = True

        if not pilotsToAccount:
            self.log.info("No pilots to Account")
            return S_OK()

        accountingSent = False
        if accountingFlag:
            retVal = self.pilotDB.getPilotInfo(list(pilotsToAccount),
                                               conn=connection)
            if not retVal["OK"]:
                self.log.error("Fail to retrieve Info for pilots",
                               retVal["Message"])
                return retVal
            dbData = retVal["Value"]
            for pref in dbData:
                if pref in pilotsToAccount:
                    if dbData[pref][
                            "Status"] not in PilotStatus.PILOT_FINAL_STATES:
                        dbData[pref]["Status"] = pilotsToAccount[pref][
                            "Status"]
                        dbData[pref]["DestinationSite"] = pilotsToAccount[
                            pref]["DestinationSite"]
                        dbData[pref]["LastUpdateTime"] = pilotsToAccount[pref][
                            "StatusDate"]

            retVal = self._addPilotsAccountingReport(dbData)
            if not retVal["OK"]:
                self.log.error("Fail to retrieve Info for pilots",
                               retVal["Message"])
                return retVal

            self.log.info("Sending accounting records...")
            retVal = gDataStoreClient.commit()
            if not retVal["OK"]:
                self.log.error("Can't send accounting reports",
                               retVal["Message"])
            else:
                self.log.info("Accounting sent for %s pilots" %
                              len(pilotsToAccount))
                accountingSent = True

        if not accountingFlag or accountingSent:
            for pRef in pilotsToAccount:
                pDict = pilotsToAccount[pRef]
                self.log.verbose("Setting Status for %s to %s" %
                                 (pRef, pDict["Status"]))
                self.pilotDB.setPilotStatus(pRef,
                                            pDict["Status"],
                                            pDict["DestinationSite"],
                                            pDict["StatusDate"],
                                            conn=connection)

        return S_OK()

    def _addPilotsAccountingReport(self, pilotsData):
        """fill accounting data"""
        for pRef in pilotsData:
            pData = pilotsData[pRef]
            pA = PilotAccounting()
            pA.setEndTime(pData["LastUpdateTime"])
            pA.setStartTime(pData["SubmissionTime"])
            retVal = Registry.getUsernameForDN(pData["OwnerDN"])
            if not retVal["OK"]:
                userName = "******"
                self.log.error(
                    "Can't determine username for dn",
                    ": %s : %s" % (pData["OwnerDN"], retVal["Message"]),
                )
            else:
                userName = retVal["Value"]
            pA.setValueByKey("User", userName)
            pA.setValueByKey("UserGroup", pData["OwnerGroup"])
            result = getCESiteMapping(pData["DestinationSite"])
            if result["OK"] and pData["DestinationSite"] in result["Value"]:
                pA.setValueByKey(
                    "Site", result["Value"][pData["DestinationSite"]].strip())
            else:
                pA.setValueByKey("Site", "Unknown")
            pA.setValueByKey("GridCE", pData["DestinationSite"])
            pA.setValueByKey("GridMiddleware", pData["GridType"])
            pA.setValueByKey("GridResourceBroker", pData["Broker"])
            pA.setValueByKey("GridStatus", pData["Status"])
            if "Jobs" not in pData:
                pA.setValueByKey("Jobs", 0)
            else:
                pA.setValueByKey("Jobs", len(pData["Jobs"]))
            self.log.verbose("Added accounting record for pilot %s" %
                             pData["PilotID"])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal["OK"]:
                return retVal
        return S_OK()

    def _killPilots(self, acc):
        for i in sorted(acc.keys()):
            result = self.diracadmin.getPilotInfo(i)
            if result["OK"] and i in result["Value"] and "Status" in result[
                    "Value"][i]:
                ret = self.diracadmin.killPilot(str(i))
                if ret["OK"]:
                    self.log.info(
                        "Successfully deleted", ": %s (Status : %s)" %
                        (i, result["Value"][i]["Status"]))
                else:
                    self.log.error("Failed to delete pilot: ",
                                   "%s : %s" % (i, ret["Message"]))
            else:
                self.log.error("Failed to get pilot info",
                               "%s : %s" % (i, str(result)))

    def _checkJobLastUpdateTime(self, joblist, StalledDays):
        timeLimitToConsider = datetime.datetime.utcnow(
        ) - TimeUtilities.day * StalledDays
        ret = False
        for jobID in joblist:
            result = self.jobDB.getJobAttributes(int(jobID))
            if result["OK"]:
                if "LastUpdateTime" in result["Value"]:
                    lastUpdateTime = result["Value"]["LastUpdateTime"]
                    if TimeUtilities.fromString(
                            lastUpdateTime) > timeLimitToConsider:
                        ret = True
                        self.log.debug(
                            "Since %s updates LastUpdateTime on %s this does not to need to be deleted."
                            % (str(jobID), str(lastUpdateTime)))
                        break
            else:
                self.log.error("Error taking job info from DB",
                               result["Message"])
        return ret
Example #3
0
class PilotStatusAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    queryStateList = ["Ready", "Submitted", "Running", "Waiting", "Scheduled"]
    finalStateList = ["Done", "Aborted", "Cleared", "Deleted", "Failed"]
    identityFieldsList = ["OwnerDN", "OwnerGroup", "GridType", "Broker"]
    eligibleGridTypes = ["gLite"]

    #############################################################################
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption("PollingTime", 120)
        self.am_setOption("GridEnv", "")
        self.am_setOption("PilotStalledDays", 3)
        self.pilotDB = PilotAgentsDB()
        return S_OK()

    #############################################################################
    def execute(self):
        """The PilotAgent execution method.
    """

        self.pilotStalledDays = self.am_getOption("PilotStalledDays", 3)
        self.gridEnv = self.am_getOption("GridEnv")
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue("/DIRAC/Setup", "")
            if setup:
                instance = gConfig.getValue("/DIRAC/Setups/%s/WorkloadManagement" % setup, "")
                if instance:
                    self.gridEnv = gConfig.getValue("/Systems/WorkloadManagement/%s/GridEnv" % instance, "")
        result = self.pilotDB._getConnection()
        if result["OK"]:
            connection = result["Value"]
        else:
            return result

        result = self.pilotDB.getPilotGroups(self.identityFieldsList, {"Status": self.queryStateList})
        if not result["OK"]:
            self.log.error("Fail to get identities Groups", result["Message"])
            return result
        if not result["Value"]:
            return S_OK()

        pilotsToAccount = {}

        for ownerDN, ownerGroup, gridType, broker in result["Value"]:

            if not gridType in self.eligibleGridTypes:
                continue

            self.log.verbose("Getting pilots for %s:%s @ %s %s" % (ownerDN, ownerGroup, gridType, broker))

            condDict1 = {
                "Status": "Done",
                "StatusReason": "Report from JobAgent",
                "OwnerDN": ownerDN,
                "OwnerGroup": ownerGroup,
                "GridType": gridType,
                "Broker": broker,
            }

            condDict2 = {
                "Status": self.queryStateList,
                "OwnerDN": ownerDN,
                "OwnerGroup": ownerGroup,
                "GridType": gridType,
                "Broker": broker,
            }

            for condDict in [condDict1, condDict2]:
                result = self.clearWaitingPilots(condDict)
                if not result["OK"]:
                    self.log.warn("Failed to clear Waiting Pilot Jobs")

                result = self.pilotDB.selectPilots(condDict)
                if not result["OK"]:
                    self.log.warn("Failed to get the Pilot Agents")
                    return result
                if not result["Value"]:
                    continue
                refList = result["Value"]

                ret = gProxyManager.getPilotProxyFromDIRACGroup(ownerDN, ownerGroup)
                if not ret["OK"]:
                    self.log.error(ret["Message"])
                    self.log.error("Could not get proxy:", 'User "%s", Group "%s"' % (ownerDN, ownerGroup))
                    continue
                proxy = ret["Value"]

                self.log.verbose(
                    "Getting status for %s pilots for owner %s and group %s" % (len(refList), ownerDN, ownerGroup)
                )

                for start_index in range(0, len(refList), MAX_JOBS_QUERY):
                    refsToQuery = refList[start_index : start_index + MAX_JOBS_QUERY]
                    self.log.verbose(
                        "Querying %d pilots of %s starting at %d" % (len(refsToQuery), len(refList), start_index)
                    )
                    result = self.getPilotStatus(proxy, gridType, refsToQuery)
                    if not result["OK"]:
                        if result["Message"] == "Broker not Available":
                            self.log.error("Broker %s not Available" % broker)
                            break
                        self.log.warn("Failed to get pilot status:")
                        self.log.warn("%s:%s @ %s" % (ownerDN, ownerGroup, gridType))
                        continue

                    statusDict = result["Value"]
                    for pRef in statusDict:
                        pDict = statusDict[pRef]
                        if pDict:
                            if pDict["isParent"]:
                                self.log.verbose("Clear parametric parent %s" % pRef)
                                result = self.clearParentJob(pRef, pDict, connection)
                                if not result["OK"]:
                                    self.log.warn(result["Message"])
                                else:
                                    self.log.info("Parametric parent removed: %s" % pRef)
                            if pDict["FinalStatus"]:
                                self.log.verbose("Marking Status for %s to %s" % (pRef, pDict["Status"]))
                                pilotsToAccount[pRef] = pDict
                            else:
                                self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"]))
                                result = self.pilotDB.setPilotStatus(
                                    pRef,
                                    pDict["Status"],
                                    pDict["DestinationSite"],
                                    updateTime=pDict["StatusDate"],
                                    conn=connection,
                                )

                    if len(pilotsToAccount) > 100:
                        self.accountPilots(pilotsToAccount, connection)
                        pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        # Now handle pilots not updated in the last N days (most likely the Broker is no
        # longer available) and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        return S_OK()

    def clearWaitingPilots(self, condDict):
        """ Clear pilots in the faulty Waiting state
    """

        last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
        clearDict = {
            "Status": "Waiting",
            "OwnerDN": condDict["OwnerDN"],
            "OwnerGroup": condDict["OwnerGroup"],
            "GridType": condDict["GridType"],
            "Broker": condDict["Broker"],
        }
        result = self.pilotDB.selectPilots(clearDict, older=last_update)
        if not result["OK"]:
            self.log.warn("Failed to get the Pilot Agents fpr Waiting state")
            return result
        if not result["Value"]:
            return S_OK()
        refList = result["Value"]

        for pilotRef in refList:
            self.log.info("Setting Waiting pilot to Aborted: %s" % pilotRef)
            result = self.pilotDB.setPilotStatus(pilotRef, "Stalled", statusReason="Exceeded max waiting time")

        return S_OK()

    def clearParentJob(self, pRef, pDict, connection):
        """ Clear the parameteric parent job from the PilotAgentsDB
    """

        childList = pDict["ChildRefs"]

        # Check that at least one child is in the database
        children_ok = False
        for child in childList:
            result = self.pilotDB.getPilotInfo(child, conn=connection)
            if result["OK"]:
                if result["Value"]:
                    children_ok = True

        if children_ok:
            return self.pilotDB.deletePilot(pRef, conn=connection)
        else:
            self.log.verbose("Adding children for parent %s" % pRef)
            result = self.pilotDB.getPilotInfo(pRef)
            parentInfo = result["Value"][pRef]
            tqID = parentInfo["TaskQueueID"]
            ownerDN = parentInfo["OwnerDN"]
            ownerGroup = parentInfo["OwnerGroup"]
            broker = parentInfo["Broker"]
            gridType = parentInfo["GridType"]
            result = self.pilotDB.addPilotTQReference(
                childList, tqID, ownerDN, ownerGroup, broker=broker, gridType=gridType
            )
            if not result["OK"]:
                return result
            children_added = True
            for chRef, chDict in pDict["ChildDicts"].items():
                result = self.pilotDB.setPilotStatus(
                    chRef, chDict["Status"], destination=chDict["DestinationSite"], conn=connection
                )
                if not result["OK"]:
                    children_added = False
            if children_added:
                result = self.pilotDB.deletePilot(pRef, conn=connection)
            else:
                return S_ERROR("Failed to add children")
        return S_OK()

    def handleOldPilots(self, connection):
        """
      select all pilots that have not been updated in the last N days and declared them 
      Deleted, accounting for them.
    """
        pilotsToAccount = {}
        timeLimitToConsider = Time.toString(Time.dateTime() - Time.day * self.pilotStalledDays)
        # A.T. Below looks to be a bug
        # result = self.pilotDB.selectPilots( {'Status':self.queryStateList} , older=None, timeStamp='LastUpdateTime' )
        result = self.pilotDB.selectPilots(
            {"Status": self.queryStateList}, older=timeLimitToConsider, timeStamp="LastUpdateTime"
        )
        if not result["OK"]:
            self.log.error("Failed to get the Pilot Agents")
            return result
        if not result["Value"]:
            return S_OK()

        refList = result["Value"]
        result = self.pilotDB.getPilotInfo(refList)
        if not result["OK"]:
            self.log.error("Failed to get Info for Pilot Agents")
            return result

        pilotsDict = result["Value"]

        for pRef in pilotsDict:
            deletedJobDict = pilotsDict[pRef]
            deletedJobDict["Status"] = "Deleted"
            deletedJobDict["StatusDate"] = Time.dateTime()
            pilotsToAccount[pRef] = deletedJobDict
            if len(pilotsToAccount) > 100:
                self.accountPilots(pilotsToAccount, connection)
                pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)

        return S_OK()

    def accountPilots(self, pilotsToAccount, connection):
        """ account for pilots
    """
        accountingFlag = False
        pae = self.am_getOption("PilotAccountingEnabled", "yes")
        if pae.lower() == "yes":
            accountingFlag = True

        if not pilotsToAccount:
            self.log.info("No pilots to Account")
            return S_OK()

        accountingSent = False
        if accountingFlag:
            retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(), conn=connection)
            if not retVal["OK"]:
                self.log.error("Fail to retrieve Info for pilots", retVal["Message"])
                return retVal
            dbData = retVal["Value"]
            for pref in dbData:
                if pref in pilotsToAccount:
                    if dbData[pref]["Status"] not in self.finalStateList:
                        dbData[pref]["Status"] = pilotsToAccount[pref]["Status"]
                        dbData[pref]["DestinationSite"] = pilotsToAccount[pref]["DestinationSite"]
                        dbData[pref]["LastUpdateTime"] = pilotsToAccount[pref]["StatusDate"]

            retVal = self.__addPilotsAccountingReport(dbData)
            if not retVal["OK"]:
                self.log.error("Fail to retrieve Info for pilots", retVal["Message"])
                return retVal

            self.log.info("Sending accounting records...")
            retVal = gDataStoreClient.commit()
            if not retVal["OK"]:
                self.log.error("Can't send accounting reports", retVal["Message"])
            else:
                self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount))
                accountingSent = True

        if not accountingFlag or accountingSent:
            for pRef in pilotsToAccount:
                pDict = pilotsToAccount[pRef]
                self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"]))
                self.pilotDB.setPilotStatus(
                    pRef, pDict["Status"], pDict["DestinationSite"], pDict["StatusDate"], conn=connection
                )

        return S_OK()

    #############################################################################
    def getPilotStatus(self, proxy, gridType, pilotRefList):
        """ Get GRID job status information using the job's owner proxy and
        GRID job IDs. Returns for each JobID its status in the GRID WMS and
        its destination CE as a tuple of 2 elements
    """

        if gridType == "LCG":
            cmd = ["edg-job-status"]
        elif gridType == "gLite":
            cmd = ["glite-wms-job-status"]
        else:
            return S_ERROR()
        cmd.extend(pilotRefList)

        start = time.time()
        ret = executeGridCommand(proxy, cmd, self.gridEnv)
        self.log.info("%s Job Status Execution Time for %d jobs:" % (gridType, len(pilotRefList)), time.time() - start)

        if not ret["OK"]:
            self.log.error("Failed to execute %s Job Status" % gridType, ret["Message"])
            return S_ERROR()
        if ret["Value"][0] != 0:
            stderr = ret["Value"][2]
            stdout = ret["Value"][1]
            deleted = 0
            resultDict = {}
            status = "Deleted"
            destination = "Unknown"
            deletedJobDict = {
                "Status": status,
                "DestinationSite": destination,
                "StatusDate": Time.dateTime(),
                "isChild": False,
                "isParent": False,
                "ParentRef": False,
                "FinalStatus": status in self.finalStateList,
                "ChildRefs": [],
            }
            # Glite returns this error for Deleted jobs to std.err
            for job in List.fromChar(stderr, "\nUnable to retrieve the status for:")[1:]:
                pRef = List.fromChar(job, "\n")[0].strip()
                resultDict[pRef] = deletedJobDict
                self.pilotDB.setPilotStatus(pRef, "Deleted")
                deleted += 1
            # EDG returns a similar error for Deleted jobs to std.out
            for job in List.fromChar(stdout, "\nUnable to retrieve the status for:")[1:]:
                pRef = List.fromChar(job, "\n")[0].strip()
                if re.search("No such file or directory: no matching jobs found", job):
                    resultDict[pRef] = deletedJobDict
                    self.pilotDB.setPilotStatus(pRef, "Deleted")
                    deleted += 1
                if re.search("edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()", job):
                    # the Broker is not accesible
                    return S_ERROR("Broker not Available")
            if not deleted:
                self.log.error(
                    "Error executing %s Job Status:" % gridType, str(ret["Value"][0]) + "\n".join(ret["Value"][1:3])
                )
                return S_ERROR()
            return S_OK(resultDict)

        stdout = ret["Value"][1]
        stderr = ret["Value"][2]
        resultDict = {}
        for job in List.fromChar(stdout, "\nStatus info for the Job :")[1:]:
            pRef = List.fromChar(job, "\n")[0].strip()
            resultDict[pRef] = self.__parseJobStatus(job, gridType)

        return S_OK(resultDict)

    def __parseJobStatus(self, job, gridType):
        """ Parse output of grid pilot status command
    """

        statusRE = "Current Status:\s*(\w*)"
        destinationRE = "Destination:\s*([\w\.-]*)"
        statusDateLCGRE = "reached on:\s*....(.*)"
        submittedDateRE = "Submitted:\s*....(.*)"
        statusFailedRE = "Current Status:.*\(Failed\)"

        status = None
        destination = "Unknown"
        statusDate = None
        submittedDate = None

        try:
            status = re.search(statusRE, job).group(1)
            if status == "Done" and re.search(statusFailedRE, job):
                status = "Failed"
            if re.search(destinationRE, job):
                destination = re.search(destinationRE, job).group(1)
            if gridType == "LCG" and re.search(statusDateLCGRE, job):
                statusDate = re.search(statusDateLCGRE, job).group(1)
                statusDate = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(statusDate, "%b %d %H:%M:%S %Y"))
            if gridType == "gLite" and re.search(submittedDateRE, job):
                submittedDate = re.search(submittedDateRE, job).group(1)
                submittedDate = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(submittedDate, "%b %d %H:%M:%S %Y %Z"))
        except:
            self.log.exception("Error parsing %s Job Status output:\n" % gridType, job)

        isParent = False
        if re.search("Nodes information", job):
            isParent = True
        isChild = False
        if re.search("Parent Job", job):
            isChild = True

        if status == "Running":
            # Pilots can be in Running state for too long, due to bugs in the WMS
            if statusDate:
                statusTime = Time.fromString(statusDate)
                delta = Time.dateTime() - statusTime
                if delta > 4 * Time.day:
                    self.log.info("Setting pilot status to Deleted after 4 days in Running")
                    status = "Deleted"
                    statusDate = statusTime + 4 * Time.day
            elif submittedDate:
                statusTime = Time.fromString(submittedDate)
                delta = Time.dateTime() - statusTime
                if delta > 7 * Time.day:
                    self.log.info("Setting pilot status to Deleted more than 7 days after submission still in Running")
                    status = "Deleted"
                    statusDate = statusTime + 7 * Time.day

        childRefs = []
        childDicts = {}
        if isParent:
            for subjob in List.fromChar(job, " Status info for the Job :")[1:]:
                chRef = List.fromChar(subjob, "\n")[0].strip()
                childDict = self.__parseJobStatus(subjob, gridType)
                childRefs.append(chRef)
                childDicts[chRef] = childDict

        return {
            "Status": status,
            "DestinationSite": destination,
            "StatusDate": statusDate,
            "isChild": isChild,
            "isParent": isParent,
            "ParentRef": False,
            "FinalStatus": status in self.finalStateList,
            "ChildRefs": childRefs,
            "ChildDicts": childDicts,
        }

    def __addPilotsAccountingReport(self, pilotsData):
        """ fill accounting data
    """
        for pRef in pilotsData:
            pData = pilotsData[pRef]
            pA = PilotAccounting()
            pA.setEndTime(pData["LastUpdateTime"])
            pA.setStartTime(pData["SubmissionTime"])
            retVal = CS.getUsernameForDN(pData["OwnerDN"])
            if not retVal["OK"]:
                userName = "******"
                self.log.error("Can't determine username for dn:", pData["OwnerDN"])
            else:
                userName = retVal["Value"]
            pA.setValueByKey("User", userName)
            pA.setValueByKey("UserGroup", pData["OwnerGroup"])
            result = getSiteForCE(pData["DestinationSite"])
            if result["OK"] and result["Value"].strip():
                pA.setValueByKey("Site", result["Value"].strip())
            else:
                pA.setValueByKey("Site", "Unknown")
            pA.setValueByKey("GridCE", pData["DestinationSite"])
            pA.setValueByKey("GridMiddleware", pData["GridType"])
            pA.setValueByKey("GridResourceBroker", pData["Broker"])
            pA.setValueByKey("GridStatus", pData["Status"])
            if not "Jobs" in pData:
                pA.setValueByKey("Jobs", 0)
            else:
                pA.setValueByKey("Jobs", len(pData["Jobs"]))
            self.log.verbose("Added accounting record for pilot %s" % pData["PilotID"])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal["OK"]:
                return retVal
        return S_OK()
Example #4
0
class PilotStatusAgent(AgentModule):
    """
      The specific agents must provide the following methods:
        - initialize() for initial settings
        - beginExecution()
        - execute() - the main method called in the agent cycle
        - endExecution()
        - finalize() - the graceful exit of the method, this one is usually used
                   for the agent restart
  """

    queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled']
    finalStateList = ['Done', 'Aborted', 'Cleared', 'Deleted', 'Failed']

    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        self.jobDB = None
        self.pilotDB = None
        self.diracadmin = None

    #############################################################################
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption('PollingTime', 120)
        self.am_setOption('GridEnv', '')
        self.am_setOption('PilotStalledDays', 3)
        self.pilotDB = PilotAgentsDB()
        self.diracadmin = DiracAdmin()
        self.jobDB = JobDB()
        self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
        self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay',
                                                   7)
        self.WMSAdministrator = RPCClient(
            'WorkloadManagement/WMSAdministrator')

        return S_OK()

    #############################################################################
    def execute(self):
        """The PilotAgent execution method.
    """

        self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3)
        self.gridEnv = self.am_getOption('GridEnv')
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue('/DIRAC/Setup', '')
            if setup:
                instance = gConfig.getValue(
                    '/DIRAC/Setups/%s/WorkloadManagement' % setup, '')
                if instance:
                    self.gridEnv = gConfig.getValue(
                        '/Systems/WorkloadManagement/%s/GridEnv' % instance,
                        '')
        result = self.pilotDB._getConnection()
        if result['OK']:
            connection = result['Value']
        else:
            return result

        # Now handle pilots not updated in the last N days (most likely the Broker is no
        # longer available) and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        result = self.WMSAdministrator.clearPilots(self.clearPilotsDelay,
                                                   self.clearAbortedDelay)
        if not result['OK']:
            self.log.warn('Failed to clear old pilots in the PilotAgentsDB')

        return S_OK()

    def clearWaitingPilots(self, condDict):
        """ Clear pilots in the faulty Waiting state
    """

        last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
        clearDict = {
            'Status': 'Waiting',
            'OwnerDN': condDict['OwnerDN'],
            'OwnerGroup': condDict['OwnerGroup'],
            'GridType': condDict['GridType'],
            'Broker': condDict['Broker']
        }
        result = self.pilotDB.selectPilots(clearDict, older=last_update)
        if not result['OK']:
            self.log.warn('Failed to get the Pilot Agents for Waiting state')
            return result
        if not result['Value']:
            return S_OK()
        refList = result['Value']

        for pilotRef in refList:
            # FIXME: definitely, one of the 2 lines below is wrong...
            self.log.info('Setting Waiting pilot to Aborted: %s' % pilotRef)
            result = self.pilotDB.setPilotStatus(
                pilotRef, 'Stalled', statusReason='Exceeded max waiting time')

        return S_OK()

    def clearParentJob(self, pRef, pDict, connection):
        """ Clear the parameteric parent job from the PilotAgentsDB
    """

        childList = pDict['ChildRefs']

        # Check that at least one child is in the database
        children_ok = False
        for child in childList:
            result = self.pilotDB.getPilotInfo(child, conn=connection)
            if result['OK']:
                if result['Value']:
                    children_ok = True

        if children_ok:
            return self.pilotDB.deletePilot(pRef, conn=connection)
        else:
            self.log.verbose('Adding children for parent %s' % pRef)
            result = self.pilotDB.getPilotInfo(pRef)
            parentInfo = result['Value'][pRef]
            tqID = parentInfo['TaskQueueID']
            ownerDN = parentInfo['OwnerDN']
            ownerGroup = parentInfo['OwnerGroup']
            broker = parentInfo['Broker']
            gridType = parentInfo['GridType']
            result = self.pilotDB.addPilotTQReference(childList,
                                                      tqID,
                                                      ownerDN,
                                                      ownerGroup,
                                                      broker=broker,
                                                      gridType=gridType)
            if not result['OK']:
                return result
            children_added = True
            for chRef, chDict in pDict['ChildDicts'].items():
                result = self.pilotDB.setPilotStatus(
                    chRef,
                    chDict['Status'],
                    destination=chDict['DestinationSite'],
                    conn=connection)
                if not result['OK']:
                    children_added = False
            if children_added:
                result = self.pilotDB.deletePilot(pRef, conn=connection)
            else:
                return S_ERROR('Failed to add children')
        return S_OK()

    def handleOldPilots(self, connection):
        """
      select all pilots that have not been updated in the last N days and declared them
      Deleted, accounting for them.
    """
        pilotsToAccount = {}
        timeLimitToConsider = Time.toString(Time.dateTime() -
                                            Time.day * self.pilotStalledDays)
        result = self.pilotDB.selectPilots({'Status': self.queryStateList},
                                           older=timeLimitToConsider,
                                           timeStamp='LastUpdateTime')
        if not result['OK']:
            self.log.error('Failed to get the Pilot Agents')
            return result
        if not result['Value']:
            return S_OK()

        refList = result['Value']
        result = self.pilotDB.getPilotInfo(refList)
        if not result['OK']:
            self.log.error('Failed to get Info for Pilot Agents')
            return result

        pilotsDict = result['Value']

        for pRef in pilotsDict:
            if pilotsDict[pRef].get('Jobs') and self._checkJobLastUpdateTime(
                    pilotsDict[pRef]['Jobs'], self.pilotStalledDays):
                self.log.debug(
                    '%s should not be deleted since one job of %s is running.'
                    % (str(pRef), str(pilotsDict[pRef]['Jobs'])))
                continue
            deletedJobDict = pilotsDict[pRef]
            deletedJobDict['Status'] = 'Deleted'
            deletedJobDict['StatusDate'] = Time.dateTime()
            pilotsToAccount[pRef] = deletedJobDict
            if len(pilotsToAccount) > 100:
                self.accountPilots(pilotsToAccount, connection)
                self._killPilots(pilotsToAccount)
                pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        self._killPilots(pilotsToAccount)

        return S_OK()

    def accountPilots(self, pilotsToAccount, connection):
        """ account for pilots
    """
        accountingFlag = False
        pae = self.am_getOption('PilotAccountingEnabled', 'yes')
        if pae.lower() == "yes":
            accountingFlag = True

        if not pilotsToAccount:
            self.log.info('No pilots to Account')
            return S_OK()

        accountingSent = False
        if accountingFlag:
            retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(),
                                               conn=connection)
            if not retVal['OK']:
                self.log.error('Fail to retrieve Info for pilots',
                               retVal['Message'])
                return retVal
            dbData = retVal['Value']
            for pref in dbData:
                if pref in pilotsToAccount:
                    if dbData[pref]['Status'] not in self.finalStateList:
                        dbData[pref]['Status'] = pilotsToAccount[pref][
                            'Status']
                        dbData[pref]['DestinationSite'] = pilotsToAccount[
                            pref]['DestinationSite']
                        dbData[pref]['LastUpdateTime'] = pilotsToAccount[pref][
                            'StatusDate']

            retVal = self.__addPilotsAccountingReport(dbData)
            if not retVal['OK']:
                self.log.error('Fail to retrieve Info for pilots',
                               retVal['Message'])
                return retVal

            self.log.info("Sending accounting records...")
            retVal = gDataStoreClient.commit()
            if not retVal['OK']:
                self.log.error("Can't send accounting reports",
                               retVal['Message'])
            else:
                self.log.info("Accounting sent for %s pilots" %
                              len(pilotsToAccount))
                accountingSent = True

        if not accountingFlag or accountingSent:
            for pRef in pilotsToAccount:
                pDict = pilotsToAccount[pRef]
                self.log.verbose('Setting Status for %s to %s' %
                                 (pRef, pDict['Status']))
                self.pilotDB.setPilotStatus(pRef,
                                            pDict['Status'],
                                            pDict['DestinationSite'],
                                            pDict['StatusDate'],
                                            conn=connection)

        return S_OK()

    def __addPilotsAccountingReport(self, pilotsData):
        """ fill accounting data
    """
        for pRef in pilotsData:
            pData = pilotsData[pRef]
            pA = PilotAccounting()
            pA.setEndTime(pData['LastUpdateTime'])
            pA.setStartTime(pData['SubmissionTime'])
            retVal = CS.getUsernameForDN(pData['OwnerDN'])
            if not retVal['OK']:
                userName = '******'
                self.log.error("Can't determine username for dn:",
                               pData['OwnerDN'])
            else:
                userName = retVal['Value']
            pA.setValueByKey('User', userName)
            pA.setValueByKey('UserGroup', pData['OwnerGroup'])
            result = getSiteForCE(pData['DestinationSite'])
            if result['OK'] and result['Value'].strip():
                pA.setValueByKey('Site', result['Value'].strip())
            else:
                pA.setValueByKey('Site', 'Unknown')
            pA.setValueByKey('GridCE', pData['DestinationSite'])
            pA.setValueByKey('GridMiddleware', pData['GridType'])
            pA.setValueByKey('GridResourceBroker', pData['Broker'])
            pA.setValueByKey('GridStatus', pData['Status'])
            if 'Jobs' not in pData:
                pA.setValueByKey('Jobs', 0)
            else:
                pA.setValueByKey('Jobs', len(pData['Jobs']))
            self.log.verbose("Added accounting record for pilot %s" %
                             pData['PilotID'])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal['OK']:
                return retVal
        return S_OK()

    def _killPilots(self, acc):
        for i in sorted(acc.keys()):
            result = self.diracadmin.getPilotInfo(i)
            if result['OK'] and i in result['Value'] and 'Status' in result[
                    'Value'][i]:
                ret = self.diracadmin.killPilot(str(i))
                if ret['OK']:
                    self.log.info("Successfully deleted: %s (Status : %s)" %
                                  (i, result['Value'][i]['Status']))
                else:
                    self.log.error("Failed to delete pilot: ",
                                   "%s : %s" % (i, ret['Message']))
            else:
                self.log.error("Failed to get pilot info",
                               "%s : %s" % (i, str(result)))

    def _checkJobLastUpdateTime(self, joblist, StalledDays):
        timeLimitToConsider = Time.dateTime() - Time.day * StalledDays
        ret = False
        for jobID in joblist:
            result = self.jobDB.getJobAttributes(int(jobID))
            if result['OK']:
                if 'LastUpdateTime' in result['Value']:
                    lastUpdateTime = result['Value']['LastUpdateTime']
                    if Time.fromString(lastUpdateTime) > timeLimitToConsider:
                        ret = True
                        self.log.debug(
                            'Since %s updates LastUpdateTime on %s this does not to need to be deleted.'
                            % (str(jobID), str(lastUpdateTime)))
                        break
            else:
                self.log.error("Error taking job info from DB",
                               result['Message'])
        return ret
Example #5
0
class PilotStatusAgent(AgentModule):
  """
      The specific agents must provide the following methods:
        - initialize() for initial settings
        - beginExecution()
        - execute() - the main method called in the agent cycle
        - endExecution()
        - finalize() - the graceful exit of the method, this one is usually used
                   for the agent restart
  """

  queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled']
  finalStateList = ['Done', 'Aborted', 'Cleared', 'Deleted', 'Failed']

  def __init__(self, *args, **kwargs):
    """ c'tor
    """
    AgentModule.__init__(self, *args, **kwargs)

    self.jobDB = None
    self.pilotDB = None
    self.diracadmin = None

  #############################################################################
  def initialize(self):
    """Sets defaults
    """

    self.am_setOption('PollingTime', 120)
    self.am_setOption('GridEnv', '')
    self.am_setOption('PilotStalledDays', 3)
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30)
    self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7)
    self.WMSAdministrator = WMSAdministratorClient()

    return S_OK()

  #############################################################################
  def execute(self):
    """The PilotAgent execution method.
    """

    self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3)
    self.gridEnv = self.am_getOption('GridEnv')
    if not self.gridEnv:
      # No specific option found, try a general one
      setup = gConfig.getValue('/DIRAC/Setup', '')
      if setup:
        instance = gConfig.getValue('/DIRAC/Setups/%s/WorkloadManagement' % setup, '')
        if instance:
          self.gridEnv = gConfig.getValue('/Systems/WorkloadManagement/%s/GridEnv' % instance, '')
    result = self.pilotDB._getConnection()
    if result['OK']:
      connection = result['Value']
    else:
      return result

    # Now handle pilots not updated in the last N days (most likely the Broker is no
    # longer available) and declare them Deleted.
    result = self.handleOldPilots(connection)

    connection.close()

    result = self.WMSAdministrator.clearPilots(self.clearPilotsDelay, self.clearAbortedDelay)
    if not result['OK']:
      self.log.warn('Failed to clear old pilots in the PilotAgentsDB')

    return S_OK()

  def clearWaitingPilots(self, condDict):
    """ Clear pilots in the faulty Waiting state
    """

    last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
    clearDict = {'Status': 'Waiting',
                 'OwnerDN': condDict['OwnerDN'],
                 'OwnerGroup': condDict['OwnerGroup'],
                 'GridType': condDict['GridType'],
                 'Broker': condDict['Broker']}
    result = self.pilotDB.selectPilots(clearDict, older=last_update)
    if not result['OK']:
      self.log.warn('Failed to get the Pilot Agents for Waiting state')
      return result
    if not result['Value']:
      return S_OK()
    refList = result['Value']

    for pilotRef in refList:
      self.log.info('Setting Waiting pilot to Stalled: %s' % pilotRef)
      result = self.pilotDB.setPilotStatus(pilotRef, 'Stalled', statusReason='Exceeded max waiting time')

    return S_OK()

  def clearParentJob(self, pRef, pDict, connection):
    """ Clear the parameteric parent job from the PilotAgentsDB
    """

    childList = pDict['ChildRefs']

    # Check that at least one child is in the database
    children_ok = False
    for child in childList:
      result = self.pilotDB.getPilotInfo(child, conn=connection)
      if result['OK']:
        if result['Value']:
          children_ok = True

    if children_ok:
      return self.pilotDB.deletePilot(pRef, conn=connection)
    else:
      self.log.verbose('Adding children for parent %s' % pRef)
      result = self.pilotDB.getPilotInfo(pRef)
      parentInfo = result['Value'][pRef]
      tqID = parentInfo['TaskQueueID']
      ownerDN = parentInfo['OwnerDN']
      ownerGroup = parentInfo['OwnerGroup']
      broker = parentInfo['Broker']
      gridType = parentInfo['GridType']
      result = self.pilotDB.addPilotTQReference(childList, tqID, ownerDN, ownerGroup,
                                                broker=broker, gridType=gridType)
      if not result['OK']:
        return result
      children_added = True
      for chRef, chDict in pDict['ChildDicts'].items():
        result = self.pilotDB.setPilotStatus(chRef, chDict['Status'],
                                             destination=chDict['DestinationSite'],
                                             conn=connection)
        if not result['OK']:
          children_added = False
      if children_added:
        result = self.pilotDB.deletePilot(pRef, conn=connection)
      else:
        return S_ERROR('Failed to add children')
    return S_OK()

  def handleOldPilots(self, connection):
    """
      select all pilots that have not been updated in the last N days and declared them
      Deleted, accounting for them.
    """
    pilotsToAccount = {}
    timeLimitToConsider = Time.toString(Time.dateTime() - Time.day * self.pilotStalledDays)
    result = self.pilotDB.selectPilots({'Status': self.queryStateList},
                                       older=timeLimitToConsider,
                                       timeStamp='LastUpdateTime')
    if not result['OK']:
      self.log.error('Failed to get the Pilot Agents')
      return result
    if not result['Value']:
      return S_OK()

    refList = result['Value']
    result = self.pilotDB.getPilotInfo(refList)
    if not result['OK']:
      self.log.error('Failed to get Info for Pilot Agents')
      return result

    pilotsDict = result['Value']

    for pRef in pilotsDict:
      if pilotsDict[pRef].get('Jobs') and self._checkJobLastUpdateTime(pilotsDict[pRef]['Jobs'], self.pilotStalledDays):
        self.log.debug('%s should not be deleted since one job of %s is running.' %
                       (str(pRef), str(pilotsDict[pRef]['Jobs'])))
        continue
      deletedJobDict = pilotsDict[pRef]
      deletedJobDict['Status'] = 'Deleted'
      deletedJobDict['StatusDate'] = Time.dateTime()
      pilotsToAccount[pRef] = deletedJobDict
      if len(pilotsToAccount) > 100:
        self.accountPilots(pilotsToAccount, connection)
        self._killPilots(pilotsToAccount)
        pilotsToAccount = {}

    self.accountPilots(pilotsToAccount, connection)
    self._killPilots(pilotsToAccount)

    return S_OK()

  def accountPilots(self, pilotsToAccount, connection):
    """ account for pilots
    """
    accountingFlag = False
    pae = self.am_getOption('PilotAccountingEnabled', 'yes')
    if pae.lower() == "yes":
      accountingFlag = True

    if not pilotsToAccount:
      self.log.info('No pilots to Account')
      return S_OK()

    accountingSent = False
    if accountingFlag:
      retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(), conn=connection)
      if not retVal['OK']:
        self.log.error('Fail to retrieve Info for pilots', retVal['Message'])
        return retVal
      dbData = retVal['Value']
      for pref in dbData:
        if pref in pilotsToAccount:
          if dbData[pref]['Status'] not in self.finalStateList:
            dbData[pref]['Status'] = pilotsToAccount[pref]['Status']
            dbData[pref]['DestinationSite'] = pilotsToAccount[pref]['DestinationSite']
            dbData[pref]['LastUpdateTime'] = pilotsToAccount[pref]['StatusDate']

      retVal = self.__addPilotsAccountingReport(dbData)
      if not retVal['OK']:
        self.log.error('Fail to retrieve Info for pilots', retVal['Message'])
        return retVal

      self.log.info("Sending accounting records...")
      retVal = gDataStoreClient.commit()
      if not retVal['OK']:
        self.log.error("Can't send accounting reports", retVal['Message'])
      else:
        self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount))
        accountingSent = True

    if not accountingFlag or accountingSent:
      for pRef in pilotsToAccount:
        pDict = pilotsToAccount[pRef]
        self.log.verbose('Setting Status for %s to %s' % (pRef, pDict['Status']))
        self.pilotDB.setPilotStatus(pRef,
                                    pDict['Status'],
                                    pDict['DestinationSite'],
                                    pDict['StatusDate'],
                                    conn=connection)

    return S_OK()

  def __addPilotsAccountingReport(self, pilotsData):
    """ fill accounting data
    """
    for pRef in pilotsData:
      pData = pilotsData[pRef]
      pA = PilotAccounting()
      pA.setEndTime(pData['LastUpdateTime'])
      pA.setStartTime(pData['SubmissionTime'])
      retVal = CS.getUsernameForDN(pData['OwnerDN'])
      if not retVal['OK']:
        userName = '******'
        self.log.error("Can't determine username for dn:", pData['OwnerDN'])
      else:
        userName = retVal['Value']
      pA.setValueByKey('User', userName)
      pA.setValueByKey('UserGroup', pData['OwnerGroup'])
      result = getSiteForCE(pData['DestinationSite'])
      if result['OK'] and result['Value'].strip():
        pA.setValueByKey('Site', result['Value'].strip())
      else:
        pA.setValueByKey('Site', 'Unknown')
      pA.setValueByKey('GridCE', pData['DestinationSite'])
      pA.setValueByKey('GridMiddleware', pData['GridType'])
      pA.setValueByKey('GridResourceBroker', pData['Broker'])
      pA.setValueByKey('GridStatus', pData['Status'])
      if 'Jobs' not in pData:
        pA.setValueByKey('Jobs', 0)
      else:
        pA.setValueByKey('Jobs', len(pData['Jobs']))
      self.log.verbose("Added accounting record for pilot %s" % pData['PilotID'])
      retVal = gDataStoreClient.addRegister(pA)
      if not retVal['OK']:
        return retVal
    return S_OK()

  def _killPilots(self, acc):
    for i in sorted(acc.keys()):
      result = self.diracadmin.getPilotInfo(i)
      if result['OK'] and i in result['Value'] and 'Status' in result['Value'][i]:
        ret = self.diracadmin.killPilot(str(i))
        if ret['OK']:
          self.log.info("Successfully deleted: %s (Status : %s)" % (i, result['Value'][i]['Status']))
        else:
          self.log.error("Failed to delete pilot: ", "%s : %s" % (i, ret['Message']))
      else:
        self.log.error("Failed to get pilot info", "%s : %s" % (i, str(result)))

  def _checkJobLastUpdateTime(self, joblist, StalledDays):
    timeLimitToConsider = Time.dateTime() - Time.day * StalledDays
    ret = False
    for jobID in joblist:
      result = self.jobDB.getJobAttributes(int(jobID))
      if result['OK']:
        if 'LastUpdateTime' in result['Value']:
          lastUpdateTime = result['Value']['LastUpdateTime']
          if Time.fromString(lastUpdateTime) > timeLimitToConsider:
            ret = True
            self.log.debug(
                'Since %s updates LastUpdateTime on %s this does not to need to be deleted.' %
                (str(jobID), str(lastUpdateTime)))
            break
      else:
        self.log.error("Error taking job info from DB", result['Message'])
    return ret
Example #6
0
class Matcher(object):
    """ Logic for matching
  """
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None):
        """ c'tor
    """
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper)

        self.siteClient = SiteStatus()

    def selectJob(self, resourceDescription, credDict):
        """ Main job selection function to find the highest priority job matching the resource capacity
    """

        startTime = time.time()

        resourceDict = self._getResourceDict(resourceDescription, credDict)

        # Make a nice print of the resource matching parameters
        toPrintDict = dict(resourceDict)
        if "MaxRAM" in resourceDescription:
            toPrintDict['MaxRAM'] = resourceDescription['MaxRAM']
        if "NumberOfProcessors" in resourceDescription:
            toPrintDict['NumberOfProcessors'] = resourceDescription[
                'NumberOfProcessors']
        toPrintDict['Tag'] = []
        if "Tag" in resourceDict:
            for tag in resourceDict['Tag']:
                if not tag.endswith('GB') and not tag.endswith('Processors'):
                    toPrintDict['Tag'].append(tag)
        if not toPrintDict['Tag']:
            toPrintDict.pop('Tag')
        gLogger.info('Resource description for matching',
                     printDict(toPrintDict))

        negativeCond = self.limiter.getNegativeCondForSite(
            resourceDict['Site'])
        result = self.tqDB.matchAndGetJob(resourceDict,
                                          negativeCond=negativeCond)

        if not result['OK']:
            raise RuntimeError(result['Message'])
        result = result['Value']
        if not result['matchFound']:
            self.log.info("No match found")
            return {}

        jobID = result['jobId']
        resAtt = self.jobDB.getJobAttributes(
            jobID, ['OwnerDN', 'OwnerGroup', 'Status'])
        if not resAtt['OK']:
            raise RuntimeError('Could not retrieve job attributes')
        if not resAtt['Value']:
            raise RuntimeError("No attributes returned for job")
        if not resAtt['Value']['Status'] == 'Waiting':
            self.log.error('Job matched by the TQ is not in Waiting state',
                           str(jobID))
            result = self.tqDB.deleteJob(jobID)
            if not result['OK']:
                raise RuntimeError(result['Message'])
            raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

        self._reportStatus(resourceDict, jobID)

        result = self.jobDB.getJobJDL(jobID)
        if not result['OK']:
            raise RuntimeError("Failed to get the job JDL")

        resultDict = {}
        resultDict['JDL'] = result['Value']
        resultDict['JobID'] = jobID

        matchTime = time.time() - startTime
        self.log.info("Match time: [%s]" % str(matchTime))
        gMonitor.addMark("matchTime", matchTime)

        # Get some extra stuff into the response returned
        resOpt = self.jobDB.getJobOptParameters(jobID)
        if resOpt['OK']:
            for key, value in resOpt['Value'].items():
                resultDict[key] = value
        resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup'])
        if not resAtt['OK']:
            raise RuntimeError('Could not retrieve job attributes')
        if not resAtt['Value']:
            raise RuntimeError('No attributes returned for job')

        if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.limiter.updateDelayCounters(resourceDict['Site'], jobID)

        pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag',
                                                 False)
        if not pilotInfoReportedFlag:
            self._updatePilotInfo(resourceDict)
        self._updatePilotJobMapping(resourceDict, jobID)

        resultDict['DN'] = resAtt['Value']['OwnerDN']
        resultDict['Group'] = resAtt['Value']['OwnerGroup']
        resultDict['PilotInfoReportedFlag'] = True

        return resultDict

    def _getResourceDict(self, resourceDescription, credDict):
        """ from resourceDescription to resourceDict (just various mods)
    """
        resourceDict = self._processResourceDescription(resourceDescription)
        resourceDict = self._checkCredentials(resourceDict, credDict)
        self._checkPilotVersion(resourceDict)
        if not self._checkMask(resourceDict):
            # Banned destinations can only take Test jobs
            resourceDict['JobType'] = 'Test'

        self.log.verbose("Resource description:")
        for key in resourceDict:
            self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key]))

        return resourceDict

    def _processResourceDescription(self, resourceDescription):
        """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

        resourceDict = {}
        for name in singleValueDefFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in multiValueMatchFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in tagMatchFields:
            if name in resourceDescription and resourceDescription[name]:
                resourceDict[name] = resourceDescription[name]
            rname = 'Required%s' % name
            if rname in resourceDescription:
                resourceDict[rname] = resourceDescription[rname]

        if 'JobID' in resourceDescription:
            resourceDict['JobID'] = resourceDescription['JobID']

        # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
        maxRAM = resourceDescription.get('MaxRAM')
        if maxRAM:
            try:
                maxRAM = int(maxRAM) / 1000
            except ValueError:
                maxRAM = None
        nProcessors = resourceDescription.get('NumberOfProcessors')
        if nProcessors:
            try:
                nProcessors = int(nProcessors)
            except ValueError:
                nProcessors = None
        for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]:
            if param and param <= 128:
                paramList = range(2, param + 1)
                paramTags = ['%d%s' % (par, key) for par in paramList]
                if paramTags:
                    resourceDict.setdefault("Tag", []).extend(paramTags)

        if "WholeNode" in resourceDescription:
            resourceDict.setdefault("Tag", []).append("WholeNode")

        if 'Tag' in resourceDict:
            resourceDict['Tag'] = list(set(resourceDict['Tag']))

        for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject',
                  'VirtualOrganization', 'PilotReference', 'PilotBenchmark',
                  'PilotInfoReportedFlag'):
            if k in resourceDescription:
                resourceDict[k] = resourceDescription[k]

        return resourceDict

    def _reportStatus(self, resourceDict, jobID):
        """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
        attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
        attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
        result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
        if not result['OK']:
            self.log.error(
                "Problem reporting job status",
                "setJobAttributes, jobID = %s: %s" %
                (jobID, result['Message']))
        else:
            self.log.verbose("Set job attributes for jobID %s" % jobID)

        result = self.jlDB.addLoggingRecord(jobID,
                                            status='Matched',
                                            minor='Assigned',
                                            source='Matcher')
        if not result['OK']:
            self.log.error(
                "Problem reporting job status",
                "addLoggingRecord, jobID = %s: %s" %
                (jobID, result['Message']))
        else:
            self.log.verbose("Added logging record for jobID %s" % jobID)

    def _checkMask(self, resourceDict):
        """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
        if 'Site' not in resourceDict:
            self.log.error("Missing Site Name in Resource JDL")
            raise RuntimeError("Missing Site Name in Resource JDL")

        # Check if site is allowed
        result = self.siteClient.getUsableSites(resourceDict['Site'])
        if not result['OK']:
            self.log.error("Internal error",
                           "siteClient.getUsableSites: %s" % result['Message'])
            raise RuntimeError("Internal error")

        if resourceDict['Site'] not in result['Value']:
            return False

        return True

    def _updatePilotInfo(self, resourceDict):
        """ Update pilot information - do not fail if we don't manage to do it
    """
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            gridCE = resourceDict.get('GridCE', 'Unknown')
            site = resourceDict.get('Site', 'Unknown')
            benchmark = resourceDict.get('PilotBenchmark', 0.0)
            self.log.verbose(
                'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f'
                % (pilotReference, gridCE, site, benchmark))

            result = self.pilotAgentsDB.setPilotStatus(pilotReference,
                                                       status='Running',
                                                       gridSite=site,
                                                       destination=gridCE,
                                                       benchmark=benchmark)
            if not result['OK']:
                self.log.warn(
                    "Problem updating pilot information",
                    "; setPilotStatus. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))

    def _updatePilotJobMapping(self, resourceDict, jobID):
        """ Update pilot to job mapping information
    """
        pilotReference = resourceDict.get('PilotReference', '')
        if pilotReference:
            result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            if not result['OK']:
                self.log.error(
                    "Problem updating pilot information",
                    ";setCurrentJobID. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))
            result = self.pilotAgentsDB.setJobForPilot(jobID,
                                                       pilotReference,
                                                       updateStatus=False)
            if not result['OK']:
                self.log.error(
                    "Problem updating pilot information",
                    "; setJobForPilot. pilotReference: %s; %s" %
                    (pilotReference, result['Message']))

    def _checkCredentials(self, resourceDict, credDict):
        """ Check if we can get a job given the passed credentials
    """
        if Properties.GENERIC_PILOT in credDict['properties']:
            # You can only match groups in the same VO
            if credDict['group'] == "hosts":
                # for the host case the VirtualOrganization parameter
                # is mandatory in resourceDict
                vo = resourceDict.get('VirtualOrganization', '')
            else:
                vo = Registry.getVOForGroup(credDict['group'])
            result = Registry.getGroupsForVO(vo)
            if result['OK']:
                resourceDict['OwnerGroup'] = result['Value']
            else:
                raise RuntimeError(result['Message'])
        else:
            # If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict['properties']:
                self.log.notice(
                    "Setting the resource DN to the credentials DN")
                resourceDict['OwnerDN'] = credDict['DN']
            # If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict['properties']:
                resourceDict['OwnerGroup'] = credDict['group']
                self.log.notice(
                    "Setting the resource group to the credentials group")
                if 'OwnerDN' in resourceDict and resourceDict[
                        'OwnerDN'] != credDict['DN']:
                    ownerDN = resourceDict['OwnerDN']
                    result = Registry.getGroupsForDN(resourceDict['OwnerDN'])
                    if not result['OK']:
                        raise RuntimeError(result['Message'])
                    if credDict['group'] not in result['Value']:
                        # DN is not in the same group! bad boy.
                        self.log.notice(
                            "You cannot request jobs from DN %s. It does not belong to your group!"
                            % ownerDN)
                        resourceDict['OwnerDN'] = credDict['DN']
            # Nothing special, group and DN have to be the same
            else:
                resourceDict['OwnerDN'] = credDict['DN']
                resourceDict['OwnerGroup'] = credDict['group']

        return resourceDict

    def _checkPilotVersion(self, resourceDict):
        """ Check the pilot DIRAC version
    """
        if self.opsHelper.getValue("Pilot/CheckVersion", True):
            if 'ReleaseVersion' not in resourceDict:
                if 'DIRACVersion' not in resourceDict:
                    raise RuntimeError(
                        'Version check requested and not provided by Pilot')
                else:
                    pilotVersion = resourceDict['DIRACVersion']
            else:
                pilotVersion = resourceDict['ReleaseVersion']

            validVersions = self.opsHelper.getValue("Pilot/Version", [])
            if validVersions and pilotVersion not in validVersions:
                raise RuntimeError(
                    'Pilot version does not match the production version %s not in ( %s )'
                    % (pilotVersion, ",".join(validVersions)))
            # Check project if requested
            validProject = self.opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if 'ReleaseProject' not in resourceDict:
                    raise RuntimeError(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict['ReleaseProject'] != validProject:
                    raise RuntimeError(
                        "Version check requested \
          but expected project %s != received %s" %
                        (validProject, resourceDict['ReleaseProject']))
Example #7
0
class PilotStatusAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled']
    finalStateList = ['Done', 'Aborted', 'Cleared', 'Deleted', 'Failed']
    identityFieldsList = ['OwnerDN', 'OwnerGroup', 'GridType', 'Broker']
    eligibleGridTypes = ['gLite']

    #############################################################################
    def initialize(self):
        """Sets defaults
    """

        self.am_setOption('PollingTime', 120)
        self.am_setOption('GridEnv', '')
        self.am_setOption('PilotStalledDays', 3)
        self.pilotDB = PilotAgentsDB()
        return S_OK()

    #############################################################################
    def execute(self):
        """The PilotAgent execution method.
    """

        self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3)
        self.gridEnv = self.am_getOption('GridEnv')
        if not self.gridEnv:
            # No specific option found, try a general one
            setup = gConfig.getValue('/DIRAC/Setup', '')
            if setup:
                instance = gConfig.getValue(
                    '/DIRAC/Setups/%s/WorkloadManagement' % setup, '')
                if instance:
                    self.gridEnv = gConfig.getValue(
                        '/Systems/WorkloadManagement/%s/GridEnv' % instance,
                        '')
        result = self.pilotDB._getConnection()
        if result['OK']:
            connection = result['Value']
        else:
            return result

        result = self.pilotDB.getPilotGroups(self.identityFieldsList,
                                             {'Status': self.queryStateList})
        if not result['OK']:
            self.log.error('Fail to get identities Groups', result['Message'])
            return result
        if not result['Value']:
            return S_OK()

        pilotsToAccount = {}

        for ownerDN, ownerGroup, gridType, broker in result['Value']:

            if not gridType in self.eligibleGridTypes:
                continue

            self.log.verbose('Getting pilots for %s:%s @ %s %s' %
                             (ownerDN, ownerGroup, gridType, broker))

            condDict1 = {
                'Status': 'Done',
                'StatusReason': 'Report from JobAgent',
                'OwnerDN': ownerDN,
                'OwnerGroup': ownerGroup,
                'GridType': gridType,
                'Broker': broker
            }

            condDict2 = {
                'Status': self.queryStateList,
                'OwnerDN': ownerDN,
                'OwnerGroup': ownerGroup,
                'GridType': gridType,
                'Broker': broker
            }

            for condDict in [condDict1, condDict2]:
                result = self.clearWaitingPilots(condDict)
                if not result['OK']:
                    self.log.warn('Failed to clear Waiting Pilot Jobs')

                result = self.pilotDB.selectPilots(condDict)
                if not result['OK']:
                    self.log.warn('Failed to get the Pilot Agents')
                    return result
                if not result['Value']:
                    continue
                refList = result['Value']

                ret = gProxyManager.getPilotProxyFromDIRACGroup(
                    ownerDN, ownerGroup)
                if not ret['OK']:
                    self.log.error(ret['Message'])
                    self.log.error(
                        'Could not get proxy:',
                        'User "%s", Group "%s"' % (ownerDN, ownerGroup))
                    continue
                proxy = ret['Value']

                self.log.verbose(
                    "Getting status for %s pilots for owner %s and group %s" %
                    (len(refList), ownerDN, ownerGroup))

                for start_index in range(0, len(refList), MAX_JOBS_QUERY):
                    refsToQuery = refList[start_index:start_index +
                                          MAX_JOBS_QUERY]
                    self.log.verbose(
                        'Querying %d pilots of %s starting at %d' %
                        (len(refsToQuery), len(refList), start_index))
                    result = self.getPilotStatus(proxy, gridType, refsToQuery)
                    if not result['OK']:
                        if result['Message'] == 'Broker not Available':
                            self.log.error('Broker %s not Available' % broker)
                            break
                        self.log.warn('Failed to get pilot status:')
                        self.log.warn('%s:%s @ %s' %
                                      (ownerDN, ownerGroup, gridType))
                        continue

                    statusDict = result['Value']
                    for pRef in statusDict:
                        pDict = statusDict[pRef]
                        if pDict:
                            if pDict['isParent']:
                                self.log.verbose('Clear parametric parent %s' %
                                                 pRef)
                                result = self.clearParentJob(
                                    pRef, pDict, connection)
                                if not result['OK']:
                                    self.log.warn(result['Message'])
                                else:
                                    self.log.info(
                                        'Parametric parent removed: %s' % pRef)
                            if pDict['FinalStatus']:
                                self.log.verbose(
                                    'Marking Status for %s to %s' %
                                    (pRef, pDict['Status']))
                                pilotsToAccount[pRef] = pDict
                            else:
                                self.log.verbose(
                                    'Setting Status for %s to %s' %
                                    (pRef, pDict['Status']))
                                result = self.pilotDB.setPilotStatus(
                                    pRef,
                                    pDict['Status'],
                                    pDict['DestinationSite'],
                                    updateTime=pDict['StatusDate'],
                                    conn=connection)

                    if len(pilotsToAccount) > 100:
                        self.accountPilots(pilotsToAccount, connection)
                        pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)
        # Now handle pilots not updated in the last N days (most likely the Broker is no
        # longer available) and declare them Deleted.
        result = self.handleOldPilots(connection)

        connection.close()

        return S_OK()

    def clearWaitingPilots(self, condDict):
        """ Clear pilots in the faulty Waiting state
    """

        last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
        clearDict = {
            'Status': 'Waiting',
            'OwnerDN': condDict['OwnerDN'],
            'OwnerGroup': condDict['OwnerGroup'],
            'GridType': condDict['GridType'],
            'Broker': condDict['Broker']
        }
        result = self.pilotDB.selectPilots(clearDict, older=last_update)
        if not result['OK']:
            self.log.warn('Failed to get the Pilot Agents fpr Waiting state')
            return result
        if not result['Value']:
            return S_OK()
        refList = result['Value']

        for pilotRef in refList:
            self.log.info('Setting Waiting pilot to Aborted: %s' % pilotRef)
            result = self.pilotDB.setPilotStatus(
                pilotRef, 'Stalled', statusReason='Exceeded max waiting time')

        return S_OK()

    def clearParentJob(self, pRef, pDict, connection):
        """ Clear the parameteric parent job from the PilotAgentsDB
    """

        childList = pDict['ChildRefs']

        # Check that at least one child is in the database
        children_ok = False
        for child in childList:
            result = self.pilotDB.getPilotInfo(child, conn=connection)
            if result['OK']:
                if result['Value']:
                    children_ok = True

        if children_ok:
            return self.pilotDB.deletePilot(pRef, conn=connection)
        else:
            self.log.verbose('Adding children for parent %s' % pRef)
            result = self.pilotDB.getPilotInfo(pRef)
            parentInfo = result['Value'][pRef]
            tqID = parentInfo['TaskQueueID']
            ownerDN = parentInfo['OwnerDN']
            ownerGroup = parentInfo['OwnerGroup']
            broker = parentInfo['Broker']
            gridType = parentInfo['GridType']
            result = self.pilotDB.addPilotTQReference(childList,
                                                      tqID,
                                                      ownerDN,
                                                      ownerGroup,
                                                      broker=broker,
                                                      gridType=gridType)
            if not result['OK']:
                return result
            children_added = True
            for chRef, chDict in pDict['ChildDicts'].items():
                result = self.pilotDB.setPilotStatus(
                    chRef,
                    chDict['Status'],
                    destination=chDict['DestinationSite'],
                    conn=connection)
                if not result['OK']:
                    children_added = False
            if children_added:
                result = self.pilotDB.deletePilot(pRef, conn=connection)
            else:
                return S_ERROR('Failed to add children')
        return S_OK()

    def handleOldPilots(self, connection):
        """
      select all pilots that have not been updated in the last N days and declared them 
      Deleted, accounting for them.
    """
        pilotsToAccount = {}
        timeLimitToConsider = Time.toString(Time.dateTime() -
                                            Time.day * self.pilotStalledDays)
        # A.T. Below looks to be a bug
        #result = self.pilotDB.selectPilots( {'Status':self.queryStateList} , older=None, timeStamp='LastUpdateTime' )
        result = self.pilotDB.selectPilots({'Status': self.queryStateList},
                                           older=timeLimitToConsider,
                                           timeStamp='LastUpdateTime')
        if not result['OK']:
            self.log.error('Failed to get the Pilot Agents')
            return result
        if not result['Value']:
            return S_OK()

        refList = result['Value']
        result = self.pilotDB.getPilotInfo(refList)
        if not result['OK']:
            self.log.error('Failed to get Info for Pilot Agents')
            return result

        pilotsDict = result['Value']

        for pRef in pilotsDict:
            deletedJobDict = pilotsDict[pRef]
            deletedJobDict['Status'] = 'Deleted'
            deletedJobDict['StatusDate'] = Time.dateTime()
            pilotsToAccount[pRef] = deletedJobDict
            if len(pilotsToAccount) > 100:
                self.accountPilots(pilotsToAccount, connection)
                pilotsToAccount = {}

        self.accountPilots(pilotsToAccount, connection)

        return S_OK()

    def accountPilots(self, pilotsToAccount, connection):
        """ account for pilots
    """
        accountingFlag = False
        pae = self.am_getOption('PilotAccountingEnabled', 'yes')
        if pae.lower() == "yes":
            accountingFlag = True

        if not pilotsToAccount:
            self.log.info('No pilots to Account')
            return S_OK()

        accountingSent = False
        if accountingFlag:
            retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(),
                                               conn=connection)
            if not retVal['OK']:
                self.log.error('Fail to retrieve Info for pilots',
                               retVal['Message'])
                return retVal
            dbData = retVal['Value']
            for pref in dbData:
                if pref in pilotsToAccount:
                    if dbData[pref]['Status'] not in self.finalStateList:
                        dbData[pref]['Status'] = pilotsToAccount[pref][
                            'Status']
                        dbData[pref]['DestinationSite'] = pilotsToAccount[
                            pref]['DestinationSite']
                        dbData[pref]['LastUpdateTime'] = pilotsToAccount[pref][
                            'StatusDate']

            retVal = self.__addPilotsAccountingReport(dbData)
            if not retVal['OK']:
                self.log.error('Fail to retrieve Info for pilots',
                               retVal['Message'])
                return retVal

            self.log.info("Sending accounting records...")
            retVal = gDataStoreClient.commit()
            if not retVal['OK']:
                self.log.error("Can't send accounting reports",
                               retVal['Message'])
            else:
                self.log.info("Accounting sent for %s pilots" %
                              len(pilotsToAccount))
                accountingSent = True

        if not accountingFlag or accountingSent:
            for pRef in pilotsToAccount:
                pDict = pilotsToAccount[pRef]
                self.log.verbose('Setting Status for %s to %s' %
                                 (pRef, pDict['Status']))
                self.pilotDB.setPilotStatus(pRef,
                                            pDict['Status'],
                                            pDict['DestinationSite'],
                                            pDict['StatusDate'],
                                            conn=connection)

        return S_OK()

    #############################################################################
    def getPilotStatus(self, proxy, gridType, pilotRefList):
        """ Get GRID job status information using the job's owner proxy and
        GRID job IDs. Returns for each JobID its status in the GRID WMS and
        its destination CE as a tuple of 2 elements
    """

        if gridType == 'LCG':
            cmd = ['edg-job-status']
        elif gridType == 'gLite':
            cmd = ['glite-wms-job-status']
        else:
            return S_ERROR()
        cmd.extend(pilotRefList)

        start = time.time()
        ret = executeGridCommand(proxy, cmd, self.gridEnv)
        self.log.info(
            '%s Job Status Execution Time for %d jobs:' %
            (gridType, len(pilotRefList)),
            time.time() - start)

        if not ret['OK']:
            self.log.error('Failed to execute %s Job Status' % gridType,
                           ret['Message'])
            return S_ERROR()
        if ret['Value'][0] != 0:
            stderr = ret['Value'][2]
            stdout = ret['Value'][1]
            deleted = 0
            resultDict = {}
            status = 'Deleted'
            destination = 'Unknown'
            deletedJobDict = {
                'Status': status,
                'DestinationSite': destination,
                'StatusDate': Time.dateTime(),
                'isChild': False,
                'isParent': False,
                'ParentRef': False,
                'FinalStatus': status in self.finalStateList,
                'ChildRefs': []
            }
            # Glite returns this error for Deleted jobs to std.err
            for job in List.fromChar(
                    stderr, '\nUnable to retrieve the status for:')[1:]:
                pRef = List.fromChar(job, '\n')[0].strip()
                resultDict[pRef] = deletedJobDict
                self.pilotDB.setPilotStatus(pRef, "Deleted")
                deleted += 1
            # EDG returns a similar error for Deleted jobs to std.out
            for job in List.fromChar(
                    stdout, '\nUnable to retrieve the status for:')[1:]:
                pRef = List.fromChar(job, '\n')[0].strip()
                if re.search(
                        "No such file or directory: no matching jobs found",
                        job):
                    resultDict[pRef] = deletedJobDict
                    self.pilotDB.setPilotStatus(pRef, "Deleted")
                    deleted += 1
                if re.search(
                        "edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()",
                        job):
                    # the Broker is not accesible
                    return S_ERROR('Broker not Available')
            if not deleted:
                self.log.error(
                    'Error executing %s Job Status:' % gridType,
                    str(ret['Value'][0]) + '\n'.join(ret['Value'][1:3]))
                return S_ERROR()
            return S_OK(resultDict)

        stdout = ret['Value'][1]
        stderr = ret['Value'][2]
        resultDict = {}
        for job in List.fromChar(stdout, '\nStatus info for the Job :')[1:]:
            pRef = List.fromChar(job, '\n')[0].strip()
            resultDict[pRef] = self.__parseJobStatus(job, gridType)

        return S_OK(resultDict)

    def __parseJobStatus(self, job, gridType):
        """ Parse output of grid pilot status command
    """

        statusRE = 'Current Status:\s*(\w*)'
        destinationRE = 'Destination:\s*([\w\.-]*)'
        statusDateLCGRE = 'reached on:\s*....(.*)'
        submittedDateRE = 'Submitted:\s*....(.*)'
        statusFailedRE = 'Current Status:.*\(Failed\)'

        status = None
        destination = 'Unknown'
        statusDate = None
        submittedDate = None

        try:
            status = re.search(statusRE, job).group(1)
            if status == 'Done' and re.search(statusFailedRE, job):
                status = 'Failed'
            if re.search(destinationRE, job):
                destination = re.search(destinationRE, job).group(1)
            if gridType == 'LCG' and re.search(statusDateLCGRE, job):
                statusDate = re.search(statusDateLCGRE, job).group(1)
                statusDate = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.strptime(statusDate, '%b %d %H:%M:%S %Y'))
            if gridType == 'gLite' and re.search(submittedDateRE, job):
                submittedDate = re.search(submittedDateRE, job).group(1)
                submittedDate = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.strptime(submittedDate, '%b %d %H:%M:%S %Y %Z'))
        except:
            self.log.exception(
                'Error parsing %s Job Status output:\n' % gridType, job)

        isParent = False
        if re.search('Nodes information', job):
            isParent = True
        isChild = False
        if re.search('Parent Job', job):
            isChild = True

        if status == "Running":
            # Pilots can be in Running state for too long, due to bugs in the WMS
            if statusDate:
                statusTime = Time.fromString(statusDate)
                delta = Time.dateTime() - statusTime
                if delta > 4 * Time.day:
                    self.log.info(
                        'Setting pilot status to Deleted after 4 days in Running'
                    )
                    status = "Deleted"
                    statusDate = statusTime + 4 * Time.day
            elif submittedDate:
                statusTime = Time.fromString(submittedDate)
                delta = Time.dateTime() - statusTime
                if delta > 7 * Time.day:
                    self.log.info(
                        'Setting pilot status to Deleted more than 7 days after submission still in Running'
                    )
                    status = "Deleted"
                    statusDate = statusTime + 7 * Time.day

        childRefs = []
        childDicts = {}
        if isParent:
            for subjob in List.fromChar(job, ' Status info for the Job :')[1:]:
                chRef = List.fromChar(subjob, '\n')[0].strip()
                childDict = self.__parseJobStatus(subjob, gridType)
                childRefs.append(chRef)
                childDicts[chRef] = childDict

        return {
            'Status': status,
            'DestinationSite': destination,
            'StatusDate': statusDate,
            'isChild': isChild,
            'isParent': isParent,
            'ParentRef': False,
            'FinalStatus': status in self.finalStateList,
            'ChildRefs': childRefs,
            'ChildDicts': childDicts
        }

    def __addPilotsAccountingReport(self, pilotsData):
        """ fill accounting data
    """
        for pRef in pilotsData:
            pData = pilotsData[pRef]
            pA = PilotAccounting()
            pA.setEndTime(pData['LastUpdateTime'])
            pA.setStartTime(pData['SubmissionTime'])
            retVal = CS.getUsernameForDN(pData['OwnerDN'])
            if not retVal['OK']:
                userName = '******'
                self.log.error("Can't determine username for dn:",
                               pData['OwnerDN'])
            else:
                userName = retVal['Value']
            pA.setValueByKey('User', userName)
            pA.setValueByKey('UserGroup', pData['OwnerGroup'])
            result = getSiteForCE(pData['DestinationSite'])
            if result['OK'] and result['Value'].strip():
                pA.setValueByKey('Site', result['Value'].strip())
            else:
                pA.setValueByKey('Site', 'Unknown')
            pA.setValueByKey('GridCE', pData['DestinationSite'])
            pA.setValueByKey('GridMiddleware', pData['GridType'])
            pA.setValueByKey('GridResourceBroker', pData['Broker'])
            pA.setValueByKey('GridStatus', pData['Status'])
            if not 'Jobs' in pData:
                pA.setValueByKey('Jobs', 0)
            else:
                pA.setValueByKey('Jobs', len(pData['Jobs']))
            self.log.verbose("Added accounting record for pilot %s" %
                             pData['PilotID'])
            retVal = gDataStoreClient.addRegister(pA)
            if not retVal['OK']:
                return retVal
        return S_OK()
Example #8
0
class PilotStatusAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled']
  finalStateList = [ 'Done', 'Aborted', 'Cleared', 'Deleted', 'Failed' ]
  identityFieldsList = [ 'OwnerDN', 'OwnerGroup', 'GridType', 'Broker' ]
  eligibleGridTypes = [ 'gLite' ]

  #############################################################################
  def initialize( self ):
    """Sets defaults
    """

    self.am_setOption( 'PollingTime', 120 )
    self.am_setOption( 'GridEnv', '' )
    self.am_setOption( 'PilotStalledDays', 3 )
    self.pilotDB = PilotAgentsDB()
    self.diracadmin = DiracAdmin()
    self.jobDB = JobDB()
    return S_OK()

  #############################################################################
  def execute( self ):
    """The PilotAgent execution method.
    """

    self.pilotStalledDays = self.am_getOption( 'PilotStalledDays', 3 )
    self.gridEnv = self.am_getOption( 'GridEnv' )
    if not self.gridEnv:
      # No specific option found, try a general one
      setup = gConfig.getValue( '/DIRAC/Setup', '' )
      if setup:
        instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' )
        if instance:
          self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' )
    result = self.pilotDB._getConnection()
    if result['OK']:
      connection = result['Value']
    else:
      return result

    result = self.pilotDB.getPilotGroups( self.identityFieldsList,
                                         {'Status': self.queryStateList } )
    if not result['OK']:
      self.log.error( 'Fail to get identities Groups', result['Message'] )
      return result
    if not result['Value']:
      return S_OK()

    pilotsToAccount = {}

    for ownerDN, ownerGroup, gridType, broker in result['Value']:

      if not gridType in self.eligibleGridTypes:
        continue

      self.log.verbose( 'Getting pilots for %s:%s @ %s %s' % ( ownerDN, ownerGroup, gridType, broker ) )

      condDict1 = {'Status':'Done',
                   'StatusReason':'Report from JobAgent',
                   'OwnerDN':ownerDN,
                   'OwnerGroup':ownerGroup,
                   'GridType':gridType,
                   'Broker':broker}

      condDict2 = {'Status':self.queryStateList,
                   'OwnerDN':ownerDN,
                   'OwnerGroup':ownerGroup,
                   'GridType':gridType,
                   'Broker':broker}

      for condDict in [ condDict1, condDict2]:
        result = self.clearWaitingPilots( condDict )
        if not result['OK']:
          self.log.warn( 'Failed to clear Waiting Pilot Jobs' )

        result = self.pilotDB.selectPilots( condDict )
        if not result['OK']:
          self.log.warn( 'Failed to get the Pilot Agents' )
          return result
        if not result['Value']:
          continue
        refList = result['Value']

        ret = gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup )
        if not ret['OK']:
          self.log.error( ret['Message'] )
          self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) )
          continue
        proxy = ret['Value']

        self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % ( len( refList ),
                                                                                      ownerDN, ownerGroup ) )

        for start_index in range( 0, len( refList ), MAX_JOBS_QUERY ):
          refsToQuery = refList[ start_index : start_index + MAX_JOBS_QUERY ]
          self.log.verbose( 'Querying %d pilots of %s starting at %d' %
                            ( len( refsToQuery ), len( refList ), start_index ) )
          result = self.getPilotStatus( proxy, gridType, refsToQuery )
          if not result['OK']:
            if result['Message'] == 'Broker not Available':
              self.log.error( 'Broker %s not Available' % broker )
              break
            self.log.warn( 'Failed to get pilot status:' )
            self.log.warn( '%s:%s @ %s' % ( ownerDN, ownerGroup, gridType ) )
            continue

          statusDict = result[ 'Value' ]
          for pRef in statusDict:
            pDict = statusDict[ pRef ]
            if pDict:
              if pDict['isParent']:
                self.log.verbose( 'Clear parametric parent %s' % pRef )
                result = self.clearParentJob( pRef, pDict, connection )
                if not result['OK']:
                  self.log.warn( result['Message'] )
                else:
                  self.log.info( 'Parametric parent removed: %s' % pRef )
              if pDict[ 'FinalStatus' ]:
                self.log.verbose( 'Marking Status for %s to %s' % ( pRef, pDict['Status'] ) )
                pilotsToAccount[ pRef ] = pDict
              else:
                self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) )
                result = self.pilotDB.setPilotStatus( pRef,
                                                      pDict['Status'],
                                                      pDict['DestinationSite'],
                                                      updateTime = pDict['StatusDate'],
                                                      conn = connection )

          if len( pilotsToAccount ) > 100:
            self.accountPilots( pilotsToAccount, connection )
            pilotsToAccount = {}

    self.accountPilots( pilotsToAccount, connection )
    # Now handle pilots not updated in the last N days (most likely the Broker is no 
    # longer available) and declare them Deleted.
    result = self.handleOldPilots( connection )

    connection.close()

    return S_OK()

  def clearWaitingPilots( self, condDict ):
    """ Clear pilots in the faulty Waiting state
    """

    last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour
    clearDict = {'Status':'Waiting',
                 'OwnerDN':condDict['OwnerDN'],
                 'OwnerGroup':condDict['OwnerGroup'],
                 'GridType':condDict['GridType'],
                 'Broker':condDict['Broker']}
    result = self.pilotDB.selectPilots( clearDict, older = last_update )
    if not result['OK']:
      self.log.warn( 'Failed to get the Pilot Agents for Waiting state' )
      return result
    if not result['Value']:
      return S_OK()
    refList = result['Value']

    for pilotRef in refList:
      self.log.info( 'Setting Waiting pilot to Aborted: %s' % pilotRef )
      result = self.pilotDB.setPilotStatus( pilotRef, 'Stalled', statusReason = 'Exceeded max waiting time' )

    return S_OK()

  def clearParentJob( self, pRef, pDict, connection ):
    """ Clear the parameteric parent job from the PilotAgentsDB
    """

    childList = pDict['ChildRefs']

    # Check that at least one child is in the database
    children_ok = False
    for child in childList:
      result = self.pilotDB.getPilotInfo( child, conn = connection )
      if result['OK']:
        if result['Value']:
          children_ok = True

    if children_ok:
      return self.pilotDB.deletePilot( pRef, conn = connection )
    else:
      self.log.verbose( 'Adding children for parent %s' % pRef )
      result = self.pilotDB.getPilotInfo( pRef )
      parentInfo = result['Value'][pRef]
      tqID = parentInfo['TaskQueueID']
      ownerDN = parentInfo['OwnerDN']
      ownerGroup = parentInfo['OwnerGroup']
      broker = parentInfo['Broker']
      gridType = parentInfo['GridType']
      result = self.pilotDB.addPilotTQReference( childList, tqID, ownerDN, ownerGroup,
                                                broker = broker, gridType = gridType )
      if not result['OK']:
        return result
      children_added = True
      for chRef, chDict in pDict['ChildDicts'].items():
        result = self.pilotDB.setPilotStatus( chRef, chDict['Status'],
                                             destination = chDict['DestinationSite'],
                                             conn = connection )
        if not result['OK']:
          children_added = False
      if children_added :
        result = self.pilotDB.deletePilot( pRef, conn = connection )
      else:
        return S_ERROR( 'Failed to add children' )
    return S_OK()

  def handleOldPilots( self, connection ):
    """
      select all pilots that have not been updated in the last N days and declared them 
      Deleted, accounting for them.
    """
    pilotsToAccount = {}
    timeLimitToConsider = Time.toString( Time.dateTime() - Time.day * self.pilotStalledDays )
    # A.T. Below looks to be a bug 
    #result = self.pilotDB.selectPilots( {'Status':self.queryStateList} , older=None, timeStamp='LastUpdateTime' )
    result = self.pilotDB.selectPilots( { 'Status':self.queryStateList} ,
                                        older = timeLimitToConsider,
                                        timeStamp = 'LastUpdateTime' )
    if not result['OK']:
      self.log.error( 'Failed to get the Pilot Agents' )
      return result
    if not result['Value']:
      return S_OK()

    refList = result['Value']
    result = self.pilotDB.getPilotInfo( refList )
    if not result['OK']:
      self.log.error( 'Failed to get Info for Pilot Agents' )
      return result

    pilotsDict = result['Value']

    for pRef in pilotsDict:
      if pilotsDict[pRef].has_key('Jobs') and len(pilotsDict[pRef]['Jobs']) > 0 and self._checkJobLastUpdateTime(pilotsDict[pRef]['Jobs'],self.pilotStalledDays):
        self.log.debug('%s should not be deleted since one job of %s is running.' % ( str(pRef) , str(pilotsDict[pRef]['Jobs']) ) )
        continue
      deletedJobDict = pilotsDict[pRef]
      deletedJobDict['Status'] = 'Deleted'
      deletedJobDict['StatusDate'] = Time.dateTime()
      pilotsToAccount[ pRef ] = deletedJobDict
      if len( pilotsToAccount ) > 100:
        self.accountPilots( pilotsToAccount, connection )
        self._killPilots( pilotsToAccount )
        pilotsToAccount = {}

    self.accountPilots( pilotsToAccount, connection )
    self._killPilots( pilotsToAccount )


    return S_OK()

  def accountPilots( self, pilotsToAccount, connection ):
    """ account for pilots
    """
    accountingFlag = False
    pae = self.am_getOption( 'PilotAccountingEnabled', 'yes' )
    if pae.lower() == "yes":
      accountingFlag = True

    if not pilotsToAccount:
      self.log.info( 'No pilots to Account' )
      return S_OK()

    accountingSent = False
    if accountingFlag:
      retVal = self.pilotDB.getPilotInfo( pilotsToAccount.keys(), conn = connection )
      if not retVal['OK']:
        self.log.error( 'Fail to retrieve Info for pilots', retVal['Message'] )
        return retVal
      dbData = retVal[ 'Value' ]
      for pref in dbData:
        if pref in pilotsToAccount:
          if dbData[pref][ 'Status' ] not in self.finalStateList:
            dbData[pref][ 'Status' ] = pilotsToAccount[pref][ 'Status' ]
            dbData[pref][ 'DestinationSite' ] = pilotsToAccount[pref][ 'DestinationSite' ]
            dbData[pref][ 'LastUpdateTime' ] = pilotsToAccount[pref][ 'StatusDate' ]

      retVal = self.__addPilotsAccountingReport( dbData )
      if not retVal['OK']:
        self.log.error( 'Fail to retrieve Info for pilots', retVal['Message'] )
        return retVal

      self.log.info( "Sending accounting records..." )
      retVal = gDataStoreClient.commit()
      if not retVal[ 'OK' ]:
        self.log.error( "Can't send accounting reports", retVal[ 'Message' ] )
      else:
        self.log.info( "Accounting sent for %s pilots" % len( pilotsToAccount ) )
        accountingSent = True

    if not accountingFlag or accountingSent:
      for pRef in pilotsToAccount:
        pDict = pilotsToAccount[pRef]
        self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) )
        self.pilotDB.setPilotStatus( pRef,
                                     pDict['Status'],
                                     pDict['DestinationSite'],
                                     pDict['StatusDate'],
                                     conn = connection )

    return S_OK()

  #############################################################################
  def getPilotStatus( self, proxy, gridType, pilotRefList ):
    """ Get GRID job status information using the job's owner proxy and
        GRID job IDs. Returns for each JobID its status in the GRID WMS and
        its destination CE as a tuple of 2 elements
    """

    if gridType == 'LCG':
      cmd = [ 'edg-job-status' ]
    elif gridType == 'gLite':
      cmd = [ 'glite-wms-job-status' ]
    else:
      return S_ERROR()
    cmd.extend( pilotRefList )

    start = time.time()
    ret = executeGridCommand( proxy, cmd, self.gridEnv )
    self.log.info( '%s Job Status Execution Time for %d jobs:' %
                   ( gridType, len( pilotRefList ) ), time.time() - start )

    if not ret['OK']:
      self.log.error( 'Failed to execute %s Job Status' % gridType, ret['Message'] )
      return S_ERROR()
    if ret['Value'][0] != 0:
      stderr = ret['Value'][2]
      stdout = ret['Value'][1]
      deleted = 0
      resultDict = {}
      status = 'Deleted'
      destination = 'Unknown'
      deletedJobDict = { 'Status': status,
             'DestinationSite': destination,
             'StatusDate': Time.dateTime(),
             'isChild': False,
             'isParent': False,
             'ParentRef': False,
             'FinalStatus' : status in self.finalStateList,
             'ChildRefs' : [] }
      # Glite returns this error for Deleted jobs to std.err
      for job in List.fromChar( stderr, '\nUnable to retrieve the status for:' )[1:]:
        pRef = List.fromChar( job, '\n' )[0].strip()
        resultDict[pRef] = deletedJobDict
        self.pilotDB.setPilotStatus( pRef, "Deleted" )
        deleted += 1
      # EDG returns a similar error for Deleted jobs to std.out
      for job in List.fromChar( stdout, '\nUnable to retrieve the status for:' )[1:]:
        pRef = List.fromChar( job, '\n' )[0].strip()
        if re.search( "No such file or directory: no matching jobs found", job ):
          resultDict[pRef] = deletedJobDict
          self.pilotDB.setPilotStatus( pRef, "Deleted" )
          deleted += 1
        if re.search( "edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()", job ):
          # the Broker is not accesible
          return S_ERROR( 'Broker not Available' )
      if not deleted:
        self.log.error( 'Error executing %s Job Status:' %
                        gridType, str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) )
        return S_ERROR()
      return S_OK( resultDict )

    stdout = ret['Value'][1]
    stderr = ret['Value'][2]
    resultDict = {}
    for job in List.fromChar( stdout, '\nStatus info for the Job :' )[1:]:
      pRef = List.fromChar( job, '\n' )[0].strip()
      resultDict[pRef] = self.__parseJobStatus( job, gridType )

    return S_OK( resultDict )

  def __parseJobStatus( self, job, gridType ):
    """ Parse output of grid pilot status command
    """

    statusRE = 'Current Status:\s*(\w*)'
    destinationRE = 'Destination:\s*([\w\.-]*)'
    statusDateLCGRE = 'reached on:\s*....(.*)'
    submittedDateRE = 'Submitted:\s*....(.*)'
    statusFailedRE = 'Current Status:.*\(Failed\)'

    status = None
    destination = 'Unknown'
    statusDate = None
    submittedDate = None

    try:
      status = re.search( statusRE, job ).group( 1 )
      if status == 'Done' and re.search( statusFailedRE, job ):
        status = 'Failed'
      if re.search( destinationRE, job ):
        destination = re.search( destinationRE, job ).group( 1 )
      if gridType == 'LCG' and re.search( statusDateLCGRE, job ):
        statusDate = re.search( statusDateLCGRE, job ).group( 1 )
        statusDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( statusDate, '%b %d %H:%M:%S %Y' ) )
      if gridType == 'gLite' and re.search( submittedDateRE, job ):
        submittedDate = re.search( submittedDateRE, job ).group( 1 )
        submittedDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( submittedDate, '%b %d %H:%M:%S %Y %Z' ) )
    except:
      self.log.exception( 'Error parsing %s Job Status output:\n' % gridType, job )

    isParent = False
    if re.search( 'Nodes information', job ):
      isParent = True
    isChild = False
    if re.search( 'Parent Job', job ):
      isChild = True

    if status == "Running":
      # Pilots can be in Running state for too long, due to bugs in the WMS
      if statusDate:
        statusTime = Time.fromString( statusDate )
        delta = Time.dateTime() - statusTime
        if delta > 4 * Time.day:
          self.log.info( 'Setting pilot status to Deleted after 4 days in Running' )
          status = "Deleted"
          statusDate = statusTime + 4 * Time.day
      elif submittedDate:
        statusTime = Time.fromString( submittedDate )
        delta = Time.dateTime() - statusTime
        if delta > 7 * Time.day:
          self.log.info( 'Setting pilot status to Deleted more than 7 days after submission still in Running' )
          status = "Deleted"
          statusDate = statusTime + 7 * Time.day

    childRefs = []
    childDicts = {}
    if isParent:
      for subjob in List.fromChar( job, ' Status info for the Job :' )[1:]:
        chRef = List.fromChar( subjob, '\n' )[0].strip()
        childDict = self.__parseJobStatus( subjob, gridType )
        childRefs.append( chRef )
        childDicts[chRef] = childDict

    return { 'Status': status,
             'DestinationSite': destination,
             'StatusDate': statusDate,
             'isChild': isChild,
             'isParent': isParent,
             'ParentRef': False,
             'FinalStatus' : status in self.finalStateList,
             'ChildRefs' : childRefs,
             'ChildDicts' : childDicts }

  def __addPilotsAccountingReport( self, pilotsData ):
    """ fill accounting data
    """
    for pRef in pilotsData:
      pData = pilotsData[pRef]
      pA = PilotAccounting()
      pA.setEndTime( pData[ 'LastUpdateTime' ] )
      pA.setStartTime( pData[ 'SubmissionTime' ] )
      retVal = CS.getUsernameForDN( pData[ 'OwnerDN' ] )
      if not retVal[ 'OK' ]:
        userName = '******'
        self.log.error( "Can't determine username for dn:", pData[ 'OwnerDN' ] )
      else:
        userName = retVal[ 'Value' ]
      pA.setValueByKey( 'User', userName )
      pA.setValueByKey( 'UserGroup', pData[ 'OwnerGroup' ] )
      result = getSiteForCE( pData[ 'DestinationSite' ] )
      if result['OK'] and result[ 'Value' ].strip():
        pA.setValueByKey( 'Site', result['Value'].strip() )
      else:
        pA.setValueByKey( 'Site', 'Unknown' )
      pA.setValueByKey( 'GridCE', pData[ 'DestinationSite' ] )
      pA.setValueByKey( 'GridMiddleware', pData[ 'GridType' ] )
      pA.setValueByKey( 'GridResourceBroker', pData[ 'Broker' ] )
      pA.setValueByKey( 'GridStatus', pData[ 'Status' ] )
      if not 'Jobs' in pData:
        pA.setValueByKey( 'Jobs', 0 )
      else:
        pA.setValueByKey( 'Jobs', len( pData['Jobs'] ) )
      self.log.verbose( "Added accounting record for pilot %s" % pData[ 'PilotID' ] )
      retVal = gDataStoreClient.addRegister( pA )
      if not retVal[ 'OK' ]:
        return retVal
    return S_OK()

  def _killPilots( self, acc ):
    for i in sorted(acc.keys()):
      result = self.diracadmin.getPilotInfo( i )
      if result['OK'] and result['Value'].has_key(i) and result['Value'][i].has_key('Status'):
        ret = self.diracadmin.killPilot( str(i) )
        if ret['OK']:
          self.log.info("Successfully deleted: %s (Status : %s)" % (i, result['Value'][i]['Status'] ) )
        else:
          self.log.error("Failed to delete %s : %s"  % ( i, ret['Message']))
      else:
        self.log.error("Failed to get info. of %s : %s" % ( i, str(result)))

  def _checkJobLastUpdateTime( self, joblist , StalledDays ):
    timeLimitToConsider = Time.dateTime() - Time.day * StalledDays 
    ret = False
    for JobID in joblist:
      result = self.jobDB.getJobAttributes(int(JobID))
      if result['OK']:
         if result['Value'].has_key('LastUpdateTime'):
           LastUpdateTime = result['Value']['LastUpdateTime']
           if Time.fromString(LastUpdateTime) > timeLimitToConsider:
             ret = True
             self.log.debug('Since '+str(JobID)+' updates LastUpdateTime on '+str(LastUpdateTime)+', this does not to need to be deleted.')
             break
      else:
        self.log.error("Error taking job info. from DB:%s" % str( result['Message'] ) )
    return ret
Example #9
0
class Matcher( object ):
  """ Logic for matching
  """

  def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ):
    """ c'tor
    """
    if pilotAgentsDB:
      self.pilotAgentsDB = pilotAgentsDB
    else:
      self.pilotAgentsDB = PilotAgentsDB()
    if jobDB:
      self.jobDB = jobDB
    else:
      self.jobDB = JobDB()
    if tqDB:
      self.tqDB = tqDB
    else:
      self.tqDB = TaskQueueDB()
    if jlDB:
      self.jlDB = jlDB
    else:
      self.jlDB = JobLoggingDB()

    if opsHelper:
      self.opsHelper = opsHelper
    else:
      self.opsHelper = Operations()

    self.log = gLogger.getSubLogger( "Matcher" )

    self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )


  def selectJob( self, resourceDescription, credDict ):
    """ Main job selection function to find the highest priority job matching the resource capacity
    """

    startTime = time.time()

    resourceDict = self._getResourceDict( resourceDescription, credDict )

    negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] )
    result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond )

    if not result['OK']:
      return result
    result = result['Value']
    if not result['matchFound']:
      self.log.info( "No match found" )
      raise RuntimeError( "No match found" )

    jobID = result['jobId']
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( "No attributes returned for job" )
    if not resAtt['Value']['Status'] == 'Waiting':
      self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) )
      result = self.tqDB.deleteJob( jobID )
      if not result[ 'OK' ]:
        return result
      raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) )

    self._reportStatus( resourceDict, jobID )

    result = self.jobDB.getJobJDL( jobID )
    if not result['OK']:
      raise RuntimeError( "Failed to get the job JDL" )

    resultDict = {}
    resultDict['JDL'] = result['Value']
    resultDict['JobID'] = jobID

    matchTime = time.time() - startTime
    self.log.info( "Match time: [%s]" % str( matchTime ) )
    gMonitor.addMark( "matchTime", matchTime )

    # Get some extra stuff into the response returned
    resOpt = self.jobDB.getJobOptParameters( jobID )
    if resOpt['OK']:
      for key, value in resOpt['Value'].items():
        resultDict[key] = value
    resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] )
    if not resAtt['OK']:
      raise RuntimeError( 'Could not retrieve job attributes' )
    if not resAtt['Value']:
      raise RuntimeError( 'No attributes returned for job' )

    if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ):
      self.limiter.updateDelayCounters( resourceDict['Site'], jobID )

    pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False )
    if not pilotInfoReportedFlag:
      self._updatePilotInfo( resourceDict )
    self._updatePilotJobMapping( resourceDict, jobID )

    resultDict['DN'] = resAtt['Value']['OwnerDN']
    resultDict['Group'] = resAtt['Value']['OwnerGroup']
    resultDict['PilotInfoReportedFlag'] = True

    return resultDict


  def _getResourceDict( self, resourceDescription, credDict ):
    """ from resourceDescription to resourceDict (just various mods)
    """
    resourceDict = self._processResourceDescription( resourceDescription )
    resourceDict = self._checkCredentials( resourceDict, credDict )
    self._checkPilotVersion( resourceDict )
    if not self._checkMask( resourceDict ):
      # Banned destinations can only take Test jobs
      resourceDict['JobType'] = 'Test'

    self.log.verbose( "Resource description:" )
    for key in resourceDict:
      self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) )

    return resourceDict

  def _processResourceDescription( self, resourceDescription ):
    """ Check and form the resource description dictionary

        resourceDescription is a ceDict coming from a JobAgent, for example.
    """

    resourceDict = {}
    if type( resourceDescription ) in StringTypes:
      classAdAgent = ClassAd( resourceDescription )
      if not classAdAgent.isOK():
        raise ValueError( 'Illegal Resource JDL' )
      self.log.verbose( classAdAgent.asJDL() )

      for name in singleValueDefFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'CPUTime':
            resourceDict[name] = classAdAgent.getAttributeInt( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      for name in multiValueMatchFields:
        if classAdAgent.lookupAttribute( name ):
          if name == 'SubmitPool':
            resourceDict[name] = classAdAgent.getListFromExpression( name )
          else:
            resourceDict[name] = classAdAgent.getAttributeString( name )

      # Check if a JobID is requested
      if classAdAgent.lookupAttribute( 'JobID' ):
        resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' )

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ):
        if classAdAgent.lookupAttribute( k ):
          resourceDict[ k ] = classAdAgent.getAttributeString( k )

    else:
      for name in singleValueDefFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      for name in multiValueMatchFields:
        if resourceDescription.has_key( name ):
          resourceDict[name] = resourceDescription[name]

      if resourceDescription.has_key( 'JobID' ):
        resourceDict['JobID'] = resourceDescription['JobID']

      for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization',
                 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ):
        if k in resourceDescription:
          resourceDict[ k ] = resourceDescription[ k ]

    return resourceDict



  def _reportStatus( self, resourceDict, jobID ):
    """ Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
    """
    attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site']
    attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']]
    result = self.jobDB.setJobAttributes( jobID, attNames, attValues )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Set job attributes for jobID %s" % jobID )

    result = self.jlDB.addLoggingRecord( jobID,
                                         status = 'Matched',
                                         minor = 'Assigned',
                                         source = 'Matcher' )
    if not result['OK']:
      self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) )
    else:
      self.log.verbose( "Added logging record for jobID %s" % jobID )


  def _checkMask( self, resourceDict ):
    """ Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
    """
    if not 'Site' in resourceDict:
      self.log.error( "Missing Site Name in Resource JDL" )
      raise RuntimeError( "Missing Site Name in Resource JDL" )

    # Get common site mask and check the agent site
    result = self.jobDB.getSiteMask( siteState = 'Active' )
    if not result['OK']:
      self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] )
      raise RuntimeError( "Internal error" )
    maskList = result['Value']

    if resourceDict['Site'] not in maskList:
      return False

    return True

  def _updatePilotInfo( self, resourceDict ):
    """ Update pilot information - do not fail if we don't manage to do it
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      gridCE = resourceDict.get( 'GridCE', 'Unknown' )
      site = resourceDict.get( 'Site', 'Unknown' )
      benchmark = resourceDict.get( 'PilotBenchmark', 0.0 )
      self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) )

      result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site,
                                                  destination = gridCE, benchmark = benchmark )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _updatePilotJobMapping( self, resourceDict, jobID ):
    """ Update pilot to job mapping information
    """
    pilotReference = resourceDict.get( 'PilotReference', '' )
    if pilotReference:
      result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )
      result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False )
      if not result['OK']:
        self.log.error( "Problem updating pilot information",
                        "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) )

  def _checkCredentials( self, resourceDict, credDict ):
    """ Check if we can get a job given the passed credentials
    """
    # Check credentials if not generic pilot
    if Properties.GENERIC_PILOT in credDict[ 'properties' ]:
      # You can only match groups in the same VO
      vo = Registry.getVOForGroup( credDict[ 'group' ] )
      result = Registry.getGroupsForVO( vo )
      if result[ 'OK' ]:
        resourceDict[ 'OwnerGroup' ] = result[ 'Value' ]
      else:
        raise RuntimeError( result['Message'] )
    else:
      # If it's a private pilot, the DN has to be the same
      if Properties.PILOT in credDict[ 'properties' ]:
        self.log.notice( "Setting the resource DN to the credentials DN" )
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # If it's a job sharing. The group has to be the same and just check that the DN (if any)
      # belongs to the same group
      elif Properties.JOB_SHARING in credDict[ 'properties' ]:
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]
        self.log.notice( "Setting the resource group to the credentials group" )
        if 'OwnerDN'  in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]:
          ownerDN = resourceDict[ 'OwnerDN' ]
          result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] )
          if not result[ 'OK' ]:
            raise RuntimeError( result['Message'] )
          if credDict[ 'group' ] not in result[ 'Value' ]:
            # DN is not in the same group! bad boy.
            self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN )
            resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
      # Nothing special, group and DN have to be the same
      else:
        resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ]
        resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ]

    return resourceDict

  def _checkPilotVersion( self, resourceDict ):
    """ Check the pilot DIRAC version
    """
    if self.opsHelper.getValue( "Pilot/CheckVersion", True ):
      if 'ReleaseVersion' not in resourceDict:
        if not 'DIRACVersion' in resourceDict:
          raise RuntimeError( 'Version check requested and not provided by Pilot' )
        else:
          pilotVersion = resourceDict['DIRACVersion']
      else:
        pilotVersion = resourceDict['ReleaseVersion']

      validVersions = self.opsHelper.getValue( "Pilot/Version", [] )
      if validVersions and pilotVersion not in validVersions:
        raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \
                            ( pilotVersion, ",".join( validVersions ) ) )
      # Check project if requested
      validProject = self.opsHelper.getValue( "Pilot/Project", "" )
      if validProject:
        if 'ReleaseProject' not in resourceDict:
          raise RuntimeError( "Version check requested but expected project %s not received" % validProject )
        if resourceDict[ 'ReleaseProject' ] != validProject:
          raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject,
                                                                                                   resourceDict[ 'ReleaseProject' ] ) )
Example #10
0
class Matcher(object):
    """Logic for matching"""
    def __init__(self,
                 pilotAgentsDB=None,
                 jobDB=None,
                 tqDB=None,
                 jlDB=None,
                 opsHelper=None,
                 pilotRef=None):
        """c'tor"""
        if pilotAgentsDB:
            self.pilotAgentsDB = pilotAgentsDB
        else:
            self.pilotAgentsDB = PilotAgentsDB()
        if jobDB:
            self.jobDB = jobDB
        else:
            self.jobDB = JobDB()
        if tqDB:
            self.tqDB = tqDB
        else:
            self.tqDB = TaskQueueDB()
        if jlDB:
            self.jlDB = jlDB
        else:
            self.jlDB = JobLoggingDB()

        if opsHelper:
            self.opsHelper = opsHelper
        else:
            self.opsHelper = Operations()

        if pilotRef:
            self.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.pilotAgentsDB.log = gLogger.getSubLogger("[%s]Matcher" %
                                                          pilotRef)
            self.jobDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.tqDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
            self.jlDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef)
        else:
            self.log = gLogger.getSubLogger("Matcher")

        self.limiter = Limiter(jobDB=self.jobDB,
                               opsHelper=self.opsHelper,
                               pilotRef=pilotRef)

        self.siteClient = SiteStatus()

    def selectJob(self, resourceDescription, credDict):
        """Main job selection function to find the highest priority job matching the resource capacity"""

        startTime = time.time()

        resourceDict = self._getResourceDict(resourceDescription, credDict)

        # Make a nice print of the resource matching parameters
        toPrintDict = dict(resourceDict)
        if "MaxRAM" in resourceDescription:
            toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"]
        if "NumberOfProcessors" in resourceDescription:
            toPrintDict["NumberOfProcessors"] = resourceDescription[
                "NumberOfProcessors"]
        toPrintDict["Tag"] = []
        if "Tag" in resourceDict:
            for tag in resourceDict["Tag"]:
                if not tag.endswith("GB") and not tag.endswith("Processors"):
                    toPrintDict["Tag"].append(tag)
        if not toPrintDict["Tag"]:
            toPrintDict.pop("Tag")
        self.log.info("Resource description for matching",
                      printDict(toPrintDict))

        negativeCond = self.limiter.getNegativeCondForSite(
            resourceDict["Site"], resourceDict.get("GridCE"))
        result = self.tqDB.matchAndGetJob(resourceDict,
                                          negativeCond=negativeCond)

        if not result["OK"]:
            raise RuntimeError(result["Message"])
        result = result["Value"]
        if not result["matchFound"]:
            self.log.info("No match found")
            return {}

        jobID = result["jobId"]
        resAtt = self.jobDB.getJobAttributes(
            jobID, ["OwnerDN", "OwnerGroup", "Status"])
        if not resAtt["OK"]:
            raise RuntimeError("Could not retrieve job attributes")
        if not resAtt["Value"]:
            raise RuntimeError("No attributes returned for job")
        if not resAtt["Value"]["Status"] == "Waiting":
            self.log.error("Job matched by the TQ is not in Waiting state",
                           str(jobID))
            result = self.tqDB.deleteJob(jobID)
            if not result["OK"]:
                raise RuntimeError(result["Message"])
            raise RuntimeError("Job %s is not in Waiting state" % str(jobID))

        self._reportStatus(resourceDict, jobID)

        result = self.jobDB.getJobJDL(jobID)
        if not result["OK"]:
            raise RuntimeError("Failed to get the job JDL")

        resultDict = {}
        resultDict["JDL"] = result["Value"]
        resultDict["JobID"] = jobID

        matchTime = time.time() - startTime
        self.log.verbose("Match time", "[%s]" % str(matchTime))
        gMonitor.addMark("matchTime", matchTime)

        # Get some extra stuff into the response returned
        resOpt = self.jobDB.getJobOptParameters(jobID)
        if resOpt["OK"]:
            for key, value in resOpt["Value"].items():
                resultDict[key] = value
        resAtt = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"])
        if not resAtt["OK"]:
            raise RuntimeError("Could not retrieve job attributes")
        if not resAtt["Value"]:
            raise RuntimeError("No attributes returned for job")

        if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True):
            self.limiter.updateDelayCounters(resourceDict["Site"], jobID)

        pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag",
                                                 False)
        if not pilotInfoReportedFlag:
            self._updatePilotInfo(resourceDict)
        self._updatePilotJobMapping(resourceDict, jobID)

        resultDict["DN"] = resAtt["Value"]["OwnerDN"]
        resultDict["Group"] = resAtt["Value"]["OwnerGroup"]
        resultDict["PilotInfoReportedFlag"] = True

        return resultDict

    def _getResourceDict(self, resourceDescription, credDict):
        """from resourceDescription to resourceDict (just various mods)"""
        resourceDict = self._processResourceDescription(resourceDescription)
        resourceDict = self._checkCredentials(resourceDict, credDict)
        self._checkPilotVersion(resourceDict)
        if not self._checkMask(resourceDict):
            # Banned destinations can only take Test jobs
            resourceDict["JobType"] = "Test"

        self.log.verbose("Resource description")
        for key in resourceDict:
            self.log.debug("%s : %s" % (key.rjust(20), resourceDict[key]))

        return resourceDict

    def _processResourceDescription(self, resourceDescription):
        """Check and form the resource description dictionary

        :param resourceDescription: a ceDict coming from a JobAgent,
                                    for example.
        :return: updated dictionary of resource description parameters
        """

        resourceDict = {}
        for name in singleValueDefFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        for name in multiValueMatchFields:
            if name in resourceDescription:
                resourceDict[name] = resourceDescription[name]

        if resourceDescription.get("Tag"):
            tags = resourceDescription["Tag"]
            resourceDict["Tag"] = (tags if isinstance(tags, list) else list(
                {tag.strip("\"' ")
                 for tag in tags.strip("[]").split(",")}))
            if "RequiredTag" in resourceDescription:
                requiredTagsList = (list({
                    tag.strip("\"' ")
                    for tag in resourceDescription["RequiredTag"].strip(
                        "[]").split(",")
                }) if isinstance(resourceDescription["RequiredTag"], str) else
                                    resourceDescription["RequiredTag"])
                resourceDict["RequiredTag"] = requiredTagsList

        if "JobID" in resourceDescription:
            resourceDict["JobID"] = resourceDescription["JobID"]

        # Convert MaxRAM and NumberOfProcessors parameters into a list of tags
        maxRAM = resourceDescription.get("MaxRAM")
        if maxRAM:
            try:
                maxRAM = int(maxRAM / 1000)
            except ValueError:
                maxRAM = None
        nProcessors = resourceDescription.get("NumberOfProcessors")
        if nProcessors:
            try:
                nProcessors = int(nProcessors)
            except ValueError:
                nProcessors = None
        for param, key in [(maxRAM, "GB"), (nProcessors, "Processors")]:
            if param and param <= 1024:
                paramList = list(range(2, param + 1))
                paramTags = ["%d%s" % (par, key) for par in paramList]
                if paramTags:
                    resourceDict.setdefault("Tag", []).extend(paramTags)

        # Add 'MultiProcessor' to the list of tags
        if nProcessors and nProcessors > 1:
            resourceDict.setdefault("Tag", []).append("MultiProcessor")

        # Add 'WholeNode' to the list of tags
        if "WholeNode" in resourceDescription:
            resourceDict.setdefault("Tag", []).append("WholeNode")

        if "Tag" in resourceDict:
            resourceDict["Tag"] = list(set(resourceDict["Tag"]))
        if "RequiredTag" in resourceDict:
            resourceDict["RequiredTag"] = list(set(
                resourceDict["RequiredTag"]))

        for k in (
                "DIRACVersion",
                "ReleaseVersion",
                "ReleaseProject",
                "VirtualOrganization",
                "PilotReference",
                "PilotBenchmark",
                "PilotInfoReportedFlag",
        ):
            if k in resourceDescription:
                resourceDict[k] = resourceDescription[k]

        return resourceDict

    def _reportStatus(self, resourceDict, jobID):
        """Reports the status of the matched job in jobDB and jobLoggingDB

        Do not fail if errors happen here
        """
        attNames = ["Status", "MinorStatus", "ApplicationStatus", "Site"]
        attValues = ["Matched", "Assigned", "Unknown", resourceDict["Site"]]
        result = self.jobDB.setJobAttributes(jobID, attNames, attValues)
        if not result["OK"]:
            self.log.error(
                "Problem reporting job status",
                "setJobAttributes, jobID = %s: %s" %
                (jobID, result["Message"]))
        else:
            self.log.verbose("Set job attributes for jobID", jobID)

        result = self.jlDB.addLoggingRecord(jobID,
                                            status=JobStatus.MATCHED,
                                            minorStatus="Assigned",
                                            source="Matcher")
        if not result["OK"]:
            self.log.error(
                "Problem reporting job status",
                "addLoggingRecord, jobID = %s: %s" %
                (jobID, result["Message"]))
        else:
            self.log.verbose("Added logging record for jobID", jobID)

    def _checkMask(self, resourceDict):
        """Check the mask: are we allowed to run normal jobs?

        FIXME: should we move to site OR SE?
        """
        if "Site" not in resourceDict:
            self.log.error("Missing Site Name in Resource JDL")
            raise RuntimeError("Missing Site Name in Resource JDL")

        # Check if site is allowed
        result = self.siteClient.getUsableSites(resourceDict["Site"])
        if not result["OK"]:
            self.log.error("Internal error",
                           "siteClient.getUsableSites: %s" % result["Message"])
            raise RuntimeError("Internal error")

        if resourceDict["Site"] not in result["Value"]:
            return False

        return True

    def _updatePilotInfo(self, resourceDict):
        """Update pilot information - do not fail if we don't manage to do it"""
        pilotReference = resourceDict.get("PilotReference", "")
        if pilotReference and pilotReference != "Unknown":
            gridCE = resourceDict.get("GridCE", "Unknown")
            site = resourceDict.get("Site", "Unknown")
            benchmark = resourceDict.get("PilotBenchmark", 0.0)
            self.log.verbose(
                "Reporting pilot info",
                "for %s: gridCE=%s, site=%s, benchmark=%f" %
                (pilotReference, gridCE, site, benchmark),
            )

            result = self.pilotAgentsDB.setPilotStatus(
                pilotReference,
                status=PilotStatus.RUNNING,
                gridSite=site,
                destination=gridCE,
                benchmark=benchmark)
            if not result["OK"]:
                self.log.warn(
                    "Problem updating pilot information",
                    "; setPilotStatus. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )

    def _updatePilotJobMapping(self, resourceDict, jobID):
        """Update pilot to job mapping information"""
        pilotReference = resourceDict.get("PilotReference", "")
        if pilotReference and pilotReference != "Unknown":
            result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID)
            if not result["OK"]:
                self.log.error(
                    "Problem updating pilot information",
                    ";setCurrentJobID. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )
            result = self.pilotAgentsDB.setJobForPilot(jobID,
                                                       pilotReference,
                                                       updateStatus=False)
            if not result["OK"]:
                self.log.error(
                    "Problem updating pilot information",
                    "; setJobForPilot. pilotReference: %s; %s" %
                    (pilotReference, result["Message"]),
                )

    def _checkCredentials(self, resourceDict, credDict):
        """Check if we can get a job given the passed credentials"""
        if Properties.GENERIC_PILOT in credDict["properties"]:
            # You can only match groups in the same VO
            if credDict["group"] == "hosts":
                # for the host case the VirtualOrganization parameter
                # is mandatory in resourceDict
                vo = resourceDict.get("VirtualOrganization", "")
            else:
                vo = Registry.getVOForGroup(credDict["group"])
            if "OwnerGroup" not in resourceDict:
                result = Registry.getGroupsForVO(vo)
                if result["OK"]:
                    resourceDict["OwnerGroup"] = result["Value"]
                else:
                    raise RuntimeError(result["Message"])
        else:
            # If it's a private pilot, the DN has to be the same
            if Properties.PILOT in credDict["properties"]:
                self.log.notice(
                    "Setting the resource DN to the credentials DN")
                resourceDict["OwnerDN"] = credDict["DN"]
            # If it's a job sharing. The group has to be the same and just check that the DN (if any)
            # belongs to the same group
            elif Properties.JOB_SHARING in credDict["properties"]:
                resourceDict["OwnerGroup"] = credDict["group"]
                self.log.notice(
                    "Setting the resource group to the credentials group")
                if "OwnerDN" in resourceDict and resourceDict[
                        "OwnerDN"] != credDict["DN"]:
                    ownerDN = resourceDict["OwnerDN"]
                    result = Registry.getGroupsForDN(resourceDict["OwnerDN"])
                    if not result["OK"]:
                        raise RuntimeError(result["Message"])
                    if credDict["group"] not in result["Value"]:
                        # DN is not in the same group! bad boy.
                        self.log.warn(
                            "You cannot request jobs from this DN, as it does not belong to your group!",
                            "(%s)" % ownerDN,
                        )
                        resourceDict["OwnerDN"] = credDict["DN"]
            # Nothing special, group and DN have to be the same
            else:
                resourceDict["OwnerDN"] = credDict["DN"]
                resourceDict["OwnerGroup"] = credDict["group"]

        return resourceDict

    def _checkPilotVersion(self, resourceDict):
        """Check the pilot DIRAC version"""
        if self.opsHelper.getValue("Pilot/CheckVersion", True):
            if "ReleaseVersion" not in resourceDict:
                if "DIRACVersion" not in resourceDict:
                    raise PilotVersionError(
                        "Version check requested and not provided by Pilot")
                else:
                    pilotVersion = resourceDict["DIRACVersion"]
            else:
                pilotVersion = resourceDict["ReleaseVersion"]

            validVersions = [
                convertToPy3VersionNumber(newStyleVersion)
                for newStyleVersion in self.opsHelper.getValue(
                    "Pilot/Version", [])
            ]
            if validVersions and convertToPy3VersionNumber(
                    pilotVersion) not in validVersions:
                raise PilotVersionError(
                    "Pilot version does not match the production version: %s not in ( %s )"
                    % (pilotVersion, ",".join(validVersions)))
            # Check project if requested
            validProject = self.opsHelper.getValue("Pilot/Project", "")
            if validProject:
                if "ReleaseProject" not in resourceDict:
                    raise PilotVersionError(
                        "Version check requested but expected project %s not received"
                        % validProject)
                if resourceDict["ReleaseProject"] != validProject:
                    raise PilotVersionError(
                        "Version check requested but expected project %s != received %s"
                        % (validProject, resourceDict["ReleaseProject"]))