コード例 #1
0
    def _checkLoggingInfo(self, jobID, jobDict):
        """Get info from JobLogging"""
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result["OK"]:
            logList = result["Value"]

        startTime = jobDict["StartExecTime"]
        if not startTime or startTime == "None":
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == "Running":
                    startTime = items[3]
                    break
            if not startTime or startTime == "None":
                startTime = jobDict["SubmissionTime"]

        if isinstance(startTime, six.string_types):
            startTime = fromString(startTime)
            if startTime is None:
                self.log.error("Wrong timestamp in DB", items[3])
                startTime = dateTime()

        endTime = dateTime()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == "Stalled":
                endTime = fromString(items[3])
        if endTime is None:
            self.log.error("Wrong timestamp in DB", items[3])
            endTime = dateTime()

        return startTime, endTime
コード例 #2
0
ファイル: FTSAgent.py プロジェクト: IgorPelevanyuk/DIRAC
  def __sendAccounting( ftsJob, ownerDN ):
    """ prepare and send DataOperation to AccouringDB """

    dataOp = DataOperation()
    dataOp.setStartTime( fromString( ftsJob.SubmitTime ) )
    dataOp.setEndTime( fromString( ftsJob.LastUpdate ) )

    accountingDict = dict()
    accountingDict["OperationType"] = "ReplicateAndRegister"

    username = getUsernameForDN( ownerDN )
    if not username["OK"]:
      username = ownerDN
    else:
      username = username["Value"]

    accountingDict["User"] = username
    accountingDict["Protocol"] = "FTS"

    # accountingDict['RegistrationTime'] = 0
    # accountingDict['RegistrationOK'] = 0
    # accountingDict['RegistrationTotal'] = 0

    accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status == "Finished" ] )
    accountingDict["TransferTotal"] = len( ftsJob )
    accountingDict["TransferSize"] = ftsJob.Size
    accountingDict["FinalStatus"] = ftsJob.Status
    accountingDict["Source"] = ftsJob.SourceSE
    accountingDict["Destination"] = ftsJob.TargetSE

    dt = ftsJob.LastUpdate - ftsJob.SubmitTime
    transferTime = dt.days * 86400 + dt.seconds
    accountingDict["TransferTime"] = transferTime
    dataOp.setValuesFromDict( accountingDict )
    dataOp.commit()
コード例 #3
0
  def __checkLoggingInfo( self, jobID, jobDict ):
    """ Get info from JobLogging
"""
    logList = []
    result = self.logDB.getJobLoggingInfo( jobID )
    if result['OK']:
      logList = result['Value']

    startTime = jobDict['StartExecTime']
    if not startTime or startTime == 'None':
      # status, minor, app, stime, source
      for items in logList:
        if items[0] == 'Running':
          startTime = items[3]
          break
      if not startTime or startTime == 'None':
        startTime = jobDict['SubmissionTime']

    if type( startTime ) in types.StringTypes:
      startTime = fromString( startTime )
      if startTime == None:
        self.log.error( 'Wrong timestamp in DB', items[3] )
        startTime = dateTime()


    endTime = dateTime()
    # status, minor, app, stime, source
    for items in logList:
      if items[0] == 'Stalled':
        endTime = fromString( items[3] )
    if endTime == None:
      self.log.error( 'Wrong timestamp in DB', items[3] )
      endTime = dateTime()

    return startTime, endTime
コード例 #4
0
  def __getLatestUpdateTime( self, job ):
    """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
    result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime'] )
    if not result['OK']:
      self.log.error( 'Failed to get job attributes', result['Message'] )
    if not result['OK'] or not result['Value']:
      self.log.error( 'Could not get attributes for job', '%s' % job )
      return S_ERROR( 'Could not get attributes for job' )

    self.log.verbose( result )
    latestUpdate = 0
    if not result['Value']['HeartBeatTime'] or result['Value']['HeartBeatTime'] == 'None':
      self.log.verbose( 'HeartBeatTime is null for job %s' % job )
    else:
      latestUpdate = toEpoch( fromString( result['Value']['HeartBeatTime'] ) )

    if not result['Value']['LastUpdateTime'] or result['Value']['LastUpdateTime'] == 'None':
      self.log.verbose( 'LastUpdateTime is null for job %s' % job )
    else:
      lastUpdate = toEpoch( fromString( result['Value']['LastUpdateTime'] ) )
      if latestUpdate < lastUpdate:
        latestUpdate = lastUpdate

    if not latestUpdate:
      return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job )
    else:
      self.log.verbose( 'Latest update time from epoch for job %s is %s' % ( job, latestUpdate ) )
      return S_OK( latestUpdate )
コード例 #5
0
ファイル: StalledJobAgent.py プロジェクト: acasajus/DIRAC
    def __checkLoggingInfo(self, jobID, jobDict):
        """ Get info from JobLogging
"""
        logList = []
        result = self.logDB.getJobLoggingInfo(jobID)
        if result['OK']:
            logList = result['Value']

        startTime = jobDict['StartExecTime']
        if not startTime or startTime == 'None':
            # status, minor, app, stime, source
            for items in logList:
                if items[0] == 'Running':
                    startTime = items[3]
                    break
            if not startTime or startTime == 'None':
                startTime = jobDict['SubmissionTime']

        if type(startTime) in types.StringTypes:
            startTime = fromString(startTime)
            if startTime == None:
                self.log.error('Wrong timestamp in DB', items[3])
                startTime = dateTime()

        endTime = dateTime()
        # status, minor, app, stime, source
        for items in logList:
            if items[0] == 'Stalled':
                endTime = fromString(items[3])
        if endTime == None:
            self.log.error('Wrong timestamp in DB', items[3])
            endTime = dateTime()

        return startTime, endTime
コード例 #6
0
ファイル: StalledJobAgent.py プロジェクト: acasajus/DIRAC
    def __getLatestUpdateTime(self, job):
        """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
        result = self.jobDB.getJobAttributes(
            job, ['HeartBeatTime', 'LastUpdateTime'])
        if not result['OK']:
            self.log.error(result['Message'])
        if not result['OK'] or not result['Value']:
            return S_ERROR('Could not get attributes for job %s' % job)

        self.log.verbose(result)
        latestUpdate = 0
        if not result['Value']['HeartBeatTime'] or result['Value'][
                'HeartBeatTime'] == 'None':
            self.log.verbose('HeartBeatTime is null for job %s' % job)
        else:
            latestUpdate = toEpoch(fromString(
                result['Value']['HeartBeatTime']))

        if not result['Value']['LastUpdateTime'] or result['Value'][
                'LastUpdateTime'] == 'None':
            self.log.verbose('LastUpdateTime is null for job %s' % job)
        else:
            lastUpdate = toEpoch(fromString(result['Value']['LastUpdateTime']))
            if latestUpdate < lastUpdate:
                latestUpdate = lastUpdate

        if not latestUpdate:
            return S_ERROR(
                'LastUpdate and HeartBeat times are null for job %s' % job)
        else:
            self.log.verbose('Latest update time from epoch for job %s is %s' %
                             (job, latestUpdate))
            return S_OK(latestUpdate)
コード例 #7
0
ファイル: StalledJobAgent.py プロジェクト: DIRACGrid/DIRAC
    def __getLatestUpdateTime(self, job):
        """ Returns the most recent of HeartBeatTime and LastUpdateTime
"""
        result = self.jobDB.getJobAttributes(job, ["HeartBeatTime", "LastUpdateTime"])
        if not result["OK"]:
            self.log.error("Failed to get job attributes", result["Message"])
        if not result["OK"] or not result["Value"]:
            self.log.error("Could not get attributes for job", "%s" % job)
            return S_ERROR("Could not get attributes for job")

        self.log.verbose(result)
        latestUpdate = 0
        if not result["Value"]["HeartBeatTime"] or result["Value"]["HeartBeatTime"] == "None":
            self.log.verbose("HeartBeatTime is null for job %s" % job)
        else:
            latestUpdate = toEpoch(fromString(result["Value"]["HeartBeatTime"]))

        if not result["Value"]["LastUpdateTime"] or result["Value"]["LastUpdateTime"] == "None":
            self.log.verbose("LastUpdateTime is null for job %s" % job)
        else:
            lastUpdate = toEpoch(fromString(result["Value"]["LastUpdateTime"]))
            if latestUpdate < lastUpdate:
                latestUpdate = lastUpdate

        if not latestUpdate:
            return S_ERROR("LastUpdate and HeartBeat times are null for job %s" % job)
        else:
            self.log.verbose("Latest update time from epoch for job %s is %s" % (job, latestUpdate))
            return S_OK(latestUpdate)
コード例 #8
0
  def __getToken2(self):
    """Get the Keystone token for the version v2 of the keystone service

    :return: S_OK(token) or S_ERROR
    """

    user = self.parameters.get('User')
    password = self.parameters.get('Password')
    authArgs = {}
    if user and password:
      authDict = {'auth': {"passwordCredentials": {"username": user,
                                                   "password": password}
                           }
                  }
      if self.project:
        authDict['auth']['tenantName'] = self.project
    elif self.parameters.get('Auth') == "voms":
      authDict = {'auth': {'voms': True}}
      if self.project:
        authDict['auth']['tenantName'] = self.project

      if self.parameters.get('Proxy'):
        authArgs['cert'] = self.parameters.get('Proxy')

    try:
      result = requests.post("%s/tokens" % self.url,
                             headers={"Content-Type": "application/json"},
                             json=authDict,
                             verify=self.caPath,
                             **authArgs)
    except Exception as exc:
      return S_ERROR('Exception getting keystone token: %s' % str(exc))

    output = result.json()

    if result.status_code in [400, 401]:
      message = "None"
      if 'error' in output:
        message = output['error'].get('message')
      return S_ERROR('Authorization error: %s' % message)

    self.token = str(output['access']['token']['id'])
    expires = fromString(str(output['access']['token']['expires']).replace('T', ' ').replace('Z', ''))
    issued = fromString(str(output['access']['token']['issued_at']).replace('T', ' ').replace('Z', ''))
    self.expires = dateTime() + (expires - issued)

    self.projectID = output['access']['token']['tenant']['id']

    for endpoint in output['access']['serviceCatalog']:
      if endpoint['type'] == 'compute':
        self.computeURL = str(endpoint['endpoints'][0]['publicURL'])
      elif endpoint['type'] == 'image':
        self.imageURL = str(endpoint['endpoints'][0]['publicURL'])
      elif endpoint['type'] == 'network':
        self.networkURL = str(endpoint['endpoints'][0]['publicURL'])
    return S_OK(self.token)
コード例 #9
0
ファイル: FTS3Agent.py プロジェクト: DIRACGrid/DIRAC
  def __sendAccounting(ftsJob):
    """ prepare and send DataOperation to AccountingDB

        :param ftsJob: the FTS3Job from which we send the accounting info
    """

    dataOp = DataOperation()
    dataOp.setStartTime(fromString(ftsJob.submitTime))
    dataOp.setEndTime(fromString(ftsJob.lastUpdate))

    dataOp.setValuesFromDict(ftsJob.accountingDict)
    dataOp.delayedCommit()
コード例 #10
0
ファイル: FTS3Agent.py プロジェクト: hamzazafar/DIRAC
  def __sendAccounting(ftsJob):
    """ prepare and send DataOperation to AccountingDB

        :param ftsJob: the FTS3Job from which we send the accounting info
    """

    dataOp = DataOperation()
    dataOp.setStartTime(fromString(ftsJob.submitTime))
    dataOp.setEndTime(fromString(ftsJob.lastUpdate))

    dataOp.setValuesFromDict(ftsJob.accountingDict)
    dataOp.delayedCommit()
コード例 #11
0
ファイル: FTSRequest.py プロジェクト: caitriana/DIRAC
 def __sendAccounting( self, regSuc, regTotal, regTime, transEndTime ):
   transSuc = 0
   transSize = 0
   missingSize = []
   for lfn in self.fileDict.keys():
     if self.fileDict[lfn].get( 'Status' ) == 'Finished':
       transSuc += 1
       if not self.catalogMetadata.has_key( lfn ):
         missingSize.append( lfn )
   if missingSize:
     self.__updateMetadataCache( missingSize )
   for lfn in self.fileDict.keys():
     if self.fileDict[lfn].get( 'Status' ) == 'Finished':
       transSize += self.catalogMetadata[lfn]['Size']
   transTotal = 0
   for state in ( self.statusSummary.keys() ):
     transTotal += self.statusSummary[state]
   submitTime = fromString( self.submitTime )
   endTime = fromString( transEndTime )
   oAccounting = DataOperation()
   #oAccounting.setEndTime(endTime)
   oAccounting.setEndTime( transEndTime )
   oAccounting.setStartTime( submitTime )
   accountingDict = {}
   accountingDict['OperationType'] = 'replicateAndRegister'
   accountingDict['User'] = '******'
   accountingDict['Protocol'] = 'FTS'
   accountingDict['RegistrationTime'] = regTime
   accountingDict['RegistrationOK'] = regSuc
   accountingDict['RegistrationTotal'] = regTotal
   accountingDict['TransferOK'] = transSuc
   accountingDict['TransferTotal'] = transTotal
   accountingDict['TransferSize'] = transSize
   accountingDict['FinalStatus'] = self.requestStatus
   accountingDict['Source'] = self.sourceSE
   accountingDict['Destination'] = self.targetSE
   c = transEndTime - submitTime
   transferTime = c.days * 86400 + c.seconds
   accountingDict['TransferTime'] = transferTime
   oAccounting.setValuesFromDict( accountingDict )
   gLogger.verbose( "Attempting to commit accounting message..." )
   oAccounting.commit()
   gLogger.verbose( "...committed." )
   return S_OK()
コード例 #12
0
 def __sendAccounting(self, regSuc, regTotal, regTime, transEndTime):
     transSuc = 0
     transSize = 0
     missingSize = []
     for lfn in self.fileDict.keys():
         if self.fileDict[lfn].get('Status') == 'Finished':
             transSuc += 1
             if not self.catalogMetadata.has_key(lfn):
                 missingSize.append(lfn)
     if missingSize:
         self.__updateMetadataCache(missingSize)
     for lfn in self.fileDict.keys():
         if self.fileDict[lfn].get('Status') == 'Finished':
             transSize += self.catalogMetadata[lfn]['Size']
     transTotal = 0
     for state in (self.statusSummary.keys()):
         transTotal += self.statusSummary[state]
     submitTime = fromString(self.submitTime)
     endTime = fromString(transEndTime)
     oAccounting = DataOperation()
     #oAccounting.setEndTime(endTime)
     oAccounting.setEndTime(transEndTime)
     oAccounting.setStartTime(submitTime)
     accountingDict = {}
     accountingDict['OperationType'] = 'replicateAndRegister'
     accountingDict['User'] = '******'
     accountingDict['Protocol'] = 'FTS'
     accountingDict['RegistrationTime'] = regTime
     accountingDict['RegistrationOK'] = regSuc
     accountingDict['RegistrationTotal'] = regTotal
     accountingDict['TransferOK'] = transSuc
     accountingDict['TransferTotal'] = transTotal
     accountingDict['TransferSize'] = transSize
     accountingDict['FinalStatus'] = self.requestStatus
     accountingDict['Source'] = self.sourceSE
     accountingDict['Destination'] = self.targetSE
     c = transEndTime - submitTime
     transferTime = c.days * 86400 + c.seconds
     accountingDict['TransferTime'] = transferTime
     oAccounting.setValuesFromDict(accountingDict)
     gLogger.verbose("Attempting to commit accounting message...")
     oAccounting.commit()
     gLogger.verbose("...committed.")
     return S_OK()
コード例 #13
0
    def export_checkComponentLog(self, component):
        """ Check component log for errors
    """
        componentList = []
        if '*' in component:
            if component == '*':
                result = InstallTools.getSetupComponents()
                if result['OK']:
                    for ctype in ['Services', 'Agents']:
                        if ctype in result['Value']:
                            for sname in result['Value'][ctype]:
                                for cname in result['Value'][ctype][sname]:
                                    componentList.append('/'.join(
                                        [sname, cname]))
        elif type(component) in StringTypes:
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for c in componentList:
            if not '/' in c:
                continue
            system, cname = c.split('/')

            startDir = InstallTools.startDir
            currentLog = startDir + '/' + system + '_' + cname + '/log/current'
            logFile = file(currentLog, 'r')
            logLines = logFile.readlines()
            logFile.close()

            errors_1 = 0
            errors_24 = 0
            now = dateTime()
            lastError = ''
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    timeStamp = fromString(fields[0] + ' ' + fields[1])
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split('ERROR:')[-1].strip()

            resultDict[c] = {
                'ErrorsHour': errors_1,
                'ErrorsDay': errors_24,
                'LastError': lastError
            }

        return S_OK(resultDict)
コード例 #14
0
    def __sendAccounting(ftsJob, ownerDN):
        """ prepare and send DataOperation to AccouringDB """

        dataOp = DataOperation()
        dataOp.setStartTime(fromString(ftsJob.SubmitTime))
        dataOp.setEndTime(fromString(ftsJob.LastUpdate))

        accountingDict = dict()
        accountingDict["OperationType"] = "ReplicateAndRegister"

        username = getUsernameForDN(ownerDN)
        if not username["OK"]:
            username = ownerDN
        else:
            username = username["Value"]

        accountingDict["User"] = username
        accountingDict[
            "Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower(
            ) else 'FTS'
        accountingDict['ExecutionSite'] = ftsJob.FTSServer

        accountingDict['RegistrationTime'] = ftsJob._regTime
        accountingDict['RegistrationOK'] = ftsJob._regSuccess
        accountingDict['RegistrationTotal'] = ftsJob._regTotal

        accountingDict["TransferOK"] = len(
            [f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES])
        accountingDict["TransferTotal"] = len(ftsJob)
        accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize
        accountingDict["FinalStatus"] = ftsJob.Status
        accountingDict["Source"] = ftsJob.SourceSE
        accountingDict["Destination"] = ftsJob.TargetSE

        dt = ftsJob.LastUpdate - ftsJob.SubmitTime
        transferTime = dt.days * 86400 + dt.seconds
        accountingDict["TransferTime"] = transferTime
        # accountingDict['TransferTime'] = sum( [f._duration for f in ftsJob])
        dataOp.setValuesFromDict(accountingDict)
        dataOp.commit()
コード例 #15
0
    def export_checkComponentLog(self, component):
        """ Check component log for errors
    """
        componentList = []
        if "*" in component:
            if component == "*":
                result = InstallTools.getSetupComponents()
                if result["OK"]:
                    for ctype in ["Services", "Agents"]:
                        if ctype in result["Value"]:
                            for sname in result["Value"][ctype]:
                                for cname in result["Value"][ctype][sname]:
                                    componentList.append("/".join([sname, cname]))
        elif type(component) in StringTypes:
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for c in componentList:
            if not "/" in c:
                continue
            system, cname = c.split("/")

            startDir = InstallTools.startDir
            currentLog = startDir + "/" + system + "_" + cname + "/log/current"
            logFile = file(currentLog, "r")
            logLines = logFile.readlines()
            logFile.close()

            errors_1 = 0
            errors_24 = 0
            now = dateTime()
            lastError = ""
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    timeStamp = fromString(fields[0] + " " + fields[1])
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split("ERROR:")[-1].strip()

            resultDict[c] = {"ErrorsHour": errors_1, "ErrorsDay": errors_24, "LastError": lastError}

        return S_OK(resultDict)
コード例 #16
0
  def export_checkComponentLog( self, component ):
    """ Check component log for errors
    """
    componentList = []
    if '*' in component:
      if component == '*':
        result = InstallTools.getSetupComponents()
        if result['OK']:
          for ctype in ['Services', 'Agents']:
            if ctype in result['Value']:
              for sname in result['Value'][ctype]:
                for cname in result['Value'][ctype][sname]:
                  componentList.append( '/'.join( [sname, cname] ) )
    elif type( component ) in StringTypes:
      componentList = [component]
    else:
      componentList = component

    resultDict = {}
    for c in componentList:
      if not '/' in c:
        continue
      system, cname = c.split( '/' )

      startDir = InstallTools.startDir
      currentLog = startDir + '/' + system + '_' + cname + '/log/current'
      logFile = file( currentLog, 'r' )
      logLines = logFile.readlines()
      logFile.close()

      errors_1 = 0
      errors_24 = 0
      now = dateTime()
      lastError = ''
      for line in logLines:
        if "ERROR:" in line:
          fields = line.split()
          recent = False
          timeStamp = fromString( fields[0] + ' ' + fields[1] )
          if ( now - timeStamp ) < hour:
            errors_1 += 1
            recent = True
          if ( now - timeStamp ) < day:
            errors_24 += 1
            recent = True
          if recent:
            lastError = line.split( 'ERROR:' )[-1].strip()

      resultDict[c] = {'ErrorsHour':errors_1, 'ErrorsDay':errors_24, 'LastError':lastError}

    return S_OK( resultDict )
コード例 #17
0
ファイル: FTSAgent.py プロジェクト: JanEbbing/DIRAC
  def __sendAccounting( ftsJob, ownerDN ):
    """ prepare and send DataOperation to AccouringDB """

    dataOp = DataOperation()
    dataOp.setStartTime( fromString( ftsJob.SubmitTime ) )
    dataOp.setEndTime( fromString( ftsJob.LastUpdate ) )

    accountingDict = dict()
    accountingDict["OperationType"] = "ReplicateAndRegister"

    username = getUsernameForDN( ownerDN )
    if not username["OK"]:
      username = ownerDN
    else:
      username = username["Value"]

    accountingDict["User"] = username
    accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS'
    accountingDict['ExecutionSite'] = ftsJob.FTSServer

    accountingDict['RegistrationTime'] = ftsJob._regTime
    accountingDict['RegistrationOK'] = ftsJob._regSuccess
    accountingDict['RegistrationTotal'] = ftsJob._regTotal

    accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] )
    accountingDict["TransferTotal"] = len( ftsJob )
    accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize
    accountingDict["FinalStatus"] = ftsJob.Status
    accountingDict["Source"] = ftsJob.SourceSE
    accountingDict["Destination"] = ftsJob.TargetSE

    # dt = ftsJob.LastUpdate - ftsJob.SubmitTime
    # transferTime = dt.days * 86400 + dt.seconds
    # accountingDict["TransferTime"] = transferTime
    accountingDict['TransferTime'] = sum( [int( f._duration ) for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] )
    dataOp.setValuesFromDict( accountingDict )
    dataOp.commit()
コード例 #18
0
ファイル: FTSRequest.py プロジェクト: SimonBidwell/DIRAC
  def __sendAccounting( self, regSuc, regTotal, regTime, transEndTime, transDict ):
    """ send accounting record

    :param self: self reference
    :param regSuc: number of files successfully registered
    :param regTotal: number of files attepted to register
    :param regTime: time stamp at the end of registration
    :param transEndTime: time stamp at the end of FTS job
    :param dict transDict: dict holding couters for files being transerred, their sizes and successfull transfers
    """

    submitTime = fromString( self.submitTime )
    oAccounting = DataOperation()
    dt = transEndTime - submitTime
    transferTime = dt.days * 86400 + dt.seconds
    if 'fts3' in self.ftsServer and transferTime < 0:
      import datetime
      while transferTime < 0:
        # Shift by one hour until transfer time is positive (ugly fix for FTS3 bug)
        transferTime += 3600
        submitTime -= datetime.timedelta( 0, 3600 )
      self.log.verbose( 'Fixed UTC submit time... Submit: %s, end: %s' % ( submitTime, transEndTime ) )
    oAccounting.setEndTime( transEndTime )
    oAccounting.setStartTime( submitTime )

    accountingDict = {}
    accountingDict['OperationType'] = 'replicateAndRegister'
    result = getProxyInfo()
    if not result['OK']:
      userName = '******'
    else:
      userName = result['Value'].get( 'username', 'unknown' )
    accountingDict['User'] = userName
    accountingDict['Protocol'] = 'FTS' if 'fts3' not in self.ftsServer else 'FTS3'
    accountingDict['RegistrationTime'] = regTime
    accountingDict['RegistrationOK'] = regSuc
    accountingDict['RegistrationTotal'] = regTotal
    accountingDict['TransferOK'] = transDict['transOK']
    accountingDict['TransferTotal'] = transDict['transTotal']
    accountingDict['TransferSize'] = transDict['transSize']
    accountingDict['FinalStatus'] = self.requestStatus
    accountingDict['Source'] = self.sourceSE
    accountingDict['Destination'] = self.targetSE
    accountingDict['TransferTime'] = transferTime
    oAccounting.setValuesFromDict( accountingDict )
    self.log.verbose( "Attempting to commit accounting message..." )
    oAccounting.commit()
    self.log.verbose( "...committed." )
    return S_OK()
コード例 #19
0
    def _getLatestUpdateTime(self, job):
        """Returns the most recent of HeartBeatTime and LastUpdateTime"""
        result = self.jobDB.getJobAttributes(
            job, ["HeartBeatTime", "LastUpdateTime"])
        if not result["OK"] or not result["Value"]:
            self.log.error(
                "Failed to get job attributes",
                "for job %d: %s" %
                (job, result["Message"] if "Message" in result else "empty"),
            )
            return S_ERROR("Could not get attributes for job")

        latestUpdate = 0
        if not result["Value"]["HeartBeatTime"] or result["Value"][
                "HeartBeatTime"] == "None":
            self.log.verbose("HeartBeatTime is null", "for job %s" % job)
        else:
            latestUpdate = toEpoch(fromString(
                result["Value"]["HeartBeatTime"]))

        if not result["Value"]["LastUpdateTime"] or result["Value"][
                "LastUpdateTime"] == "None":
            self.log.verbose("LastUpdateTime is null", "for job %s" % job)
        else:
            latestUpdate = max(
                latestUpdate,
                toEpoch(fromString(result["Value"]["LastUpdateTime"])))

        if not latestUpdate:
            return S_ERROR(
                "LastUpdate and HeartBeat times are null for job %s" % job)
        else:
            self.log.verbose(
                "", "Latest update time from epoch for job %s is %s" %
                (job, latestUpdate))
            return S_OK(latestUpdate)
コード例 #20
0
ファイル: FTSAgent.py プロジェクト: sposs/DIRAC
    def __sendAccounting(ftsJob, ownerDN):
        """ prepare and send DataOperation to AccouringDB """

        dataOp = DataOperation()
        dataOp.setStartTime(fromString(ftsJob.SubmitTime))
        dataOp.setEndTime(fromString(ftsJob.LastUpdate))

        accountingDict = dict()
        accountingDict["OperationType"] = "ReplicateAndRegister"

        username = getUsernameForDN(ownerDN)
        if not username["OK"]:
            username = ownerDN
        else:
            username = username["Value"]

        accountingDict["User"] = username
        accountingDict["Protocol"] = "FTS"

        # accountingDict['RegistrationTime'] = 0
        # accountingDict['RegistrationOK'] = 0
        # accountingDict['RegistrationTotal'] = 0

        accountingDict["TransferOK"] = len(
            [f for f in ftsJob if f.Status == "Finished"])
        accountingDict["TransferTotal"] = len(ftsJob)
        accountingDict["TransferSize"] = ftsJob.Size
        accountingDict["FinalStatus"] = ftsJob.Status
        accountingDict["Source"] = ftsJob.SourceSE
        accountingDict["Destination"] = ftsJob.TargetSE

        dt = ftsJob.LastUpdate - ftsJob.SubmitTime
        transferTime = dt.days * 86400 + dt.seconds
        accountingDict["TransferTime"] = transferTime
        dataOp.setValuesFromDict(accountingDict)
        dataOp.commit()
コード例 #21
0
    def __sendAccounting(self, regSuc, regTotal, regTime, transEndTime,
                         transDict):
        """ send accounting record

    :param self: self reference
    :param regSuc: number of files successfully registered
    :param regTotal: number of files attepted to register 
    :param regTime: time stamp at the end of registration 
    :param transEndTime: time stamp at the end of FTS job
    :param dict transDict: dict holding couters for files being transerred, their sizes and successfull transfers 
    """

        submitTime = fromString(self.submitTime)
        oAccounting = DataOperation()
        oAccounting.setEndTime(transEndTime)
        oAccounting.setStartTime(submitTime)

        accountingDict = {}
        accountingDict['OperationType'] = 'replicateAndRegister'
        accountingDict['User'] = '******'
        accountingDict['Protocol'] = 'FTS'
        accountingDict['RegistrationTime'] = regTime
        accountingDict['RegistrationOK'] = regSuc
        accountingDict['RegistrationTotal'] = regTotal
        accountingDict['TransferOK'] = transDict['transOK']
        accountingDict['TransferTotal'] = transDict['transTotal']
        accountingDict['TransferSize'] = transDict['transSize']
        accountingDict['FinalStatus'] = self.requestStatus
        accountingDict['Source'] = self.sourceSE
        accountingDict['Destination'] = self.targetSE
        dt = transEndTime - submitTime
        transferTime = dt.days * 86400 + dt.seconds
        accountingDict['TransferTime'] = transferTime
        oAccounting.setValuesFromDict(accountingDict)
        self.log.verbose("Attempting to commit accounting message...")
        oAccounting.commit()
        self.log.verbose("...committed.")
        return S_OK()
コード例 #22
0
ファイル: FTSRequest.py プロジェクト: msapunov/DIRAC
  def __sendAccounting( self, regSuc, regTotal, regTime, transEndTime, transDict ):
    """ send accounting record

    :param self: self reference
    :param regSuc: number of files successfully registered
    :param regTotal: number of files attepted to register 
    :param regTime: time stamp at the end of registration 
    :param transEndTime: time stamp at the end of FTS job
    :param dict transDict: dict holding couters for files being transerred, their sizes and successfull transfers 
    """

    submitTime = fromString( self.submitTime )
    oAccounting = DataOperation()
    oAccounting.setEndTime( transEndTime )
    oAccounting.setStartTime( submitTime )

    accountingDict = {}
    accountingDict['OperationType'] = 'replicateAndRegister'
    accountingDict['User'] = '******'
    accountingDict['Protocol'] = 'FTS'
    accountingDict['RegistrationTime'] = regTime
    accountingDict['RegistrationOK'] = regSuc
    accountingDict['RegistrationTotal'] = regTotal
    accountingDict['TransferOK'] = transDict['transOK']
    accountingDict['TransferTotal'] = transDict['transTotal']
    accountingDict['TransferSize'] = transDict['transSize']
    accountingDict['FinalStatus'] = self.requestStatus
    accountingDict['Source'] = self.sourceSE
    accountingDict['Destination'] = self.targetSE
    dt = transEndTime - submitTime
    transferTime = dt.days * 86400 + dt.seconds
    accountingDict['TransferTime'] = transferTime
    oAccounting.setValuesFromDict( accountingDict )
    self.log.verbose( "Attempting to commit accounting message..." )
    oAccounting.commit()
    self.log.verbose( "...committed." )
    return S_OK()
コード例 #23
0
ファイル: KeystoneClient.py プロジェクト: DIRACGrid/DIRAC
    def __getToken3(self):
        """Get the Keystone token for the version v3 of the keystone service

        :return: S_OK(token) or S_ERROR
        """

        domain = self.parameters.get("Domain", "Default")
        user = self.parameters.get("User")
        password = self.parameters.get("Password")
        appcred_file = self.parameters.get("Appcred")
        authDict = {}
        authArgs = {}
        if user and password:
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["password"],
                        "password": {
                            "user": {
                                "name": user,
                                "domain": {
                                    "name": domain
                                },
                                "password": password
                            }
                        },
                    }
                }
            }
        elif self.parameters.get("Auth") == "voms":
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["mapped"],
                        "mapped": {
                            "voms": True,
                            "identity_provider": "egi.eu",
                            "protocol": "mapped"
                        },
                    }
                }
            }
            if self.parameters.get("Proxy"):
                authArgs["cert"] = self.parameters.get("Proxy")
        elif appcred_file:
            # The application credentials are stored in a file of the format:
            # id secret
            ac_fd = open(appcred_file, "r")
            auth_info = ac_fd.read()
            auth_info = auth_info.strip()
            ac_id, ac_secret = auth_info.split(" ", 1)
            ac_fd.close()
            authDict = {
                "auth": {
                    "identity": {
                        "methods": ["application_credential"],
                        "application_credential": {
                            "id": ac_id,
                            "secret": ac_secret
                        },
                    }
                }
            }
        else:
            return S_ERROR("No valid credentials provided")

        # appcred includes the project scope binding in the credential itself
        if self.project and not appcred_file:
            authDict["auth"]["scope"] = {
                "project": {
                    "domain": {
                        "name": domain
                    },
                    "name": self.project
                }
            }

        gLogger.debug("Request token with auth arguments: %s and body %s" %
                      (str(authArgs), str(authDict)))

        url = "%s/auth/tokens" % self.url
        try:
            result = requests.post(url,
                                   headers={
                                       "Content-Type": "application/json",
                                       "Accept": "application/json",
                                   },
                                   json=authDict,
                                   verify=self.caPath,
                                   **authArgs)

        except Exception as exc:
            return S_ERROR("Exception getting keystone token: %s" % str(exc))

        if result.status_code not in [200, 201, 202, 203, 204]:
            return S_ERROR("Failed to get keystone token: %s" % result.text)

        try:
            self.token = result.headers["X-Subject-Token"]
        except Exception as exc:
            return S_ERROR("Failed to get keystone token: %s" % str(exc))

        output = result.json()

        expires = fromString(
            str(output["token"]["expires_at"]).replace("T",
                                                       " ").replace("Z", ""))
        issued = fromString(
            str(output["token"]["issued_at"]).replace("T",
                                                      " ").replace("Z", ""))
        self.expires = dateTime() + (expires - issued)

        if "project" in output["token"]:
            if output["token"]["project"]["name"] == self.project:
                self.projectID = output["token"]["project"]["id"]

        if "catalog" in output["token"]:
            for service in output["token"]["catalog"]:
                if service["type"] == "compute":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.computeURL = str(endpoint["url"])

                elif service["type"] == "image":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.imageURL = str(endpoint["url"])

                elif service["type"] == "network":
                    for endpoint in service["endpoints"]:
                        if endpoint["interface"] == "public":
                            self.networkURL = str(endpoint["url"])

        return S_OK(self.token)
コード例 #24
0
    def export_checkComponentLog(self, component):
        """Check component log for errors"""
        componentList = []
        if "*" in component:
            if component == "*":
                result = gComponentInstaller.getSetupComponents()
                if result["OK"]:
                    for ctype in ["Services", "Agents", "Executors"]:
                        if ctype in result["Value"]:
                            for sname in result["Value"][ctype]:
                                for cname in result["Value"][ctype][sname]:
                                    componentList.append("/".join(
                                        [sname, cname]))
        elif isinstance(component, six.string_types):
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for comp in componentList:
            if "/" not in comp:
                continue
            system, cname = comp.split("/")

            startDir = gComponentInstaller.startDir
            currentLog = startDir + "/" + system + "_" + cname + "/log/current"
            try:
                with open(currentLog, "r") as logFile:
                    logLines = logFile.readlines()
            except IOError as err:
                gLogger.error("File does not exists:", currentLog)
                resultDict[comp] = {
                    "ErrorsHour": -1,
                    "ErrorsDay": -1,
                    "LastError": currentLog + "::" + repr(err)
                }
                continue

            errors_1 = 0
            errors_24 = 0
            now = dateTime()
            lastError = ""
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    if len(fields) < 2:  # if the line contains only one word
                        lastError = line.split("ERROR:")[-1].strip()
                        continue
                    timeStamp = fromString(fields[0] + " " + fields[1])
                    if not timeStamp:  # if the timestamp is missing in the log
                        lastError = line.split("ERROR:")[-1].strip()
                        continue
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split("ERROR:")[-1].strip()

            resultDict[comp] = {
                "ErrorsHour": errors_1,
                "ErrorsDay": errors_24,
                "LastError": lastError
            }

        return S_OK(resultDict)
コード例 #25
0
  def __getToken3(self):
    """Get the Keystone token for the version v3 of the keystone service

    :return: S_OK(token) or S_ERROR
    """

    domain = self.parameters.get('Domain', "Default")
    user = self.parameters.get('User')
    password = self.parameters.get('Password')
    appcred_file = self.parameters.get('Appcred')
    authDict = {}
    authArgs = {}
    if user and password:
      authDict = {'auth': {"identity": {"methods": ["password"],
                                        "password": {"user": {"name": user,
                                                              "domain": {"name": domain},
                                                              "password": password
                                                              }
                                                     }
                                        }
                           }
                  }
    elif self.parameters.get('Auth') == "voms":
      authDict = {"auth": {"identity": {"methods": ["mapped"],
                                        "mapped": {'voms': True,
                                                   'identity_provider': 'egi.eu',
                                                   "protocol": 'mapped'}}}}
      if self.parameters.get('Proxy'):
        authArgs['cert'] = self.parameters.get('Proxy')
    elif appcred_file:
      # The application credentials are stored in a file of the format:
      # id secret
      ac_fd = open(appcred_file, 'r')
      auth_info = ac_fd.read()
      auth_info = auth_info.strip()
      ac_id, ac_secret = auth_info.split(" ", 1)
      ac_fd.close()
      authDict = {'auth': {"identity": {"methods": ["application_credential"],
                                        "application_credential": {"id": ac_id,
                                                                   "secret": ac_secret}}}}
    else:
      return S_ERROR("No valid credentials provided")

    # appcred includes the project scope binding in the credential itself
    if self.project and not appcred_file:
      authDict['auth']['scope'] = {"project": {"domain": {"name": domain},
                                               "name": self.project
                                               }
                                   }

    gLogger.debug('Request token with auth arguments: %s and body %s' %
                  (str(authArgs), str(authDict)))

    url = "%s/auth/tokens" % self.url
    try:
      result = requests.post(url,
                             headers={"Content-Type": "application/json",
                                      "Accept": "application/json",
                                      },
                             json=authDict,
                             verify=self.caPath,
                             **authArgs)

    except Exception as exc:
      return S_ERROR('Exception getting keystone token: %s' % str(exc))

    if result.status_code not in [200, 201, 202, 203, 204]:
      return S_ERROR('Failed to get keystone token: %s' % result.text)

    try:
      self.token = result.headers['X-Subject-Token']
    except Exception as exc:
      return S_ERROR('Failed to get keystone token: %s' % str(exc))

    output = result.json()

    expires = fromString(str(output['token']['expires_at']).replace('T', ' ').replace('Z', ''))
    issued = fromString(str(output['token']['issued_at']).replace('T', ' ').replace('Z', ''))
    self.expires = dateTime() + (expires - issued)

    if 'project' in output['token']:
      if output['token']['project']['name'] == self.project:
        self.projectID = output['token']['project']['id']

    if 'catalog' in output['token']:
      for service in output['token']['catalog']:
        if service['type'] == 'compute':
          for endpoint in service['endpoints']:
            if endpoint['interface'] == 'public':
              self.computeURL = str(endpoint['url'])

        elif service['type'] == 'image':
          for endpoint in service['endpoints']:
            if endpoint['interface'] == 'public':
              self.imageURL = str(endpoint['url'])

        elif service['type'] == 'network':
          for endpoint in service['endpoints']:
            if endpoint['interface'] == 'public':
              self.networkURL = str(endpoint['url'])

    return S_OK(self.token)
コード例 #26
0
                    pollingtime = line.split(':')[4].split(' ')[1].split(
                        '.')[0]
                except BaseException:
                    try:
                        pollingtime = line.split(':')[7].split(' ')[1].split(
                            '.')[0]
                    except BaseException:
                        write_log("    wrong format for Polling Time : " +
                                  line)
                        break
            else:
                lastLine = line

    lastLineList = lastLine.split(' ')
    try:
        lastupdate = fromString(lastLineList[0] + ' ' + lastLineList[1])
    except BaseException:
        write_log('    EXCEPT : ' + dirname)
        write_log('   last line is ' + str(lastLineList))

    if isinstance(pollingtime, int):
        if int(pollingtime) < 59:
            pollingtime = 120

        interval = timeInterval(lastupdate, second * int(pollingtime))
        if not interval.includes(now):
            write_log("    the PollingTime is : " + str(pollingtime) + " s")
            write_log('    last update for ' + dirname + ' was : ' +
                      str(lastupdate))
            write_log('    Polling Time is ' + str(pollingtime) + ' s')
            write_log('    last known status' + result + '\n')
コード例 #27
0
ファイル: JobScheduling.py プロジェクト: corionma/DIRAC
  def optimizeJob( self, jid, jobState ):
    #Reschedule delay
    result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] )
    if not result[ 'OK' ]:
      return result
    attDict = result[ 'Value' ]
    try:
      reschedules = int( attDict[ 'RescheduleCounter' ] )
    except ValueError:
      return S_ERROR( "RescheduleCounter has to be an integer" )
    if reschedules != 0:
      delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] )
      delay = delays[ min( reschedules, len( delays ) - 1 ) ]
      waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) )
      if waited < delay:
        return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay )

    #Get site requirements
    result = self.__getSitesRequired( jobState )
    if not result[ 'OK' ]:
      return result
    userSites, userBannedSites = result[ 'Value' ]

    #Get active and banned sites from DIRAC
    result = self.__jobDB.getSiteMask( 'Active' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve active sites from JobDB" )
    wmsActiveSites = result[ 'Value' ]
    result = self.__jobDB.getSiteMask( 'Banned' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve banned sites from JobDB" )
    wmsBannedSites = result[ 'Value' ]

    #If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      result = jobState.getAttribute( "JobType" )
      if not result[ 'OK' ]:
        return S_ERROR( "Could not retrieve job type" )
      jobType = result[ 'Value' ]
      if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ):
        sites = self.__applySiteFilter( userSites, wmsActiveSites, wmsBannedSites )
        if not sites:
          if len( userSites ) > 1:
            return self.__holdJob( jobState, "Requested sites %s are inactive" % ",".join( userSites ) )
          else:
            return self.__holdJob( jobState, "Requested site %s is inactive" % userSites[0] )

    #Get the Input data
    # Third, check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      #No input data? Generate requirements and next
      return self.__sendToTQ( jobState, userSites, userBannedSites )

    inputData = result[ 'Value' ]

    self.jobLog.verbose( 'Has an input data requirement' )
    idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' )
    result = self.retrieveOptimizerParam( idAgent )
    if not result['OK']:
      self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] )
      return S_ERROR( "File Catalog Access Failure" )
    opData = result[ 'Value' ]
    if 'SiteCandidates' not in opData:
      return S_ERROR( "No possible site candidates" )

    #Filter input data sites with user requirement
    siteCandidates = list( opData[ 'SiteCandidates' ] )
    self.jobLog.info( "Site candidates are %s" % siteCandidates )

    siteCandidates = self.__applySiteFilter( siteCandidates, userSites, userBannedSites )
    if not siteCandidates:
      return S_ERROR( "Impossible InputData * Site requirements" )

    idSites = {}
    for site in siteCandidates:
      idSites[ site ] = opData[ 'SiteCandidates' ][ site ]

    #Check if sites have correct count of disk+tape replicas
    numData = len( inputData )
    errorSites = set()
    for site in idSites:
      if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]:
        self.jobLog.error( "Site candidate %s does not have all the input data" % site )
        errorSites.add( site )
    for site in errorSites:
      idSites.pop( site )
    if not idSites:
      return S_ERROR( "Site candidates do not have all the input data" )

    #Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites )
    if not siteCandidates:
      return S_ERROR( "No destination sites available" )

    #Is any site active?
    stageSites = self.__applySiteFilter( siteCandidates, wmsActiveSites, wmsBannedSites )
    if not stageSites:
      return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) )

    #If no staging is required send to TQ
    if not stageRequired:
      #Use siteCandidates and not stageSites because active and banned sites
      #will be taken into account on matching time
      return self.__sendToTQ( jobState, siteCandidates, userBannedSites )

    #Check if the user is allowed to stage
    if self.ex_getOption( "RestrictDataStage", False ):
      if not self.__checkStageAllowed( jobState ):
        return S_ERROR( "Stage not allowed" )

    #Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) )
    stageData = idSites[ stageSite ]
    #Set as if everything has already been staged
    stageData[ 'disk' ] += stageData[ 'tape' ]
    stageData[ 'tape' ] = 0
    #Set the site info back to the original dict to save afterwards
    opData[ 'SiteCandidates' ][ stageSite ] = stageData

    result = self.__requestStaging( jobState, stageSite, opData )
    if not result[ 'OK' ]:
      return result
    stageLFNs = result[ 'Value' ]
    self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData )
    #Save the optimizer data again
    self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData )
    result = self.storeOptimizerParam( idAgent, opData )
    if not result[ 'OK' ]:
      return result

    return self.__setJobSite( jobState, stageSites )
コード例 #28
0
    def _sendAccounting(self, jobID):
        """
        Send WMS accounting data for the given job.

        Run inside thread.
        """
        try:
            accountingReport = Job()
            endTime = "Unknown"
            lastHeartBeatTime = "Unknown"

            result = self.jobDB.getJobAttributes(jobID)
            if not result["OK"]:
                return result
            jobDict = result["Value"]

            startTime, endTime = self._checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self._checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            result = JobMonitoringClient().getJobParameter(
                jobID, "CPUNormalizationFactor")
            if not result["OK"] or not result["Value"]:
                self.log.error(
                    "Error getting Job Parameter CPUNormalizationFactor, setting 0",
                    result.get("Message", "No such value"),
                )
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(
                    result["Value"].get("CPUNormalizationFactor"))

        except Exception as e:
            self.log.exception(
                "Exception in _sendAccounting",
                "for job=%s: endTime=%s, lastHBTime=%s" %
                (str(jobID), str(endTime), str(lastHeartBeatTime)),
                lException=e,
            )
            return S_ERROR("Exception")
        processingType = self._getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            "Site": jobDict["Site"],
            "User": jobDict["Owner"],
            "UserGroup": jobDict["OwnerGroup"],
            "JobGroup": jobDict["JobGroup"],
            "JobType": jobDict["JobType"],
            "JobClass": jobDict["JobSplitType"],
            "ProcessingType": processingType,
            "FinalMajorStatus": JobStatus.FAILED,
            "FinalMinorStatus": JobMinorStatus.STALLED_PILOT_NOT_RUNNING,
            "CPUTime": lastCPUTime,
            "NormCPUTime": lastCPUTime * cpuNormalization,
            "ExecTime": lastWallTime,
            "InputDataSize": 0.0,
            "OutputDataSize": 0.0,
            "InputDataFiles": 0,
            "OutputDataFiles": 0,
            "DiskSpace": 0.0,
            "InputSandBoxSize": 0.0,
            "OutputSandBoxSize": 0.0,
            "ProcessedEvents": 0,
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData["ExecTime"]:
            acData["ExecTime"] = acData["CPUTime"]
        elif acData["ExecTime"] < acData["CPUTime"]:
            acData["ExecTime"] = acData["CPUTime"]

        self.log.verbose("Accounting Report is:")
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result["OK"]:
            self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True")
        else:
            self.log.error(
                "Failed to send accounting report",
                "Job: %d, Error: %s" % (int(jobID), result["Message"]))
        return result
コード例 #29
0
  def export_checkComponentLog(self, component):
    """ Check component log for errors
    """
    componentList = []
    if '*' in component:
      if component == '*':
        result = gComponentInstaller.getSetupComponents()
        if result['OK']:
          for ctype in ['Services', 'Agents', 'Executors']:
            if ctype in result['Value']:
              for sname in result['Value'][ctype]:
                for cname in result['Value'][ctype][sname]:
                  componentList.append('/'.join([sname, cname]))
    elif isinstance(component, basestring):
      componentList = [component]
    else:
      componentList = component

    resultDict = {}
    for comp in componentList:
      if '/' not in comp:
        continue
      system, cname = comp.split('/')

      startDir = gComponentInstaller.startDir
      currentLog = startDir + '/' + system + '_' + cname + '/log/current'
      try:
        logFile = file(currentLog, 'r')
      except IOError as err:
        gLogger.error("File does not exists:", currentLog)
        resultDict[comp] = {'ErrorsHour': -1, 'ErrorsDay': -1, 'LastError': currentLog + '::' + repr(err)}
        continue

      logLines = logFile.readlines()
      logFile.close()

      errors_1 = 0
      errors_24 = 0
      now = dateTime()
      lastError = ''
      for line in logLines:
        if "ERROR:" in line:
          fields = line.split()
          recent = False
          if len(fields) < 2:  # if the line contains only one word
            lastError = line.split('ERROR:')[-1].strip()
            continue
          timeStamp = fromString(fields[0] + ' ' + fields[1])
          if not timeStamp:  # if the timestamp is missing in the log
            lastError = line.split('ERROR:')[-1].strip()
            continue
          if (now - timeStamp) < hour:
            errors_1 += 1
            recent = True
          if (now - timeStamp) < day:
            errors_24 += 1
            recent = True
          if recent:
            lastError = line.split('ERROR:')[-1].strip()

      resultDict[comp] = {'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError}

    return S_OK(resultDict)
コード例 #30
0
ファイル: StalledJobAgent.py プロジェクト: bmb/DIRAC
  def __sendAccounting( self, jobID ):
    """ Send WMS accounting data for the given job
    """

    accountingReport = Job()

    result = self.jobDB.getJobAttributes( jobID )
    if not result['OK']:
      return result
    jobDict = result['Value']

    startTime, endTime = self.__checkLoggingInfo( jobID, jobDict )

    lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict )

    if lastHeartBeatTime and fromString( lastHeartBeatTime ) > endTime:
      endTime = fromString( lastHeartBeatTime )

    cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor' )
    if not cpuNormalization['OK'] or not cpuNormalization['Value']:
      cpuNormalization = 0.0
    else:
      cpuNormalization = float( cpuNormalization['Value'] )

    processingType = self.__getProcessingType( jobID )

    accountingReport.setStartTime( startTime )
    accountingReport.setEndTime( endTime )
    # execTime = toEpoch( endTime ) - toEpoch( startTime )
    #Fill the accounting data
    acData = { 'Site' : jobDict['Site'],
               'User' : jobDict['Owner'],
               'UserGroup' : jobDict['OwnerGroup'],
               'JobGroup' : jobDict['JobGroup'],
               'JobType' : jobDict['JobType'],
               'JobClass' : jobDict['JobSplitType'],
               'ProcessingType' : processingType,
               'FinalMajorStatus' : 'Failed',
               'FinalMinorStatus' : 'Stalled',
               'CPUTime' : lastCPUTime,
               'NormCPUTime' : lastCPUTime * cpuNormalization,
               'ExecTime' : lastWallTime,
               'InputDataSize' : 0.0,
               'OutputDataSize' : 0.0,
               'InputDataFiles' : 0,
               'OutputDataFiles' : 0,
               'DiskSpace' : 0.0,
               'InputSandBoxSize' : 0.0,
               'OutputSandBoxSize' : 0.0,
               'ProcessedEvents' : 0
             }
    self.log.verbose( 'Accounting Report is:' )
    self.log.verbose( acData )
    accountingReport.setValuesFromDict( acData )

    result = accountingReport.commit()
    if result['OK']:
      self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' )
    else:
      self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % ( int( jobID ), result['Message'] ) )
    return result
コード例 #31
0
  def execute(self):
    """ The main agent execution method
    """
    limitDate = date() - self._period
    tableList = ["MessageRepository", "FixedTextMessages", "Systems",
                 "SubSystems"]
    columnsList = ["SystemName", "SubSystemName", "count(*) as entries",
                   "FixedTextString"]
    cmd = "SELECT " + ', '.join(columnsList) + " FROM " \
          + " NATURAL JOIN ".join(tableList) \
          + " WHERE MessageTime > '%s'" % limitDate \
          + " AND LogLevel in ('ERROR','FATAL','EXCEPT')" \
          + " GROUP BY FixedTextID,SystemName,SubSystemName HAVING entries > %s" % self._threshold \
          + " ORDER BY entries DESC LIMIT %i;" % self._limit

    result = self.systemLoggingDB._query(cmd)
    if not result['OK']:
      return result

    messageList = result['Value']

    if messageList == 'None' or messageList == ():
      self.log.warn('The DB query returned an empty result')
      return S_OK()

    mailBody = '\n'
    for message in messageList:
      mailBody = mailBody + "Count: " + str(message[2]) + "\tError: '"\
          + message[3] + "'\tSystem: '" + message[0]\
          + "'\tSubsystem: '" + message[1] + "'\n"

    mailBody = mailBody + "\n\n-------------------------------------------------------\n"\
        + "Please do not reply to this mail. It was automatically\n"\
        + "generated by a Dirac Agent.\n"

    result = self.systemLoggingDB._getDataFromAgentTable(self.agentName)
    self.log.debug(result)
    if not result['OK']:
      errorString = "Could not get the date when the last mail was sent"
      self.log.error(errorString)
      return S_ERROR(errorString)
    else:
      if result['Value']:
        self.log.debug("date value: %s" % fromString(result['Value'][0][0][1:-1]))
        lastMailSentDate = fromString(result['Value'][0][0][1:-1])
      else:
        lastMailSentDate = limitDate - 1 * day
        result = self.systemLoggingDB._insertDataIntoAgentTable(self.agentName, lastMailSentDate)
        if not result['OK']:
          errorString = "Could not insert data into the DB"
          self.log.error(errorString, result['Message'])
          return S_ERROR(errorString + ": " + result['Message'])

    self.log.debug("limitDate: %s\t" % limitDate + "lastMailSentDate: %s\n" % lastMailSentDate)
    if lastMailSentDate > limitDate:
      self.log.info("The previous report was sent less " + " than %s days ago" % self.__days)
      return S_OK()

    dateSent = toString(date())
    self.log.info("The list with the top errors has been sent")

    result = self.systemLoggingDB._insertDataIntoAgentTable(self.agentName, dateSent)
    if not result['OK']:
      errorString = "Could not insert data into the DB"
      self.log.error(errorString, result['Message'])
      return S_ERROR(errorString + ": " + result['Message'])

    result = self.notification.sendMail(self._mailAddress, self._subject,
                                        mailBody)
    if not result['OK']:
      self.log.warn("The notification could not be sent")
      return S_OK()

    return S_OK("The list with the top errors has been sent")
コード例 #32
0
ファイル: StalledJobAgent.py プロジェクト: acasajus/DIRAC
    def __sendAccounting(self, jobID):
        """ Send WMS accounting data for the given job
"""
        try:
            accountingReport = Job()
            endTime = 'Unknown'
            lastHeartBeatTime = 'Unknown'

            result = self.jobDB.getJobAttributes(jobID)
            if not result['OK']:
                return result
            jobDict = result['Value']

            startTime, endTime = self.__checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(
                jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            cpuNormalization = self.jobDB.getJobParameter(
                jobID, 'CPUNormalizationFactor')
            if not cpuNormalization['OK'] or not cpuNormalization['Value']:
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(cpuNormalization['Value'])
        except Exception:
            self.log.exception(
                "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s"
                % (str(jobID), str(endTime), str(lastHeartBeatTime)), '',
                False)
            return S_ERROR("Exception")
        processingType = self.__getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        #Fill the accounting data
        acData = {
            'Site': jobDict['Site'],
            'User': jobDict['Owner'],
            'UserGroup': jobDict['OwnerGroup'],
            'JobGroup': jobDict['JobGroup'],
            'JobType': jobDict['JobType'],
            'JobClass': jobDict['JobSplitType'],
            'ProcessingType': processingType,
            'FinalMajorStatus': 'Failed',
            'FinalMinorStatus': 'Stalled',
            'CPUTime': lastCPUTime,
            'NormCPUTime': lastCPUTime * cpuNormalization,
            'ExecTime': lastWallTime,
            'InputDataSize': 0.0,
            'OutputDataSize': 0.0,
            'InputDataFiles': 0,
            'OutputDataFiles': 0,
            'DiskSpace': 0.0,
            'InputSandBoxSize': 0.0,
            'OutputSandBoxSize': 0.0,
            'ProcessedEvents': 0
        }
        self.log.verbose('Accounting Report is:')
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result['OK']:
            self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True')
        else:
            self.log.error(
                'Failed to send accounting report',
                'Job: %d, Error: %s' % (int(jobID), result['Message']))
        return result
コード例 #33
0
  def __sendAccounting( self, jobID ):
    """ Send WMS accounting data for the given job
"""
    try:
      accountingReport = Job()
      endTime = 'Unknown'
      lastHeartBeatTime = 'Unknown'

      result = self.jobDB.getJobAttributes( jobID )
      if not result['OK']:
        return result
      jobDict = result['Value']

      startTime, endTime = self.__checkLoggingInfo( jobID, jobDict )
      lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict )
      lastHeartBeatTime = fromString( lastHeartBeatTime )
      if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
        endTime = lastHeartBeatTime

      cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor' )
      if not cpuNormalization['OK'] or not cpuNormalization['Value']:
        cpuNormalization = 0.0
      else:
        cpuNormalization = float( cpuNormalization['Value'] )
    except Exception:
      self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % ( str( jobID ), str( endTime ), str( lastHeartBeatTime ) ), '' , False )
      return S_ERROR( "Exception" )
    processingType = self.__getProcessingType( jobID )

    accountingReport.setStartTime( startTime )
    accountingReport.setEndTime( endTime )
    # execTime = toEpoch( endTime ) - toEpoch( startTime )
    #Fill the accounting data
    acData = { 'Site' : jobDict['Site'],
               'User' : jobDict['Owner'],
               'UserGroup' : jobDict['OwnerGroup'],
               'JobGroup' : jobDict['JobGroup'],
               'JobType' : jobDict['JobType'],
               'JobClass' : jobDict['JobSplitType'],
               'ProcessingType' : processingType,
               'FinalMajorStatus' : 'Failed',
               'FinalMinorStatus' : 'Stalled',
               'CPUTime' : lastCPUTime,
               'NormCPUTime' : lastCPUTime * cpuNormalization,
               'ExecTime' : lastWallTime,
               'InputDataSize' : 0.0,
               'OutputDataSize' : 0.0,
               'InputDataFiles' : 0,
               'OutputDataFiles' : 0,
               'DiskSpace' : 0.0,
               'InputSandBoxSize' : 0.0,
               'OutputSandBoxSize' : 0.0,
               'ProcessedEvents' : 0
             }
    
    # For accidentally stopped jobs ExecTime can be not set
    if not acData['ExecTime']:
      acData['ExecTime'] = acData['CPUTime']
    elif acData['ExecTime'] < acData['CPUTime']:
      acData['ExecTime'] = acData['CPUTime']
    
    self.log.verbose( 'Accounting Report is:' )
    self.log.verbose( acData )
    accountingReport.setValuesFromDict( acData )

    result = accountingReport.commit()
    if result['OK']:
      self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' )
    else:
      self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % ( int( jobID ), result['Message'] ) )
    return result
コード例 #34
0
ファイル: StalledJobAgent.py プロジェクト: zhangxiaomei/DIRAC
  def sendAccounting( self, jobID ):
    """Send WMS accounting data for the given job
    """

    accountingReport = Job()

    result = self.jobDB.getJobAttributes( jobID )
    if not result['OK']:
      return result
    jobDict = result['Value']

    result = self.logDB.getJobLoggingInfo( jobID )
    if not result['OK']:
      logList = []
    else:
      logList = result['Value']

    startTime = jobDict['StartExecTime']
    endTime = ''

    if not startTime or startTime == 'None':
      for status, minor, app, stime, source in logList:
        if status == 'Running':
          startTime = stime
          break
      for status, minor, app, stime, source in logList:
        if status == 'Stalled':
          endTime = stime
      if not startTime or startTime == 'None':
        startTime = jobDict['SubmissionTime']

    if type( startTime ) in types.StringTypes:
      startTime = fromString( startTime )


    result = self.logDB.getJobLoggingInfo( jobID )
    if not result['OK']:
      endTime = dateTime()
    else:
      for status, minor, app, stime, source in result['Value']:
        if status == 'Stalled':
          endTime = stime
          break
    if not endTime:
      endTime = dateTime()

    if type( endTime ) in types.StringTypes:
      endTime = fromString( endTime )

    result = self.jobDB.getHeartBeatData( jobID )

    lastCPUTime = 0
    lastWallTime = 0
    lastHeartBeatTime = jobDict['StartExecTime']
    if result['OK']:
      for name, value, heartBeatTime in result['Value']:
        if 'CPUConsumed' == name:
          try:
            value = int( float( value ) )
            if value > lastCPUTime:
              lastCPUTime = value
          except:
            pass
        if 'WallClockTime' == name:
          try:
            value = int( float( value ) )
            if value > lastWallTime:
              lastWallTime = value
          except:
            pass
        if heartBeatTime > lastHeartBeatTime:
          lastHeartBeatTime = heartBeatTime

    accountingReport.setStartTime( startTime )
    accountingReport.setEndTime()
    # execTime = toEpoch( endTime ) - toEpoch( startTime )
    #Fill the accounting data
    acData = { 'Site' : jobDict['Site'],
               'User' : jobDict['Owner'],
               'UserGroup' : jobDict['OwnerGroup'],
               'JobGroup' : jobDict['JobGroup'],
               'JobType' : jobDict['JobType'],
               'JobClass' : jobDict['JobSplitType'],
               'ProcessingType' : 'unknown',
               'FinalMajorStatus' : 'Failed',
               'FinalMinorStatus' : 'Stalled',
               'CPUTime' : lastCPUTime,
               'NormCPUTime' : 0.0,
               'ExecTime' : lastWallTime,
               'InputDataSize' : 0.0,
               'OutputDataSize' : 0.0,
               'InputDataFiles' : 0,
               'OutputDataFiles' : 0,
               'DiskSpace' : 0.0,
               'InputSandBoxSize' : 0.0,
               'OutputSandBoxSize' : 0.0,
               'ProcessedEvents' : 0
             }
    self.log.verbose( 'Accounting Report is:' )
    self.log.verbose( acData )
    accountingReport.setValuesFromDict( acData )

    result = accountingReport.commit()
    if result['OK']:
      self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' )
    else:
      self.log.warn( 'Failed to send accounting report for job %d' % int( jobID ) )
      self.log.error( result['Message'] )
    return result
コード例 #35
0
ファイル: JobSchedulingAgent.py プロジェクト: zenglzh/DIRAC
    def checkJob(self, job, classAdJob):
        """This method controls the checking of the job.
    """
        self.log.verbose('Job %s will be processed' % (job))

        # Check if the job was recently rescheduled
        result = self.jobDB.getJobAttributes(
            job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            self.log.error(result['Message'])
            return S_ERROR('Can not get job attributes from JobDB')
        jobDict = result['Value']
        reCounter = int(jobDict['RescheduleCounter'])
        if reCounter != 0:
            reTime = fromString(jobDict['RescheduleTime'])
            delta = toEpoch() - toEpoch(reTime)
            delay = self.maxRescheduleDelay
            if reCounter <= len(self.rescheduleDelaysList):
                delay = self.rescheduleDelaysList[reCounter - 1]
            if delta < delay:
                if jobDict['ApplicationStatus'].find(
                        'On Hold: after rescheduling') == -1:
                    result = self.jobDB.setJobStatus(
                        job,
                        application='On Hold: after rescheduling #%d' %
                        reCounter)
                return S_OK()

        # First, get Site and BannedSites from the Job

        result = self.__getJobSiteRequirement(job, classAdJob)
        userBannedSites = result['BannedSites']
        userSites = result['Sites']

        if userSites:
            userSites = applySiteRequirements(userSites, [], userBannedSites)
            if not userSites:
                msg = 'Impossible Site Requirement'
                return S_ERROR(msg)

        # Second, get the Active and Banned sites from the WMS

        wmsSites = self.jobDB.getSiteMask('Active')
        wmsBannedSites = self.jobDB.getSiteMask('Banned')
        if not (wmsSites['OK'] and wmsBannedSites['OK']):
            if not wmsSites['OK']:
                self.log.error(wmsSites['Message'])
            if not wmsBannedSites['OK']:
                self.log.error(wmsBannedSites['Message'])
            return S_ERROR('Can not get Active and Banned Sites from JobDB')

        wmsSites = wmsSites['Value']
        wmsBannedSites = wmsBannedSites['Value']

        if userSites:
            sites = applySiteRequirements(userSites, wmsSites, wmsBannedSites)
            if not sites:
                # Put on Hold only non-excluded job types
                jobType = classAdJob.getAttributeString('JobType')
                if not jobType in self.excludedOnHoldJobTypes:
                    msg = 'On Hold: Requested site is Banned or not Active'
                    self.log.info(msg)
                    result = self.jobDB.setJobStatus(job, application=msg)
                    return S_OK()

        # Third, check if there is input data
        result = self.jobDB.getInputData(job)
        if not result['OK']:
            self.log.warn('Failed to get input data from JobDB for %s' % (job))
            self.log.error(result['Message'])
            return S_ERROR('Failed to get input data from JobDB')

        if not result['Value']:
            return self.__sendJobToTaskQueue(job, classAdJob, userSites,
                                             userBannedSites)

        hasInputData = False
        inputData = []
        for lfn in result['Value']:
            if lfn:
                inputData.append(lfn)
                hasInputData = True

        if not hasInputData:
            #With no input data requirement, job can proceed directly to task queue
            self.log.verbose('Job %s has no input data requirement' % (job))
            return self.__sendJobToTaskQueue(job, classAdJob, userSites,
                                             userBannedSites)

        self.log.verbose('Job %s has an input data requirement ' % (job))

        # Fourth, Check all optimizer information
        result = self.__checkOptimizerInfo(job)
        if not result['OK']:
            return result

        optInfo = result['Value']

        #Compare site candidates with current mask
        optSites = optInfo['SiteCandidates'].keys()
        self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites)))
        # Check that it is compatible with user requirements
        optSites = applySiteRequirements(optSites, userSites, userBannedSites)
        if not optSites:
            msg = 'Impossible Site + InputData Requirement'
            return S_ERROR(msg)

        sites = applySiteRequirements(optSites, wmsSites, wmsBannedSites)
        if not sites:
            msg = 'On Hold: InputData Site is Banned or not Active'
            self.log.info(msg)
            result = self.jobDB.setJobStatus(job, application=msg)
            return S_OK()

        #Set stager request as necessary, optimize for smallest #files on tape if
        #more than one site candidate left at this point
        checkStaging = self.__resolveSitesForStaging(job, sites, inputData,
                                                     optInfo['SiteCandidates'])
        if not checkStaging['OK']:
            return checkStaging

        destinationSites = checkStaging['SiteCandidates']
        if not destinationSites:
            return S_ERROR('No destination sites available')

        stagingFlag = checkStaging['Value']
        if stagingFlag:
            #Single site candidate chosen and staging required
            self.log.verbose('Job %s requires staging of input data' % (job))
            # set all LFN to disk for the selected site
            stagingSite = destinationSites[0]
            siteDict = optInfo['SiteCandidates'][stagingSite]
            siteDict['disk'] = siteDict['disk'] + siteDict['tape']
            siteDict['tape'] = 0

            optInfo['SiteCandidates'][stagingSite] = siteDict
            self.log.verbose(
                'Updating %s Optimizer Info for Job %s:' %
                (self.dataAgentName, job), optInfo)
            result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo)
            if not result['OK']:
                return result

            # Site is selected for staging, report it
            self.log.verbose('Staging site candidate for job %s is %s' %
                             (job, stagingSite))

            result = self.__getStagingSites(stagingSite, destinationSites)
            if not result['OK']:
                stagingSites = [stagingSite]
            else:
                stagingSites = result['Value']

            if len(stagingSites) == 1:
                self.jobDB.setJobAttribute(job, 'Site', stagingSite)
            else:
                # Get the name of the site group
                result = self.__getSiteGroup(stagingSites)
                if result['OK']:
                    groupName = result['Value']
                    if groupName:
                        self.jobDB.setJobAttribute(job, 'Site', groupName)
                    else:
                        self.jobDB.setJobAttribute(job, 'Site', 'Multiple')
                else:
                    self.jobDB.setJobAttribute(job, 'Site', 'Multiple')

            stagerDict = self.__setStagingRequest(job, stagingSite, optInfo)
            if not stagerDict['OK']:
                return stagerDict
            self.__updateOtherSites(job, stagingSite, stagerDict['Value'],
                                    optInfo)
            return S_OK()
        else:
            #No staging required, can proceed to task queue agent and then waiting status
            self.log.verbose('Job %s does not require staging of input data' %
                             (job))
        #Finally send job to TaskQueueAgent
        return self.__sendJobToTaskQueue(job, classAdJob, destinationSites,
                                         userBannedSites)
コード例 #36
0
ファイル: JobSchedulingAgent.py プロジェクト: closier/DIRAC
    def checkJob(self, job, classAdJob):
        """This method controls the checking of the job.
    """
        self.log.verbose("Job %s will be processed" % (job))

        # Check if the job was recently rescheduled
        result = self.jobDB.getJobAttributes(job, ["RescheduleCounter", "RescheduleTime", "ApplicationStatus"])
        if not result["OK"]:
            self.log.error(result["Message"])
            return S_ERROR("Can not get job attributes from JobDB")
        jobDict = result["Value"]
        reCounter = int(jobDict["RescheduleCounter"])
        if reCounter != 0:
            reTime = fromString(jobDict["RescheduleTime"])
            delta = toEpoch() - toEpoch(reTime)
            delay = self.maxRescheduleDelay
            if reCounter <= len(self.rescheduleDelaysList):
                delay = self.rescheduleDelaysList[reCounter - 1]
            if delta < delay:
                if jobDict["ApplicationStatus"].find("On Hold: after rescheduling") == -1:
                    result = self.jobDB.setJobStatus(job, application="On Hold: after rescheduling #%d" % reCounter)
                return S_OK()

        # First, get Site and BannedSites from the Job

        result = self.__getJobSiteRequirement(job, classAdJob)
        userBannedSites = result["BannedSites"]
        userSites = result["Sites"]

        if userSites:
            userSites = applySiteRequirements(userSites, [], userBannedSites)
            if not userSites:
                msg = "Impossible Site Requirement"
                return S_ERROR(msg)

        # Second, get the Active and Banned sites from the WMS

        wmsSites = self.jobDB.getSiteMask("Active")
        wmsBannedSites = self.jobDB.getSiteMask("Banned")
        if not (wmsSites["OK"] and wmsBannedSites["OK"]):
            if not wmsSites["OK"]:
                self.log.error(wmsSites["Message"])
            if not wmsBannedSites["OK"]:
                self.log.error(wmsBannedSites["Message"])
            return S_ERROR("Can not get Active and Banned Sites from JobDB")

        wmsSites = wmsSites["Value"]
        wmsBannedSites = wmsBannedSites["Value"]

        if userSites:
            sites = applySiteRequirements(userSites, wmsSites, wmsBannedSites)
            if not sites:
                # Put on Hold only non-excluded job types
                jobType = classAdJob.getAttributeString("JobType")
                if not jobType in self.excludedOnHoldJobTypes:
                    msg = "On Hold: Requested site is Banned or not Active"
                    self.log.info(msg)
                    result = self.jobDB.setJobStatus(job, application=msg)
                    return S_OK()

        # Third, check if there is input data
        result = self.jobDB.getInputData(job)
        if not result["OK"]:
            self.log.warn("Failed to get input data from JobDB for %s" % (job))
            self.log.error(result["Message"])
            return S_ERROR("Failed to get input data from JobDB")

        if not result["Value"]:
            return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites)

        hasInputData = False
        inputData = []
        for lfn in result["Value"]:
            if lfn:
                inputData.append(lfn)
                hasInputData = True

        if not hasInputData:
            # With no input data requirement, job can proceed directly to task queue
            self.log.verbose("Job %s has no input data requirement" % (job))
            return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites)

        self.log.verbose("Job %s has an input data requirement " % (job))

        # Fourth, Check all optimizer information
        result = self.__checkOptimizerInfo(job)
        if not result["OK"]:
            return result

        optInfo = result["Value"]

        # Compare site candidates with current mask
        optSites = optInfo["SiteCandidates"].keys()
        self.log.info("Input Data Site Candidates: %s" % (", ".join(optSites)))
        # Check that it is compatible with user requirements
        optSites = applySiteRequirements(optSites, userSites, userBannedSites)
        if not optSites:
            msg = "Impossible Site + InputData Requirement"
            return S_ERROR(msg)

        sites = applySiteRequirements(optSites, wmsSites, wmsBannedSites)
        if not sites:
            msg = "On Hold: InputData Site is Banned or not Active"
            self.log.info(msg)
            result = self.jobDB.setJobStatus(job, application=msg)
            return S_OK()

        # Set stager request as necessary, optimize for smallest #files on tape if
        # more than one site candidate left at this point
        checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo["SiteCandidates"])
        if not checkStaging["OK"]:
            return checkStaging

        destinationSites = checkStaging["SiteCandidates"]
        if not destinationSites:
            return S_ERROR("No destination sites available")

        stagingFlag = checkStaging["Value"]
        if stagingFlag:
            # Single site candidate chosen and staging required
            self.log.verbose("Job %s requires staging of input data" % (job))
            # set all LFN to disk for the selected site
            stagingSite = destinationSites[0]
            siteDict = optInfo["SiteCandidates"][stagingSite]
            siteDict["disk"] = siteDict["disk"] + siteDict["tape"]
            siteDict["tape"] = 0

            optInfo["SiteCandidates"][stagingSite] = siteDict
            result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo)
            if not result["OK"]:
                return result

            # Site is selected for staging, report it
            self.log.verbose("Staging site candidate for job %s is %s" % (job, stagingSite))
            if len(destinationSites) == 1:
                self.jobDB.setJobAttribute(job, "Site", stagingSite)
            else:
                self.jobDB.setJobAttribute(job, "Site", "Multiple")

            stagerDict = self.__setStagingRequest(job, stagingSite, optInfo)
            if not stagerDict["OK"]:
                return stagerDict
            self.__updateOtherSites(job, stagingSite, stagerDict["Value"], optInfo)
            return S_OK()
        else:
            # No staging required, can proceed to task queue agent and then waiting status
            self.log.verbose("Job %s does not require staging of input data" % (job))
        # Finally send job to TaskQueueAgent
        return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)
コード例 #37
0
ファイル: JobScheduling.py プロジェクト: mesmith75/DIRAC
  def optimizeJob( self, jid, jobState ):
    """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
    # Reschedule delay
    result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] )
    if not result[ 'OK' ]:
      return result
    attDict = result[ 'Value' ]
    try:
      reschedules = int( attDict[ 'RescheduleCounter' ] )
    except ( ValueError, KeyError ):
      return S_ERROR( "RescheduleCounter has to be an integer" )
    if reschedules != 0:
      delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] )
      delay = delays[ min( reschedules, len( delays ) - 1 ) ]
      waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) )
      if waited < delay:
        return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay )

    # Get site requirements
    result = self.__getSitesRequired( jobState )
    if not result[ 'OK' ]:
      return result
    userSites, userBannedSites = result[ 'Value' ]

    # Get job type
    result = jobState.getAttribute( "JobType" )
    if not result[ 'OK' ]:
      return S_ERROR( "Could not retrieve job type" )
    jobType = result[ 'Value' ]

    # Get banned sites from DIRAC
    result = self.__jobDB.getSiteMask( 'Banned' )
    if not result[ 'OK' ]:
      return S_ERROR( "Cannot retrieve banned sites from JobDB" )
    wmsBannedSites = result[ 'Value' ]

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ):
        result = self.__jobDB.getUserSitesTuple( userSites )
        if not result[ 'OK' ]:
          return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" )

        userSites, bannedSites, invalidSites = result['Value']
        if invalidSites:
          self.jobLog.debug( "Invalid site(s) requested: %s" % ','.join( invalidSites ) )
          if not self.ex_getOption( 'AllowInvalidSites', True ):
            return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join( invalidSites ) )
        if bannedSites:
          self.jobLog.debug( "Banned site(s) %s ignored" % ",".join( bannedSites ) )
          if not userSites:
            return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join( bannedSites ) )

        if not userSites:
          return self.__holdJob( jobState, "No requested site(s) are active/valid" )
        userSites = list(userSites)

    # Check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) )
      return S_ERROR( "Failed to get input data from JobDB" )

    if not result['Value']:
      # No input data? Just send to TQ
      return self.__sendToTQ( jobState, userSites, userBannedSites )

    self.jobLog.verbose( "Has an input data requirement" )
    inputData = result[ 'Value' ]

    # Production jobs are sent to TQ, but first we have to verify if staging is necessary
    if jobType in Operations().getValue( 'Transformations/DataProcessing', [] ):
      self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" )

      userName = jobState.getAttribute( 'Owner' )
      if not userName[ 'OK' ]:
        return userName
      userName = userName['Value']

      userGroup = jobState.getAttribute( 'OwnerGroup' )
      if not userGroup[ 'OK' ]:
        return userGroup
      userGroup = userGroup['Value']

      res = getFilesToStage( inputData, proxyUserName = userName, proxyUserGroup = userGroup ) #pylint: disable=unexpected-keyword-arg

      if not res['OK']:
        return self.__holdJob( jobState, res['Message'] )
      stageLFNs = res['Value']['offlineLFNs']
      if stageLFNs:
        res = self.__checkStageAllowed( jobState )
        if not res['OK']:
          return res
        if not res['Value']:
          return S_ERROR( "Stage not allowed" )
        self.__requestStaging( jobState, stageLFNs )
        return S_OK()
      else:
        return self.__sendToTQ( jobState, userSites, userBannedSites )

    # From now on we know it's a user job with input data

    idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' )
    result = self.retrieveOptimizerParam( idAgent )
    if not result['OK']:
      self.jobLog.error( "Could not retrieve input data info", result[ 'Message' ] )
      return S_ERROR( "Could not retrieve input data info" )
    opData = result[ 'Value' ]

    if 'SiteCandidates' not in opData:
      return S_ERROR( "No possible site candidates" )

    # Filter input data sites with user requirement
    siteCandidates = list( opData[ 'SiteCandidates' ] )
    self.jobLog.info( "Site candidates are %s" % siteCandidates )

    if userSites:
      siteCandidates = list( set( siteCandidates ) & set( userSites ) )

    siteCandidates = self._applySiteFilter( siteCandidates, banned = userBannedSites )
    if not siteCandidates:
      return S_ERROR( "Impossible InputData * Site requirements" )

    idSites = {}
    for site in siteCandidates:
      idSites[ site ] = opData[ 'SiteCandidates' ][ site ]

    # Check if sites have correct count of disk+tape replicas
    numData = len( inputData )
    errorSites = set()
    for site in idSites:
      if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]:
        self.jobLog.error( "Site candidate %s does not have all the input data" % site )
        errorSites.add( site )
    for site in errorSites:
      idSites.pop( site )
    if not idSites:
      return S_ERROR( "Site candidates do not have all the input data" )

    # Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites )
    if not siteCandidates:
      return S_ERROR( "No destination sites available" )

    # Is any site active?
    stageSites = self._applySiteFilter( siteCandidates, banned = wmsBannedSites )
    if not stageSites:
      return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) )

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ( jobState, siteCandidates, userBannedSites )

    # Check if the user is allowed to stage
    if self.ex_getOption( "RestrictDataStage", False ):
      res = self.__checkStageAllowed( jobState )
      if not res['OK']:
        return res
      if not res['Value']:
        return S_ERROR( "Stage not allowed" )

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) )
    stageData = idSites[ stageSite ]
    # Set as if everything has already been staged
    stageData[ 'disk' ] += stageData[ 'tape' ]
    stageData[ 'tape' ] = 0
    # Set the site info back to the original dict to save afterwards
    opData[ 'SiteCandidates' ][ stageSite ] = stageData

    stageRequest = self.__preRequestStaging( jobState, stageSite, opData )
    if not stageRequest['OK']:
      return stageRequest
    stageLFNs = stageRequest['Value']
    result = self.__requestStaging( jobState, stageLFNs )
    if not result[ 'OK' ]:
      return result
    stageLFNs = result[ 'Value' ]
    self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData )
    # Save the optimizer data again
    self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData )
    result = self.storeOptimizerParam( idAgent, opData )
    if not result[ 'OK' ]:
      return result

    return self.__setJobSite( jobState, stageSites )
コード例 #38
0
  def checkJob( self, job, classAdJob ):
    """This method controls the checking of the job.
    """
    self.log.verbose( 'Job %s will be processed' % ( job ) )

    # Check if the job was recently rescheduled
    result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] )
    if not result['OK']:
      self.log.error( result['Message'] )
      return S_ERROR( 'Can not get job attributes from JobDB' )
    jobDict = result['Value']
    reCounter = int( jobDict['RescheduleCounter'] )
    if reCounter != 0 :
      reTime = fromString( jobDict['RescheduleTime'] )
      delta = toEpoch() - toEpoch( reTime )
      delay = self.maxRescheduleDelay
      if reCounter <= len( self.rescheduleDelaysList ):
        delay = self.rescheduleDelaysList[reCounter - 1]
      if delta < delay:
        if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1:
          result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter )
        return S_OK()

    # First, get Site and BannedSites from the Job

    result = self.__getJobSiteRequirement( job, classAdJob )
    userBannedSites = result['BannedSites']
    userSites = result['Sites']

    if userSites:
      userSites = applySiteRequirements( userSites, [], userBannedSites )
      if not userSites:
        msg = 'Impossible Site Requirement'
        return S_ERROR( msg )

    # Second, get the Active and Banned sites from the WMS

    wmsSites = self.jobDB.getSiteMask( 'Active' )
    wmsBannedSites = self.jobDB.getSiteMask( 'Banned' )
    if not ( wmsSites['OK'] and wmsBannedSites['OK'] ):
      if not wmsSites['OK']:
        self.log.error( wmsSites['Message'] )
      if not wmsBannedSites['OK']:
        self.log.error( wmsBannedSites['Message'] )
      return S_ERROR( 'Can not get Active and Banned Sites from JobDB' )

    wmsSites = wmsSites['Value']
    wmsBannedSites = wmsBannedSites['Value']

    if userSites:
      sites = applySiteRequirements( userSites, wmsSites, wmsBannedSites )
      if not sites:
        # Put on Hold only non-excluded job types
        jobType = classAdJob.getAttributeString( 'JobType' )
        if not jobType in self.excludedOnHoldJobTypes:
          msg = 'On Hold: Requested site is Banned or not Active'
          self.log.info( msg )
          result = self.jobDB.setJobStatus( job, application = msg )
          return S_OK()


    # Third, check if there is input data
    result = self.jobDB.getInputData( job )
    if not result['OK']:
      self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) )
      self.log.error( result['Message'] )
      return S_ERROR( 'Failed to get input data from JobDB' )

    if not result['Value']:
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    hasInputData = False
    inputData = []
    for lfn in result['Value']:
      if lfn:
        inputData.append( lfn )
        hasInputData = True

    if not hasInputData:
      #With no input data requirement, job can proceed directly to task queue
      self.log.verbose( 'Job %s has no input data requirement' % ( job ) )
      return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites )

    self.log.verbose( 'Job %s has an input data requirement ' % ( job ) )

    # Fourth, Check all optimizer information
    result = self.__checkOptimizerInfo( job )
    if not result['OK']:
      return result

    optInfo = result['Value']

    #Compare site candidates with current mask
    optSites = optInfo['SiteCandidates'].keys()
    self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) )
    # Check that it is compatible with user requirements
    optSites = applySiteRequirements( optSites, userSites, userBannedSites )
    if not optSites:
      msg = 'Impossible Site + InputData Requirement'
      return S_ERROR( msg )

    sites = applySiteRequirements( optSites, wmsSites, wmsBannedSites )
    if not sites:
      msg = 'On Hold: InputData Site is Banned or not Active'
      self.log.info( msg )
      result = self.jobDB.setJobStatus( job, application = msg )
      return S_OK()

    #Set stager request as necessary, optimize for smallest #files on tape if
    #more than one site candidate left at this point
    checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] )
    if not checkStaging['OK']:
      return checkStaging

    destinationSites = checkStaging['SiteCandidates']
    if not destinationSites:
      return S_ERROR( 'No destination sites available' )

    stagingFlag = checkStaging['Value']
    if stagingFlag:
      #Single site candidate chosen and staging required
      self.log.verbose( 'Job %s requires staging of input data' % ( job ) )
      # set all LFN to disk for the selected site
      stagingSite = destinationSites[0]
      siteDict = optInfo['SiteCandidates'][stagingSite]
      siteDict['disk'] = siteDict['disk'] + siteDict['tape']
      siteDict['tape'] = 0

      optInfo['SiteCandidates'][stagingSite] = siteDict
      self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo )
      result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo )
      if not result['OK']:
        return result

      # Site is selected for staging, report it
      self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) )

      result = self.__getStagingSites(stagingSite,destinationSites)
      if not result['OK']:
        stagingSites = [stagingSite]
      else:
        stagingSites = result['Value']  

      if len( stagingSites ) == 1:
        self.jobDB.setJobAttribute( job, 'Site', stagingSite )
      else:
        # Get the name of the site group
        result = self.__getSiteGroup(stagingSites)
        if result['OK']:
          groupName = result['Value']
          if groupName:
            self.jobDB.setJobAttribute( job, 'Site', groupName )
          else:    
            self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )
        else:
          self.jobDB.setJobAttribute( job, 'Site', 'Multiple' )    

      stagerDict = self.__setStagingRequest( job, stagingSite, optInfo )
      if not stagerDict['OK']:
        return stagerDict
      self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo )
      return S_OK()
    else:
      #No staging required, can proceed to task queue agent and then waiting status
      self.log.verbose( 'Job %s does not require staging of input data' % ( job ) )
    #Finally send job to TaskQueueAgent
    return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
コード例 #39
0
  def execute(self):
    """ The main agent execution method
    """
    limitDate = date() - self._period
    tableList = ["MessageRepository", "FixedTextMessages", "Systems",
                 "SubSystems"]
    columnsList = ["SystemName", "SubSystemName", "count(*) as entries",
                   "FixedTextString"]
    cmd = "SELECT " + ', '.join(columnsList) + " FROM " \
          + " NATURAL JOIN ".join(tableList) \
          + " WHERE MessageTime > '%s'" % limitDate \
          + " AND LogLevel in ('ERROR','FATAL','EXCEPT')" \
          + " GROUP BY FixedTextID,SystemName,SubSystemName HAVING entries > %s" % self._threshold \
          + " ORDER BY entries DESC LIMIT %i;" % self._limit

    result = self.systemLoggingDB._query(cmd)
    if not result['OK']:
      return result

    messageList = result['Value']

    if messageList == 'None' or messageList == ():
      self.log.warn('The DB query returned an empty result')
      return S_OK()

    mailBody = '\n'
    for message in messageList:
      mailBody = mailBody + "Count: " + str(message[2]) + "\tError: '"\
          + message[3] + "'\tSystem: '" + message[0]\
          + "'\tSubsystem: '" + message[1] + "'\n"

    mailBody = mailBody + "\n\n-------------------------------------------------------\n"\
        + "Please do not reply to this mail. It was automatically\n"\
        + "generated by a Dirac Agent.\n"

    result = self.systemLoggingDB._getDataFromAgentTable(self.agentName)
    self.log.debug(result)
    if not result['OK']:
      errorString = "Could not get the date when the last mail was sent"
      self.log.error(errorString)
      return S_ERROR(errorString)
    else:
      if result['Value']:
        self.log.debug("date value: %s" % fromString(result['Value'][0][0][1:-1]))
        lastMailSentDate = fromString(result['Value'][0][0][1:-1])
      else:
        lastMailSentDate = limitDate - 1 * day
        result = self.systemLoggingDB._insertDataIntoAgentTable(self.agentName, lastMailSentDate)
        if not result['OK']:
          errorString = "Could not insert data into the DB"
          self.log.error(errorString, result['Message'])
          return S_ERROR(errorString + ": " + result['Message'])

    self.log.debug("limitDate: %s\t" % limitDate + "lastMailSentDate: %s\n" % lastMailSentDate)
    if lastMailSentDate > limitDate:
      self.log.info("The previous report was sent less " + " than %s days ago" % self.__days)
      return S_OK()

    dateSent = toString(date())
    self.log.info("The list with the top errors has been sent")

    result = self.systemLoggingDB._insertDataIntoAgentTable(self.agentName, dateSent)
    if not result['OK']:
      errorString = "Could not insert data into the DB"
      self.log.error(errorString, result['Message'])
      return S_ERROR(errorString + ": " + result['Message'])

    result = self.notification.sendMail(self._mailAddress, self._subject,
                                        mailBody)
    if not result['OK']:
      self.log.warn("The notification could not be sent")
      return S_OK()

    return S_OK("The list with the top errors has been sent")
コード例 #40
0
ファイル: StalledJobAgent.py プロジェクト: DIRACGrid/DIRAC
    def __sendAccounting(self, jobID):
        """ Send WMS accounting data for the given job
"""
        try:
            accountingReport = Job()
            endTime = "Unknown"
            lastHeartBeatTime = "Unknown"

            result = self.jobDB.getJobAttributes(jobID)
            if not result["OK"]:
                return result
            jobDict = result["Value"]

            startTime, endTime = self.__checkLoggingInfo(jobID, jobDict)
            lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(jobID, jobDict)
            lastHeartBeatTime = fromString(lastHeartBeatTime)
            if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
                endTime = lastHeartBeatTime

            cpuNormalization = self.jobDB.getJobParameter(jobID, "CPUNormalizationFactor")
            if not cpuNormalization["OK"] or not cpuNormalization["Value"]:
                cpuNormalization = 0.0
            else:
                cpuNormalization = float(cpuNormalization["Value"])
        except Exception:
            self.log.exception(
                "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s"
                % (str(jobID), str(endTime), str(lastHeartBeatTime)),
                "",
                False,
            )
            return S_ERROR("Exception")
        processingType = self.__getProcessingType(jobID)

        accountingReport.setStartTime(startTime)
        accountingReport.setEndTime(endTime)
        # execTime = toEpoch( endTime ) - toEpoch( startTime )
        # Fill the accounting data
        acData = {
            "Site": jobDict["Site"],
            "User": jobDict["Owner"],
            "UserGroup": jobDict["OwnerGroup"],
            "JobGroup": jobDict["JobGroup"],
            "JobType": jobDict["JobType"],
            "JobClass": jobDict["JobSplitType"],
            "ProcessingType": processingType,
            "FinalMajorStatus": "Failed",
            "FinalMinorStatus": "Stalled",
            "CPUTime": lastCPUTime,
            "NormCPUTime": lastCPUTime * cpuNormalization,
            "ExecTime": lastWallTime,
            "InputDataSize": 0.0,
            "OutputDataSize": 0.0,
            "InputDataFiles": 0,
            "OutputDataFiles": 0,
            "DiskSpace": 0.0,
            "InputSandBoxSize": 0.0,
            "OutputSandBoxSize": 0.0,
            "ProcessedEvents": 0,
        }

        # For accidentally stopped jobs ExecTime can be not set
        if not acData["ExecTime"]:
            acData["ExecTime"] = acData["CPUTime"]
        elif acData["ExecTime"] < acData["CPUTime"]:
            acData["ExecTime"] = acData["CPUTime"]

        self.log.verbose("Accounting Report is:")
        self.log.verbose(acData)
        accountingReport.setValuesFromDict(acData)

        result = accountingReport.commit()
        if result["OK"]:
            self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True")
        else:
            self.log.error("Failed to send accounting report", "Job: %d, Error: %s" % (int(jobID), result["Message"]))
        return result
コード例 #41
0
ファイル: KeystoneClient.py プロジェクト: DIRACGrid/DIRAC
    def __getToken2(self):
        """Get the Keystone token for the version v2 of the keystone service

        :return: S_OK(token) or S_ERROR
        """

        user = self.parameters.get("User")
        password = self.parameters.get("Password")
        authArgs = {}
        if user and password:
            authDict = {
                "auth": {
                    "passwordCredentials": {
                        "username": user,
                        "password": password
                    }
                }
            }
            if self.project:
                authDict["auth"]["tenantName"] = self.project
        elif self.parameters.get("Auth") == "voms":
            authDict = {"auth": {"voms": True}}
            if self.project:
                authDict["auth"]["tenantName"] = self.project

            if self.parameters.get("Proxy"):
                authArgs["cert"] = self.parameters.get("Proxy")

        try:
            result = requests.post(
                "%s/tokens" % self.url,
                headers={"Content-Type": "application/json"},
                json=authDict,
                verify=self.caPath,
                **authArgs)
        except Exception as exc:
            return S_ERROR("Exception getting keystone token: %s" % str(exc))

        output = result.json()

        if result.status_code in [400, 401]:
            message = "None"
            if "error" in output:
                message = output["error"].get("message")
            return S_ERROR("Authorization error: %s" % message)

        self.token = str(output["access"]["token"]["id"])
        expires = fromString(
            str(output["access"]["token"]["expires"]).replace("T",
                                                              " ").replace(
                                                                  "Z", ""))
        issued = fromString(
            str(output["access"]["token"]["issued_at"]).replace("T",
                                                                " ").replace(
                                                                    "Z", ""))
        self.expires = dateTime() + (expires - issued)

        self.projectID = output["access"]["token"]["tenant"]["id"]

        for endpoint in output["access"]["serviceCatalog"]:
            if endpoint["type"] == "compute":
                self.computeURL = str(endpoint["endpoints"][0]["publicURL"])
            elif endpoint["type"] == "image":
                self.imageURL = str(endpoint["endpoints"][0]["publicURL"])
            elif endpoint["type"] == "network":
                self.networkURL = str(endpoint["endpoints"][0]["publicURL"])
        return S_OK(self.token)
コード例 #42
0
ファイル: JobScheduling.py プロジェクト: Eo300/DIRAC
    def optimizeJob(self, jid, jobState):
        """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
        # Reschedule delay
        result = jobState.getAttributes(
            ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            return result
        attDict = result['Value']
        try:
            reschedules = int(attDict['RescheduleCounter'])
        except (ValueError, KeyError):
            return S_ERROR("RescheduleCounter has to be an integer")
        if reschedules != 0:
            delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600])
            delay = delays[min(reschedules, len(delays) - 1)]
            waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime']))
            if waited < delay:
                return self.__holdJob(
                    jobState, 'On Hold: after rescheduling %s' % reschedules,
                    delay)

        # Get the job manifest for the later checks
        result = jobState.getManifest()
        if not result['OK']:
            return S_ERROR("Could not retrieve job manifest: %s" %
                           result['Message'])
        jobManifest = result['Value']

        # Get site requirements
        result = self.__getSitesRequired(jobManifest)
        if not result['OK']:
            return result
        userSites, userBannedSites = result['Value']

        # Get job type
        result = jobState.getAttribute("JobType")
        if not result['OK']:
            return S_ERROR("Could not retrieve job type")
        jobType = result['Value']

        # Get banned sites from DIRAC
        result = self.siteClient.getSites('Banned')
        if not result['OK']:
            return S_ERROR("Cannot retrieve banned sites from JobDB")
        wmsBannedSites = result['Value']

        # If the user has selected any site, filter them and hold the job if not able to run
        if userSites:
            if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []):

                result = self.siteClient.getUsableSites(userSites)
                if not result['OK']:
                    return S_ERROR(
                        "Problem checking userSites for tuple of active/banned/invalid sites"
                    )
                usableSites = set(result['Value'])
                bannedSites = []
                invalidSites = []
                for site in userSites:
                    if site in wmsBannedSites:
                        bannedSites.append(site)
                    elif site not in usableSites:
                        invalidSites.append(site)

                if invalidSites:
                    self.jobLog.debug("Invalid site(s) requested: %s" %
                                      ','.join(invalidSites))
                    if not self.ex_getOption('AllowInvalidSites', True):
                        return self.__holdJob(
                            jobState, "Requested site(s) %s are invalid" %
                            ",".join(invalidSites))
                if bannedSites:
                    self.jobLog.debug("Banned site(s) %s ignored" %
                                      ",".join(bannedSites))
                    if not usableSites:
                        return self.__holdJob(
                            jobState, "Requested site(s) %s are inactive" %
                            ",".join(bannedSites))

                if not usableSites:
                    return self.__holdJob(
                        jobState, "No requested site(s) are active/valid")
                userSites = list(usableSites)

        checkPlatform = self.ex_getOption('CheckPlatform', False)
        jobPlatform = jobManifest.getOption("Platform", None)
        # First check that the platform is valid (in OSCompatibility list)
        if checkPlatform and jobPlatform:
            result = gConfig.getOptionsDict(
                '/Resources/Computing/OSCompatibility')
            if not result['OK']:
                return S_ERROR("Unable to get OSCompatibility list")
            allPlatforms = result['Value']
            if jobPlatform not in allPlatforms:
                self.jobLog.error("Platform not supported", jobPlatform)
                return S_ERROR("Platform %s is not supported" % jobPlatform)

        # Filter the userSites by the platform selection (if there is one)
        if checkPlatform and userSites:
            if jobPlatform:
                result = self.__filterByPlatform(jobPlatform, userSites)
                if not result['OK']:
                    self.jobLog.error("Failed to filter job sites by platform",
                                      result['Message'])
                    return S_ERROR("Failed to filter job sites by platform")
                userSites = result['Value']
                if not userSites:
                    # No sites left after filtering -> Invalid platform/sites combination
                    self.jobLog.error("No selected sites match platform",
                                      jobPlatform)
                    return S_ERROR("No selected sites match platform '%s'" %
                                   jobPlatform)

        # Check if there is input data
        result = jobState.getInputData()
        if not result['OK']:
            self.jobLog.error("Cannot get input data", result['Message'])
            return S_ERROR("Failed to get input data from JobDB")

        if not result['Value']:
            # No input data? Just send to TQ
            return self.__sendToTQ(jobState, jobManifest, userSites,
                                   userBannedSites)

        self.jobLog.verbose("Has an input data requirement")
        inputData = result['Value']

        # ===================================================================================
        # Production jobs are sent to TQ, but first we have to verify if staging is necessary
        # ===================================================================================
        if jobType in Operations().getValue('Transformations/DataProcessing',
                                            []):
            self.jobLog.info(
                "Production job: sending to TQ, but first checking if staging is requested"
            )

            res = getFilesToStage(inputData,
                                  jobState=jobState,
                                  checkOnlyTapeSEs=self.ex_getOption(
                                      'CheckOnlyTapeSEs', True),
                                  jobLog=self.jobLog)

            if not res['OK']:
                return self.__holdJob(jobState, res['Message'])
            if res['Value']['absentLFNs']:
                # Some files do not exist at all... set the job Failed
                # Reverse errors
                reasons = {}
                for lfn, reason in res['Value']['absentLFNs'].iteritems():
                    reasons.setdefault(reason, []).append(lfn)
                for reason, lfns in reasons.iteritems():
                    # Some files are missing in the FC or in SEs, fail the job
                    self.jobLog.error(reason, ','.join(lfns))
                error = ','.join(reasons)
                return S_ERROR(error)

            if res['Value']['failedLFNs']:
                return self.__holdJob(
                    jobState, "Couldn't get storage metadata of some files")
            stageLFNs = res['Value']['offlineLFNs']
            if stageLFNs:
                res = self.__checkStageAllowed(jobState)
                if not res['OK']:
                    return res
                if not res['Value']:
                    return S_ERROR("Stage not allowed")
                self.__requestStaging(jobState, stageLFNs)
                return S_OK()
            else:
                # No staging required
                onlineSites = res['Value']['onlineSites']
                if onlineSites:
                    # Set the online site(s) first
                    userSites = set(userSites)
                    onlineSites &= userSites
                    userSites = list(onlineSites) + list(userSites -
                                                         onlineSites)
                return self.__sendToTQ(jobState,
                                       jobManifest,
                                       userSites,
                                       userBannedSites,
                                       onlineSites=onlineSites)

        # ===================================================
        # From now on we know it's a user job with input data
        # ===================================================

        idAgent = self.ex_getOption('InputDataAgent', 'InputData')
        result = self.retrieveOptimizerParam(idAgent)
        if not result['OK']:
            self.jobLog.error("Could not retrieve input data info",
                              result['Message'])
            return S_ERROR("Could not retrieve input data info")
        opData = result['Value']

        if 'SiteCandidates' not in opData:
            return S_ERROR("No possible site candidates")

        # Filter input data sites with user requirement
        siteCandidates = list(opData['SiteCandidates'])
        self.jobLog.info("Site candidates are %s" % siteCandidates)

        if userSites:
            siteCandidates = list(set(siteCandidates) & set(userSites))

        siteCandidates = self._applySiteFilter(siteCandidates,
                                               banned=userBannedSites)
        if not siteCandidates:
            return S_ERROR("Impossible InputData * Site requirements")

        idSites = {}
        for site in siteCandidates:
            idSites[site] = opData['SiteCandidates'][site]

        # Check if sites have correct count of disk+tape replicas
        numData = len(inputData)
        errorSites = set()
        for site in idSites:
            if numData != idSites[site]['disk'] + idSites[site]['tape']:
                self.jobLog.error(
                    "Site candidate %s does not have all the input data" %
                    site)
                errorSites.add(site)
        for site in errorSites:
            idSites.pop(site)
        if not idSites:
            return S_ERROR("Site candidates do not have all the input data")

        # Check if staging is required
        stageRequired, siteCandidates = self.__resolveStaging(
            inputData, idSites)
        if not siteCandidates:
            return S_ERROR("No destination sites available")

        # Is any site active?
        stageSites = self._applySiteFilter(siteCandidates,
                                           banned=wmsBannedSites)
        if not stageSites:
            return self.__holdJob(
                jobState,
                "Sites %s are inactive or banned" % ", ".join(siteCandidates))

        # If no staging is required send to TQ
        if not stageRequired:
            # Use siteCandidates and not stageSites because active and banned sites
            # will be taken into account on matching time
            return self.__sendToTQ(jobState, jobManifest, siteCandidates,
                                   userBannedSites)

        # Check if the user is allowed to stage
        if self.ex_getOption("RestrictDataStage", False):
            res = self.__checkStageAllowed(jobState)
            if not res['OK']:
                return res
            if not res['Value']:
                return S_ERROR("Stage not allowed")

        # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
        stageSite = stageSites[0]
        self.jobLog.verbose(" Staging site will be %s" % (stageSite))
        stageData = idSites[stageSite]
        # Set as if everything has already been staged
        stageData['disk'] += stageData['tape']
        stageData['tape'] = 0
        # Set the site info back to the original dict to save afterwards
        opData['SiteCandidates'][stageSite] = stageData

        stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData)
        if not stageRequest['OK']:
            return stageRequest
        stageLFNs = stageRequest['Value']
        result = self.__requestStaging(jobState, stageLFNs)
        if not result['OK']:
            return result
        stageLFNs = result['Value']
        self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData)
        # Save the optimizer data again
        self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData)
        result = self.storeOptimizerParam(idAgent, opData)
        if not result['OK']:
            return result

        return self.__setJobSite(jobState, stageSites)
コード例 #43
0
    def optimizeJob(self, jid, jobState):
        # Reschedule delay
        result = jobState.getAttributes(
            ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
        if not result['OK']:
            return result
        attDict = result['Value']
        try:
            reschedules = int(attDict['RescheduleCounter'])
        except ValueError:
            return S_ERROR("RescheduleCounter has to be an integer")
        if reschedules != 0:
            delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600])
            delay = delays[min(reschedules, len(delays) - 1)]
            waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime']))
            if waited < delay:
                return self.__holdJob(
                    jobState, 'On Hold: after rescheduling %s' % reschedules,
                    delay)

        # Get site requirements
        result = self._getSitesRequired(jobState)
        if not result['OK']:
            return result
        userSites, userBannedSites = result['Value']

        # Get active and banned sites from DIRAC
        result = self.__jobDB.getSiteMask('Active')
        if not result['OK']:
            return S_ERROR("Cannot retrieve active sites from JobDB")
        wmsActiveSites = result['Value']
        result = self.__jobDB.getSiteMask('Banned')
        if not result['OK']:
            return S_ERROR("Cannot retrieve banned sites from JobDB")
        wmsBannedSites = result['Value']

        # If the user has selected any site, filter them and hold the job if not able to run
        if userSites:
            result = jobState.getAttribute("JobType")
            if not result['OK']:
                return S_ERROR("Could not retrieve job type")
            jobType = result['Value']
            if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []):
                sites = self._applySiteFilter(userSites, wmsActiveSites,
                                              wmsBannedSites)
                if not sites:
                    return self.__holdJob(
                        jobState, "Sites %s are inactive or banned" %
                        ", ".join(userSites))

        # Get the Input data
        # Third, check if there is input data
        result = jobState.getInputData()
        if not result['OK']:
            self.jobLog.error("Cannot get input data %s" % (result['Message']))
            return S_ERROR('Failed to get input data from JobDB')

        if not result['Value']:
            # No input data? Generate requirements and next
            return self.__sendToTQ(jobState, userSites, userBannedSites)

        inputData = result['Value']

        self.jobLog.verbose('Has an input data requirement')
        idAgent = self.ex_getOption('InputDataAgent', 'InputData')
        result = self.retrieveOptimizerParam(idAgent)
        if not result['OK']:
            self.jobLog.error("Could not retrieve input data info: %s" %
                              result['Message'])
            return S_ERROR("File Catalog Access Failure")
        opData = result['Value']
        if 'SiteCandidates' not in opData:
            return S_ERROR("No possible site candidates")

        # Filter input data sites with user requirement
        siteCandidates = list(opData['SiteCandidates'])
        self.jobLog.info("Site candidates are %s" % siteCandidates)

        siteCandidates = self._applySiteFilter(siteCandidates, userSites,
                                               userBannedSites)
        if not siteCandidates:
            return S_ERROR("Impossible InputData * Site requirements")

        idSites = {}
        for site in siteCandidates:
            idSites[site] = opData['SiteCandidates'][site]

        #Check if sites have correct count of disk+tape replicas
        numData = len(inputData)
        errorSites = set()
        for site in idSites:
            if numData != idSites[site]['disk'] + idSites[site]['tape']:
                self.jobLog.error(
                    "Site candidate %s does not have all the input data" %
                    site)
                errorSites.add(site)
        for site in errorSites:
            idSites.pop(site)
        if not idSites:
            return S_ERROR("Site candidates do not have all the input data")

        #Check if staging is required
        stageRequired, siteCandidates = self.__resolveStaging(
            jobState, inputData, idSites)
        if not siteCandidates:
            return S_ERROR("No destination sites available")

        # Is any site active?
        stageSites = self._applySiteFilter(siteCandidates, wmsActiveSites,
                                           wmsBannedSites)
        if not stageSites:
            return self.__holdJob(
                jobState,
                "Sites %s are inactive or banned" % ", ".join(siteCandidates))

        # If no staging is required send to TQ
        if not stageRequired:
            # Use siteCandidates and not stageSites because active and banned sites
            # will be taken into account on matching time
            return self.__sendToTQ(jobState, siteCandidates, userBannedSites)

        # Check if the user is allowed to stage
        if self.ex_getOption("RestrictDataStage", False):
            if not self.__checkStageAllowed(jobState):
                return S_ERROR("Stage not allowed")

        # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
        stageSite = stageSites[0]
        self.jobLog.verbose(" Staging site will be %s" % (stageSite))
        stageData = idSites[stageSite]
        # Set as if everything has already been staged
        stageData['disk'] += stageData['tape']
        stageData['tape'] = 0
        # Set the site info back to the original dict to save afterwards
        opData['SiteCandidates'][stageSite] = stageData

        result = self.__requestStaging(jobState, stageSite, opData)
        if not result['OK']:
            return result
        stageLFNs = result['Value']
        self._updateSharedSESites(stageSite, stageLFNs, opData)
        # Save the optimizer data again
        self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData)
        result = self.storeOptimizerParam(idAgent, opData)
        if not result['OK']:
            return result

        return self._setJobSite(jobState, stageSites)
コード例 #44
0
    def export_checkComponentLog(self, component):
        """ Check component log for errors
    """
        componentList = []
        if '*' in component:
            if component == '*':
                result = gComponentInstaller.getSetupComponents()
                if result['OK']:
                    for ctype in ['Services', 'Agents', 'Executors']:
                        if ctype in result['Value']:
                            for sname in result['Value'][ctype]:
                                for cname in result['Value'][ctype][sname]:
                                    componentList.append('/'.join(
                                        [sname, cname]))
        elif isinstance(component, basestring):
            componentList = [component]
        else:
            componentList = component

        resultDict = {}
        for comp in componentList:
            if '/' not in comp:
                continue
            system, cname = comp.split('/')

            startDir = gComponentInstaller.startDir
            currentLog = startDir + '/' + system + '_' + cname + '/log/current'
            try:
                logFile = file(currentLog, 'r')
            except IOError as err:
                gLogger.error("File does not exists:", currentLog)
                resultDict[comp] = {
                    'ErrorsHour': -1,
                    'ErrorsDay': -1,
                    'LastError': currentLog + '::' + repr(err)
                }
                continue

            logLines = logFile.readlines()
            logFile.close()

            errors_1 = 0
            errors_24 = 0
            now = dateTime()
            lastError = ''
            for line in logLines:
                if "ERROR:" in line:
                    fields = line.split()
                    recent = False
                    if len(fields) < 2:  # if the line contains only one word
                        lastError = line.split('ERROR:')[-1].strip()
                        continue
                    timeStamp = fromString(fields[0] + ' ' + fields[1])
                    if not timeStamp:  # if the timestamp is missing in the log
                        lastError = line.split('ERROR:')[-1].strip()
                        continue
                    if (now - timeStamp) < hour:
                        errors_1 += 1
                        recent = True
                    if (now - timeStamp) < day:
                        errors_24 += 1
                        recent = True
                    if recent:
                        lastError = line.split('ERROR:')[-1].strip()

            resultDict[comp] = {
                'ErrorsHour': errors_1,
                'ErrorsDay': errors_24,
                'LastError': lastError
            }

        return S_OK(resultDict)
コード例 #45
0
ファイル: JobScheduling.py プロジェクト: andresailer/DIRAC
  def optimizeJob(self, jid, jobState):
    """ 1. Banned sites are removed from the destination list.
        2. Get input files
        3. Production jobs are sent directly to TQ
        4. Check if staging is necessary
    """
    # Reschedule delay
    result = jobState.getAttributes(['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'])
    if not result['OK']:
      return result
    attDict = result['Value']
    try:
      reschedules = int(attDict['RescheduleCounter'])
    except (ValueError, KeyError):
      return S_ERROR("RescheduleCounter has to be an integer")
    if reschedules != 0:
      delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600])
      delay = delays[min(reschedules, len(delays) - 1)]
      waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime']))
      if waited < delay:
        return self.__holdJob(jobState, 'On Hold: after rescheduling %s' % reschedules, delay)

    # Get the job manifest for the later checks
    result = jobState.getManifest()
    if not result['OK']:
      return S_ERROR("Could not retrieve job manifest: %s" % result['Message'])
    jobManifest = result['Value']

    # Get site requirements
    result = self.__getSitesRequired(jobManifest)
    if not result['OK']:
      return result
    userSites, userBannedSites = result['Value']

    # Get job type
    result = jobState.getAttribute("JobType")
    if not result['OK']:
      return S_ERROR("Could not retrieve job type")
    jobType = result['Value']

    # Get banned sites from DIRAC
    result = self.siteClient.getSites('Banned')
    if not result['OK']:
      return S_ERROR("Cannot retrieve banned sites from JobDB")
    wmsBannedSites = result['Value']

    # If the user has selected any site, filter them and hold the job if not able to run
    if userSites:
      if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []):

        result = self.siteClient.getUsableSites(userSites)
        if not result['OK']:
          return S_ERROR("Problem checking userSites for tuple of active/banned/invalid sites")
        usableSites = set(result['Value'])
        bannedSites = []
        invalidSites = []
        for site in userSites:
          if site in wmsBannedSites:
            bannedSites.append(site)
          elif site not in usableSites:
            invalidSites.append(site)

        if invalidSites:
          self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites))
          if not self.ex_getOption('AllowInvalidSites', True):
            return self.__holdJob(jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites))
        if bannedSites:
          self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites))
          if not usableSites:
            return self.__holdJob(jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites))

        if not usableSites:
          return self.__holdJob(jobState, "No requested site(s) are active/valid")
        userSites = list(usableSites)

    checkPlatform = self.ex_getOption('CheckPlatform', False)
    jobPlatform = jobManifest.getOption("Platform", None)
    # First check that the platform is valid (in OSCompatibility list)
    if checkPlatform and jobPlatform:
      result = gConfig.getOptionsDict('/Resources/Computing/OSCompatibility')
      if not result['OK']:
        return S_ERROR("Unable to get OSCompatibility list")
      allPlatforms = result['Value']
      if jobPlatform not in allPlatforms:
        self.jobLog.error("Platform %s is not supported" % jobPlatform)
        return S_ERROR("Platform %s is not supported" % jobPlatform)

    # Filter the userSites by the platform selection (if there is one)
    if checkPlatform and userSites:
      if jobPlatform:
        result = self.__filterByPlatform(jobPlatform, userSites)
        if not result['OK']:
          self.jobLog.error("Failed to filter job sites by platform: %s" % result['Message'])
          return S_ERROR("Failed to filter job sites by platform")
        userSites = result['Value']
        if not userSites:
          # No sites left after filtering -> Invalid platform/sites combination
          self.jobLog.error("No selected sites match platform '%s'" % jobPlatform)
          return S_ERROR("No selected sites match platform '%s'" % jobPlatform)

    # Check if there is input data
    result = jobState.getInputData()
    if not result['OK']:
      self.jobLog.error("Cannot get input data %s" % (result['Message']))
      return S_ERROR("Failed to get input data from JobDB")

    if not result['Value']:
      # No input data? Just send to TQ
      return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites)

    self.jobLog.verbose("Has an input data requirement")
    inputData = result['Value']

    # ===================================================================================
    # Production jobs are sent to TQ, but first we have to verify if staging is necessary
    # ===================================================================================
    if jobType in Operations().getValue('Transformations/DataProcessing', []):
      self.jobLog.info("Production job: sending to TQ, but first checking if staging is requested")

      res = getFilesToStage(inputData,
                            jobState=jobState,
                            checkOnlyTapeSEs=self.ex_getOption('CheckOnlyTapeSEs', True),
                            jobLog=self.jobLog)

      if not res['OK']:
        return self.__holdJob(jobState, res['Message'])
      if res['Value']['absentLFNs']:
        # Some files do not exist at all... set the job Failed
        # Reverse errors
        reasons = {}
        for lfn, reason in res['Value']['absentLFNs'].iteritems():
          reasons.setdefault(reason, []).append(lfn)
        for reason, lfns in reasons.iteritems():
          # Some files are missing in the FC or in SEs, fail the job
          self.jobLog.error(reason, ','.join(lfns))
        error = ','.join(reasons)
        return S_ERROR(error)

      if res['Value']['failedLFNs']:
        return self.__holdJob(jobState, "Couldn't get storage metadata of some files")
      stageLFNs = res['Value']['offlineLFNs']
      if stageLFNs:
        res = self.__checkStageAllowed(jobState)
        if not res['OK']:
          return res
        if not res['Value']:
          return S_ERROR("Stage not allowed")
        self.__requestStaging(jobState, stageLFNs)
        return S_OK()
      else:
        # No staging required
        onlineSites = res['Value']['onlineSites']
        if onlineSites:
          # Set the online site(s) first
          userSites = set(userSites)
          onlineSites &= userSites
          userSites = list(onlineSites) + list(userSites - onlineSites)
        return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites)

    # ===================================================
    # From now on we know it's a user job with input data
    # ===================================================

    idAgent = self.ex_getOption('InputDataAgent', 'InputData')
    result = self.retrieveOptimizerParam(idAgent)
    if not result['OK']:
      self.jobLog.error("Could not retrieve input data info", result['Message'])
      return S_ERROR("Could not retrieve input data info")
    opData = result['Value']

    if 'SiteCandidates' not in opData:
      return S_ERROR("No possible site candidates")

    # Filter input data sites with user requirement
    siteCandidates = list(opData['SiteCandidates'])
    self.jobLog.info("Site candidates are %s" % siteCandidates)

    if userSites:
      siteCandidates = list(set(siteCandidates) & set(userSites))

    siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites)
    if not siteCandidates:
      return S_ERROR("Impossible InputData * Site requirements")

    idSites = {}
    for site in siteCandidates:
      idSites[site] = opData['SiteCandidates'][site]

    # Check if sites have correct count of disk+tape replicas
    numData = len(inputData)
    errorSites = set()
    for site in idSites:
      if numData != idSites[site]['disk'] + idSites[site]['tape']:
        self.jobLog.error("Site candidate %s does not have all the input data" % site)
        errorSites.add(site)
    for site in errorSites:
      idSites.pop(site)
    if not idSites:
      return S_ERROR("Site candidates do not have all the input data")

    # Check if staging is required
    stageRequired, siteCandidates = self.__resolveStaging(inputData, idSites)
    if not siteCandidates:
      return S_ERROR("No destination sites available")

    # Is any site active?
    stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites)
    if not stageSites:
      return self.__holdJob(jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates))

    # If no staging is required send to TQ
    if not stageRequired:
      # Use siteCandidates and not stageSites because active and banned sites
      # will be taken into account on matching time
      return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites)

    # Check if the user is allowed to stage
    if self.ex_getOption("RestrictDataStage", False):
      res = self.__checkStageAllowed(jobState)
      if not res['OK']:
        return res
      if not res['Value']:
        return S_ERROR("Stage not allowed")

    # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
    stageSite = stageSites[0]
    self.jobLog.verbose(" Staging site will be %s" % (stageSite))
    stageData = idSites[stageSite]
    # Set as if everything has already been staged
    stageData['disk'] += stageData['tape']
    stageData['tape'] = 0
    # Set the site info back to the original dict to save afterwards
    opData['SiteCandidates'][stageSite] = stageData

    stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData)
    if not stageRequest['OK']:
      return stageRequest
    stageLFNs = stageRequest['Value']
    result = self.__requestStaging(jobState, stageLFNs)
    if not result['OK']:
      return result
    stageLFNs = result['Value']
    self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData)
    # Save the optimizer data again
    self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData)
    result = self.storeOptimizerParam(idAgent, opData)
    if not result['OK']:
      return result

    return self.__setJobSite(jobState, stageSites)